diff --git a/CMakeLists.txt b/CMakeLists.txt index ead63bff83..ff42643fac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) project(OpenBLAS) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 2) -set(OpenBLAS_PATCH_VERSION 18) +set(OpenBLAS_PATCH_VERSION 19) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") enable_language(ASM) @@ -45,8 +45,8 @@ endif() message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") -include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake") -include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") set(BLASDIRS interface driver/level2 driver/level3 driver/others) @@ -123,9 +123,9 @@ endforeach () # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. if (NOT NOFORTRAN AND NOT NO_LAPACK) - include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake") if (NOT NO_LAPACKE) - include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake") endif () endif () @@ -137,7 +137,7 @@ endif() # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) -include("${CMAKE_SOURCE_DIR}/cmake/export.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") # Set output for libopenblas set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index ebe52ea8a1..5ecf32b914 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -150,3 +150,14 @@ In chronological order: * theoractice * [2016-03-20] Fix compiler error in VisualStudio with CMake * [2016-03-22] Fix access violation on Windows while static linking + +* Paul Mustière + * [2016-02-04] Fix Android build on ARMV7 + * [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8 + +* Shivraj Patil + * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA + +* Kaustubh Raste + * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA + * [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA diff --git a/Changelog.txt b/Changelog.txt index 7f82e8e883..2eb27ab04b 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,22 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.19 +1-Sep-2016 +common: + * Improved cross compiling. + * Fix the bug on musl libc. + +POWER: + * Optimize BLAS on Power8 + * Fixed Julia+OpenBLAS bugs on Power8 + +MIPS: + * Optimize BLAS on MIPS P5600 and I6400 (Thanks, Shivraj Patil, Kaustubh Raste) + +ARM: + * Improved on ARM Cortex-A57. (Thanks, Ashwin Sekhar T K) + + ==================================================================== Version 0.2.18 12-Apr-2016 diff --git a/Makefile b/Makefile index 9ba2bffb34..2ae0047989 100644 --- a/Makefile +++ b/Makefile @@ -108,8 +108,6 @@ endif tests : ifndef NOFORTRAN -ifndef TARGET -ifndef CROSS touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -119,8 +117,6 @@ ifndef NO_CBLAS $(MAKE) -C ctest all endif endif -endif -endif libs : ifeq ($(CORE), UNKOWN) diff --git a/Makefile.install b/Makefile.install index 5da4e68c9c..1b9388a8b5 100644 --- a/Makefile.install +++ b/Makefile.install @@ -20,75 +20,75 @@ lib.grd : $(error OpenBLAS: Please run "make" firstly) install : lib.grd - @-mkdir -p $(DESTDIR)$(PREFIX) - @-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR) - @-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR) + @-mkdir -p "$(DESTDIR)$(PREFIX)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) #for inc - @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h + @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" @echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h + @echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h + @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifndef NO_STATIC @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) - @install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) - @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) - @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) - @-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) - @-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR) - @-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" + @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" endif ifeq ($(OSNAME), CYGWIN_NT) @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) @@ -96,34 +96,34 @@ endif endif #Generating OpenBLASConfig.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) - @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) - @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" ifndef NO_SHARED #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), Darwin) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif else #only static - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif #Generating OpenBLASConfigVersion.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) - @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! diff --git a/Makefile.mips b/Makefile.mips new file mode 100644 index 0000000000..05ea9c679d --- /dev/null +++ b/Makefile.mips @@ -0,0 +1,3 @@ +ifdef BINARY64 +else +endif diff --git a/Makefile.power b/Makefile.power index 7e2b47386b..79db83751e 100644 --- a/Makefile.power +++ b/Makefile.power @@ -1,4 +1,26 @@ -# CCOMMON_OPT += -DALLOC_SHM + +ifdef USE_THREAD +ifeq ($(USE_THREAD), 0) +USE_OPENMP = 0 +else +USE_OPENMP = 1 +endif +else +USE_OPENMP = 1 +endif + + + +ifeq ($(CORE), POWER8) +ifeq ($(USE_OPENMP), 1) +COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else +COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +endif +endif + FLAMEPATH = $(HOME)/flame/lib @@ -16,6 +38,16 @@ else endif endif +#Either uncomment below line or run make with `USE_MASS=1` to enable support of MASS library +#USE_MASS = 1 + +ifeq ($(USE_MASS), 1) +# Path to MASS libs, change it if the libs are installed at any other location +MASSPATH = /opt/ibm/xlmass/8.1.3/lib +COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS +EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8 +endif + ifdef BINARY64 diff --git a/Makefile.prebuild b/Makefile.prebuild index ee0b677879..524f0a741f 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -17,14 +17,26 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif +ifeq ($(TARGET), P5600) +TARGET_FLAGS = -mips32r5 +endif + +ifeq ($(TARGET), I6400) +TARGET_FLAGS = -mips64r6 +endif + +ifeq ($(TARGET), P6600) +TARGET_FLAGS = -mips64r6 +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) config.h : c_check f_check getarch - perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) + perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) ifneq ($(ONLY_CBLAS), 1) - perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) + perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) else #When we only build CBLAS, we set NOFORTRAN=2 echo "NOFORTRAN=2" >> $(TARGET_MAKE) diff --git a/Makefile.rule b/Makefile.rule index d8db6102c6..5bb9cf0b77 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.18 +VERSION = 0.2.19 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -52,6 +52,7 @@ VERSION = 0.2.18 # USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. +# This flag is always set for POWER8. Don't modify the flag # USE_OPENMP = 1 # You can define maximum number of threads. Basically it should be @@ -153,10 +154,12 @@ NO_AFFINITY = 1 # Common Optimization Flag; # The default -O2 is enough. +# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT # COMMON_OPT = -O2 # gfortran option for LAPACK # enable this flag only on 64bit Linux and if you need a thread safe lapack library +# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT # FCOMMON_OPT = -frecursive # Profiling flags diff --git a/Makefile.system b/Makefile.system index b89f60e963..b05177b6c3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -159,7 +159,7 @@ ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf @@ -462,7 +462,7 @@ endif endif endif -ifeq ($(ARCH), mips64) +ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) NO_BINARY_MODE = 1 endif @@ -502,13 +502,16 @@ endif ifdef NO_BINARY_MODE -ifeq ($(ARCH), mips64) +ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 CCOMMON_OPT += -mabi=64 else CCOMMON_OPT += -mabi=n32 endif BINARY_DEFINED = 1 +else ifeq ($(ARCH), $(filter $(ARCH),mips)) +CCOMMON_OPT += -mabi=32 +BINARY_DEFINED = 1 endif ifeq ($(CORE), LOONGSON3A) @@ -521,6 +524,21 @@ CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif +ifeq ($(CORE), P5600) +CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +endif + +ifeq ($(CORE), I6400) +CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) +FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) +endif + +ifeq ($(CORE), P6600) +CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) +FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) +endif + ifeq ($(OSNAME), AIX) BINARY_DEFINED = 1 endif @@ -589,12 +607,14 @@ ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran endif ifdef NO_BINARY_MODE -ifeq ($(ARCH), mips64) +ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 FCOMMON_OPT += -mabi=64 else FCOMMON_OPT += -mabi=n32 endif +else ifeq ($(ARCH), $(filter $(ARCH),mips)) +FCOMMON_OPT += -mabi=32 endif else ifdef BINARY64 @@ -677,21 +697,7 @@ FCOMMON_OPT += -i8 endif endif endif - -ifneq ($(ARCH), mips64) -ifndef BINARY64 -FCOMMON_OPT += -m32 -else -FCOMMON_OPT += -m64 -endif -else -ifdef BINARY64 -FCOMMON_OPT += -mabi=64 -else -FCOMMON_OPT += -mabi=n32 -endif -endif - + ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif @@ -707,7 +713,7 @@ endif endif endif -ifeq ($(ARCH), mips64) +ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) ifndef BINARY64 FCOMMON_OPT += -n32 else @@ -737,7 +743,7 @@ endif ifeq ($(C_COMPILER), OPEN64) -ifeq ($(ARCH), mips64) +ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) ifndef BINARY64 CCOMMON_OPT += -n32 else @@ -1126,6 +1132,8 @@ export HAVE_VFP export HAVE_VFPV3 export HAVE_VFPV4 export HAVE_NEON +export HAVE_MSA +export MSA_FLAGS export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/README.md b/README.md index 32a861081f..ff55edaa14 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,35 @@ On X86 box, compile this library for loongson3a CPU with loongcc (based on Open6 make DEBUG=1 +### Compile with MASS Support on Power CPU (Optional dependency) + +[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and +Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. +The library can be installed as below - + + * On Ubuntu: + + wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add - + echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list + sudo apt-get update + sudo apt-get install libxlmass-devel.8.1.3 + + * On RHEL/CentOS: + + wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key + sudo rpm --import repomd.xml.key + wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo + sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/ + sudo yum install libxlmass-devel.8.1.3 + +After installing MASS library, compile openblas with USE_MASS=1. + +Example: + +Compiling on Power8 with MASS support - + + make USE_MASS=1 TARGET=POWER8 + ### Install to the directory (optional) Example: @@ -82,6 +111,7 @@ Please read GotoBLAS_01Readme.txt - **MingWin or Visual Studio(CMake)/Windows**: Please read . - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **FreeBSD**: Supported by community. We didn't test the library on this OS. +- **Android**: Supported by community. Please read . ## Usages Link with libopenblas.a or -lopenblas for shared library. diff --git a/TargetList.txt b/TargetList.txt index dc1e08722e..52a60b49cc 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -53,26 +53,31 @@ PPC440 PPC440FP2 CELL -3.MIPS64 CPU: +3.MIPS CPU: +P5600 + +4.MIPS64 CPU: SICORTEX LOONGSON3A LOONGSON3B +I6400 +P6600 -4.IA64 CPU: +5.IA64 CPU: ITANIUM2 -5.SPARC CPU: +6.SPARC CPU: SPARC SPARCV7 -6.ARM CPU: +7.ARM CPU: CORTEXA15 CORTEXA9 ARMV7 ARMV6 ARMV5 -7.ARM 64-bit CPU: +8.ARM 64-bit CPU: ARMV8 CORTEXA57 diff --git a/appveyor.yml b/appveyor.yml index 5360a9ef9b..c9d8e47ac9 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -version: 0.2.18.{build} +version: 0.2.19.{build} #environment: diff --git a/benchmark/Makefile b/benchmark/Makefile index 8166f38630..e801ce4ebe 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - smallscaling + smallscaling \ + isamax.goto idamax.goto icamax.goto izamax.goto \ + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ @@ -261,7 +265,9 @@ endif essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ - slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl + slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ + scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ + strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ @@ -393,6 +399,9 @@ scholesky.mkl : scholesky.$(SUFFIX) scholesky.veclib : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +scholesky.essl : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dcholesky ################################################### dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) @@ -410,6 +419,9 @@ dcholesky.mkl : dcholesky.$(SUFFIX) dcholesky.veclib : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dcholesky.essl : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ccholesky ################################################### ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) @@ -427,6 +439,9 @@ ccholesky.mkl : ccholesky.$(SUFFIX) ccholesky.veclib : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ccholesky.essl : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zcholesky ################################################### @@ -445,6 +460,9 @@ zcholesky.mkl : zcholesky.$(SUFFIX) zcholesky.veclib : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zcholesky.essl : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sgemm #################################################### sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -683,6 +701,9 @@ strsm.mkl : strsm.$(SUFFIX) strsm.veclib : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +strsm.essl : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dtrsm #################################################### dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -699,6 +720,9 @@ dtrsm.mkl : dtrsm.$(SUFFIX) dtrsm.veclib : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dtrsm.essl : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ctrsm #################################################### ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) @@ -716,6 +740,9 @@ ctrsm.mkl : ctrsm.$(SUFFIX) ctrsm.veclib : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ctrsm.essl : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ztrsm #################################################### ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) @@ -733,6 +760,9 @@ ztrsm.mkl : ztrsm.$(SUFFIX) ztrsm.veclib : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ztrsm.essl : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1911,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX) zgemm3m.veclib : zgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +isamax.atlas : isamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +idamax.atlas : idamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +icamax.atlas : icamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +izamax.atlas : izamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## SNRM2 ############################################## +snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +snrm2.atlas : snrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## DNRM2 ############################################## +dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dnrm2.atlas : dnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Sscnrm2 ############################################## +scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scnrm2.atlas : scnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Ddznrm2 ############################################## +dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dznrm2.atlas : dznrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ################################################################################################### slinpack.$(SUFFIX) : linpack.c @@ -2217,11 +2304,38 @@ cgemm3m.$(SUFFIX) : gemm3m.c zgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +snrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +scnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +dznrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + smallscaling: smallscaling.c ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread clean :: - @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl + @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling include $(TOPDIR)/Makefile.tail diff --git a/benchmark/asum.c b/benchmark/asum.c index beb6402f4e..78ccdf47b9 100644 --- a/benchmark/asum.c +++ b/benchmark/asum.c @@ -183,9 +183,9 @@ int main(int argc, char *argv[]){ timeg /= loops; #ifdef COMPLEX - fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg); #else - fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg); #endif } diff --git a/benchmark/axpy.c b/benchmark/axpy.c index a7206b690a..37c7aeb63f 100644 --- a/benchmark/axpy.c +++ b/benchmark/axpy.c @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/copy.c b/benchmark/copy.c index 15c45201c6..ea5b38d686 100644 --- a/benchmark/copy.c +++ b/benchmark/copy.c @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MBytes\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); + " %10.2f MBytes %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/dot.c b/benchmark/dot.c index 4c8d6cc384..50d05e5320 100644 --- a/benchmark/dot.c +++ b/benchmark/dot.c @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/gemv.c b/benchmark/gemv.c index 42af2825a7..c06e829d96 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -221,7 +221,7 @@ int main(int argc, char *argv[]){ timeg /= loops; - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); } } @@ -258,7 +258,7 @@ int main(int argc, char *argv[]){ timeg /= loops; - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); } } diff --git a/benchmark/iamax.c b/benchmark/iamax.c new file mode 100644 index 0000000000..c55f415796 --- /dev/null +++ b/benchmark/iamax.c @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef IAMAX + +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMAX BLASFUNC(izamax) +#else +#define IAMAX BLASFUNC(icamax) +#endif +#else +#ifdef DOUBLE +#define IAMAX BLASFUNC(idamax) +#else +#define IAMAX BLASFUNC(isamax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Time\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef NRM2 + +#ifdef COMPLEX +#ifdef DOUBLE +#define NRM2 BLASFUNC(dznrm2) +#else +#define NRM2 BLASFUNC(scnrm2) +#endif +#else +#ifdef DOUBLE +#define NRM2 BLASFUNC(dnrm2) +#else +#define NRM2 BLASFUNC(snrm2) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Time\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l #include #include +#include #define MIN_SIZE 5 #define MAX_SIZE 60 #define NB_SIZE 10 diff --git a/benchmark/swap.c b/benchmark/swap.c index 9f108ef50e..368c59cd43 100644 --- a/benchmark/swap.c +++ b/benchmark/swap.c @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MBytes\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); + " %10.2f MBytes %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/trmm.c b/benchmark/trmm.c index f81e9d9129..54c7972dbc 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -191,8 +191,8 @@ int main(int argc, char *argv[]){ gettimeofday( &start, (struct timezone *)0); fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1); } diff --git a/benchmark/zdot.c b/benchmark/zdot.c index d5ec997261..ed9d4d2e83 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } diff --git a/c_check b/c_check index bcf4c2cb35..2ec9fc484b 100644 --- a/c_check +++ b/c_check @@ -1,5 +1,8 @@ #!/usr/bin/perl +use File::Basename; +use File::Temp qw(tempfile); + # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); @@ -8,6 +11,7 @@ $hostarch = "arm" if ($hostarch =~ /^arm.*/); $hostarch = "arm64" if ($hostarch eq "aarch64"); $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); +$tmpf = new File::Temp( UNLINK => 1 ); $binary = $ENV{"BINARY"}; $makefile = shift(@ARGV); @@ -26,14 +30,12 @@ if ($?) { $cross_suffix = ""; -if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { - if ($1 =~ /(.*-)(.*)/) { - $cross_suffix = $1; - } -} else { - if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) { - $cross_suffix = $1; - } +if (dirname($compiler_name) ne ".") { + $cross_suffix .= dirname($compiler_name) . "/"; +} + +if (basename($compiler_name) =~ /(.*-)(.*)/) { + $cross_suffix .= $1; } $compiler = ""; @@ -63,7 +65,7 @@ $os = Android if ($data =~ /OS_ANDROID/); $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips32 if ($data =~ /ARCH_MIPS32/); +$architecture = mips if ($data =~ /ARCH_MIPS/); $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); @@ -79,7 +81,12 @@ if ($os eq "AIX") { $defined = 1; } -if (($architecture eq "mips32") || ($architecture eq "mips64")) { +if ($architecture eq "mips") { + $compiler_name .= " -mabi=32"; + $defined = 1; +} + +if ($architecture eq "mips64") { $compiler_name .= " -mabi=n32" if ($binary eq "32"); $compiler_name .= " -mabi=64" if ($binary eq "64"); $defined = 1; @@ -152,10 +159,28 @@ if ($?) { die 1; } +$have_msa = 0; +if (($architecture eq "mips") || ($architecture eq "mips64")) { + $code = '"addvi.b $w0, $w1, 1"'; + $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + print $tmpf "#include \n\n"; + print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + + $args = "$msa_flags -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args"); + system(@cmd) == 0; + if ($? != 0) { + $have_msa = 0; + } else { + $have_msa = 1; + } + unlink("$tmpf.o"); +} + $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips32 if ($data =~ /ARCH_MIPS32/); +$architecture = mips if ($data =~ /ARCH_MIPS/); $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); @@ -243,9 +268,11 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64; print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; -print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne ""; +print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; print MAKEFILE "CROSS=1\n" if $cross != 0; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; +print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; +print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; @@ -257,6 +284,7 @@ print CONFFILE "#define C_$compiler\t1\n"; print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; +print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; if ($os eq "LINUX") { diff --git a/cmake/export.cmake b/cmake/export.cmake index adf59101f1..629f8fbc25 100644 --- a/cmake/export.cmake +++ b/cmake/export.cmake @@ -53,7 +53,7 @@ endif() add_custom_command( TARGET ${OpenBLAS_LIBNAME} PRE_LINK COMMAND perl - ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" + ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" COMMENT "Create openblas.def file" VERBATIM) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index c3fa486552..471ce90e47 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -50,20 +50,20 @@ else() set(TARGET_CONF "config.h") endif () -include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake") if (NOT NOFORTRAN) - include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") endif () # compile getarch set(GETARCH_SRC - ${CMAKE_SOURCE_DIR}/getarch.c + ${PROJECT_SOURCE_DIR}/getarch.c ${CPUIDEMO} ) if (NOT MSVC) - list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S) + list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) endif () if (MSVC) @@ -76,7 +76,7 @@ set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) try_compile(GETARCH_RESULT ${GETARCH_DIR} SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GETARCH_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} ) @@ -97,8 +97,8 @@ set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH2_DIR}) try_compile(GETARCH2_RESULT ${GETARCH2_DIR} - SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR} + SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GETARCH2_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} ) diff --git a/cmake/system.cmake b/cmake/system.cmake index 134e9c12db..aa046a56aa 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -3,7 +3,7 @@ ## Description: Ported from OpenBLAS/Makefile.system ## -set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib") +set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib") # TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa # http://stackoverflow.com/questions/714100/os-detecting-makefile @@ -78,7 +78,7 @@ else () set(ONLY_CBLAS 0) endif () -include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") if (NOT DEFINED NUM_THREADS) set(NUM_THREADS ${NUM_CORES}) @@ -124,17 +124,17 @@ set(OBJCOPY "${CROSS_SUFFIX}objcopy") set(OBJCONV "${CROSS_SUFFIX}objconv") # OS dependent settings -include("${CMAKE_SOURCE_DIR}/cmake/os.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/os.cmake") # Architecture dependent settings -include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") # C Compiler dependent settings -include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") if (NOT NOFORTRAN) # Fortran Compiler dependent settings - include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") endif () if (BINARY64) @@ -247,10 +247,10 @@ if (NOT DEFINED SYMBOLSUFFIX) set(SYMBOLSUFFIX "") endif () -set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}") +set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") # TODO: nead to convert these Makefiles -# include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake +# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake if (${CORE} STREQUAL "PPC440") set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC") @@ -410,8 +410,8 @@ set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def") set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp") set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip") -set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}") -set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}") +set(LIBS "${PROJECT_SOURCE_DIR}/${LIBNAME}") +set(LIBS_P "${PROJECT_SOURCE_DIR}/${LIBNAME_P}") set(LIB_COMPONENTS BLAS) diff --git a/common.h b/common.h index e045e42b2a..480174c11a 100644 --- a/common.h +++ b/common.h @@ -332,6 +332,13 @@ typedef int blasint; #endif #endif +#ifdef POWER8 +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif + + /* #ifdef PILEDRIVER #ifndef YIELDING @@ -397,6 +404,10 @@ please /~https://github.com/xianyi/OpenBLAS/issues/246 #include "common_sparc.h" #endif +#ifdef ARCH_MIPS +#include "common_mips.h" +#endif + #ifdef ARCH_MIPS64 #include "common_mips64.h" #endif @@ -615,9 +626,14 @@ void gotoblas_profile_init(void); void gotoblas_profile_quit(void); #ifdef USE_OPENMP +#ifndef C_MSVC int omp_in_parallel(void); int omp_get_num_procs(void); #else +__declspec(dllimport) int __cdecl omp_in_parallel(void); +__declspec(dllimport) int __cdecl omp_get_num_procs(void); +#endif +#else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); int omp_get_num_procs(void) __attribute__ ((weak)); diff --git a/common_mips.h b/common_mips.h new file mode 100644 index 0000000000..ae126949a2 --- /dev/null +++ b/common_mips.h @@ -0,0 +1,109 @@ +/***************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#ifndef COMMON_MIPS +#define COMMON_MIPS + +#define MB +#define WMB + +#define INLINE inline + +#define RETURN_BY_COMPLEX + +#ifndef ASSEMBLER + +static void INLINE blas_lock(volatile unsigned long *address){ + +} +#define BLAS_LOCK_DEFINED + +static inline unsigned int rpcc(void){ + unsigned long ret; + + __asm__ __volatile__(".set push \n" + "rdhwr %0, $30 \n" + ".set pop" : "=r"(ret) : : "memory"); + + return ret; +} +#define RPCC_DEFINED + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#define GET_IMAGE(res) + +#define GET_IMAGE_CANCEL + +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .arm ;\ + .global REALNAME ;\ + .func REALNAME ;\ +REALNAME: + +#define EPILOGUE + +#define PROFCODE + +#endif + + +#define SEEK_ADDRESS + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/common_mips64.h b/common_mips64.h index f5c0ec7cfd..6078bf35b6 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -102,7 +102,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ static inline unsigned int rpcc(void){ unsigned long ret; -#if defined(LOONGSON3A) || defined(LOONGSON3B) + // unsigned long long tmp; //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); //ret=tmp; @@ -111,17 +111,10 @@ static inline unsigned int rpcc(void){ "rdhwr %0, $2\n" ".set pop": "=r"(ret):: "memory"); -#else - __asm__ __volatile__(".set push \n" - ".set mips32r2\n" - "rdhwr %0, $30 \n" - ".set pop" : "=r"(ret) : : "memory"); -#endif return ret; } #define RPCC_DEFINED -#if defined(LOONGSON3A) || defined(LOONGSON3B) #ifndef NO_AFFINITY #define WHEREAMI static inline int WhereAmI(void){ @@ -134,7 +127,6 @@ static inline int WhereAmI(void){ } #endif -#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; diff --git a/common_power.h b/common_power.h index 723d949f29..e3a1a7aef4 100644 --- a/common_power.h +++ b/common_power.h @@ -39,8 +39,13 @@ #ifndef COMMON_POWER #define COMMON_POWER +#if defined(POWER8) +#define MB __asm__ __volatile__ ("eieio":::"memory") +#define WMB __asm__ __volatile__ ("eieio":::"memory") +#else #define MB __asm__ __volatile__ ("sync") #define WMB __asm__ __volatile__ ("sync") +#endif #define INLINE inline @@ -798,7 +803,7 @@ Lmcount$lazy_ptr: #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) #elif defined(POWER8) -#define BUFFER_SIZE ( 32 << 20) +#define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) #endif diff --git a/cpuid_mips.c b/cpuid_mips.c index 22beff7fca..15c58959e0 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -71,15 +71,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #define CPU_UNKNOWN 0 -#define CPU_SICORTEX 1 -#define CPU_LOONGSON3A 2 -#define CPU_LOONGSON3B 3 +#define CPU_P5600 1 static char *cpuname[] = { "UNKOWN", - "SICORTEX", - "LOONGSON3A", - "LOONGSON3B" + "P5600" }; int detect(void){ @@ -120,7 +116,7 @@ int detect(void){ if (strstr(p, "loongson3a")) return CPU_LOONGSON3A; }else{ - return CPU_SICORTEX; + return CPU_UNKNOWN; } } //Check model name for Loongson3 @@ -149,64 +145,40 @@ char *get_corename(void){ } void get_architecture(void){ - printf("MIPS64"); + printf("MIPS"); } void get_subarchitecture(void){ - if(detect()==CPU_LOONGSON3A) { - printf("LOONGSON3A"); - }else if(detect()==CPU_LOONGSON3B){ - printf("LOONGSON3B"); + if(detect()==CPU_P5600){ + printf("P5600"); }else{ - printf("SICORTEX"); + printf("UNKNOWN"); } } void get_subdirname(void){ - printf("mips64"); + printf("mips"); } void get_cpuconfig(void){ - if(detect()==CPU_LOONGSON3A) { - printf("#define LOONGSON3A\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 32\n"); - printf("#define L2_SIZE 512488\n"); - printf("#define L2_LINESIZE 32\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 4\n"); - }else if(detect()==CPU_LOONGSON3B){ - printf("#define LOONGSON3B\n"); + if(detect()==CPU_P5600){ + printf("#define P5600\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); - printf("#define L2_SIZE 512488\n"); + printf("#define L2_SIZE 1048576\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 4\n"); - }else{ - printf("#define SICORTEX\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 32\n"); - printf("#define L2_SIZE 512488\n"); - printf("#define L2_LINESIZE 32\n"); - printf("#define DTB_DEFAULT_ENTRIES 32\n"); - printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); + }else{ + printf("#define UNKNOWN\n"); } } void get_libname(void){ - if(detect()==CPU_LOONGSON3A) { - printf("loongson3a\n"); - }else if(detect()==CPU_LOONGSON3B) { - printf("loongson3b\n"); + if(detect()==CPU_P5600) { + printf("p5600\n"); }else{ -#ifdef __mips64 - printf("mips64\n"); -#else - printf("mips32\n"); -#endif + printf("mips\n"); } } diff --git a/cpuid_mips64.c b/cpuid_mips64.c new file mode 100644 index 0000000000..ac1554c797 --- /dev/null +++ b/cpuid_mips64.c @@ -0,0 +1,238 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define CPU_UNKNOWN 0 +#define CPU_SICORTEX 1 +#define CPU_LOONGSON3A 2 +#define CPU_LOONGSON3B 3 +#define CPU_I6400 4 +#define CPU_P6600 5 + +static char *cpuname[] = { + "UNKOWN", + "SICORTEX", + "LOONGSON3A", + "LOONGSON3B", + "I6400", + "P6600" +}; + +int detect(void){ + +#ifdef linux + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("cpu", buffer, 3)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + fclose(infile); + + if(p != NULL){ + if (strstr(p, "Loongson-3A")){ + return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; + }else if (strstr(p, "Loongson-3")){ + infile = fopen("/proc/cpuinfo", "r"); + p = (char *)NULL; + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("system type", buffer, 11)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if (strstr(p, "loongson3a")) + return CPU_LOONGSON3A; + }else{ + return CPU_SICORTEX; + } + } + //Check model name for Loongson3 + infile = fopen("/proc/cpuinfo", "r"); + p = (char *)NULL; + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("model name", buffer, 10)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL){ + if (strstr(p, "Loongson-3A")){ + return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; + } + } +#endif + return CPU_UNKNOWN; +} + +char *get_corename(void){ + return cpuname[detect()]; +} + +void get_architecture(void){ + printf("MIPS64"); +} + +void get_subarchitecture(void){ + if(detect()==CPU_LOONGSON3A) { + printf("LOONGSON3A"); + }else if(detect()==CPU_LOONGSON3B){ + printf("LOONGSON3B"); + }else if(detect()==CPU_I6400){ + printf("I6400"); + }else if(detect()==CPU_P6600){ + printf("P6600"); + }else{ + printf("SICORTEX"); + } +} + +void get_subdirname(void){ + printf("mips64"); +} + +void get_cpuconfig(void){ + if(detect()==CPU_LOONGSON3A) { + printf("#define LOONGSON3A\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + }else if(detect()==CPU_LOONGSON3B){ + printf("#define LOONGSON3B\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + }else if(detect()==CPU_I6400){ + printf("#define I6400\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + }else if(detect()==CPU_P6600){ + printf("#define P6600\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + }else{ + printf("#define SICORTEX\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 32\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + } +} + +void get_libname(void){ + if(detect()==CPU_LOONGSON3A) { + printf("loongson3a\n"); + }else if(detect()==CPU_LOONGSON3B) { + printf("loongson3b\n"); + }else if(detect()==CPU_I6400) { + printf("i6400\n"); + }else if(detect()==CPU_P6600) { + printf("p6600\n"); + }else{ + printf("mips64\n"); + } +} diff --git a/cpuid_x86.c b/cpuid_x86.c index e5938803d2..bbd377f672 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1172,6 +1172,8 @@ int get_cpuname(void){ #endif else return CPUTYPE_NEHALEM; + case 12: + // Braswell case 13: // Avoton return CPUTYPE_NEHALEM; @@ -1678,6 +1680,8 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; + case 12: + // Braswell case 13: // Avoton return CORE_NEHALEM; diff --git a/ctest.c b/ctest.c index b5c74f1377..e0ef46e609 100644 --- a/ctest.c +++ b/ctest.c @@ -110,7 +110,7 @@ ARCH_MIPS64 #endif #if defined(__mips32) || defined(__mips) -ARCH_MIPS32 +ARCH_MIPS #endif #ifdef __alpha diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index dbe785bcb6..addcffeace 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) enable_language(Fortran) diff --git a/ctest/Makefile b/ctest/Makefile index 7a5d236aa8..6eda438635 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o all :: all1 all2 all3 all1: xscblat1 xdcblat1 xccblat1 xzcblat1 +ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat1 OMP_NUM_THREADS=2 ./xdcblat1 @@ -53,8 +54,10 @@ else OPENBLAS_NUM_THREADS=2 ./xccblat1 OPENBLAS_NUM_THREADS=2 ./xzcblat1 endif +endif all2: xscblat2 xdcblat2 xccblat2 xzcblat2 +ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat2 < sin2 OMP_NUM_THREADS=2 ./xdcblat2 < din2 @@ -66,8 +69,10 @@ else OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 endif +endif all3: xscblat3 xdcblat3 xccblat3 xzcblat3 +ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat3 < sin3 OMP_NUM_THREADS=2 ./xdcblat3 < din3 @@ -88,6 +93,7 @@ else OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif +endif diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 696767486b..f444469bd5 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -1,5 +1,5 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) # sources that need to be compiled twice, once with no flags and once with LOWER set(UL_SOURCES diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 3d3303af2b..36677a9423 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) # N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index b361f2a978..489d40c76d 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) if (${CORE} STREQUAL "PPC440") set(MEMORY memory_qalloc.c) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 2fde07fcc3..18f85c3168 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -261,8 +261,8 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } - //Intel Avoton - if (model == 13) { + //Intel Braswell / Avoton + if (model == 12 || model == 13) { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; } @@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; //char mname[20]; - for ( i=1 ; i <= 21; i++) + for ( i=1 ; i <= 22; i++) { if (!strncasecmp(coretype,corename[i],20)) { diff --git a/driver/others/init.c b/driver/others/init.c index f134f85f74..801f939911 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -361,6 +361,9 @@ static void numa_mapping(void) { unsigned long work, bit; int count = 0; int bitmask_idx = 0; + int current_cpu; + int current_node = 0; + int cpu_count = 0; for (node = 0; node < common -> num_nodes; node ++) { core = 0; @@ -382,33 +385,84 @@ static void numa_mapping(void) { fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); #endif - h = 1; - - while (h < count) h = 2 * h + 1; - - while (h > 1) { - h /= 2; - for (i = h; i < count; i++) { - work = common -> cpu_info[i]; - bit = CPU_ISSET(i, &cpu_orig_mask[0]); - j = i - h; - while (work < common -> cpu_info[j]) { - common -> cpu_info[j + h] = common -> cpu_info[j]; - if (CPU_ISSET(j, &cpu_orig_mask[0])) { - CPU_SET(j + h, &cpu_orig_mask[0]); - } else { - CPU_CLR(j + h, &cpu_orig_mask[0]); - } - j -= h; - if (j < 0) break; - } - common -> cpu_info[j + h] = work; - if (bit) { - CPU_SET(j + h, &cpu_orig_mask[0]); - } else { - CPU_CLR(j + h, &cpu_orig_mask[0]); + current_cpu = sched_getcpu(); + for (cpu = 0; cpu < count; cpu++) { + if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) { + current_node = READ_NODE(common -> cpu_info[cpu]); + break; + } + } + for (i = 0; i < MAX_BITMASK_LEN; i++) + cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]); + + /* + * If all the processes can be accommodated in the + * in the current node itself, then bind to cores + * from the current node only + */ + if (numprocs <= cpu_count) { + /* + * First sort all the cores in order from the current node. + * Then take remaining nodes one by one in order, + * and sort their cores in order. + */ + for (i = 0; i < count; i++) { + for (j = 0; j < count - 1; j++) { + int node_1, node_2; + int core_1, core_2; + int swap = 0; + + node_1 = READ_NODE(common -> cpu_info[j]); + node_2 = READ_NODE(common -> cpu_info[j + 1]); + core_1 = READ_CORE(common -> cpu_info[j]); + core_2 = READ_CORE(common -> cpu_info[j + 1]); + + if (node_1 == node_2) { + if (core_1 > core_2) + swap = 1; + } else { + if ((node_2 == current_node) || + ((node_1 != current_node) && (node_1 > node_2))) + swap = 1; + } + if (swap) { + unsigned long temp; + + temp = common->cpu_info[j]; + common->cpu_info[j] = common->cpu_info[j + 1]; + common->cpu_info[j + 1] = temp; + } } + } + } else { + h = 1; + + while (h < count) h = 2 * h + 1; + + while (h > 1) { + h /= 2; + for (i = h; i < count; i++) { + work = common -> cpu_info[i]; + bit = CPU_ISSET(i, &cpu_orig_mask[0]); + j = i - h; + while (work < common -> cpu_info[j]) { + common -> cpu_info[j + h] = common -> cpu_info[j]; + if (CPU_ISSET(j, &cpu_orig_mask[0])) { + CPU_SET(j + h, &cpu_orig_mask[0]); + } else { + CPU_CLR(j + h, &cpu_orig_mask[0]); + } + j -= h; + if (j < 0) break; + } + common -> cpu_info[j + h] = work; + if (bit) { + CPU_SET(j + h, &cpu_orig_mask[0]); + } else { + CPU_CLR(j + h, &cpu_orig_mask[0]); + } + } } } @@ -416,7 +470,10 @@ static void numa_mapping(void) { fprintf(stderr, "\nSorting ...\n\n"); for (cpu = 0; cpu < count; cpu++) - fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); + fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu], + READ_CPU(common -> cpu_info[cpu]), + READ_CORE(common -> cpu_info[cpu]), + READ_NODE(common -> cpu_info[cpu])); #endif } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index f4b1a80ad3..f22c6b69ae 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -167,7 +167,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -251,7 +251,7 @@ int get_L2_size(void){ void blas_set_parameter(void){ int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) int size = 16; #else int size = get_L2_size(); diff --git a/exports/Makefile b/exports/Makefile index c2b8d9c1ca..5632b6fff7 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) #only build without Fortran - $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c diff --git a/f_check b/f_check index 4c9d81e9f3..2f01f1c447 100644 --- a/f_check +++ b/f_check @@ -114,7 +114,7 @@ if ($compiler eq "") { $openmp = "-mp"; } - if ($data =~ /IBM/) { + if ($data =~ /IBM XL/) { $vendor = IBM; $openmp = "-openmp"; } @@ -223,7 +223,12 @@ if (!$?) { } #For gfortran MIPS if ($?) { - $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + $mips_data = `$compiler_bin -E -dM - < /dev/null`; + if ($mips_data =~ /_MIPS_ISA_MIPS64/) { + $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } else { + $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } } $binary = "" if ($?); } diff --git a/getarch.c b/getarch.c index 1e0b086757..f8069e5078 100644 --- a/getarch.c +++ b/getarch.c @@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3A */ /* #define FORCE_LOONGSON3B */ +/* #define FORCE_I6400 */ +/* #define FORCE_P6600 */ +/* #define FORCE_P5600 */ /* #define FORCE_ITANIUM2 */ /* #define FORCE_SPARC */ /* #define FORCE_SPARCV7 */ @@ -699,6 +702,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_I6400 +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "I6400" +#define SUBDIRNAME "mips64" +#define ARCHCONFIG "-DI6400 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "i6400" +#define CORENAME "I6400" +#else +#endif + +#ifdef FORCE_P6600 +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "P6600" +#define SUBDIRNAME "mips64" +#define ARCHCONFIG "-DP6600 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "p6600" +#define CORENAME "P6600" +#else +#endif + +#ifdef FORCE_P5600 +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "P5600" +#define SUBDIRNAME "mips" +#define ARCHCONFIG "-DP5600 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "p5600" +#define CORENAME "P5600" +#else +#endif + #ifdef FORCE_ITANIUM2 #define FORCE #define ARCHITECTURE "IA64" @@ -888,7 +933,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __mips__ +#ifdef __mips64 +#include "cpuid_mips64.c" +#else #include "cpuid_mips.c" +#endif #define OPENBLAS_SUPPORTED #endif diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 9ff924e5f1..1722dc6611 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -1,5 +1,5 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) set(BLAS1_SOURCES diff --git a/interface/lapack/fortran/dlaqr5.f b/interface/lapack/fortran/dlaqr5.f new file mode 100644 index 0000000000..a8fad0a79b --- /dev/null +++ b/interface/lapack/fortran/dlaqr5.f @@ -0,0 +1,1083 @@ +! Copyright (c) 2013-2016, The OpenBLAS Project +! All rights reserved. +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions are +! met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions and the following disclaimer in +! the documentation and/or other materials provided with the +! distribution. +! 3. Neither the name of the OpenBLAS project nor the names of +! its contributors may be used to endorse or promote products +! derived from this software without specific prior written permission. +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +! AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +! ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +! LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +! DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +! SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +! OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +! USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*> \brief \b DLAQR5 performs a single small-bulge multi-shift QR sweep. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAQR5 + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, +* SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, +* LDU, NV, WV, LDWV, NH, WH, LDWH ) +* +* .. Scalar Arguments .. +* INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, LDH, LDU, LDV, +* $ LDWH, LDWV, LDZ, N, NH, NSHFTS, NV +* LOGICAL WANTT, WANTZ +* .. +* .. Array Arguments .. +* DOUBLE PRECISION H( LDH, * ), SI( * ), SR( * ), U( LDU, * ), +* $ V( LDV, * ), WH( LDWH, * ), WV( LDWV, * ), +* $ Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLAQR5, called by DLAQR0, performs a +*> single small-bulge multi-shift QR sweep. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] WANTT +*> \verbatim +*> WANTT is logical scalar +*> WANTT = .true. if the quasi-triangular Schur factor +*> is being computed. WANTT is set to .false. otherwise. +*> \endverbatim +*> +*> \param[in] WANTZ +*> \verbatim +*> WANTZ is logical scalar +*> WANTZ = .true. if the orthogonal Schur factor is being +*> computed. WANTZ is set to .false. otherwise. +*> \endverbatim +*> +*> \param[in] KACC22 +*> \verbatim +*> KACC22 is integer with value 0, 1, or 2. +*> Specifies the computation mode of far-from-diagonal +*> orthogonal updates. +*> = 0: DLAQR5 does not accumulate reflections and does not +*> use matrix-matrix multiply to update far-from-diagonal +*> matrix entries. +*> = 1: DLAQR5 accumulates reflections and uses matrix-matrix +*> multiply to update the far-from-diagonal matrix entries. +*> = 2: DLAQR5 accumulates reflections, uses matrix-matrix +*> multiply to update the far-from-diagonal matrix entries, +*> and takes advantage of 2-by-2 block structure during +*> matrix multiplies. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is integer scalar +*> N is the order of the Hessenberg matrix H upon which this +*> subroutine operates. +*> \endverbatim +*> +*> \param[in] KTOP +*> \verbatim +*> KTOP is integer scalar +*> \endverbatim +*> +*> \param[in] KBOT +*> \verbatim +*> KBOT is integer scalar +*> These are the first and last rows and columns of an +*> isolated diagonal block upon which the QR sweep is to be +*> applied. It is assumed without a check that +*> either KTOP = 1 or H(KTOP,KTOP-1) = 0 +*> and +*> either KBOT = N or H(KBOT+1,KBOT) = 0. +*> \endverbatim +*> +*> \param[in] NSHFTS +*> \verbatim +*> NSHFTS is integer scalar +*> NSHFTS gives the number of simultaneous shifts. NSHFTS +*> must be positive and even. +*> \endverbatim +*> +*> \param[in,out] SR +*> \verbatim +*> SR is DOUBLE PRECISION array of size (NSHFTS) +*> \endverbatim +*> +*> \param[in,out] SI +*> \verbatim +*> SI is DOUBLE PRECISION array of size (NSHFTS) +*> SR contains the real parts and SI contains the imaginary +*> parts of the NSHFTS shifts of origin that define the +*> multi-shift QR sweep. On output SR and SI may be +*> reordered. +*> \endverbatim +*> +*> \param[in,out] H +*> \verbatim +*> H is DOUBLE PRECISION array of size (LDH,N) +*> On input H contains a Hessenberg matrix. On output a +*> multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied +*> to the isolated diagonal block in rows and columns KTOP +*> through KBOT. +*> \endverbatim +*> +*> \param[in] LDH +*> \verbatim +*> LDH is integer scalar +*> LDH is the leading dimension of H just as declared in the +*> calling procedure. LDH.GE.MAX(1,N). +*> \endverbatim +*> +*> \param[in] ILOZ +*> \verbatim +*> ILOZ is INTEGER +*> \endverbatim +*> +*> \param[in] IHIZ +*> \verbatim +*> IHIZ is INTEGER +*> Specify the rows of Z to which transformations must be +*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N +*> \endverbatim +*> +*> \param[in,out] Z +*> \verbatim +*> Z is DOUBLE PRECISION array of size (LDZ,IHI) +*> If WANTZ = .TRUE., then the QR Sweep orthogonal +*> similarity transformation is accumulated into +*> Z(ILOZ:IHIZ,ILO:IHI) from the right. +*> If WANTZ = .FALSE., then Z is unreferenced. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is integer scalar +*> LDA is the leading dimension of Z just as declared in +*> the calling procedure. LDZ.GE.N. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is DOUBLE PRECISION array of size (LDV,NSHFTS/2) +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is integer scalar +*> LDV is the leading dimension of V as declared in the +*> calling procedure. LDV.GE.3. +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is DOUBLE PRECISION array of size +*> (LDU,3*NSHFTS-3) +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is integer scalar +*> LDU is the leading dimension of U just as declared in the +*> in the calling subroutine. LDU.GE.3*NSHFTS-3. +*> \endverbatim +*> +*> \param[in] NH +*> \verbatim +*> NH is integer scalar +*> NH is the number of columns in array WH available for +*> workspace. NH.GE.1. +*> \endverbatim +*> +*> \param[out] WH +*> \verbatim +*> WH is DOUBLE PRECISION array of size (LDWH,NH) +*> \endverbatim +*> +*> \param[in] LDWH +*> \verbatim +*> LDWH is integer scalar +*> Leading dimension of WH just as declared in the +*> calling procedure. LDWH.GE.3*NSHFTS-3. +*> \endverbatim +*> +*> \param[in] NV +*> \verbatim +*> NV is integer scalar +*> NV is the number of rows in WV agailable for workspace. +*> NV.GE.1. +*> \endverbatim +*> +*> \param[out] WV +*> \verbatim +*> WV is DOUBLE PRECISION array of size +*> (LDWV,3*NSHFTS-3) +*> \endverbatim +*> +*> \param[in] LDWV +*> \verbatim +*> LDWV is integer scalar +*> LDWV is the leading dimension of WV as declared in the +*> in the calling subroutine. LDWV.GE.NV. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date September 2012 +* +*> \ingroup doubleOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> Karen Braman and Ralph Byers, Department of Mathematics, +*> University of Kansas, USA +* +*> \par References: +* ================ +*> +*> K. Braman, R. Byers and R. Mathias, The Multi-Shift QR +*> Algorithm Part I: Maintaining Well Focused Shifts, and Level 3 +*> Performance, SIAM Journal of Matrix Analysis, volume 23, pages +*> 929--947, 2002. +*> +* ===================================================================== + SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, + $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, + $ LDU, NV, WV, LDWV, NH, WH, LDWH ) +* +* -- LAPACK auxiliary routine (version 3.4.2) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* September 2012 +* +* .. Scalar Arguments .. + INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, LDH, LDU, LDV, + $ LDWH, LDWV, LDZ, N, NH, NSHFTS, NV + LOGICAL WANTT, WANTZ +* .. +* .. Array Arguments .. + DOUBLE PRECISION H( LDH, * ), SI( * ), SR( * ), U( LDU, * ), + $ V( LDV, * ), WH( LDWH, * ), WV( LDWV, * ), + $ Z( LDZ, * ) +* .. +* +* ================================================================ +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0d0, ONE = 1.0d0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, + $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, + $ ULP + INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, + $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + $ NS, NU + LOGICAL ACCUM, BLK22, BMP22 +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. Intrinsic Functions .. +* + INTRINSIC ABS, DBLE, MAX, MIN, MOD +* .. +* .. Local Arrays .. + DOUBLE PRECISION VT( 3 ) +* temp scalars + DOUBLE PRECISION tempv1, tempv2, tempv3, + $ tempv4, tempv5, tempv6, + $ temph1, temph2, temph3, + $ temph4, temph5, temph6, + $ tempz1, tempz2, tempz3, + $ tempz4, tempz5, tempz6, + $ tempu1, tempu2, tempu3, + $ tempu4, tempu5, tempu6, + $ REFSU1 + INTEGER JBEGIN, M1 +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLABAD, DLACPY, DLAQR1, DLARFG, DLASET, + $ DTRMM +* .. +* .. Executable Statements .. +* +* ==== If there are no shifts, then there is nothing to do. ==== +* + IF( NSHFTS.LT.2 ) + $ RETURN +* +* ==== If the active block is empty or 1-by-1, then there +* . is nothing to do. ==== +* + IF( KTOP.GE.KBOT ) + $ RETURN +* +* ==== Shuffle shifts into pairs of real shifts and pairs +* . of complex conjugate shifts assuming complex +* . conjugate shifts are already adjacent to one +* . another. ==== +* + DO 10 I = 1, NSHFTS - 2, 2 + IF( SI( I ).NE.-SI( I+1 ) ) THEN +* + SWAP = SR( I ) + SR( I ) = SR( I+1 ) + SR( I+1 ) = SR( I+2 ) + SR( I+2 ) = SWAP +* + SWAP = SI( I ) + SI( I ) = SI( I+1 ) + SI( I+1 ) = SI( I+2 ) + SI( I+2 ) = SWAP + END IF + 10 CONTINUE +* +* ==== NSHFTS is supposed to be even, but if it is odd, +* . then simply reduce it by one. The shuffle above +* . ensures that the dropped shift is real and that +* . the remaining shifts are paired. ==== +* + NS = NSHFTS - MOD( NSHFTS, 2 ) +* +* ==== Machine constants for deflation ==== +* + SAFMIN = DLAMCH( 'SAFE MINIMUM' ) + SAFMAX = ONE / SAFMIN + CALL DLABAD( SAFMIN, SAFMAX ) + ULP = DLAMCH( 'PRECISION' ) + SMLNUM = SAFMIN*( DBLE( N ) / ULP ) +* +* ==== Use accumulated reflections to update far-from-diagonal +* . entries ? ==== +* + ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) +* +* ==== If so, exploit the 2-by-2 block structure? ==== +* + BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) +* +* ==== clear trash ==== +* + IF( KTOP+2.LE.KBOT ) + $ H( KTOP+2, KTOP ) = ZERO +* +* ==== NBMPS = number of 2-shift bulges in the chain ==== +* + NBMPS = NS / 2 +* +* ==== KDU = width of slab ==== +* + KDU = 6*NBMPS - 3 +* +* ==== Create and chase chains of NBMPS bulges ==== +* + DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + NDCOL = INCOL + KDU + IF( ACCUM ) + $ CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) +* +* ==== Near-the-diagonal bulge chase. The following loop +* . performs the near-the-diagonal part of a small bulge +* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . chunk extends from column INCOL to column NDCOL +* . (including both column INCOL and column NDCOL). The +* . following loop chases a 3*NBMPS column long chain of +* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . may be less than KTOP and and NDCOL may be greater than +* . KBOT indicating phantom columns from which to chase +* . bulges before they are actually introduced or to which +* . to chase bulges beyond column KBOT.) ==== +* + DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) +* +* ==== Bulges number MTOP to MBOT are active double implicit +* . shift bulges. There may or may not also be small +* . 2-by-2 bulge, if there is room. The inactive bulges +* . (if any) must wait until the active bulges have moved +* . down the diagonal to make room. The phantom matrix +* . paradigm described above helps keep track. ==== +* + MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + M22 = MBOT + 1 + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + $ ( KBOT-2 ) +* +* ==== Generate reflections to chase the chain right +* . one column. (The minimum value of K is KTOP-1.) ==== +* + DO 20 M = MTOP, MBOT + K = KRCOL + 3*( M-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), + $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), + $ V( 1, M ) ) + ALPHA = V( 1, M ) + CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M ) = H( K+2, K ) + V( 3, M ) = H( K+3, K ) + CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) +* +* ==== A Bulge may collapse because of vigilant +* . deflation or destructive underflow. In the +* . underflow case, try the two-small-subdiagonals +* . trick to try to reinflate the bulge. ==== +* + IF( H( K+3, K ).NE.ZERO .OR. H( K+3, K+1 ).NE. + $ ZERO .OR. H( K+3, K+2 ).EQ.ZERO ) THEN +* +* ==== Typical case: not collapsed (yet). ==== +* + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + H( K+3, K ) = ZERO + ELSE +* +* ==== Atypical case: collapsed. Attempt to +* . reintroduce ignoring H(K+1,K) and H(K+2,K). +* . If the fill resulting from the new +* . reflector is too large, then abandon it. +* . Otherwise, use the new one. ==== +* + CALL DLAQR1( 3, H( K+1, K+1 ), LDH, SR( 2*M-1 ), + $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), + $ VT ) + ALPHA = VT( 1 ) + CALL DLARFG( 3, ALPHA, VT( 2 ), 1, VT( 1 ) ) + REFSUM = VT( 1 )*( H( K+1, K )+VT( 2 )* + $ H( K+2, K ) ) +* + IF( ABS( H( K+2, K )-REFSUM*VT( 2 ) )+ + $ ABS( REFSUM*VT( 3 ) ).GT.ULP* + $ ( ABS( H( K, K ) )+ABS( H( K+1, + $ K+1 ) )+ABS( H( K+2, K+2 ) ) ) ) THEN +* +* ==== Starting a new bulge here would +* . create non-negligible fill. Use +* . the old one with trepidation. ==== +* + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + H( K+3, K ) = ZERO + ELSE +* +* ==== Stating a new bulge here would +* . create only negligible fill. +* . Replace the old reflector with +* . the new one. ==== +* + H( K+1, K ) = H( K+1, K ) - REFSUM + H( K+2, K ) = ZERO + H( K+3, K ) = ZERO + V( 1, M ) = VT( 1 ) + V( 2, M ) = VT( 2 ) + V( 3, M ) = VT( 3 ) + END IF + END IF + END IF + 20 CONTINUE +* +* ==== Generate a 2-by-2 reflection, if needed. ==== +* + K = KRCOL + 3*( M22-1 ) + IF( BMP22 ) THEN + IF( K.EQ.KTOP-1 ) THEN + CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), + $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), + $ V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + END IF +* +* ==== Multiply H by reflections from the left ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = MAX( KTOP, KRCOL ), JBOT + MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) + + DO 30 M = MTOP, MEND + + M1 = M -1 + + tempv1 = V( 1, M ) + K = KRCOL + 2*M1 + tempv2 = V( 2, M ) + K = K + M1 + tempv3 = V( 3, M ) + temph1 = H( K+1, J ) + temph2 = H( K+2, J ) + temph3 = H( K+3, J ) + + REFSUM = tempv1*( temph1+tempv2* + $ temph2+tempv3*temph3 ) + + + H( K+1, J ) = temph1 - REFSUM + H( K+2, J ) = temph2 - REFSUM*tempv2 + H( K+3, J ) = temph3 - REFSUM*tempv3 + + 30 CONTINUE + + 40 CONTINUE + IF( BMP22 ) THEN + K = KRCOL + 3*( M22-1 ) + DO 50 J = MAX( K+1, KTOP ), JBOT + REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 50 CONTINUE + END IF +* +* ==== Multiply H by reflections from the right. +* . Delay filling in the last row until the +* . vigilant deflation check is complete. ==== +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF + DO 90 M = MTOP, MBOT + IF( V( 1, M ).NE.ZERO ) THEN + tempv1 = V( 1, M ) + tempv2 = V( 2, M ) + tempv3 = V( 3, M ) + K = KRCOL + 3*( M-1 ) + JBEGIN = JTOP + + IF ( MOD( MIN( KBOT, K+3 )-JTOP+1, 2).GT.0 ) THEN + J = JBEGIN + + temph1 = H( J, K+1 ) + temph2 = H( J, K+2 ) + temph3 = H( J, K+3 ) + REFSUM = tempv1* ( temph1+tempv2*temph2+ + $ tempv3*temph3 ) + H( J, K+1 ) = temph1 - REFSUM + H( J, K+2 ) = temph2 - REFSUM*tempv2 + H( J, K+3 ) = temph3 - REFSUM*tempv3 + + JBEGIN = JBEGIN + 1 + + END IF + + + DO 60 J = JBEGIN, MIN( KBOT, K+3 ), 2 + + temph1 = H( J, K+1 ) + temph4 = H( J+1, K+1 ) + temph2 = H( J, K+2 ) + temph5 = H( J+1, K+2 ) + temph3 = H( J, K+3 ) + temph6 = H( J+1, K+3 ) + + REFSUM = tempv1* ( temph1+tempv2*temph2+ + $ tempv3*temph3 ) + + REFSU1 = tempv1* ( temph4+tempv2*temph5+ + $ tempv3*temph6 ) + + H( J, K+1 ) = temph1 - REFSUM + H( J+1, K+1 ) = temph4 - REFSU1 + H( J, K+2 ) = temph2 - REFSUM*tempv2 + H( J+1, K+2 ) = temph5 - REFSU1*tempv2 + H( J, K+3 ) = temph3 - REFSUM*tempv3 + H( J+1, K+3 ) = temph6 - REFSU1*tempv3 + + 60 CONTINUE +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If necessary, update Z later +* . with with an efficient matrix-matrix +* . multiply.) ==== +* + KMS = K - INCOL + JBEGIN=MAX( 1, KTOP-INCOL ) + + IF ( MOD(KDU-JBEGIN+1,2).GT.0 ) THEN + J = JBEGIN + tempu1 = U( J, KMS+1 ) + tempu2 = U( J, KMS+2 ) + tempu3 = U( J, KMS+3 ) + REFSUM = tempv1* ( tempu1+tempv2*tempu2+ + $ tempv3*tempu3 ) + U( J, KMS+1 ) = tempu1 - REFSUM + U( J, KMS+2 ) = tempu2 - REFSUM*tempv2 + U( J, KMS+3 ) = tempu3 - REFSUM*tempv3 + JBEGIN = JBEGIN + 1 + + END IF + + + DO 70 J = JBEGIN, KDU , 2 + + tempu1 = U( J, KMS+1 ) + tempu4 = U( J+1, KMS+1 ) + tempu2 = U( J, KMS+2 ) + tempu5 = U( J+1, KMS+2 ) + tempu3 = U( J, KMS+3 ) + tempu6 = U( J+1, KMS+3 ) + REFSUM = tempv1* ( tempu1+tempv2*tempu2+ + $ tempv3*tempu3 ) + + REFSU1 = tempv1* ( tempu4+tempv2*tempu5+ + $ tempv3*tempu6 ) + + U( J, KMS+1 ) = tempu1 - REFSUM + U( J+1, KMS+1 ) = tempu4 - REFSU1 + U( J, KMS+2 ) = tempu2 - REFSUM*tempv2 + U( J+1, KMS+2 ) = tempu5 - REFSU1*tempv2 + U( J, KMS+3 ) = tempu3 - REFSUM*tempv3 + U( J+1, KMS+3 ) = tempu6 - REFSU1*tempv3 + + 70 CONTINUE + + + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + JBEGIN = ILOZ + + IF ( MOD(IHIZ-ILOZ+1,2).GT.0 ) THEN + J = JBEGIN + + tempz1 = Z( J, K+1 ) + tempz2 = Z( J, K+2 ) + tempz3 = Z( J, K+3 ) + REFSUM = tempv1* ( tempz1+tempv2*tempz2+ + $ tempv3*tempz3 ) + Z( J, K+1 ) = tempz1 - REFSUM + Z( J, K+2 ) = tempz2 - REFSUM*tempv2 + Z( J, K+3 ) = tempz3 - REFSUM*tempv3 + + JBEGIN = JBEGIN + 1 + + END IF + + DO 80 J = JBEGIN, IHIZ, 2 + + tempz1 = Z( J, K+1 ) + tempz4 = Z( J+1, K+1 ) + tempz2 = Z( J, K+2 ) + tempz5 = Z( J+1, K+2 ) + tempz3 = Z( J, K+3 ) + tempz6 = Z( J+1, K+3 ) + + REFSUM = tempv1* ( tempz1+tempv2*tempz2+ + $ tempv3*tempz3 ) + + REFSU1 = tempv1* ( tempz4+tempv2*tempz5+ + $ tempv3*tempz6 ) + + Z( J, K+1 ) = tempz1 - REFSUM + Z( J, K+2 ) = tempz2 - REFSUM*tempv2 + Z( J, K+3 ) = tempz3 - REFSUM*tempv3 + + + Z( J+1, K+1 ) = tempz4 - REFSU1 + Z( J+1, K+2 ) = tempz5 - REFSU1*tempv2 + Z( J+1, K+3 ) = tempz6 - REFSU1*tempv3 + + + 80 CONTINUE + + END IF + END IF + 90 CONTINUE +* +* ==== Special case: 2-by-2 reflection (if needed) ==== +* + K = KRCOL + 3*( M22-1 ) + IF( BMP22 ) THEN + IF ( V( 1, M22 ).NE.ZERO ) THEN + DO 100 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + 100 CONTINUE +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 110 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*V( 2, M22 ) + 110 CONTINUE + ELSE IF( WANTZ ) THEN + DO 120 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + 120 CONTINUE + END IF + END IF + END IF +* +* ==== Vigilant deflation check ==== +* + MSTART = MTOP + IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) + $ MSTART = MSTART + 1 + MEND = MBOT + IF( BMP22 ) + $ MEND = MEND + 1 + IF( KRCOL.EQ.KBOT-2 ) + $ MEND = MEND + 1 + DO 130 M = MSTART, MEND + K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) + IF( TST1.EQ.ZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + ABS( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + ABS( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + ABS( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) + END IF + IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) ) + $ THEN + H12 = MAX( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) ) + H21 = MIN( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) ) + H11 = MAX( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + END IF + END IF + 130 CONTINUE +* +* ==== Fill in the last row of each bulge. ==== +* + MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) + DO 140 M = MTOP, MEND + K = KRCOL + 3*( M-1 ) + REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) + H( K+4, K+1 ) = -REFSUM + H( K+4, K+2 ) = -REFSUM*V( 2, M ) + H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) + 140 CONTINUE +* +* ==== End of near-the-diagonal bulge chase. ==== +* + 150 CONTINUE +* +* ==== Use U (if accumulated) to update far-from-diagonal +* . entries in H. If required, use U to update Z as +* . well. ==== +* + IF( ACCUM ) THEN + IF( WANTT ) THEN + JTOP = 1 + JBOT = N + ELSE + JTOP = KTOP + JBOT = KBOT + END IF + IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. + $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN +* +* ==== Updates not exploiting the 2-by-2 block +* . structure of U. K1 and NU keep track of +* . the location and size of U in the special +* . cases of introducing bulges and chasing +* . bulges off the bottom. In these special +* . cases and in case the number of shifts +* . is NS = 2, there is no 2-by-2 block +* . structure to exploit. ==== +* + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 +* +* ==== Horizontal Multiply ==== +* + DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 160 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 170 CONTINUE +* +* ==== Z multiply (also vertical) ==== +* + IF( WANTZ ) THEN + DO 180 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) + CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ Z( JROW, INCOL+K1 ), LDZ ) + 180 CONTINUE + END IF + ELSE +* +* ==== Updates exploiting U's 2-by-2 block structure. +* . (I2, I4, J2, J4 are the last rows and columns +* . of the blocks.) ==== +* + I2 = ( KDU+1 ) / 2 + I4 = KDU + J2 = I4 - I2 + J4 = KDU +* +* ==== KZS and KNZ deal with the band of zeros +* . along the diagonal of one of the triangular +* . blocks. ==== +* + KZS = ( J4-J2 ) - ( NS+1 ) + KNZ = NS + 1 +* +* ==== Horizontal multiply ==== +* + DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) +* +* ==== Copy bottom of H to top+KZS of scratch ==== +* (The first KZS rows get multiplied by zero.) ==== +* + CALL DLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), + $ LDH, WH( KZS+1, 1 ), LDWH ) +* +* ==== Multiply by U21**T ==== +* + CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) + CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, + $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), + $ LDWH ) +* +* ==== Multiply top of H by U11**T ==== +* + CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, + $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) +* +* ==== Copy top of H to bottom of WH ==== +* + CALL DLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, + $ WH( I2+1, 1 ), LDWH ) +* +* ==== Multiply by U21**T ==== +* + CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, + $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) +* +* ==== Multiply by U22 ==== +* + CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, + $ U( J2+1, I2+1 ), LDU, + $ H( INCOL+1+J2, JCOL ), LDH, ONE, + $ WH( I2+1, 1 ), LDWH ) +* +* ==== Copy it back ==== +* + CALL DLACPY( 'ALL', KDU, JLEN, WH, LDWH, + $ H( INCOL+1, JCOL ), LDH ) + 190 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV + JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) +* +* ==== Copy right of H to scratch (the first KZS +* . columns get multiplied by zero) ==== +* + CALL DLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), + $ LDH, WV( 1, 1+KZS ), LDWV ) +* +* ==== Multiply by U21 ==== +* + CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) + CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, + $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), + $ LDWV ) +* +* ==== Multiply by U11 ==== +* + CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, + $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, + $ LDWV ) +* +* ==== Copy left of H to right of scratch ==== +* + CALL DLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, + $ WV( 1, 1+I2 ), LDWV ) +* +* ==== Multiply by U21 ==== +* + CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, + $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) +* +* ==== Multiply by U22 ==== +* + CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, + $ H( JROW, INCOL+1+J2 ), LDH, + $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), + $ LDWV ) +* +* ==== Copy it back ==== +* + CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, + $ H( JROW, INCOL+1 ), LDH ) + 200 CONTINUE +* +* ==== Multiply Z (also vertical) ==== +* + IF( WANTZ ) THEN + DO 210 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) +* +* ==== Copy right of Z to left of scratch (first +* . KZS columns get multiplied by zero) ==== +* + CALL DLACPY( 'ALL', JLEN, KNZ, + $ Z( JROW, INCOL+1+J2 ), LDZ, + $ WV( 1, 1+KZS ), LDWV ) +* +* ==== Multiply by U12 ==== +* + CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, + $ LDWV ) + CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, + $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), + $ LDWV ) +* +* ==== Multiply by U11 ==== +* + CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, + $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, + $ WV, LDWV ) +* +* ==== Copy left of Z to right of scratch ==== +* + CALL DLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), + $ LDZ, WV( 1, 1+I2 ), LDWV ) +* +* ==== Multiply by U21 ==== +* + CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, + $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), + $ LDWV ) +* +* ==== Multiply by U22 ==== +* + CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, + $ Z( JROW, INCOL+1+J2 ), LDZ, + $ U( J2+1, I2+1 ), LDU, ONE, + $ WV( 1, 1+I2 ), LDWV ) +* +* ==== Copy the result back to Z ==== +* + CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, + $ Z( JROW, INCOL+1 ), LDZ ) + 210 CONTINUE + END IF + END IF + END IF + 220 CONTINUE +* +* ==== End of DLAQR5 ==== +* + END diff --git a/interface/swap.c b/interface/swap.c index 23b2e4ec8b..7d47d600bd 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,6 +42,10 @@ #include "functable.h" #endif +// Disable multi-threading as it does not show any performance +// benefits. Keep the multi-threading code for the record. +#undef SMP + #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 2be915c323..1721afc1ca 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -243,6 +243,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif { buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); + // It seems to be required for some K8 or Barcelona CPU + buffer_size += 8; if(incx != 1) buffer_size += n * 2; } diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index fc4c4028b2..17c2b1b89c 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1,6 +1,6 @@ -include_directories(${CMAKE_SOURCE_DIR}) -include("${CMAKE_SOURCE_DIR}/cmake/kernel.cmake") +include_directories(${PROJECT_SOURCE_DIR}) +include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake") # Makefile diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 8e68274240..e55f153f59 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -12,10 +12,6 @@ ifeq ($(ARCH), ia64) USE_GEMM3M = 1 endif -ifeq ($(ARCH), MIPS) -USE_GEMM3M = 1 -endif - ifeq ($(ARCH), arm) USE_TRMM = 1 endif diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c index 91ca76569b..4ef49e2934 100644 --- a/kernel/arm/scal.c +++ b/kernel/arm/scal.c @@ -40,6 +40,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS { BLASLONG i=0,j=0; + if ( (n <= 0) || (inc_x <= 0)) + return(0); + + while(j < n) { diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c index f543edc046..0521aaa0bd 100644 --- a/kernel/arm/zscal.c +++ b/kernel/arm/zscal.c @@ -43,6 +43,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F BLASLONG ip = 0; FLOAT temp; + if ( (n <= 0) || (inc_x <= 0)) + return(0); + + inc_x2 = 2 * inc_x; for ( i=0; i ALPHA0_R +//v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R +//v13 must save pB1_00_I, pB1_01_I +//v14 must save pB1_02_R, pB1_03_R +//v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R @@ -171,8 +173,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] @@ -189,6 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -200,6 +206,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -211,6 +220,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 + fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -222,56 +234,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + + fmul v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.s[2] + fmls v25.4s, v0.4s, v11.s[0] #else - fmul v25.4s, v0.4s, v9.s[2] + fmul v25.4s, v0.4s, v11.s[0] #endif - OP_ir v25.4s, v1.4s, v8.s[2] + OP_ir v25.4s, v1.4s, v10.s[0] - fmul v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + + fmul v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.s[2] + fmls v27.4s, v2.4s, v11.s[0] #else - fmul v27.4s, v2.4s, v9.s[2] + fmul v27.4s, v2.4s, v11.s[0] #endif - OP_ir v27.4s, v3.4s, v8.s[2] + OP_ir v27.4s, v3.4s, v10.s[0] - fmul v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.s[3] + fmls v29.4s, v0.4s, v11.s[1] #else - fmul v29.4s, v0.4s, v9.s[3] + fmul v29.4s, v0.4s, v11.s[1] #endif - OP_ir v29.4s, v1.4s, v8.s[3] + OP_ir v29.4s, v1.4s, v10.s[1] - fmul v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmul v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.s[3] + fmls v31.4s, v2.4s, v11.s[1] #else - fmul v31.4s, v2.4s, v9.s[3] + fmul v31.4s, v2.4s, v11.s[1] #endif - OP_ir v31.4s, v3.4s, v8.s[3] - - ld2 {v12.4s, v13.4s}, [pB] - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M1 @@ -280,47 +295,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] - ld2 {v12.4s, v13.4s}, [pB] // For next round - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] // For next round - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M2 @@ -329,47 +353,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 - ld2 {v0.4s, v1.4s}, [pA] - add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_E @@ -388,157 +419,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] - - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] - - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] - - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] - + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] + + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] + + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_SUB - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.s[0] - OP_ii v18.4s, v3.4s, v9.s[0] - OP_ri v19.4s, v2.4s, v9.s[0] - OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro SAVE8x4 - mov pCRow1, pCRow0 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - ld2 {v0.4s, v1.4s}, [pCRow1] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow0] - add pCRow2, pCRow1, #32 + add pCRow0, pCRow0, #32 - ld2 {v2.4s, v3.4s}, [pCRow2] + ld2 {v2.4s, v3.4s}, [pCRow0] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R - st2 {v2.4s, v3.4s}, [pCRow2] + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R + st2 {v2.4s, v3.4s}, [pCRow0] - add pCRow1, pCRow1, LDC + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] - add pCRow2, pCRow1, #32 + add pCRow1, pCRow1, #32 - ld2 {v6.4s, v7.4s}, [pCRow2] + ld2 {v6.4s, v7.4s}, [pCRow1] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmla v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmla v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow1] - add pCRow1, pCRow1, LDC + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - ld2 {v0.4s, v1.4s}, [pCRow1] + ld2 {v0.4s, v1.4s}, [pCRow2] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmla v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] + fmla v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow2] - add pCRow2, pCRow1, #32 + add pCRow2, pCRow2, #32 ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I - fmla v3.4s, v26.4s, alphaV1_I - fmla v3.4s, v27.4s, alphaV1_R + fmla v3.4s, v26.4s, alphaV0_I + fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld2 {v4.4s, v5.4s}, [pCRow1] + ld2 {v4.4s, v5.4s}, [pCRow3] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmla v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R - st2 {v4.4s, v5.4s}, [pCRow1] + fmla v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R + st2 {v4.4s, v5.4s}, [pCRow3] - add pCRow2, pCRow1, #32 + add pCRow3, pCRow3, #32 - ld2 {v6.4s, v7.4s}, [pCRow2] + ld2 {v6.4s, v7.4s}, [pCRow3] fmla v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I - fmla v7.4s, v30.4s, alphaV1_I - fmla v7.4s, v31.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmla v7.4s, v30.4s, alphaV0_I + fmla v7.4s, v31.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -720,13 +768,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -734,8 +785,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -743,8 +794,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmla v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R + fmla v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -752,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmla v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R + fmla v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -800,13 +851,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -814,8 +868,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmla v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmla v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -823,8 +877,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I - fmla v1.2s, v24.2s, alphaV1_I - fmla v1.2s, v25.2s, alphaV1_R + fmla v1.2s, v24.2s, alphaV0_I + fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -832,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I - fmla v5.2s, v28.2s, alphaV1_I - fmla v5.2s, v29.2s, alphaV1_R + fmla v5.2s, v28.2s, alphaV0_I + fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -880,13 +934,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -894,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmla s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmla s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -903,8 +960,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s24, alphaV0_R fmls s0, s25, alphaV0_I - fmla s1, s24, alphaV1_I - fmla s1, s25, alphaV1_R + fmla s1, s24, alphaV0_I + fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -912,8 +969,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s28, alphaV0_R fmls s4, s29, alphaV0_I - fmla s5, s28, alphaV1_I - fmla s5, s29, alphaV1_R + fmla s5, s28, alphaV0_I + fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -962,13 +1019,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -976,8 +1036,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC @@ -985,8 +1045,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -994,8 +1054,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v6.4s, v7.4s}, [pCRow2] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmla v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R + fmla v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 @@ -1028,13 +1088,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1042,8 +1105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1076,13 +1139,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1090,8 +1156,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmla v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmla v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1124,13 +1190,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -1138,8 +1207,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmla s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmla s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1174,13 +1243,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 @@ -1188,8 +1260,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pCRow1] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -1216,13 +1288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1248,13 +1323,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1281,13 +1359,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1313,10 +1394,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0_R, s0 - fmov alpha0_I, s1 - fmov alpha1_R, s0 - fmov alpha1_I, s1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -1330,8 +1412,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ cgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -1342,44 +1428,69 @@ cgemm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble cgemm_kernel_L4_M4_BEGIN + .align 5 cgemm_kernel_L4_M8_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt cgemm_kernel_L4_M8_32 - KERNEL8x4_I // do one in the K - KERNEL8x4_M2 // do another in the K + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble cgemm_kernel_L4_M8_22a - .align 5 + .align 5 cgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt cgemm_kernel_L4_M8_22 - + .align 5 cgemm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 + .align 5 cgemm_kernel_L4_M8_32: tst counterL, #1 ble cgemm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 @@ -1390,14 +1501,21 @@ cgemm_kernel_L4_M8_40: cgemm_kernel_L4_M8_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble cgemm_kernel_L4_M8_100 + .align 5 cgemm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne cgemm_kernel_L4_M8_46 + cgemm_kernel_L4_M8_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE8x4 diff --git a/kernel/arm64/copy.S b/kernel/arm64/copy.S index 17aa5a1e87..70eab96fb6 100644 --- a/kernel/arm64/copy.S +++ b/kernel/arm64/copy.S @@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. str TMPF, [Y], #SZ #else #if !defined(DOUBLE) - ld1 {v0.2s}, [X], #8 - st1 {v0.2s}, [Y], #8 + ldr d0, [X], #8 + str d0, [Y], #8 #else - ld1 {v0.2d}, [X], #16 - st1 {v0.2d}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 #endif #endif .endm .macro KERNEL_F4 - #if !defined(COMPLEX) #if !defined(DOUBLE) - ld1 {v0.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 #else // DOUBLE - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 + #endif #else // COMPLEX #if !defined(DOUBLE) - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 #else // DOUBLE - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - ld1 {v2.4s}, [X], #16 - ld1 {v3.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 - st1 {v2.4s}, [Y], #16 - st1 {v3.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 + ldr q2, [X], #16 + str q2, [Y], #16 + ldr q3, [X], #16 + str q3, [Y], #16 #endif #endif diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S index ce5cb04060..680fb56c3c 100644 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -46,20 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alphaR w17 +#define alphaI w18 +#define temp x19 +#define tempOffset x20 +#define tempK x21 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] -#define alpha1_R s14 -#define alphaV1_R v14.s[0] -#define alpha1_I s15 -#define alphaV1_I v15.s[0] +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -124,14 +126,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I -//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R -//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I -//v10 must save ALPHA0_R -//v11 must save ALPHA0_I -//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R -//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I -//v14 must save ALPHA1_R -//v15 must save ALPHA1_I +//v08 must save pB0_00_R, pB0_01_R +//v09 must save pB0_00_I, pB0_01_I +//v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R +//v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R +//v13 must save pB1_00_I, pB1_01_I +//v14 must save pB1_02_R, pB1_03_R +//v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R @@ -149,6 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -173,8 +176,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] @@ -191,6 +195,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -202,6 +209,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -213,6 +223,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 + fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -224,56 +237,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + + fmul v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.s[2] + fmls v25.4s, v0.4s, v11.s[0] #else - fmul v25.4s, v0.4s, v9.s[2] + fmul v25.4s, v0.4s, v11.s[0] #endif - OP_ir v25.4s, v1.4s, v8.s[2] + OP_ir v25.4s, v1.4s, v10.s[0] - fmul v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + + fmul v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.s[2] + fmls v27.4s, v2.4s, v11.s[0] #else - fmul v27.4s, v2.4s, v9.s[2] + fmul v27.4s, v2.4s, v11.s[0] #endif - OP_ir v27.4s, v3.4s, v8.s[2] + OP_ir v27.4s, v3.4s, v10.s[0] - fmul v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.s[3] + fmls v29.4s, v0.4s, v11.s[1] #else - fmul v29.4s, v0.4s, v9.s[3] + fmul v29.4s, v0.4s, v11.s[1] #endif - OP_ir v29.4s, v1.4s, v8.s[3] + OP_ir v29.4s, v1.4s, v10.s[1] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmul v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] + fmul v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.s[3] + fmls v31.4s, v2.4s, v11.s[1] #else - fmul v31.4s, v2.4s, v9.s[3] + fmul v31.4s, v2.4s, v11.s[1] #endif - OP_ir v31.4s, v3.4s, v8.s[3] - - ld2 {v12.4s, v13.4s}, [pB] - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M1 @@ -282,47 +298,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] - ld2 {v12.4s, v13.4s}, [pB] // For next round - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] // For next round - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M2 @@ -331,47 +356,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 - ld2 {v0.4s, v1.4s}, [pA] - add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_E @@ -390,157 +422,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] - - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] - - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] - - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] - + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] + + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] + + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_SUB - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.s[0] - OP_ii v18.4s, v3.4s, v9.s[0] - OP_ri v19.4s, v2.4s, v9.s[0] - OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro SAVE8x4 - mov pCRow1, pCRow0 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow0] + add pCRow0, pCRow0, #32 fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R - st2 {v2.4s, v3.4s}, [pCRow2] - - add pCRow1, pCRow1, LDC + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R + st2 {v2.4s, v3.4s}, [pCRow0] + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] - add pCRow2, pCRow1, #32 - + add pCRow1, pCRow1, #32 fmul v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmul v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] - - add pCRow1, pCRow1, LDC + fmul v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow1] + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmul v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow2] + add pCRow2, pCRow2, #32 fmul v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I - fmul v3.4s, v26.4s, alphaV1_I - fmla v3.4s, v27.4s, alphaV1_R + fmul v3.4s, v26.4s, alphaV0_I + fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] - add pCRow1, pCRow1, LDC - + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmul v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R - st2 {v4.4s, v5.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R + st2 {v4.4s, v5.4s}, [pCRow3] + add pCRow3, pCRow3, #32 fmul v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I - fmul v7.4s, v30.4s, alphaV1_I - fmla v7.4s, v31.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmul v7.4s, v30.4s, alphaV0_I + fmla v7.4s, v31.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -722,13 +763,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -736,8 +780,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -745,8 +789,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmul v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R + fmul v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -754,8 +798,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmul v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R + fmul v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -802,13 +846,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -816,8 +863,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmul v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmul v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -825,8 +872,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I - fmul v1.2s, v24.2s, alphaV1_I - fmla v1.2s, v25.2s, alphaV1_R + fmul v1.2s, v24.2s, alphaV0_I + fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -834,8 +881,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I - fmul v5.2s, v28.2s, alphaV1_I - fmla v5.2s, v29.2s, alphaV1_R + fmul v5.2s, v28.2s, alphaV0_I + fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -882,13 +929,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -896,8 +946,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmul s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmul s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -905,8 +955,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s0, s24, alphaV0_R fmls s0, s25, alphaV0_I - fmul s1, s24, alphaV1_I - fmla s1, s25, alphaV1_R + fmul s1, s24, alphaV0_I + fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -914,8 +964,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s28, alphaV0_R fmls s4, s29, alphaV0_I - fmul s5, s28, alphaV1_I - fmla s5, s29, alphaV1_R + fmul s5, s28, alphaV0_I + fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -964,13 +1014,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -978,8 +1031,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC @@ -987,8 +1040,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -996,8 +1049,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmul v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R + fmul v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 @@ -1030,13 +1083,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1044,8 +1100,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1078,13 +1134,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1092,8 +1151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmul v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmul v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1126,13 +1185,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -1140,8 +1202,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmul s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmul s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1176,13 +1238,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 @@ -1190,8 +1255,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -1218,13 +1283,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1250,13 +1318,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1283,13 +1354,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1315,10 +1389,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0_R, s0 - fmov alpha0_I, s1 - fmov alpha1_R, s0 - fmov alpha1_I, s1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -1335,8 +1410,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ ctrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1370,40 +1450,64 @@ ctrmm_kernel_L4_M8_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt ctrmm_kernel_L4_M8_32 - KERNEL8x4_I // do one in the K - KERNEL8x4_M2 // do another in the K + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble ctrmm_kernel_L4_M8_22a - .align 5 + .align 5 ctrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M8_22 - + .align 5 ctrmm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b ctrmm_kernel_L4_M8_44 + .align 5 ctrmm_kernel_L4_M8_32: tst counterL, #1 ble ctrmm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b ctrmm_kernel_L4_M8_44 @@ -1414,13 +1518,17 @@ ctrmm_kernel_L4_M8_40: ctrmm_kernel_L4_M8_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble ctrmm_kernel_L4_M8_100 + .align 5 ctrmm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne ctrmm_kernel_L4_M8_46 + ctrmm_kernel_L4_M8_100: SAVE8x4 @@ -1440,6 +1548,9 @@ ctrmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] ctrmm_kernel_L4_M8_END: subs counterI, counterI, #1 @@ -1454,9 +1565,8 @@ ctrmm_kernel_L4_M4_BEGIN: tst counterI, #4 ble ctrmm_kernel_L4_M2_BEGIN -ctrmm_kernel_L4_M4_20: - INIT4x4 +ctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1475,38 +1585,47 @@ ctrmm_kernel_L4_M4_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble ctrmm_kernel_L4_M4_40 + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt ctrmm_kernel_L4_M4_32 -ctrmm_kernel_L4_M4_22: + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB + subs counterL, counterL, #2 + ble ctrmm_kernel_L4_M4_22a + .align 5 - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB + +ctrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M4_22 - +ctrmm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_E + b ctrmm_kernel_L4_M4_44 +ctrmm_kernel_L4_M4_32: + tst counterL, #1 + ble ctrmm_kernel_L4_M4_40 + KERNEL4x4_I + KERNEL4x4_E + b ctrmm_kernel_L4_M4_44 ctrmm_kernel_L4_M4_40: - ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M4_100 + INIT4x4 -ctrmm_kernel_L4_M4_42: +ctrmm_kernel_L4_M4_44: + ands counterL , tempK, #1 + ble ctrmm_kernel_L4_M4_100 +ctrmm_kernel_L4_M4_46: KERNEL4x4_SUB - subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M4_42 - ctrmm_kernel_L4_M4_100: SAVE4x4 @@ -1528,7 +1647,6 @@ ctrmm_kernel_L4_M4_100: ctrmm_kernel_L4_M4_END: - ctrmm_kernel_L4_M2_BEGIN: mov counterI, origM diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S index f3c3d5c35d..3fd74fc3ba 100644 --- a/kernel/arm64/dgemm_kernel_8x4.S +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -339,7 +339,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] ldp q2, q3, [pCRow0] fmla v2.2d, v18.2d, alphaV0 @@ -356,7 +355,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp q4, q5, [pCRow1] add pCRow1, pCRow1, #32 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ldp q6, q7, [pCRow1] fmla v6.2d, v22.2d, alphaV0 @@ -373,7 +371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp q0, q1, [pCRow2] add pCRow2, pCRow2, #32 - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] ldp q2, q3, [pCRow2] fmla v2.2d, v26.2d, alphaV0 @@ -390,7 +387,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp q4, q5, [pCRow3] add pCRow3, pCRow3, #32 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] ldp q6, q7, [pCRow3] fmla v6.2d, v30.2d, alphaV0 @@ -434,33 +430,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x4 fmov alpha0, alpha + ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] - add pCRow1, pCRow0, LDC + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #32 ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] - add pCRow2, pCRow1, LDC + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #32 ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 fmla v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] - add pCRow1, pCRow2, LDC + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow2, pCRow2, #32 - ld1 {v12.2d, v13.2d}, [pCRow1] + ld1 {v12.2d, v13.2d}, [pCRow3] fmla v12.2d, v28.2d, alphaV0 fmla v13.2d, v29.2d, alphaV0 - st1 {v12.2d, v13.2d}, [pCRow1] + st1 {v12.2d, v13.2d}, [pCRow3] - add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -487,29 +488,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x4 fmov alpha0, alpha + ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] - add pCRow1, pCRow0, LDC + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #16 ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] - add pCRow2, pCRow1, LDC + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #16 ld1 {v8.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] - add pCRow1, pCRow2, LDC + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow2, pCRow2, #16 - ld1 {v12.2d}, [pCRow1] + ld1 {v12.2d}, [pCRow3] fmla v12.2d, v28.2d, alphaV0 - st1 {v12.2d}, [pCRow1] + st1 {v12.2d}, [pCRow3] - add pCRow0, pCRow0, #16 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + add pCRow3, pCRow3, #16 .endm /******************************************************************************/ @@ -532,7 +538,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x4 fmov alpha0, alpha - add pCRow1, pCRow0, LDC ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] @@ -540,16 +545,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] - add pCRow2, pCRow1, LDC - add pCRow1, pCRow2, LDC + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #8 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #8 ld1 {v12.d}[0], [pCRow2] - ld1 {v12.d}[1], [pCRow1] + ld1 {v12.d}[1], [pCRow3] fmla v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] - st1 {v12.d}[1], [pCRow1] + st1 {v12.d}[1], [pCRow3] - add pCRow0, pCRow0, #8 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow2, pCRow2, #8 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + add pCRow3, pCRow3, #8 .endm /******************************************************************************/ @@ -578,6 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v18.2d, v2.2d, v8.d[0] fmla v19.2d, v3.2d, v8.d[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] fmla v22.2d, v2.2d, v8.d[1] @@ -586,7 +598,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE8x2 fmov alpha0, alpha - add pCRow1, pCRow0, LDC ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 @@ -595,6 +606,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #64 + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 @@ -602,7 +616,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] - add pCRow0, pCRow0, #64 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #64 .endm /******************************************************************************/ @@ -628,19 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x2 fmov alpha0, alpha + ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] - add pCRow1, pCRow0, LDC + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #32 ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] - add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #32 .endm /******************************************************************************/ @@ -663,17 +681,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x2 fmov alpha0, alpha + ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] - add pCRow1 , pCRow0, LDC + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #16 ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] - add pCRow0, pCRow0, #16 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #16 .endm /******************************************************************************/ @@ -694,7 +715,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x2 fmov alpha0, alpha - add pCRow1 , pCRow0, LDC ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] @@ -702,7 +722,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #8 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #8 .endm /******************************************************************************/ @@ -726,12 +749,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v18.2d, v2.2d, v8.d[0] fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 fmov alpha0, alpha + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 @@ -739,6 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #64 .endm @@ -763,11 +789,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x1 fmov alpha0, alpha + ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #32 .endm @@ -790,10 +818,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x1 fmov alpha0, alpha + ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #16 .endm @@ -819,6 +849,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmadd d8, d16, alpha0, d8 str d8, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #8 .endm @@ -858,6 +889,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ + .align 5 dgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC @@ -989,17 +1021,26 @@ dgemm_kernel_L4_M4_20: cmp counterL , #0 ble dgemm_kernel_L4_M4_40 + .align 5 dgemm_kernel_L4_M4_22: KERNEL4x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_22 @@ -1012,6 +1053,8 @@ dgemm_kernel_L4_M4_40: dgemm_kernel_L4_M4_42: KERNEL4x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_42 @@ -1022,7 +1065,6 @@ dgemm_kernel_L4_M4_100: dgemm_kernel_L4_M4_END: - dgemm_kernel_L4_M2_BEGIN: mov counterI, origM @@ -1042,16 +1084,23 @@ dgemm_kernel_L4_M2_20: cmp counterL , #0 ble dgemm_kernel_L4_M2_40 + .align 5 dgemm_kernel_L4_M2_22: KERNEL2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB KERNEL2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB subs counterL, counterL, #1 @@ -1063,9 +1112,12 @@ dgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M2_100 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] dgemm_kernel_L4_M2_42: KERNEL2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M2_42 @@ -1092,15 +1144,22 @@ dgemm_kernel_L4_M1_20: cmp counterL , #0 ble dgemm_kernel_L4_M1_40 + .align 5 dgemm_kernel_L4_M1_22: KERNEL1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB KERNEL1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + KERNEL1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB KERNEL1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB subs counterL, counterL, #1 @@ -1112,9 +1171,11 @@ dgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M1_100 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] dgemm_kernel_L4_M1_42: KERNEL1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M1_42 @@ -1143,9 +1204,10 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction tst counterJ , #2 ble dgemm_kernel_L1_BEGIN - mov pCRow0, pC // pCRow0 = pC + mov pCRow0, pC + add pCRow1, pCRow0, LDC - add pC,pC,LDC, lsl #1 + add pC, pCRow1, LDC mov pA, origPA // pA = A @@ -1156,6 +1218,7 @@ dgemm_kernel_L2_M8_BEGIN: cmp counterI, #0 ble dgemm_kernel_L2_M4_BEGIN + .align 5 dgemm_kernel_L2_M8_20: INIT8x2 @@ -1165,28 +1228,31 @@ dgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M8_40 - .align 5 + .align 5 dgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M8_22 - dgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M8_100 + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M8_42: KERNEL8x2_SUB @@ -1221,17 +1287,23 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M4_40 - .align 5 + .align 5 dgemm_kernel_L2_M4_22: KERNEL4x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB KERNEL4x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB subs counterL, counterL, #1 @@ -1243,9 +1315,12 @@ dgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M4_100 + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M4_42: KERNEL4x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L2_M4_42 @@ -1279,19 +1354,26 @@ dgemm_kernel_L2_M2_20: dgemm_kernel_L2_M2_22: KERNEL2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M2_22 - + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 @@ -1329,18 +1411,24 @@ dgemm_kernel_L2_M1_20: dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x2_SUB KERNEL1x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + KERNEL1x2_SUB KERNEL1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M1_22 - + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 @@ -1380,6 +1468,7 @@ dgemm_kernel_L1_M8_BEGIN: cmp counterI, #0 ble dgemm_kernel_L1_M4_BEGIN + .align 5 dgemm_kernel_L1_M8_20: INIT8x1 @@ -1388,14 +1477,16 @@ dgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M8_40 - .align 5 + .align 5 dgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1410,6 +1501,7 @@ dgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M8_100 + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M8_42: KERNEL8x1_SUB @@ -1443,17 +1535,23 @@ dgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M4_40 - .align 5 + .align 5 dgemm_kernel_L1_M4_22: KERNEL4x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB KERNEL4x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNEL4x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB KERNEL4x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB subs counterL, counterL, #1 @@ -1465,9 +1563,11 @@ dgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M4_100 + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M4_42: KERNEL4x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L1_M4_42 @@ -1501,18 +1601,24 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x1_SUB KERNEL2x1_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNEL2x1_SUB KERNEL2x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M2_22 - + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 @@ -1547,14 +1653,17 @@ dgemm_kernel_L1_M1_20: cmp counterL , #0 ble dgemm_kernel_L1_M1_40 + dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x1_SUB KERNEL1x1_SUB @@ -1567,6 +1676,8 @@ dgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M1_100 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M1_42: KERNEL1x1_SUB diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index b06c7560d6..2b8173715d 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alpha x17 +#define temp x18 +#define tempOffset x19 +#define tempK x20 #define alpha0 d10 #define alphaV0 v10.d[0] -#define alpha1 d11 -#define alphaV1 v11.d[0] -#define alpha2 d14 -#define alphaV2 v14.d[0] -#define alpha3 d15 -#define alphaV3 v15.d[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 // 00 origM // 01 origN @@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 -//v08 must save pB0_0, pB0_1 -//v09 must save pB0_2, pB0_3 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB1_0, pB1_1 -//v13 must save pB1_2, pB1_3 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 --> ALPHA0 +//v11 must save pB0_3 +//v12 must save pB1_0 +//v13 must save pB1_1 +//v14 must save pB1_2 +//v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 @@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmul v16.2d, v0.2d, v8.d[0] + fmul v20.2d, v0.2d, v9.d[0] + + ldp d10, d11, [pB], #16 + fmul v17.2d, v1.2d, v8.d[0] + fmul v21.2d, v1.2d, v9.d[0] + + ldp q2, q3, [pA], #32 + + fmul v24.2d, v0.2d, v10.d[0] + fmul v28.2d, v0.2d, v11.d[0] + + ldp q4, q5, [pA], #32 + + fmul v25.2d, v1.2d, v10.d[0] + fmul v29.2d, v1.2d, v11.d[0] + + ldp d12, d13, [pB], #16 + fmul v18.2d, v2.2d, v8.d[0] - fmul v19.2d, v3.2d, v8.d[0] + fmul v22.2d, v2.2d, v9.d[0] - fmul v20.2d, v0.2d, v8.d[1] - fmul v21.2d, v1.2d, v8.d[1] - fmul v22.2d, v2.2d, v8.d[1] - fmul v23.2d, v3.2d, v8.d[1] + ldp d14, d15, [pB], #16 - fmul v24.2d, v0.2d, v9.d[0] - fmul v25.2d, v1.2d, v9.d[0] - fmul v26.2d, v2.2d, v9.d[0] - fmul v27.2d, v3.2d, v9.d[0] + fmul v26.2d, v2.2d, v10.d[0] + fmul v30.2d, v2.2d, v11.d[0] - fmul v28.2d, v0.2d, v9.d[1] - fmul v29.2d, v1.2d, v9.d[1] - fmul v30.2d, v2.2d, v9.d[1] - fmul v31.2d, v3.2d, v9.d[1] + ldp q6, q7, [pA], #32 - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 + fmul v19.2d, v3.2d, v8.d[0] + fmul v27.2d, v3.2d, v10.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v31.2d, v3.2d, v11.d[0] + fmul v23.2d, v3.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] + + ldp q4, q5, [pA], #32 + + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] + + ldp d12, d13, [pB], #16 + fmla v17.2d, v1.2d, v8.d[0] - fmla v18.2d, v2.2d, v8.d[0] - fmla v19.2d, v3.2d, v8.d[0] + fmla v25.2d, v1.2d, v10.d[0] - fmla v20.2d, v0.2d, v8.d[1] - fmla v21.2d, v1.2d, v8.d[1] - fmla v22.2d, v2.2d, v8.d[1] - fmla v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmla v24.2d, v0.2d, v9.d[0] - fmla v25.2d, v1.2d, v9.d[0] - fmla v26.2d, v2.2d, v9.d[0] - fmla v27.2d, v3.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v29.2d, v1.2d, v11.d[0] - fmla v28.2d, v0.2d, v9.d[1] - fmla v29.2d, v1.2d, v9.d[1] - fmla v30.2d, v2.2d, v9.d[1] - fmla v31.2d, v3.2d, v9.d[1] + ldp d14, d15, [pB], #16 - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] + fmla v23.2d, v3.2d, v9.d[0] + + ldp q6, q7, [pA], #32 - prfm PLDL1KEEP, [pA, #512] + fmla v27.2d, v3.2d, v10.d[0] + fmla v31.2d, v3.2d, v11.d[0] .endm .macro KERNEL8x4_M2 fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] + + ldp q0, q1, [pA], #32 + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + + ldp d8, d9, [pB], #16 + + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + + ldp d10, d11, [pB], #16 + fmla v18.2d, v6.2d, v12.d[0] - fmla v19.2d, v7.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] - fmla v20.2d, v4.2d, v12.d[1] - fmla v21.2d, v5.2d, v12.d[1] - fmla v22.2d, v6.2d, v12.d[1] - fmla v23.2d, v7.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v24.2d, v4.2d, v13.d[0] - fmla v25.2d, v5.2d, v13.d[0] - fmla v26.2d, v6.2d, v13.d[0] - fmla v27.2d, v7.2d, v13.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] - fmla v28.2d, v4.2d, v13.d[1] - fmla v29.2d, v5.2d, v13.d[1] - fmla v30.2d, v6.2d, v13.d[1] - fmla v31.2d, v7.2d, v13.d[1] + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 + ldp q2, q3, [pA], #32 - prfm PLDL1KEEP, [pB, #512] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] + fmla v17.2d, v5.2d, v12.d[0] - fmla v18.2d, v6.2d, v12.d[0] - fmla v19.2d, v7.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] - fmla v20.2d, v4.2d, v12.d[1] - fmla v21.2d, v5.2d, v12.d[1] - fmla v22.2d, v6.2d, v12.d[1] - fmla v23.2d, v7.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v24.2d, v4.2d, v13.d[0] - fmla v25.2d, v5.2d, v13.d[0] - fmla v26.2d, v6.2d, v13.d[0] - fmla v27.2d, v7.2d, v13.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] - fmla v28.2d, v4.2d, v13.d[1] - fmla v29.2d, v5.2d, v13.d[1] - fmla v30.2d, v6.2d, v13.d[1] - fmla v31.2d, v7.2d, v13.d[1] + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_SUB - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] + + ldp d10, d11, [pB], #16 + fmla v17.2d, v1.2d, v8.d[0] + fmla v21.2d, v1.2d, v9.d[0] + + ldp q2, q3, [pA], #32 + + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] + + fmla v25.2d, v1.2d, v10.d[0] + fmla v29.2d, v1.2d, v11.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v18.2d, v2.2d, v8.d[0] - fmla v19.2d, v3.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] - fmla v20.2d, v0.2d, v8.d[1] - fmla v21.2d, v1.2d, v8.d[1] - fmla v22.2d, v2.2d, v8.d[1] - fmla v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmla v24.2d, v0.2d, v9.d[0] - fmla v25.2d, v1.2d, v9.d[0] - fmla v26.2d, v2.2d, v9.d[0] - fmla v27.2d, v3.2d, v9.d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] - fmla v28.2d, v0.2d, v9.d[1] - fmla v29.2d, v1.2d, v9.d[1] - fmla v30.2d, v2.2d, v9.d[1] - fmla v31.2d, v3.2d, v9.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v19.2d, v3.2d, v8.d[0] + fmla v27.2d, v3.2d, v10.d[0] + + fmla v31.2d, v3.2d, v11.d[0] + fmla v23.2d, v3.2d, v9.d[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 - st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + fmul v1.2d, v17.2d, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.2d, v20.2d, alphaV0 - fmul v5.2d, v21.2d, alphaV1 - fmul v6.2d, v22.2d, alphaV2 - fmul v7.2d, v23.2d, alphaV3 - st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + fmul v5.2d, v21.2d, alphaV0 + stp q4, q5, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul v6.2d, v22.2d, alphaV0 + fmul v7.2d, v23.2d, alphaV0 + stp q6, q7, [pCRow1] + + add pCRow1, pCRow1, #32 + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.2d, v24.2d, alphaV0 - fmul v1.2d, v25.2d, alphaV1 - fmul v2.2d, v26.2d, alphaV2 - fmul v3.2d, v27.2d, alphaV3 - st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] + fmul v1.2d, v25.2d, alphaV0 + stp q0, q1, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + fmul v2.2d, v26.2d, alphaV0 + fmul v3.2d, v27.2d, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.2d, v28.2d, alphaV0 - fmul v5.2d, v29.2d, alphaV1 - fmul v6.2d, v30.2d, alphaV2 - fmul v7.2d, v31.2d, alphaV3 - st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + fmul v5.2d, v29.2d, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + fmul v6.2d, v30.2d, alphaV0 + fmul v7.2d, v31.2d, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV2 - fmul v13.2d, v21.2d, alphaV3 + fmul v12.2d, v20.2d, alphaV0 + fmul v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2d, v24.2d, alphaV0 - fmul v9.2d, v25.2d, alphaV1 + fmul v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC - fmul v12.2d, v28.2d, alphaV2 - fmul v13.2d, v29.2d, alphaV3 + fmul v12.2d, v28.2d, alphaV0 + fmul v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC - fmul v8.2d, v24.2d, alphaV2 + fmul v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC - fmul v12.2d, v28.2d, alphaV3 + fmul v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 @@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] @@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha add pCRow1, pCRow0, LDC fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 + fmul v1.2d, v17.2d, alphaV0 + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmul v4.2d, v20.2d, alphaV0 - fmul v5.2d, v21.2d, alphaV1 - fmul v6.2d, v22.2d, alphaV2 - fmul v7.2d, v23.2d, alphaV3 + fmul v5.2d, v21.2d, alphaV0 + fmul v6.2d, v22.2d, alphaV0 + fmul v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] add pCRow0, pCRow0, #64 @@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV2 - fmul v13.2d, v21.2d, alphaV3 + fmul v12.2d, v20.2d, alphaV0 + fmul v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1 , pCRow0, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha add pCRow1 , pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 @@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 + fmul v1.2d, v17.2d, alphaV0 + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #64 @@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 @@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha fmul d8, d16, alpha0 str d8, [pCRow0] @@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, d0 - fmov alpha1, d0 - fmov alpha2, d0 - fmov alpha3, d0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ dtrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble dtrmm_kernel_L4_M4_BEGIN + .align 5 dtrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 + asr counterL , tempK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? blt dtrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble dtrmm_kernel_L4_M8_22a - .align 5 + .align 5 dtrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M8_22 - + .align 5 dtrmm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dtrmm_kernel_L4_M8_44 + .align 5 dtrmm_kernel_L4_M8_32: tst counterL, #1 ble dtrmm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b dtrmm_kernel_L4_M8_44 @@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40: dtrmm_kernel_L4_M8_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble dtrmm_kernel_L4_M8_100 + .align 5 dtrmm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne dtrmm_kernel_L4_M8_46 + dtrmm_kernel_L4_M8_100: SAVE8x4 @@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] dtrmm_kernel_L4_M8_END: subs counterI, counterI, #1 diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S index 6279c22506..162f721c38 100644 --- a/kernel/arm64/gemv_n.S +++ b/kernel/arm64/gemv_n.S @@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SHZ 3 #endif +#define A_PRE_SIZE 768 +#define Y_PRE_SIZE 768 + /******************************************************************************/ .macro SAVE_REGS @@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.4s, v3.4s}, [A_PTR], #32 ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 fmla v4.4s, v1.4s, v2.4s + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v5.4s, v1.4s, v3.4s st1 {v4.4s, v5.4s}, [Y_OPTR], #32 ld1 {v6.4s, v7.4s}, [A_PTR], #32 ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 fmla v8.4s, v1.4s, v6.4s + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v9.4s, v1.4s, v7.4s st1 {v8.4s, v9.4s}, [Y_OPTR], #32 #else //DOUBLE ld1 {v2.2d, v3.2d}, [A_PTR], #32 ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 fmla v4.2d, v1.2d, v2.2d + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v5.2d, v1.2d, v3.2d st1 {v4.2d, v5.2d}, [Y_OPTR], #32 ld1 {v6.2d, v7.2d}, [A_PTR], #32 ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 fmla v8.2d, v1.2d, v6.2d + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v9.2d, v1.2d, v7.2d st1 {v8.2d, v9.2d}, [Y_OPTR], #32 ld1 {v10.2d, v11.2d}, [A_PTR], #32 ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 fmla v12.2d, v1.2d, v10.2d + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v13.2d, v1.2d, v11.2d st1 {v12.2d, v13.2d}, [Y_OPTR], #32 ld1 {v14.2d, v15.2d}, [A_PTR], #32 ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 fmla v16.2d, v1.2d, v14.2d + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v17.2d, v1.2d, v15.2d st1 {v16.2d, v17.2d}, [Y_OPTR], #32 #endif diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S index 0145af6216..28325f784b 100644 --- a/kernel/arm64/gemv_t.S +++ b/kernel/arm64/gemv_t.S @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define J x11 /* loop variable */ #define I x12 /* loop variable */ +#define X_PREFETCH_SIZE 768 +#define A_PREFETCH_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 fmla v1.4s, v5.4s, v9.4s + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.4s, v6.4s, v10.4s + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.4s, v7.4s, v11.4s + ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 fmla v4.4s, v8.4s, v12.4s - ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 fmla v1.4s, v13.4s, v17.4s + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.4s, v14.4s, v18.4s + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.4s, v15.4s, v19.4s fmla v4.4s, v16.4s, v20.4s #else ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 fmla v1.2d, v5.2d, v9.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v6.2d, v10.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v7.2d, v11.2d fmla v4.2d, v8.2d, v12.2d ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 fmla v1.2d, v13.2d, v17.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v14.2d, v18.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v15.2d, v19.2d fmla v4.2d, v16.2d, v20.2d ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 fmla v1.2d, v5.2d, v9.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v6.2d, v10.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v7.2d, v11.2d fmla v4.2d, v8.2d, v12.2d ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 fmla v1.2d, v13.2d, v17.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v14.2d, v18.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v15.2d, v19.2d fmla v4.2d, v16.2d, v20.2d #endif diff --git a/kernel/arm64/iamax.S b/kernel/arm64/iamax.S index 575c15e534..6c0d84f988 100644 --- a/kernel/arm64/iamax.S +++ b/kernel/arm64/iamax.S @@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fabs MAXF, MAXF .endm +.macro KERNEL_F8 +#if !defined(DOUBLE) + ldp q2, q3, [X], #32 + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fmax v2.4s, v2.4s, v3.4s + fmaxv TMPF, v2.4s + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#else + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + + fmax v2.2d, v2.2d, v3.2d + fmax v4.2d, v4.2d, v5.2d + fmax v2.2d, v2.2d, v4.2d + fmaxp TMPF, v2.2d + + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F8_FINALIZE + sub x6, INDEX, #1 +#if !defined(DOUBLE) + lsl x6, x6, #2 + add x7, x7, x6 + ldp q2, q3, [x7] + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + + ins v4.s[0], v3.s[0] + ins v5.s[0], v3.s[1] + ins v6.s[0], v3.s[2] + ins v7.s[0], v3.s[3] + + add x6, INDEX, #7 + fcmp MAXF, s7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[0] + ins v5.s[0], v2.s[1] + ins v6.s[0], v2.s[2] + ins v7.s[0], v2.s[3] + + sub x6, x6, #1 + fcmp MAXF, s7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq +#else + add x6, x6, #4 + lsl x6, x6, #3 + add x7, x7, x6 + ldp q2, q3, [x7] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + + ins v4.d[0], v2.d[0] + ins v5.d[0], v2.d[1] + ins v6.d[0], v3.d[0] + ins v7.d[0], v3.d[1] + + add x6, INDEX, #7 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d4 + csel INDEX, x6, INDEX, eq + + sub x7, x7, #32 + ldp q2, q3, [x7] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + + ins v4.d[0], v2.d[0] + ins v5.d[0], v2.d[1] + ins v6.d[0], v3.d[0] + ins v7.d[0], v3.d[1] + + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d4 + csel INDEX, x6, INDEX, eq +#endif +.endm + + .macro KERNEL_S1 ld1 TMPVF, [X], INC_X add Z, Z, #1 @@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp INC_X, xzr ble iamax_kernel_zero + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + mov x7, X + +iamax_kernel_F_BEGIN: + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #3 + cmp I, xzr + beq iamax_kernel_F1 + + add Z, Z, #1 +iamax_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne iamax_kernel_F8 + + KERNEL_F8_FINALIZE + + sub Z, Z, #1 +iamax_kernel_F1: + + ands I, N, #7 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + INIT_S subs N, N, #1 diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S index ebdc671e0f..9b252ec98c 100644 --- a/kernel/arm64/izamax.S +++ b/kernel/arm64/izamax.S @@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif .endm +.macro KERNEL_F8 +#if !defined(DOUBLE) + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + fabs v5.4s, v5.4s + + faddp v2.4s, v2.4s, v3.4s + faddp v3.4s, v4.4s, v5.4s + + fmax v2.4s, v2.4s, v3.4s + fmaxv TMPF, v2.4s + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#else + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + ldp q16, q17, [X], #32 + ldp q18, q19, [X], #32 + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + fabs v16.2d, v16.2d + fabs v17.2d, v17.2d + fabs v18.2d, v18.2d + fabs v19.2d, v19.2d + + faddp v2.2d, v2.2d, v3.2d + faddp v3.2d, v4.2d, v5.2d + faddp v4.2d, v16.2d, v17.2d + faddp v5.2d, v18.2d, v19.2d + + fmax v2.2d, v2.2d, v3.2d + fmax v4.2d, v4.2d, v5.2d + fmax v2.2d, v2.2d, v4.2d + fmaxp TMPF, v2.2d + + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F8_FINALIZE + sub x6, INDEX, #1 +#if !defined(DOUBLE) + lsl x6, x6, #3 + add x7, x7, x6 + + ldp q2, q3, [x7] + ldp q4, q5, [x7, #32] + + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + fabs v5.4s, v5.4s + + faddp v2.4s, v2.4s, v3.4s + faddp v3.4s, v4.4s, v5.4s + + ins v4.s[0], v3.s[3] + add x6, INDEX, #7 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[2] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[1] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[0] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[3] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[2] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[1] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[0] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq +#else + lsl x6, x6, #4 + add x7, x7, x6 + + ldp q2, q3, [x7] + ldp q4, q5, [x7, #32] + ldp q16, q17, [x7, #64] + ldp q18, q19, [x7, #96] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + fabs v16.2d, v16.2d + fabs v17.2d, v17.2d + fabs v18.2d, v18.2d + fabs v19.2d, v19.2d + + faddp v2.2d, v2.2d, v3.2d + faddp v3.2d, v4.2d, v5.2d + faddp v4.2d, v16.2d, v17.2d + faddp v5.2d, v18.2d, v19.2d + + ins v7.d[0], v5.d[1] + add x6, INDEX, #7 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v5.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v4.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v4.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v3.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v3.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v2.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v2.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq +#endif +.endm + .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v1.2s}, [X], INC_X @@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp INC_X, xzr ble iamax_kernel_zero + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + mov x7, X + + +iamax_kernel_F_BEGIN: + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #3 + cmp I, xzr + ble iamax_kernel_F1 + + add Z, Z, #1 + +iamax_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne iamax_kernel_F8 + + KERNEL_F8_FINALIZE + + sub Z, Z, #1 +iamax_kernel_F1: + + ands I, N, #7 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + INIT_S subs N, N, #1 diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S index 68366d9f2c..6e3645b767 100644 --- a/kernel/arm64/sgemm_kernel_16x4.S +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -46,16 +46,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 +#define pCRow3 x15 +#define pA x16 +#define alpha w17 #define alpha0 s10 #define alphaV0 v10.s[0] -#define alpha1 s11 -#define alphaV1 v11.s[0] -#define alpha2 s14 -#define alphaV2 v14.s[0] -#define alpha3 s15 -#define alphaV3 v15.s[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 160 + // 00 origM // 01 origN @@ -98,14 +99,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 -//v08 must save pB00, pB01 -//v09 must save pB02, pB03 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB10, pB11 -//v13 must save pB12, pB13 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB00 +//v09 must save pB01 +//v10 must save pB02 +//v11 must save pB03 +//v12 must save pB10 +//v13 must save pB11 +//v14 must save pB12 +//v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 @@ -147,206 +148,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + + ldp s8, s9, [pB], #8 fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmul v17.4s, v1.4s, v8.s[0] + fmul v21.4s, v1.4s, v9.s[0] + + ldp q4, q5, [pA], #32 + + fmul v25.4s, v1.4s, v10.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + fmul v18.4s, v2.4s, v8.s[0] + fmul v22.4s, v2.4s, v9.s[0] + + ldp s14, s15, [pB], #8 + fmul v19.4s, v3.4s, v8.s[0] + fmul v23.4s, v3.4s, v9.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.s[1] - fmul v23.4s, v3.4s, v8.s[1] + ldp q6, q7, [pA], #32 - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v26.4s, v2.4s, v9.s[0] - fmul v27.4s, v3.4s, v9.s[0] + fmul v26.4s, v2.4s, v10.s[0] + fmul v30.4s, v2.4s, v11.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - fmul v30.4s, v2.4s, v9.s[1] - fmul v31.4s, v3.4s, v9.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + fmul v27.4s, v3.4s, v10.s[0] + fmul v31.4s, v3.4s, v11.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] + + ldp q4, q5, [pA], #32 + fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + ldp s12, s13, [pB], #8 - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v22.4s, v2.4s, v9.s[0] + fmla v23.4s, v3.4s, v9.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + ldp s14, s15, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.4s, v2.4s, v10.s[0] + fmla v27.4s, v3.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp q6, q7, [pA], #32 + + fmla v30.4s, v2.4s, v11.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] + + ldp q0, q1, [pA], #32 + fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + ldp s8, s9, [pB], #8 - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v22.4s, v6.4s, v13.s[0] + fmla v23.4s, v7.4s, v13.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp s10, s11, [pB], #8 + + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.4s, v6.4s, v14.s[0] + fmla v27.4s, v7.4s, v14.s[0] + + ldp q2, q3, [pA], #32 + + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + fmla v30.4s, v6.4s, v15.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v17.4s, v5.4s, v12.s[0] - fmla v18.4s, v6.4s, v12.s[0] - fmla v19.4s, v7.4s, v12.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v29.4s, v5.4s, v15.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v22.4s, v6.4s, v13.s[0] + fmla v26.4s, v6.4s, v14.s[0] + fmla v30.4s, v6.4s, v15.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v19.4s, v7.4s, v12.s[0] + fmla v23.4s, v7.4s, v13.s[0] + fmla v27.4s, v7.4s, v14.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldp s8, s9, [pB], #8 fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmla v17.4s, v1.4s, v8.s[0] + fmla v21.4s, v1.4s, v9.s[0] + + fmla v25.4s, v1.4s, v10.s[0] + fmla v29.4s, v1.4s, v11.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v22.4s, v2.4s, v9.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v8.s[0] + fmla v23.4s, v3.4s, v9.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v26.4s, v2.4s, v10.s[0] + fmla v30.4s, v2.4s, v11.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v27.4s, v3.4s, v10.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro SAVE16x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmla v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + ldp q2, q3, [pCRow0] + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - fmla v6.4s, v22.4s, alphaV2 - fmla v7.4s, v23.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v5.4s, v21.4s, alphaV0 + stp q4, q5, [pCRow1] + + add pCRow1, pCRow1, #32 + + ldp q6, q7, [pCRow1] + fmla v6.4s, v22.4s, alphaV0 + fmla v7.4s, v23.4s, alphaV0 + stp q6, q7, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + ldp q0, q1, [pCRow2] fmla v0.4s, v24.4s, alphaV0 - fmla v1.4s, v25.4s, alphaV1 - fmla v2.4s, v26.4s, alphaV2 - fmla v3.4s, v27.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + fmla v1.4s, v25.4s, alphaV0 + stp q0, q1, [pCRow2] - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + add pCRow2, pCRow2, #32 + + ldp q2, q3, [pCRow2] + fmla v2.4s, v26.4s, alphaV0 + fmla v3.4s, v27.4s, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + ldp q4, q5, [pCRow3] fmla v4.4s, v28.4s, alphaV0 - fmla v5.4s, v29.4s, alphaV1 - fmla v6.4s, v30.4s, alphaV2 - fmla v7.4s, v31.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v5.4s, v29.4s, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + + ldp q6, q7, [pCRow3] + fmla v6.4s, v30.4s, alphaV0 + fmla v7.4s, v31.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -363,264 +407,217 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmul v20.4s, v0.4s, v9.s[0] + fmul v21.4s, v1.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v25.4s, v1.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha - ld1 {v0.4s, v1.4s}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + fmla v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + ldp q2, q3, [pCRow1] + fmla v2.4s, v20.4s, alphaV0 + fmla v3.4s, v21.4s, alphaV0 + stp q2, q3, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 - ld1 {v0.4s, v1.4s}, [pCRow2] - fmla v0.4s, v24.4s, alphaV0 - fmla v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] + ldp q4, q5, [pCRow2] + fmla v4.4s, v24.4s, alphaV0 + fmla v5.4s, v25.4s, alphaV0 + stp q4, q5, [pCRow2] - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v28.4s, alphaV0 - fmla v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + add pCRow2, pCRow2, #32 - add pCRow0, pCRow0, #32 + ldp q6, q7, [pCRow3] + fmla v6.4s, v28.4s, alphaV0 + fmla v7.4s, v29.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr - fmov s17, s16 - fmov s20, s17 - fmov s21, s16 - fmov s24, s17 - fmov s25, s16 - fmov s28, s17 - fmov s29, s16 + fmov s20, wzr + fmov s24, wzr + fmov s28, wzr .endm .macro KERNEL4x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmul v16.2s, v0.2s, v8.s[0] - fmul v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmul v20.2s, v0.2s, v8.s[1] - fmul v25.2s, v1.2s, v9.s[0] - - fmul v24.2s, v0.2s, v9.s[0] - fmul v21.2s, v1.2s, v8.s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] - fmul v28.2s, v0.2s, v9.s[1] - fmul v17.2s, v1.2s, v8.s[0] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.2s, v5.2s}, [pA] - add pA, pA, #16 + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] // For next round - add pB, pB, #16 - - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - ld1 {v4.2s, v5.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] - prfm PLDL1KEEP, [pB, #512] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] - ld1 {v8.2s, v9.2s}, [pB] // For next round - add pB, pB, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - ld1 {v0.2s, v1.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - prfm PLDL1KEEP, [pA, #512] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + ldr q0, [pA], #16 .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] +.endm - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] +.macro KERNEL4x4_SUB + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] + ldr q0, [pA], #16 - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] .endm -.macro KERNEL4x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 - - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] +.macro SAVE4x4 + fmov alpha0, alpha - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] + ldr q0, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] + add pCRow0, pCRow0, #16 - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] -.endm + ldr q1, [pCRow1] + fmla v1.4s, v20.4s, alphaV0 + str q1, [pCRow1] -.macro SAVE4x4 - ld1 {v8.2s, v9.2s}, [pCRow0] - fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow0] + add pCRow1, pCRow1, #16 - add pCRow1, pCRow0, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV2 - fmla v13.2s, v21.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + ldr q2, [pCRow2] + fmla v2.4s, v24.4s, alphaV0 + str q2, [pCRow2] - add pCRow2, pCRow1, LDC - ld1 {v8.2s, v9.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV0 - fmla v9.2s, v25.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow2] + add pCRow2, pCRow2, #16 - add pCRow1, pCRow2, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV2 - fmla v13.2s, v29.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + ldr q3, [pCRow3] + fmla v3.4s, v28.4s, alphaV0 + str q3, [pCRow3] - add pCRow0, pCRow0, #16 + add pCRow3, pCRow3, #16 .endm /******************************************************************************/ @@ -633,38 +630,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] - fmla v20.2s, v0.2s, v8.s[1] - fmla v24.2s, v0.2s, v9.s[0] - fmla v28.2s, v0.2s, v9.s[1] + fmla v20.2s, v0.2s, v9.s[0] + fmla v24.2s, v0.2s, v10.s[0] + fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 - ld1 {v8.2s}, [pCRow0] - fmla v8.2s, v16.2s, alphaV0 - st1 {v8.2s}, [pCRow0] + fmov alpha0, alpha - add pCRow1, pCRow0, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV1 - st1 {v12.2s}, [pCRow1] + ldr d0, [pCRow0] + fmla v0.2s, v16.2s, alphaV0 + str d0, [pCRow0] - add pCRow2, pCRow1, LDC - ld1 {v8.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV2 - st1 {v8.2s}, [pCRow2] + add pCRow0, pCRow0, #8 - add pCRow1, pCRow2, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV3 - st1 {v12.2s}, [pCRow1] + ldr d1, [pCRow1] + fmla v1.2s, v20.2s, alphaV0 + str d1, [pCRow1] - add pCRow0, pCRow0, #8 + add pCRow1, pCRow1, #8 + + ldr d0, [pCRow2] + fmla v0.2s, v24.2s, alphaV0 + str d0, [pCRow2] + + add pCRow2, pCRow2, #8 + + ldr d1, [pCRow3] + fmla v1.2s, v28.2s, alphaV0 + str d1, [pCRow3] + + add pCRow3, pCRow3, #8 .endm /******************************************************************************/ @@ -686,22 +688,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] - add pCRow2, pCRow1, LDC - add pCRow1, pCRow2, LDC + add pCRow0, pCRow0, #4 + add pCRow1, pCRow1, #4 + ld1 {v12.s}[0], [pCRow2] - ld1 {v12.s}[1], [pCRow1] - fmla v12.2s, v20.2s, alphaV1 + ld1 {v12.s}[1], [pCRow3] + fmla v12.2s, v20.2s, alphaV0 st1 {v12.s}[0], [pCRow2] - st1 {v12.s}[1], [pCRow1] + st1 {v12.s}[1], [pCRow3] - add pCRow0, pCRow0, #4 + add pCRow2, pCRow2, #4 + add pCRow3, pCRow3, #4 .endm /******************************************************************************/ @@ -741,20 +746,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 + fmla v1.4s, v17.4s, alphaV0 + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - fmla v6.4s, v22.4s, alphaV2 - fmla v7.4s, v23.4s, alphaV3 + fmla v5.4s, v21.4s, alphaV0 + fmla v6.4s, v22.4s, alphaV0 + fmla v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -785,18 +792,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 + fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 + fmla v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -824,15 +833,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha + ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 + fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV2 - fmla v13.2s, v21.2s, alphaV3 + fmla v12.2s, v20.2s, alphaV0 + fmla v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -857,13 +868,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha + ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV1 + fmla v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 @@ -886,6 +899,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha + add pCRow1 , pCRow0, LDC ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] @@ -925,11 +940,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x1 + fmov alpha0, alpha + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 + fmla v1.4s, v17.4s, alphaV0 + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 @@ -956,9 +973,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha + ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 + fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 @@ -983,9 +1002,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha + ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 + fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 @@ -1008,6 +1029,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha + ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] @@ -1032,6 +1055,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha + ldr s8, [pCRow0] fmla s8, s16, alphaV0 str s8, [pCRow0] @@ -1061,10 +1086,10 @@ sgemm_kernel_begin: stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, s0 - fmov alpha1, s0 - fmov alpha2, s0 - fmov alpha3, s0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 @@ -1078,8 +1103,12 @@ sgemm_kernel_begin: /******************************************************************************/ sgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -1090,42 +1119,69 @@ sgemm_kernel_L4_M16_BEGIN: cmp counterI, #0 ble sgemm_kernel_L4_M8_BEGIN + .align 5 sgemm_kernel_L4_M16_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt sgemm_kernel_L4_M16_32 - KERNEL16x4_I // do one in the K - KERNEL16x4_M2 // do another in the K + KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #2 ble sgemm_kernel_L4_M16_22a - .align 5 + .align 5 sgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M16_22 + .align 5 sgemm_kernel_L4_M16_22a: + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 + .align 5 sgemm_kernel_L4_M16_32: tst counterL, #1 ble sgemm_kernel_L4_M16_40 KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 @@ -1136,14 +1192,20 @@ sgemm_kernel_L4_M16_40: sgemm_kernel_L4_M16_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble sgemm_kernel_L4_M16_100 + .align 5 sgemm_kernel_L4_M16_46: KERNEL16x4_SUB + subs counterL, counterL, #1 + bne sgemm_kernel_L4_M16_46 sgemm_kernel_L4_M16_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE16x4 diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 28b3216513..77e05103d7 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -46,19 +46,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alpha w17 +#define temp x18 +#define tempOffset x19 +#define tempK x20 #define alpha0 s10 #define alphaV0 v10.s[0] -#define alpha1 s11 -#define alphaV1 v11.s[0] -#define alpha2 s14 -#define alphaV2 v14.s[0] -#define alpha3 s15 -#define alphaV3 v15.s[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 160 + // 00 origM // 01 origN @@ -101,14 +102,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 -//v08 must save pB00, pB01 -//v09 must save pB02, pB03 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB10, pB11 -//v13 must save pB12, pB13 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB00 +//v09 must save pB01 +//v10 must save pB02 +//v11 must save pB03 +//v12 must save pB10 +//v13 must save pB11 +//v14 must save pB12 +//v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 @@ -150,202 +151,240 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + + ldp s8, s9, [pB], #8 fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmul v17.4s, v1.4s, v8.s[0] + fmul v21.4s, v1.4s, v9.s[0] + + ldp q4, q5, [pA], #32 + + fmul v25.4s, v1.4s, v10.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + fmul v18.4s, v2.4s, v8.s[0] + fmul v22.4s, v2.4s, v9.s[0] + + ldp s14, s15, [pB], #8 + fmul v19.4s, v3.4s, v8.s[0] + fmul v23.4s, v3.4s, v9.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.s[1] - fmul v23.4s, v3.4s, v8.s[1] + ldp q6, q7, [pA], #32 - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v26.4s, v2.4s, v9.s[0] - fmul v27.4s, v3.4s, v9.s[0] + fmul v26.4s, v2.4s, v10.s[0] + fmul v30.4s, v2.4s, v11.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - fmul v30.4s, v2.4s, v9.s[1] - fmul v31.4s, v3.4s, v9.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + fmul v27.4s, v3.4s, v10.s[0] + fmul v31.4s, v3.4s, v11.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] + + ldp q4, q5, [pA], #32 + fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + ldp s12, s13, [pB], #8 - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v22.4s, v2.4s, v9.s[0] + fmla v23.4s, v3.4s, v9.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + ldp s14, s15, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.4s, v2.4s, v10.s[0] + fmla v27.4s, v3.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp q6, q7, [pA], #32 + + fmla v30.4s, v2.4s, v11.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] + + ldp q0, q1, [pA], #32 + fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + ldp s8, s9, [pB], #8 - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v22.4s, v6.4s, v13.s[0] + fmla v23.4s, v7.4s, v13.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp s10, s11, [pB], #8 + + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.4s, v6.4s, v14.s[0] + fmla v27.4s, v7.4s, v14.s[0] + + ldp q2, q3, [pA], #32 + + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + fmla v30.4s, v6.4s, v15.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v17.4s, v5.4s, v12.s[0] - fmla v18.4s, v6.4s, v12.s[0] - fmla v19.4s, v7.4s, v12.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v29.4s, v5.4s, v15.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v22.4s, v6.4s, v13.s[0] + fmla v26.4s, v6.4s, v14.s[0] + fmla v30.4s, v6.4s, v15.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v19.4s, v7.4s, v12.s[0] + fmla v23.4s, v7.4s, v13.s[0] + fmla v27.4s, v7.4s, v14.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldp s8, s9, [pB], #8 fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmla v17.4s, v1.4s, v8.s[0] + fmla v21.4s, v1.4s, v9.s[0] + + fmla v25.4s, v1.4s, v10.s[0] + fmla v29.4s, v1.4s, v11.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v22.4s, v2.4s, v9.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v8.s[0] + fmla v23.4s, v3.4s, v9.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v26.4s, v2.4s, v10.s[0] + fmla v30.4s, v2.4s, v11.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v27.4s, v3.4s, v10.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro SAVE16x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmul v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 + + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - fmul v6.4s, v22.4s, alphaV2 - fmul v7.4s, v23.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmul v5.4s, v21.4s, alphaV0 + stp q4, q5, [pCRow1] + + add pCRow1, pCRow1, #32 - add pCRow1, pCRow2, LDC + fmul v6.4s, v22.4s, alphaV0 + fmul v7.4s, v23.4s, alphaV0 + stp q6, q7, [pCRow1] + + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.4s, v24.4s, alphaV0 - fmul v1.4s, v25.4s, alphaV1 - fmul v2.4s, v26.4s, alphaV2 - fmul v3.4s, v27.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + fmul v1.4s, v25.4s, alphaV0 + stp q0, q1, [pCRow2] + + add pCRow2, pCRow2, #32 + + fmul v2.4s, v26.4s, alphaV0 + fmul v3.4s, v27.4s, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.4s, v28.4s, alphaV0 - fmul v5.4s, v29.4s, alphaV1 - fmul v6.4s, v30.4s, alphaV2 - fmul v7.4s, v31.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmul v5.4s, v29.4s, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + fmul v6.4s, v30.4s, alphaV0 + fmul v7.4s, v31.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -362,260 +401,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmul v20.4s, v0.4s, v9.s[0] + fmul v21.4s, v1.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v25.4s, v1.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + fmul v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 - fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + fmul v2.4s, v20.4s, alphaV0 + fmul v3.4s, v21.4s, alphaV0 + stp q2, q3, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 - fmul v0.4s, v24.4s, alphaV0 - fmul v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] + fmul v4.4s, v24.4s, alphaV0 + fmul v5.4s, v25.4s, alphaV0 + stp q4, q5, [pCRow2] - fmul v4.4s, v28.4s, alphaV0 - fmul v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + add pCRow2, pCRow2, #32 - add pCRow0, pCRow0, #32 + fmul v6.4s, v28.4s, alphaV0 + fmul v7.4s, v29.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr - fmov s17, s16 - fmov s20, s17 - fmov s21, s16 - fmov s24, s17 - fmov s25, s16 - fmov s28, s17 - fmov s29, s16 + fmov s20, wzr + fmov s24, wzr + fmov s28, wzr .endm .macro KERNEL4x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmul v16.2s, v0.2s, v8.s[0] - fmul v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmul v20.2s, v0.2s, v8.s[1] - fmul v25.2s, v1.2s, v9.s[0] - - fmul v24.2s, v0.2s, v9.s[0] - fmul v21.2s, v1.2s, v8.s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] - fmul v28.2s, v0.2s, v9.s[1] - fmul v17.2s, v1.2s, v8.s[0] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.2s, v5.2s}, [pA] - add pA, pA, #16 + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] // For next round - add pB, pB, #16 - - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - ld1 {v4.2s, v5.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] - prfm PLDL1KEEP, [pB, #512] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - ld1 {v8.2s, v9.2s}, [pB] // For next round - add pB, pB, #16 + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - ld1 {v0.2s, v1.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - prfm PLDL1KEEP, [pA, #512] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + ldr q0, [pA], #16 .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] .endm .macro KERNEL4x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] .endm .macro SAVE4x4 + fmov alpha0, alpha - fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow0] + fmul v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] - add pCRow1, pCRow0, LDC + add pCRow0, pCRow0, #16 - fmul v12.2s, v20.2s, alphaV2 - fmul v13.2s, v21.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + fmul v1.4s, v20.4s, alphaV0 + str q1, [pCRow1] - add pCRow2, pCRow1, LDC + add pCRow1, pCRow1, #16 - fmul v8.2s, v24.2s, alphaV0 - fmul v9.2s, v25.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow2] + fmul v2.4s, v24.4s, alphaV0 + str q2, [pCRow2] - add pCRow1, pCRow2, LDC + add pCRow2, pCRow2, #16 - fmul v12.2s, v28.2s, alphaV2 - fmul v13.2s, v29.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + fmul v3.4s, v28.4s, alphaV0 + str q3, [pCRow3] - add pCRow0, pCRow0, #16 + add pCRow3, pCRow3, #16 .endm /******************************************************************************/ @@ -628,34 +616,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] - fmla v20.2s, v0.2s, v8.s[1] - fmla v24.2s, v0.2s, v9.s[0] - fmla v28.2s, v0.2s, v9.s[1] + fmla v20.2s, v0.2s, v9.s[0] + fmla v24.2s, v0.2s, v10.s[0] + fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 - fmul v8.2s, v16.2s, alphaV0 - st1 {v8.2s}, [pCRow0] + fmov alpha0, alpha - add pCRow1, pCRow0, LDC - fmul v12.2s, v20.2s, alphaV1 - st1 {v12.2s}, [pCRow1] + fmul v0.2s, v16.2s, alphaV0 + str d0, [pCRow0] - add pCRow2, pCRow1, LDC - fmul v8.2s, v24.2s, alphaV2 - st1 {v8.2s}, [pCRow2] + add pCRow0, pCRow0, #8 - add pCRow1, pCRow2, LDC - fmul v12.2s, v28.2s, alphaV3 - st1 {v12.2s}, [pCRow1] + fmul v1.2s, v20.2s, alphaV0 + str d1, [pCRow1] - add pCRow0, pCRow0, #8 + add pCRow1, pCRow1, #8 + + fmul v0.2s, v24.2s, alphaV0 + str d0, [pCRow2] + + add pCRow2, pCRow2, #8 + + fmul v1.2s, v28.2s, alphaV0 + str d1, [pCRow3] + + add pCRow3, pCRow3, #8 .endm /******************************************************************************/ @@ -677,20 +670,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] - add pCRow2, pCRow1, LDC - add pCRow1, pCRow2, LDC + add pCRow0, pCRow0, #4 + add pCRow1, pCRow1, #4 - fmul v12.2s, v20.2s, alphaV1 + fmul v12.2s, v20.2s, alphaV0 st1 {v12.s}[0], [pCRow2] - st1 {v12.s}[1], [pCRow1] + st1 {v12.s}[1], [pCRow3] - add pCRow0, pCRow0, #4 + add pCRow2, pCRow2, #4 + add pCRow3, pCRow3, #4 .endm /******************************************************************************/ @@ -730,18 +724,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 + fmul v1.4s, v17.4s, alphaV0 + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - fmul v6.4s, v22.4s, alphaV2 - fmul v7.4s, v23.4s, alphaV3 + fmul v5.4s, v21.4s, alphaV0 + fmul v6.4s, v22.4s, alphaV0 + fmul v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -772,16 +768,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 + fmul v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 + fmul v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -809,15 +807,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 + fmul v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2s, v20.2s, alphaV2 - fmul v13.2s, v21.2s, alphaV3 + fmul v12.2s, v20.2s, alphaV0 + fmul v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -842,12 +841,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha + fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC - fmul v12.2s, v20.2s, alphaV1 + fmul v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 @@ -870,6 +871,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha + add pCRow1 , pCRow0, LDC fmul v8.2s, v16.2s, alphaV0 @@ -908,11 +911,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x1 + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 + fmul v1.4s, v17.4s, alphaV0 + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 @@ -939,9 +943,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 + fmul v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 @@ -966,9 +971,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 + fmul v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 @@ -991,6 +997,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] @@ -1015,6 +1022,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha fmul s8, s16, alpha0 str s8, [pCRow0] @@ -1043,10 +1051,10 @@ strmm_kernel_begin: stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, s0 - fmov alpha1, s0 - fmov alpha2, s0 - fmov alpha3, s0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 @@ -1063,8 +1071,13 @@ strmm_kernel_begin: /******************************************************************************/ strmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1078,6 +1091,7 @@ strmm_kernel_L4_M16_BEGIN: cmp counterI, #0 ble strmm_kernel_L4_M8_BEGIN + .align 5 strmm_kernel_L4_M16_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1098,38 +1112,64 @@ strmm_kernel_L4_M16_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt strmm_kernel_L4_M16_32 - KERNEL16x4_I // do one in the K - KERNEL16x4_M2 // do another in the K + KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #2 ble strmm_kernel_L4_M16_22a - .align 5 + .align 5 strmm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #1 bgt strmm_kernel_L4_M16_22 + .align 5 strmm_kernel_L4_M16_22a: + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b strmm_kernel_L4_M16_44 + .align 5 strmm_kernel_L4_M16_32: tst counterL, #1 ble strmm_kernel_L4_M16_40 KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 KERNEL16x4_E b strmm_kernel_L4_M16_44 @@ -1140,12 +1180,15 @@ strmm_kernel_L4_M16_40: strmm_kernel_L4_M16_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble strmm_kernel_L4_M16_100 + .align 5 strmm_kernel_L4_M16_46: KERNEL16x4_SUB + subs counterL, counterL, #1 + bne strmm_kernel_L4_M16_46 strmm_kernel_L4_M16_100: @@ -1166,6 +1209,9 @@ strmm_kernel_L4_M16_100: #if defined(LEFT) add tempOffset, tempOffset, #16 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] strmm_kernel_L4_M16_END: subs counterI, counterI, #1 diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 1cb695e562..08a1531cff 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define alpha_save_R x16 -#define alpha_save_I x17 +#define pCRow3 x15 +#define pA x16 +#define alphaR x17 +#define alphaI x18 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] -#define alpha1_R d14 -#define alphaV1_R v14.d[0] -#define alpha1_I d15 -#define alphaV1_I v15.d[0] - +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 alpha_save_R -// 17 alpha_save_I -// 18 must save +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I // 19 must save // 20 must save // 21 must save @@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] @@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) - eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.d[0] -#else - fmul v19.2d, v2.2d, v9.d[0] -#endif - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 fmul v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] @@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + fmul v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.2d, v3.2d, v8.d[1] + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.d[0] +#else + fmul v19.2d, v2.2d, v9.d[0] +#endif + OP_ir v19.2d, v3.2d, v8.d[0] + + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v25.2d, v1.2d, v10.d[0] + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 + fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v27.2d, v3.2d, v10.d[0] + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + fmul v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v29.2d, v1.2d, v10.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v31.2d, v3.2d, v10.d[1] - ld2 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld2 {v14.2d, v15.2d}, [pB] - add pB, pB, #32 - ld2 {v4.2d, v5.2d} , [pA] - add pA, pA, #32 - ld2 {v6.2d, v7.2d} , [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 @@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - ld2 {v12.2d, v13.2d}, [pB] // For next round + ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] @@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] - ld2 {v14.2d, v15.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] - ld2 {v4.2d, v5.2d} , [pA] // For next round + ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v8.d[1] @@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] - ld2 {v6.2d, v7.2d} , [pA] // For next round - add pA, pA, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] @@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] - ld2 {v8.2d, v9.2d}, [pB] // For next round + ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] @@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] - ld2 {v10.2d, v11.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] - ld2 {v0.2d, v1.2d}, [pA] // For next round + ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v12.d[1] @@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] - ld2 {v2.2d, v3.2d}, [pA] // For next round - add pA, pA, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] @@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] @@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] @@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] - OP_ri v19.2d, v2.2d, v9.d[0] - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] @@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - mov pCRow1, pCRow0 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2 {v0.2d, v1.2d}, [pCRow1] + ld2 {v0.2d, v1.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld2 {v2.2d, v3.2d}, [pCRow0] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R - st2 {v2.2d, v3.2d}, [pCRow2] + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R + st2 {v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow1, pCRow1, #32 + + ld2 {v6.2d, v7.2d}, [pCRow1] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmla v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmla v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow1] - add pCRow1, pCRow1, LDC - ld2 {v0.2d, v1.2d}, [pCRow1] + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ld2 {v0.2d, v1.2d}, [pCRow2] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmla v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmla v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I - fmla v3.2d, v26.2d, alphaV1_I - fmla v3.2d, v27.2d, alphaV1_R + fmla v3.2d, v26.2d, alphaV0_I + fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld2 {v4.2d, v5.2d}, [pCRow1] + ld2 {v4.2d, v5.2d}, [pCRow3] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmla v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R - st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v6.2d, v7.2d}, [pCRow2] + fmla v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R + st2 {v4.2d, v5.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + + ld2 {v6.2d, v7.2d}, [pCRow3] fmla v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I - fmla v7.2d, v30.2d, alphaV1_I - fmla v7.2d, v31.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmla v7.2d, v30.2d, alphaV0_I + fmla v7.2d, v31.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmla v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R + fmla v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmla v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R + fmla v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmla d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmla d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d24, alphaV0_R fmls d0, d25, alphaV0_I - fmla d1, d24, alphaV1_I - fmla d1, d25, alphaV1_R + fmla d1, d24, alphaV0_I + fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d28, alphaV0_R fmls d4, d29, alphaV0_I - fmla d5, d28, alphaV1_I - fmla d5, d29, alphaV1_R + fmla d5, d28, alphaV0_I + fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC @@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v6.2d, v7.2d}, [pCRow2] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmla v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R + fmla v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmla d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmla d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha_save_R, d0 - fmov alpha_save_I, d1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 @@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble zgemm_kernel_L2_BEGIN zgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + mov pA, origPA // pA = start of A array zgemm_kernel_L4_M4_BEGIN: @@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN: cmp counterI, #0 ble zgemm_kernel_L4_M2_BEGIN + .align 5 zgemm_kernel_L4_M4_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt zgemm_kernel_L4_M4_32 - KERNEL4x4_I // do one in the K - KERNEL4x4_M2 // do another in the K + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 ble zgemm_kernel_L4_M4_22a - .align 5 + .align 5 zgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt zgemm_kernel_L4_M4_22 - + .align 5 zgemm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 + .align 5 zgemm_kernel_L4_M4_32: tst counterL, #1 ble zgemm_kernel_L4_M4_40 KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 @@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40: zgemm_kernel_L4_M4_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble zgemm_kernel_L4_M4_100 + .align 5 zgemm_kernel_L4_M4_46: KERNEL4x4_SUB + subs counterL, counterL, #1 + bne zgemm_kernel_L4_M4_46 + zgemm_kernel_L4_M4_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE4x4 diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S index 9e285e2995..a28d1b0cee 100644 --- a/kernel/arm64/zgemv_n.S +++ b/kernel/arm64/zgemv_n.S @@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y_OPTR x13 /* loop Y vector address */ #define X_PTR x14 /* loop X vector address */ +#define A_PRE_SIZE 768 +#define Y_PRE_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define ALPHA_R s0 #define ALPHA_I s1 -#define ALPHA_R_COPY s7 -#define ALPHA_I_COPY s8 #define SHZ 3 #else #define ALPHA_R d0 #define ALPHA_I d1 -#define ALPHA_R_COPY d7 -#define ALPHA_I_COPY d8 #define SHZ 4 #endif @@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT - /********** INIT FOR F4 LOOP **********/ - fmov ALPHA_R_COPY, ALPHA_R - fmov ALPHA_I_COPY, ALPHA_I -#if !defined(DOUBLE) - ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) - ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) - ins v7.d[1], v7.d[0] - ins v8.d[1], v8.d[0] -#else - ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) - ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) -#endif - - /******* INIT FOR F1 AND S1 LOOP ******/ #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) eor v2.16b, v2.16b, v2.16b @@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro INIT_LOOP - /********** INIT_LOOP FOR F4 LOOP **********/ #if !defined(DOUBLE) - ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] - ins v10.s[0], v9.s[1] - ins v9.s[1], v9.s[0] // [R(X), R(X)] - ins v10.s[1], v10.s[0] // [I(X), I(X)] - ins v9.d[1], v9.d[0] - ins v10.d[1], v10.d[0] + ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] + ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] + fmul v2.2s, v0.2s, v2.2s + fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] + ins v3.s[0], v2.s[1] + + /********** INIT_LOOP FOR F4 LOOP **********/ #if !defined(CONJ) #if !defined(XCONJ) - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] - fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] - fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + dup v22.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v23.4s, v25.s[0] // -I[TEMP] + dup v24.4s, v3.s[0] // I[TEMP] #else - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] - fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] - fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + dup v22.4s, v2.s[0] // R[TEMP] + dup v23.4s, v3.s[0] // I[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v24.4s, v25.s[0] // -I[TEMP] #endif #else // CONJ #if !defined(XCONJ) - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] - fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] - fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s2 + dup v22.4s, v25.s[0] // R[TEMP] + dup v23.4s, v3.s[0] // I[TEMP] + dup v24.4s, v3.s[0] // I[TEMP] #else - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] - eor v12.16b, v12.16b, v12.16b - fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] - fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s2 + dup v22.4s, v25.s[0] // R[TEMP] + + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v23.4s, v25.s[0] // I[TEMP] + dup v24.4s, v25.s[0] // I[TEMP] #endif #endif // CONJ + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ - ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] - ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] - fmul v2.2s, v0.2s, v2.2s - fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] - ins v3.s[0], v2.s[1] #if !defined(CONJ) #if !defined(XCONJ) eor v4.16b, v4.16b, v4.16b @@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif // CONJ #else // DOUBLE + ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] + ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] + fmul v2.2d, v0.2d, v2.2d + fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] + ins v3.d[0], v2.d[1] // I(TEMP) - /********** INIT_LOOP FOR F4 LOOP **********/ - ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] - ins v10.d[0], v9.d[1] - ins v9.d[1], v9.d[0] // [R(X), R(X)] - ins v10.d[1], v10.d[0] // [I(X), I(X)] + /****** INIT_LOOP FOR F4 LOOP ******/ #if !defined(CONJ) #if !defined(XCONJ) - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] - fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] - fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + dup v22.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v23.2d, v25.d[0] // -I[TEMP] + dup v24.2d, v3.d[0] // I[TEMP] #else - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] - fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] - fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + dup v22.2d, v2.d[0] // R[TEMP] + dup v23.2d, v3.d[0] // I[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v24.2d, v25.d[0] // -I[TEMP] #endif #else // CONJ #if !defined(XCONJ) - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] - fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] - fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d2 + dup v22.2d, v25.d[0] // R[TEMP] + dup v23.2d, v3.d[0] // I[TEMP] + dup v24.2d, v3.d[0] // I[TEMP] #else - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] - eor v12.16b, v12.16b, v12.16b - fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] - fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d2 + dup v22.2d, v25.d[0] // R[TEMP] + + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v23.2d, v25.d[0] // I[TEMP] + dup v24.2d, v25.d[0] // I[TEMP] #endif #endif // CONJ + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ - ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] - ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] - fmul v2.2d, v0.2d, v2.2d - fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] - ins v3.d[0], v2.d[1] // I(TEMP) #if !defined(CONJ) #if !defined(XCONJ) eor v4.16b, v4.16b, v4.16b @@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v13.4s, v14.4s}, [A_PTR], #32 ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] - fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] - fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] -#else - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] - fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] - fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] - fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] - fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] -#else - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] - fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] - fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] -#endif -#endif // CONJ + + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] + + fmla v15.4s, v21.4s, v13.4s + fmla v15.4s, v23.4s, v14.4s + fmla v16.4s, v22.4s, v14.4s + fmla v16.4s, v24.4s, v13.4s + st2 {v15.4s, v16.4s}, [Y_OPTR], #32 #else // DOUBLE ld2 {v13.2d, v14.2d}, [A_PTR], #32 ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] - fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] - fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] - fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] - fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] - fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] - fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] - fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] - fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] -#endif -#endif // CONJ + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] + + fmla v15.2d, v21.2d, v13.2d + fmla v15.2d, v23.2d, v14.2d + fmla v16.2d, v22.2d, v14.2d + fmla v16.2d, v24.2d, v13.2d + st2 {v15.2d, v16.2d}, [Y_OPTR], #32 ld2 {v17.2d, v18.2d}, [A_PTR], #32 ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#endif -#endif // CONJ + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] + + fmla v19.2d, v21.2d, v17.2d + fmla v19.2d, v23.2d, v18.2d + fmla v20.2d, v22.2d, v18.2d + fmla v20.2d, v24.2d, v17.2d + st2 {v19.2d, v20.2d}, [Y_OPTR], #32 #endif @@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP: zgemv_n_kernel_F4: - KERNEL_F1 - KERNEL_F1 - KERNEL_F1 - KERNEL_F1 + KERNEL_F4 subs I, I, #1 bne zgemv_n_kernel_F4 diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S index e61c171520..79ce9bcf28 100644 --- a/kernel/arm64/zgemv_t.S +++ b/kernel/arm64/zgemv_t.S @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define J x11 /* loop variable */ #define I x12 /* loop variable */ +#define A_PRE_SIZE 768 +#define X_PRE_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v11.4s, v12.4s}, [X_PTR], #32 ld2 {v13.4s, v14.4s}, [A_PTR], #32 + prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] + prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] @@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else // DOUBLE ld2 {v11.2d, v12.2d}, [X_PTR], #32 ld2 {v13.2d, v14.2d}, [A_PTR], #32 - prfm PLDL1STRM, [X_PTR, #512] + prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] @@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v17.2d, v18.2d}, [X_PTR], #32 ld2 {v19.2d, v20.2d}, [A_PTR], #32 - prfm PLDL1STRM, [A_PTR, #512] + prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 7945870d67..77a7857ffe 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define alpha_save_R x16 -#define alpha_save_I x17 -#define temp x18 -#define tempOffset x19 -#define tempK x20 +#define pCRow3 x15 +#define pA x16 +#define alphaR x17 +#define alphaI x18 +#define temp x19 +#define tempOffset x20 +#define tempK x21 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] -#define alpha1_R d14 -#define alphaV1_R v14.d[0] -#define alpha1_I d15 -#define alphaV1_I v15.d[0] - +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 04 origPB // 05 pC // 06 origLDC -> LDC -// 07 offset +// 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ @@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 alpha_save_R -// 17 alpha_save_I -// 18 must save temp -// 19 must save tempOffset -// 20 must save tempK -// 21 must save +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save temp +// 20 must save tempOffset +// 21 must save tempK // 22 must save // 23 must save // 24 must save @@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] @@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) - eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.d[0] -#else - fmul v19.2d, v2.2d, v9.d[0] -#endif - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 fmul v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] @@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + fmul v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.2d, v3.2d, v8.d[1] + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.d[0] +#else + fmul v19.2d, v2.2d, v9.d[0] +#endif + OP_ir v19.2d, v3.2d, v8.d[0] + + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v25.2d, v1.2d, v10.d[0] + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 + fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v27.2d, v3.2d, v10.d[0] + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + fmul v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v29.2d, v1.2d, v10.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v31.2d, v3.2d, v10.d[1] - ld2 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld2 {v14.2d, v15.2d}, [pB] - add pB, pB, #32 - ld2 {v4.2d, v5.2d} , [pA] - add pA, pA, #32 - ld2 {v6.2d, v7.2d} , [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 @@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - ld2 {v12.2d, v13.2d}, [pB] // For next round + ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] @@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] - ld2 {v14.2d, v15.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] - ld2 {v4.2d, v5.2d} , [pA] // For next round + ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v8.d[1] @@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] - ld2 {v6.2d, v7.2d} , [pA] // For next round - add pA, pA, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] @@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] - ld2 {v8.2d, v9.2d}, [pB] // For next round + ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] @@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] - ld2 {v10.2d, v11.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] - ld2 {v0.2d, v1.2d}, [pA] // For next round + ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v12.d[1] @@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] - ld2 {v2.2d, v3.2d}, [pA] // For next round - add pA, pA, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] @@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] @@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] @@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] - OP_ri v19.2d, v2.2d, v9.d[0] - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] @@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - mov pCRow1, pCRow0 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R - st2 {v2.2d, v3.2d}, [pCRow2] + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R + st2 {v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + + add pCRow1, pCRow1, #32 + fmul v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmul v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmul v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow1] + + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC fmul v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmul v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + fmul v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I - fmul v3.2d, v26.2d, alphaV1_I - fmla v3.2d, v27.2d, alphaV1_R + fmul v3.2d, v26.2d, alphaV0_I + fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmul v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R - st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R + st2 {v4.2d, v5.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + fmul v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I - fmul v7.2d, v30.2d, alphaV1_I - fmla v7.2d, v31.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmul v7.2d, v30.2d, alphaV0_I + fmla v7.2d, v31.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmul v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R + fmul v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmul v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R + fmul v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmul d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmul d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d0, d24, alphaV0_R fmls d0, d25, alphaV0_I - fmul d1, d24, alphaV1_I - fmla d1, d25, alphaV1_R + fmul d1, d24, alphaV0_I + fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d28, alphaV0_R fmls d4, d29, alphaV0_I - fmul d5, d28, alphaV1_I - fmla d5, d29, alphaV1_R + fmul d5, d28, alphaV0_I + fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmul v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R + fmul v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmul d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmul d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha_save_R, d0 - fmov alpha_save_I, d1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 @@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble ztrmm_kernel_L2_BEGIN ztrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN: cmp counterI, #0 ble ztrmm_kernel_L4_M2_BEGIN + .align 5 ztrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt ztrmm_kernel_L4_M4_32 - KERNEL4x4_I // do one in the K - KERNEL4x4_M2 // do another in the K + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #2 ble ztrmm_kernel_L4_M4_22a - .align 5 + .align 5 ztrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt ztrmm_kernel_L4_M4_22 - + .align 5 ztrmm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b ztrmm_kernel_L4_M4_44 + .align 5 ztrmm_kernel_L4_M4_32: tst counterL, #1 ble ztrmm_kernel_L4_M4_40 KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_E b ztrmm_kernel_L4_M4_44 @@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40: ztrmm_kernel_L4_M4_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble ztrmm_kernel_L4_M4_100 + .align 5 ztrmm_kernel_L4_M4_46: KERNEL4x4_SUB + subs counterL, counterL, #1 + bne ztrmm_kernel_L4_M4_46 + ztrmm_kernel_L4_M4_100: SAVE4x4 @@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + ztrmm_kernel_L4_M4_END: subs counterI, counterI, #1 bne ztrmm_kernel_L4_M4_20 diff --git a/kernel/mips/KERNEL b/kernel/mips/KERNEL new file mode 100644 index 0000000000..aeccfbf4c8 --- /dev/null +++ b/kernel/mips/KERNEL @@ -0,0 +1,46 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.c +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.c +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.c +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.c +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 new file mode 100644 index 0000000000..6835792211 --- /dev/null +++ b/kernel/mips/KERNEL.P5600 @@ -0,0 +1,221 @@ +SAMAXKERNEL = ../mips/amax.c +DAMAXKERNEL = ../mips/amax.c +CAMAXKERNEL = ../mips/zamax.c +ZAMAXKERNEL = ../mips/zamax.c + +SAMINKERNEL = ../mips/amin.c +DAMINKERNEL = ../mips/amin.c +CAMINKERNEL = ../mips/zamin.c +ZAMINKERNEL = ../mips/zamin.c + +SMAXKERNEL = ../mips/max.c +DMAXKERNEL = ../mips/max.c + +SMINKERNEL = ../mips/min.c +DMINKERNEL = ../mips/min.c + +ISAMAXKERNEL = ../mips/iamax.c +IDAMAXKERNEL = ../mips/iamax.c +ICAMAXKERNEL = ../mips/izamax.c +IZAMAXKERNEL = ../mips/izamax.c + +ISAMINKERNEL = ../mips/iamin.c +IDAMINKERNEL = ../mips/iamin.c +ICAMINKERNEL = ../mips/izamin.c +IZAMINKERNEL = ../mips/izamin.c + +ISMAXKERNEL = ../mips/imax.c +IDMAXKERNEL = ../mips/imax.c + +ISMINKERNEL = ../mips/imin.c +IDMINKERNEL = ../mips/imin.c + +ifdef HAVE_MSA +SASUMKERNEL = ../mips/sasum_msa.c +DASUMKERNEL = ../mips/dasum_msa.c +CASUMKERNEL = ../mips/casum_msa.c +ZASUMKERNEL = ../mips/zasum_msa.c +else +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/asum.c +ZASUMKERNEL = ../mips/asum.c +endif + +SAXPYKERNEL = ../mips/axpy.c +DAXPYKERNEL = ../mips/axpy.c +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c + +SCOPYKERNEL = ../mips/copy.c +DCOPYKERNEL = ../mips/copy.c +CCOPYKERNEL = ../mips/zcopy.c +ZCOPYKERNEL = ../mips/zcopy.c + +ifdef HAVE_MSA +SDOTKERNEL = ../mips/sdot_msa.c +DDOTKERNEL = ../mips/ddot_msa.c +CDOTKERNEL = ../mips/cdot_msa.c +ZDOTKERNEL = ../mips/zdot_msa.c +else +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c +endif + +SNRM2KERNEL = ../mips/nrm2.c +DNRM2KERNEL = ../mips/nrm2.c +CNRM2KERNEL = ../mips/znrm2.c +ZNRM2KERNEL = ../mips/znrm2.c + +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c + +SSCALKERNEL = ../mips/scal.c +DSCALKERNEL = ../mips/scal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c + +SSWAPKERNEL = ../mips/swap.c +DSWAPKERNEL = ../mips/swap.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + +ifdef HAVE_MSA +SGEMVNKERNEL = ../mips/sgemv_n_msa.c +DGEMVNKERNEL = ../mips/dgemv_n_msa.c +CGEMVNKERNEL = ../mips/cgemv_n_msa.c +ZGEMVNKERNEL = ../mips/zgemv_n_msa.c +else +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c +endif + +ifdef HAVE_MSA +SGEMVTKERNEL = ../mips/sgemv_t_msa.c +DGEMVTKERNEL = ../mips/dgemv_t_msa.c +CGEMVTKERNEL = ../mips/cgemv_t_msa.c +ZGEMVTKERNEL = ../mips/zgemv_t_msa.c +else +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c +endif + +ifdef HAVE_MSA +SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c +SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c +SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o +else +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o +endif + +ifdef HAVE_MSA +DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c +DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c +DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c +DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c +DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o +else +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o +endif + +ifdef HAVE_MSA +CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c +CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c +CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c +CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c +CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +else +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +endif + +ifdef HAVE_MSA +ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c +ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c +ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +else +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +endif + +ifdef HAVE_MSA +STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c +STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c +STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c +STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c +else +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c +DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c +DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c +DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif \ No newline at end of file diff --git a/kernel/mips/Makefile b/kernel/mips/Makefile new file mode 100644 index 0000000000..efae70d7b7 --- /dev/null +++ b/kernel/mips/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/mips/amax.c b/kernel/mips/amax.c new file mode 100644 index 0000000000..ad14081f5c --- /dev/null +++ b/kernel/mips/amax.c @@ -0,0 +1,66 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/mips/amin.c b/kernel/mips/amin.c new file mode 100644 index 0000000000..8079450ff5 --- /dev/null +++ b/kernel/mips/amin.c @@ -0,0 +1,66 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/mips/asum.c b/kernel/mips/asum.c new file mode 100644 index 0000000000..d221464de0 --- /dev/null +++ b/kernel/mips/asum.c @@ -0,0 +1,57 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/mips/axpby.c b/kernel/mips/axpby.c new file mode 100644 index 0000000000..af4fccde21 --- /dev/null +++ b/kernel/mips/axpby.c @@ -0,0 +1,95 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + + ix = 0; + iy = 0; + + if ( beta == 0.0 ) + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = 0.0 ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + + + } + + } + else + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = beta * y[iy] ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] + beta * y[iy] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + + } + + return(0); + +} + + diff --git a/kernel/mips/axpy.c b/kernel/mips/axpy.c new file mode 100644 index 0000000000..42f181ee13 --- /dev/null +++ b/kernel/mips/axpy.c @@ -0,0 +1,54 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips/casum_msa.c b/kernel/mips/casum_msa.c new file mode 100644 index 0000000000..454573d56d --- /dev/null +++ b/kernel/mips/casum_msa.c @@ -0,0 +1,338 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include +#include "macros_msa.h" + +#define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec)) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i, inc_x2; + FLOAT sumf = 0.0; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3; + v4f32 zero_v = {0}; + v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; + + if (n <= 0 || inc_x <= 0) return (sumf); + + if (1 == inc_x) + { + if (n > 15) + { + n -= 16; + + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 = AND_VEC_W(src0); + sum_abs1 = AND_VEC_W(src1); + sum_abs2 = AND_VEC_W(src2); + sum_abs3 = AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + sum_abs2 = zero_v; + sum_abs3 = zero_v; + } + + for (i = (n >> 4); i--;) + { + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + + if (n & 15) + { + if ((n & 8) && (n & 4) && (n & 2)) + { + LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 8) && (n & 4)) + { + LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 8) && (n & 2)) + { + LD_SP5_INC(x, 4, src0, src1, src2, src3, src4); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 4) && (n & 2)) + { + LD_SP3_INC(x, 4, src0, src1, src2); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 8) + { + LD_SP4_INC(x, 4, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 4) + { + LD_SP2_INC(x, 4, src0, src1); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 2) + { + src0 = LD_SP(x); x += 4; + + sum_abs0 += AND_VEC_W(src0); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else + { + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + + if (n & 1) + { + sumf += fabsf(*(x + 0)); + sumf += fabsf(*(x + 1)); + } + } + else + { + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + } + else + { + inc_x2 = 2 * inc_x; + + if (n > 8) + { + n -= 8; + + LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 = AND_VEC_W(src0); + sum_abs1 = AND_VEC_W(src1); + sum_abs2 = AND_VEC_W(src2); + sum_abs3 = AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + sum_abs2 = zero_v; + sum_abs3 = zero_v; + } + + for (i = (n >> 3); i--;) + { + LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + + if (n & 7) + { + if ((n & 4) && (n & 2) && (n & 1)) + { + LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + } + else if ((n & 4) && (n & 2)) + { + LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + } + else if ((n & 4) && (n & 1)) + { + LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + } + else if ((n & 2) && (n & 1)) + { + LD_SP3_INC(x, inc_x2, src0, src1, src2); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + } + else if (n & 4) + { + LD_SP4_INC(x, inc_x2, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + } + else if (n & 2) + { + LD_SP2_INC(x, inc_x2, src0, src1); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + } + else if (n & 1) + { + src0 = LD_SP(x); x += inc_x2; + + sum_abs0 += AND_VEC_W(src0); + } + } + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0] + sum_abs0[1]; + } + + return (sumf); +} diff --git a/kernel/mips/cdot_msa.c b/kernel/mips/cdot_msa.c new file mode 100644 index 0000000000..bf9f6b7e2f --- /dev/null +++ b/kernel/mips/cdot_msa.c @@ -0,0 +1,361 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#if !defined(CONJ) + #define OP2 += + #define OP3 - + #define OP4 + +#else + #define OP2 -= + #define OP3 + + #define OP4 - +#endif + +#define DOT16_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); \ + \ + dot0 += (vx2r * vy2r); \ + dot0 OPR0## = (vx2i * vy2i); \ + dot1 OPR1## = (vx2i * vy2r); \ + dot1 += (vx2r * vy2i); \ + \ + dot0 += (vx3r * vy3r); \ + dot0 OPR0## = (vx3i * vy3i); \ + dot1 OPR1## = (vx3i * vy3r); \ + dot1 += (vx3r * vy3i); + +#define DOT12_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); \ + \ + dot0 += (vx2r * vy2r); \ + dot0 OPR0## = (vx2i * vy2i); \ + dot1 OPR1## = (vx2i * vy2r); \ + dot1 += (vx2r * vy2i); + +#define DOT8_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); + +#define DOT4_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); + +/* return float, x,y float */ +/* cdotc - CONJ */ +/* cdotu - !CONJ */ +#ifndef _MSC_VER +#include +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i = 0; + FLOAT dot[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + FLOAT x0, x1, x2, x3, x4, x5, x6, x7; + FLOAT y0, y1, y2, y3, y4, y5, y6, y7; + v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; + v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; + v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; + v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; + v4f32 dot0 = {0, 0, 0, 0}; + v4f32 dot1 = {0, 0, 0, 0}; + openblas_complex_float result; + + dot[0] = 0.0; + dot[1] = 0.0; + + __real__(result) = 0.0; + __imag__(result) = 0.0; + + if ( n < 1 ) return(result); + + if ((1 == inc_x) && (1 == inc_y)) + { + for (i = (n >> 4); i--;) + { + LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); + PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); + PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i); + + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); + PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); + PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); + PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i); + + #if !defined(CONJ) + DOT16_KERNEL(-, +); + #else + DOT16_KERNEL(+, -); + #endif + } + + if (n & 15) + { + if ((n & 8) && (n & 4)) + { + LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); + LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); + LD_SP2_INC(x, 4, vx4, vx5); + LD_SP2_INC(y, 4, vy4, vy5); + + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); + PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); + + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); + PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); + PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); + + #if !defined(CONJ) + DOT12_KERNEL(-, +); + #else + DOT12_KERNEL(+, -); + #endif + } + else if (n & 8) + { + LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); + LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); + + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); + + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); + PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); + + #if !defined(CONJ) + DOT8_KERNEL(-, +); + #else + DOT8_KERNEL(+, -); + #endif + } + else if (n & 4) + { + LD_SP2_INC(x, 4, vx0, vx1); + LD_SP2_INC(y, 4, vy0, vy1); + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); + + #if !defined(CONJ) + DOT4_KERNEL(-, +); + #else + DOT4_KERNEL(+, -); + #endif + } + + if ((n & 2) && (n & 1)) + { + LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5); + LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5); + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + + dot[0] += ( x2 * y2 OP3 x3 * y3 ); + dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + + dot[0] += ( x4 * y4 OP3 x5 * y5 ); + dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); + } + else if (n & 2) + { + LD_GP4_INC(x, 1, x0, x1, x2, x3); + LD_GP4_INC(y, 1, y0, y1, y2, y3); + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + + dot[0] += ( x2 * y2 OP3 x3 * y3 ); + dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + } + else if (n & 1) + { + LD_GP2_INC(x, 1, x0, x1); + LD_GP2_INC(y, 1, y0, y1); + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + } + } + + dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]); + dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]); + } + else + { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + for (i = (n >> 2); i--;) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; + x2 = *x; + x3 = *(x + 1); + x += inc_x2; + x4 = *x; + x5 = *(x + 1); + x += inc_x2; + x6 = *x; + x7 = *(x + 1); + x += inc_x2; + + y0 = *y; + y1 = *(y + 1); + y += inc_y2; + y2 = *y; + y3 = *(y + 1); + y += inc_y2; + y4 = *y; + y5 = *(y + 1); + y += inc_y2; + y6 = *y; + y7 = *(y + 1); + y += inc_y2; + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + + dot[0] += ( x2 * y2 OP3 x3 * y3 ); + dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + + dot[0] += ( x4 * y4 OP3 x5 * y5 ); + dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); + + dot[0] += ( x6 * y6 OP3 x7 * y7 ); + dot[1] OP2 ( x7 * y6 OP4 x6 * y7 ); + } + + if ((n & 2) && (n & 1)) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; + x2 = *x; + x3 = *(x + 1); + x += inc_x2; + x4 = *x; + x5 = *(x + 1); + x += inc_x2; + + y0 = *y; + y1 = *(y + 1); + y += inc_y2; + y2 = *y; + y3 = *(y + 1); + y += inc_y2; + y4 = *y; + y5 = *(y + 1); + y += inc_y2; + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + + dot[0] += ( x2 * y2 OP3 x3 * y3 ); + dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + + dot[0] += ( x4 * y4 OP3 x5 * y5 ); + dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); + } + else if (n & 2) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; + x2 = *x; + x3 = *(x + 1); + x += inc_x2; + + y0 = *y; + y1 = *(y + 1); + y += inc_y2; + y2 = *y; + y3 = *(y + 1); + y += inc_y2; + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + + dot[0] += ( x2 * y2 OP3 x3 * y3 ); + dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + } + else if (n & 1) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; + + y0 = *y; + y1 = *(y + 1); + y += inc_y2; + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + } + } + + __real__(result) = dot[0]; + __imag__(result) = dot[1]; + + return(result); +} diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c new file mode 100644 index 0000000000..cd1fa45b37 --- /dev/null +++ b/kernel/mips/cgemm_kernel_8x4_msa.c @@ -0,0 +1,2154 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ + LD_SP2_INC(pb0, 4, src_b0, src_b1); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = (OP4 src_a0r) * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = (OP4 src_a1r) * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = (OP4 src_a0r) * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = (OP4 src_a1r) * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = (OP4 src_a0r) * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + res5_r OP0## = src_a1r * src_br; \ + res5_r OP1## = src_a1i * src_bi; \ + res5_i OP2## = (OP4 src_a1r) * src_bi; \ + res5_i OP3## = src_a1i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = (OP4 src_a0r) * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ + \ + res7_r OP0## = src_a1r * src_br; \ + res7_r OP1## = src_a1i * src_bi; \ + res7_i OP2## = (OP4 src_a1r) * src_bi; \ + res7_i OP3## = src_a1i * src_br; \ +} + +#define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ + src_b0 = LD_SP(pb0); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = (OP4 src_a0r) * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = (OP4 src_a1r) * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = (OP4 src_a0r) * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = (OP4 src_a1r) * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ +} + +#define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ + src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = (OP4 src_a0r) * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = (OP4 src_a1r) * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ +} + +#define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP2_INC(pa0, 4, src_a0, src_a1); \ + LD_SP2_INC(pb0, 4, src_b0, src_b1); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = OP4 src_a0r * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = OP4 src_a0r * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ +} + +#define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP2_INC(pa0, 4, src_a0, src_a1); \ + src_b0 = LD_SP(pb0); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ +} + +#define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP2_INC(pa0, 4, src_a0, src_a1); \ + src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ +} + +#define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4) \ +{ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + a1_r = pa0[2]; \ + a1_i = pa0[3]; \ + res2 OP0## = a1_r * b0_r; \ + res2 OP1## = a1_i * b0_i; \ + res3 OP2## = OP4 a1_r * b0_i; \ + res3 OP3## = a1_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res4 OP0## = a0_r * b1_r; \ + res4 OP1## = a0_i * b1_i; \ + res5 OP2## = OP4 a0_r * b1_i; \ + res5 OP3## = a0_i * b1_r; \ + \ + res6 OP0## = a1_r * b1_r; \ + res6 OP1## = a1_i * b1_i; \ + res7 OP2## = OP4 a1_r * b1_i; \ + res7 OP3## = a1_i * b1_r; \ + \ + /* 2nd col */ \ + b2_r = pb0[4]; \ + b2_i = pb0[5]; \ + res8 OP0## = a0_r * b2_r; \ + res8 OP1## = a0_i * b2_i; \ + res9 OP2## = OP4 a0_r * b2_i; \ + res9 OP3## = a0_i * b2_r; \ + \ + res10 OP0## = a1_r * b2_r; \ + res10 OP1## = a1_i * b2_i; \ + res11 OP2## = OP4 a1_r * b2_i; \ + res11 OP3## = a1_i * b2_r; \ + \ + /* 3rd col */ \ + b3_r = pb0[6]; \ + b3_i = pb0[7]; \ + res12 OP0## = a0_r * b3_r; \ + res12 OP1## = a0_i * b3_i; \ + res13 OP2## = OP4 a0_r * b3_i; \ + res13 OP3## = a0_i * b3_r; \ + \ + res14 OP0## = a1_r * b3_r; \ + res14 OP1## = a1_i * b3_i; \ + res15 OP2## = OP4 a1_r * b3_i; \ + res15 OP3## = a1_i * b3_r; \ +} + +#define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + a1_r = pa0[2]; \ + a1_i = pa0[3]; \ + res2 OP0## = a1_r * b0_r; \ + res2 OP1## = a1_i * b0_i; \ + res3 OP2## = OP4 a1_r * b0_i; \ + res3 OP3## = a1_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res4 OP0## = a0_r * b1_r; \ + res4 OP1## = a0_i * b1_i; \ + res5 OP2## = OP4 a0_r * b1_i; \ + res5 OP3## = a0_i * b1_r; \ + \ + res6 OP0## = a1_r * b1_r; \ + res6 OP1## = a1_i * b1_i; \ + res7 OP2## = OP4 a1_r * b1_i; \ + res7 OP3## = a1_i * b1_r; \ +} + +#define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + a1_r = pa0[2]; \ + a1_i = pa0[3]; \ + res2 OP0## = a1_r * b0_r; \ + res2 OP1## = a1_i * b0_i; \ + res3 OP2## = OP4 a1_r * b0_i; \ + res3 OP3## = a1_i * b0_r; \ +} + +#define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res2 OP0## = a0_r * b1_r; \ + res2 OP1## = a0_i * b1_i; \ + res3 OP2## = OP4 a0_r * b1_i; \ + res3 OP3## = a0_i * b1_r; \ + \ + /* 2nd col */ \ + b2_r = pb0[4]; \ + b2_i = pb0[5]; \ + res4 OP0## = a0_r * b2_r; \ + res4 OP1## = a0_i * b2_i; \ + res5 OP2## = OP4 a0_r * b2_i; \ + res5 OP3## = a0_i * b2_r; \ + \ + /* 3rd col */ \ + b3_r = pb0[6]; \ + b3_i = pb0[7]; \ + res6 OP0## = a0_r * b3_r; \ + res6 OP1## = a0_i * b3_i; \ + res7 OP2## = OP4 a0_r * b3_i; \ + res7 OP3## = a0_i * b3_r; \ +} + +#define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res2 OP0## = a0_r * b1_r; \ + res2 OP1## = a0_i * b1_i; \ + res3 OP2## = OP4 a0_r * b1_i; \ + res3 OP3## = a0_i * b1_r; \ +} + +#define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ +} + +#define CGEMM_SCALE_8X4_MSA \ +{ \ + LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ + \ + LD_SP4(pc2, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r += alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i += alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \ + \ + LD_SP4(pc3, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r += alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i += alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \ +} + +#define CGEMM_SCALE_8X2_MSA \ +{ \ + LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ +} + +#define CGEMM_SCALE_8X1_MSA \ +{ \ + LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ +} + +#define CGEMM_SCALE_4X4_MSA \ +{ \ + LD_SP2(pc0, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + LD_SP2(pc1, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ + \ + LD_SP2(pc2, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc2, 4); \ + \ + LD_SP2(pc3, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc3, 4); \ +} + +#define CGEMM_SCALE_4X2_MSA \ +{ \ + LD_SP2(pc0, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + LD_SP2(pc1, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ +} + +#define CGEMM_SCALE_4X1_MSA \ +{ \ + LD_SP2(pc0, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ +} + +#define CGEMM_SCALE_2X4 \ +{ \ + /* 0th col */ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] += alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] += alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] += alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] += alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] += alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] += alphar * res7; \ + pc1[3] += alphai * res6; \ + \ + /* 2nd col */ \ + pc2[0] += alphar * res8; \ + pc2[0] -= alphai * res9; \ + pc2[1] += alphar * res9; \ + pc2[1] += alphai * res8; \ + pc2[2] += alphar * res10; \ + pc2[2] -= alphai * res11; \ + pc2[3] += alphar * res11; \ + pc2[3] += alphai * res10; \ + \ + /* 3rd col */ \ + pc3[0] += alphar * res12; \ + pc3[0] -= alphai * res13; \ + pc3[1] += alphar * res13; \ + pc3[1] += alphai * res12; \ + pc3[2] += alphar * res14; \ + pc3[2] -= alphai * res15; \ + pc3[3] += alphar * res15; \ + pc3[3] += alphai * res14; \ +} + +#define CGEMM_SCALE_2X2 \ +{ \ + /* 0th col */ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] += alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] += alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] += alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] += alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] += alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] += alphar * res7; \ + pc1[3] += alphai * res6; \ +} + +#define CGEMM_SCALE_2X1 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc0[2] += alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] += alphar * res3; \ + pc0[3] += alphai * res2; \ +} + +#define CGEMM_SCALE_1X4 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[0] += alphar * res2; \ + pc1[0] -= alphai * res3; \ + pc1[1] += alphar * res3; \ + pc1[1] += alphai * res2; \ + \ + pc2[0] += alphar * res4; \ + pc2[0] -= alphai * res5; \ + pc2[1] += alphar * res5; \ + pc2[1] += alphai * res4; \ + \ + pc3[0] += alphar * res6; \ + pc3[0] -= alphai * res7; \ + pc3[1] += alphar * res7; \ + pc3[1] += alphai * res6; \ +} + +#define CGEMM_SCALE_1X2 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[2] += alphar * res2; \ + pc1[2] -= alphai * res3; \ + pc1[3] += alphar * res3; \ + pc1[3] += alphai * res2; \ +} + +#define CGEMM_SCALE_1X1 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +#define CGEMM_TRMM_SCALE_8X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r = alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i = alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r = alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i = alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \ +} + +#define CGEMM_TRMM_SCALE_8X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ +} + +#define CGEMM_TRMM_SCALE_8X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ +} + +#define CGEMM_TRMM_SCALE_4X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc2, 4); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc3, 4); \ +} + +#define CGEMM_TRMM_SCALE_4X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ +} + +#define CGEMM_TRMM_SCALE_4X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ +} + +#define CGEMM_TRMM_SCALE_2X4 \ +{ \ + /* 0th col */ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] = alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] = alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] = alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] = alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] = alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] = alphar * res7; \ + pc1[3] += alphai * res6; \ + \ + /* 2nd col */ \ + pc2[0] = alphar * res8; \ + pc2[0] -= alphai * res9; \ + pc2[1] = alphar * res9; \ + pc2[1] += alphai * res8; \ + pc2[2] = alphar * res10; \ + pc2[2] -= alphai * res11; \ + pc2[3] = alphar * res11; \ + pc2[3] += alphai * res10; \ + \ + /* 3rd col */ \ + pc3[0] = alphar * res12; \ + pc3[0] -= alphai * res13; \ + pc3[1] = alphar * res13; \ + pc3[1] += alphai * res12; \ + pc3[2] = alphar * res14; \ + pc3[2] -= alphai * res15; \ + pc3[3] = alphar * res15; \ + pc3[3] += alphai * res14; \ +} + +#define CGEMM_TRMM_SCALE_2X2 \ +{ \ + /* 0th col */ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] = alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] = alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] = alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] = alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] = alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] = alphar * res7; \ + pc1[3] += alphai * res6; \ +} + +#define CGEMM_TRMM_SCALE_2X1 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc0[2] = alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] = alphar * res3; \ + pc0[3] += alphai * res2; \ +} + +#define CGEMM_TRMM_SCALE_1X4 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[0] = alphar * res2; \ + pc1[0] -= alphai * res3; \ + pc1[1] = alphar * res3; \ + pc1[1] += alphai * res2; \ + \ + pc2[0] = alphar * res4; \ + pc2[0] -= alphai * res5; \ + pc2[1] = alphar * res5; \ + pc2[1] += alphai * res4; \ + \ + pc3[0] = alphar * res6; \ + pc3[0] -= alphai * res7; \ + pc3[1] = alphar * res7; \ + pc3[1] += alphai * res6; \ +} + +#define CGEMM_TRMM_SCALE_1X2 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[2] = alphar * res2; \ + pc1[2] -= alphai * res3; \ + pc1[3] = alphar * res3; \ + pc1[3] += alphai * res2; \ +} + +#define CGEMM_TRMM_SCALE_1X1 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, + FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif + FLOAT *pc0, *pc1, *pc2, *pc3; + FLOAT *pa0, *pb0; + FLOAT res0, res1, res2, res3, res4, res5, res6, res7; + FLOAT res8, res9, res10, res11, res12, res13, res14, res15; + FLOAT a0_r, a1_r; + FLOAT a0_i, a1_i; + FLOAT b0_r, b1_r, b2_r, b3_r; + FLOAT b0_i, b1_i, b2_i, b3_i; + v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1; + v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; + v4f32 dst0, dst1, dst2, dst3; + v4f32 alpha_r, alpha_i; + v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; + v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; + v4f32 dst0_r, dst0_i, dst1_r, dst1_i; + + alpha_r = COPY_FLOAT_TO_VECTOR(alphar); + alpha_i = COPY_FLOAT_TO_VECTOR(alphai); + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 2); j--;) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + pc2 = pc1 + 2 * ldc; + pc3 = pc2 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 8; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_8X4_MSA +#else + CGEMM_SCALE_8X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 8; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_4X4_MSA +#else + CGEMM_SCALE_4X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X4(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X4(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X4(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X4(, -, , -, -); +#endif + + pa0 += 4; + pb0 += 8; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X4(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X4(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X4(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X4(+, -, -, -,); +#endif + + pa0 += 4; + pb0 += 8; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_2X4 +#else + CGEMM_SCALE_2X4 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X4(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X4(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X4(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X4(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 8; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X4(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X4(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X4(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X4(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 8; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_1X4 +#else + CGEMM_SCALE_1X4 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + l = k << 3; + B = B + l; + i = ldc << 3; + C = C + i; + } + + if (n & 2) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 8; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X2_MSA(, -, , -, -); +#endif + + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X2_MSA(+, -, -, -,); +#endif + + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_8X2_MSA +#else + CGEMM_SCALE_8X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 8; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X2_MSA(, -, , -, -); +#endif + + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X2_MSA(+, -, -, -,); +#endif + + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_4X2_MSA +#else + CGEMM_SCALE_4X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X2(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X2(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X2(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X2(, -, , -, -); +#endif + + pa0 += 4; + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X2(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X2(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X2(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X2(+, -, -, -,); +#endif + + pa0 += 4; + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_2X2 +#else + CGEMM_SCALE_2X2 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 4; + pc1 += 4; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X2(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X2(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X2(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X2(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X2(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X2(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X2(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X2(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_1X2 +#else + CGEMM_SCALE_1X2 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + l = k << 2; + B = B + l; + i = ldc << 2; + C = C + i; + } + + if (n & 1) + { + pc0 = C; + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 8; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_8X1_MSA +#else + CGEMM_SCALE_8X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 8; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_4X1_MSA +#else + CGEMM_SCALE_4X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X1(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X1(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X1(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X1(, -, , -, -); +#endif + + pa0 += 4; + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X1(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X1(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X1(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X1(+, -, -, -,); +#endif + + pa0 += 4; + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_2X1 +#else + CGEMM_SCALE_2X1 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 4; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X1(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X1(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X1(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X1(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X1(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X1(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X1(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X1(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_1X1 +#else + CGEMM_SCALE_1X1 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + + l = k << 1; + B = B + l; + i = ldc << 1; + C = C + i; + } + + return 0; +} diff --git a/kernel/mips/cgemm_ncopy_4_msa.c b/kernel/mips/cgemm_ncopy_4_msa.c new file mode 100644 index 0000000000..b38290b3d3 --- /dev/null +++ b/kernel/mips/cgemm_ncopy_4_msa.c @@ -0,0 +1,195 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 dst0, dst1, dst4, dst5; + + psrc0 = src; + pdst = dst; + lda *= 2; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + ILVRL_D2_SP(src7, src5, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + src4 = LD_SP(psrc3); + src6 = LD_SP(psrc4); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + ctemp05 = *(psrc3 + 0); + ctemp06 = *(psrc3 + 1); + ctemp07 = *(psrc4 + 0); + ctemp08 = *(psrc4 + 1); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + *(pdst + 4) = ctemp05; + *(pdst + 5) = ctemp06; + *(pdst + 6) = ctemp07; + *(pdst + 7) = ctemp08; + pdst += 8; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + psrc1 += 4; + psrc2 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + psrc1 += 2; + psrc2 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + pdst += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + psrc1 += 4; + + ST_SP(src0, pdst); + pdst += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + psrc1 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + pdst += 2; + } + } + + return 0; +} diff --git a/kernel/mips/cgemm_ncopy_8_msa.c b/kernel/mips/cgemm_ncopy_8_msa.c new file mode 100644 index 0000000000..9ea749069b --- /dev/null +++ b/kernel/mips/cgemm_ncopy_8_msa.c @@ -0,0 +1,310 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07; + FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14; + FLOAT ctemp15, ctemp16; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + psrc0 = src; + pdst = dst; + lda *= 2; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + LD_SP2_INC(psrc5, 4, src8, src9); + LD_SP2_INC(psrc6, 4, src10, src11); + LD_SP2_INC(psrc7, 4, src12, src13); + LD_SP2_INC(psrc8, 4, src14, src15); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + ILVRL_D2_SP(src10, src8, dst2, dst6); + ILVRL_D2_SP(src14, src12, dst3, dst7); + + ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + ILVRL_D2_SP(src7, src5, dst1, dst5); + ILVRL_D2_SP(src11, src9, dst2, dst6); + ILVRL_D2_SP(src15, src13, dst3, dst7); + + ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + src4 = LD_SP(psrc3); + src6 = LD_SP(psrc4); + src8 = LD_SP(psrc5); + src10 = LD_SP(psrc6); + src12 = LD_SP(psrc7); + src14 = LD_SP(psrc8); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + psrc5 += 4; + psrc6 += 4; + psrc7 += 4; + psrc8 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + ILVRL_D2_SP(src10, src8, dst2, dst6); + ILVRL_D2_SP(src14, src12, dst3, dst7); + + ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + ctemp05 = *(psrc3 + 0); + ctemp06 = *(psrc3 + 1); + ctemp07 = *(psrc4 + 0); + ctemp08 = *(psrc4 + 1); + ctemp09 = *(psrc5 + 0); + ctemp10 = *(psrc5 + 1); + ctemp11 = *(psrc6 + 0); + ctemp12 = *(psrc6 + 1); + ctemp13 = *(psrc7 + 0); + ctemp14 = *(psrc7 + 1); + ctemp15 = *(psrc8 + 0); + ctemp16 = *(psrc8 + 1); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + psrc5 += 2; + psrc6 += 2; + psrc7 += 2; + psrc8 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + *(pdst + 4) = ctemp05; + *(pdst + 5) = ctemp06; + *(pdst + 6) = ctemp07; + *(pdst + 7) = ctemp08; + *(pdst + 8) = ctemp09; + *(pdst + 9) = ctemp10; + *(pdst + 10) = ctemp11; + *(pdst + 11) = ctemp12; + *(pdst + 12) = ctemp13; + *(pdst + 13) = ctemp14; + *(pdst + 14) = ctemp15; + *(pdst + 15) = ctemp16; + pdst += 16; + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + ILVRL_D2_SP(src7, src5, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + src4 = LD_SP(psrc3); + src6 = LD_SP(psrc4); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + ctemp05 = *(psrc3 + 0); + ctemp06 = *(psrc3 + 1); + ctemp07 = *(psrc4 + 0); + ctemp08 = *(psrc4 + 1); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + *(pdst + 4) = ctemp05; + *(pdst + 5) = ctemp06; + *(pdst + 6) = ctemp07; + *(pdst + 7) = ctemp08; + pdst += 8; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + psrc1 += 4; + psrc2 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + psrc1 += 2; + psrc2 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + pdst += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + psrc1 += 4; + + ST_SP(src0, pdst); + pdst += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + psrc1 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + pdst += 2; + } + } + + return 0; +} diff --git a/kernel/mips/cgemm_tcopy_4_msa.c b/kernel/mips/cgemm_tcopy_4_msa.c new file mode 100644 index 0000000000..12aaa979e4 --- /dev/null +++ b/kernel/mips/cgemm_tcopy_4_msa.c @@ -0,0 +1,125 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0; + FLOAT *psrc1, *psrc2; + FLOAT *pdst0; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + v4f32 src0, src1, src2, src3; + + psrc0 = src; + pdst0 = dst; + lda *= 2; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 8; + + for (i = (m >> 1); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + LD_SP2(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst0, 4); + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 4; + + for (i = (m >> 1); i--;) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + ST_SP2_INC(src0, src1, pdst0, 4); + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + src0 = LD_SP(psrc1); + ST_SP(src0, pdst0); + pdst0 += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 2; + + for (i = (m >> 1); i--;) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + pdst0 += 2; + } + } + + return 0; +} diff --git a/kernel/mips/cgemm_tcopy_8_msa.c b/kernel/mips/cgemm_tcopy_8_msa.c new file mode 100644 index 0000000000..9f78fa73a6 --- /dev/null +++ b/kernel/mips/cgemm_tcopy_8_msa.c @@ -0,0 +1,214 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *pdst0; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + lda *= 2; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 16; + + for (i = (m >> 2); i--;) + { + LD_SP4(psrc1, 4, src0, src1, src2, src3); + LD_SP4(psrc2, 4, src4, src5, src6, src7); + LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11); + LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15); + ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); + ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4); + psrc1 += 4 * lda; + psrc2 += 4 * lda; + } + + if (m & 2) + { + LD_SP4(psrc1, 4, src0, src1, src2, src3); + LD_SP4(psrc2, 4, src4, src5, src6, src7); + ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + LD_SP4(psrc1, 4, src0, src1, src2, src3); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 8; + + for (i = (m >> 2); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + LD_SP2(psrc1 + 2 * lda, 4, src4, src5); + LD_SP2(psrc2 + 2 * lda, 4, src6, src7); + + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + ST_SP4_INC(src4, src5, src6, src7, pdst0, 4); + psrc1 += 4 * lda; + psrc2 += 4 * lda; + } + + if (m & 2) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + LD_SP2(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst0, 4); + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 4; + + for (i = (m >> 2); i--;) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + src2 = LD_SP(psrc1 + 2 * lda); + src3 = LD_SP(psrc2 + 2 * lda); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + + psrc1 += 4 * lda; + psrc2 += 4 * lda; + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + ST_SP2_INC(src0, src1, pdst0, 4); + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + src0 = LD_SP(psrc1); + ST_SP(src0, pdst0); + pdst0 += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 2; + + for (i = (m >> 2); i--;) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + } + + if (m & 2) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + pdst0 += 2; + } + } + + return 0; +} diff --git a/kernel/mips/cgemv_n_msa.c b/kernel/mips/cgemv_n_msa.c new file mode 100644 index 0000000000..f1879ba003 --- /dev/null +++ b/kernel/mips/cgemv_n_msa.c @@ -0,0 +1,611 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#undef OP0 +#undef OP1 +#undef OP2 +#undef OP3 +#undef OP4 + +#if !defined(XCONJ) + #define OP3 -= + #define OP4 += +#else + #define OP3 += + #define OP4 -= +#endif + +#if !defined(CONJ) + #if !defined(XCONJ) + #define OP0 -= + #define OP1 += + #define OP2 += + #else + #define OP0 += + #define OP1 += + #define OP2 -= + #endif +#else + #if !defined(XCONJ) + #define OP0 += + #define OP1 -= + #define OP2 -= + #else + #define OP0 -= + #define OP1 -= + #define OP2 += + #endif +#endif + +#define CGEMV_N_8x4() \ + LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ + LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ + LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \ + LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ + PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ + PCKEVOD_W2_SP(t11, t10, src5r, src5i); \ + PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ + PCKEVOD_W2_SP(t15, t14, src7r, src7i); \ + \ + y0r += tp0r * src0r; \ + y1r += tp0r * src1r; \ + y0r += tp1r * src2r; \ + y1r += tp1r * src3r; \ + y0r += tp2r * src4r; \ + y1r += tp2r * src5r; \ + y0r += tp3r * src6r; \ + y1r += tp3r * src7r; \ + \ + y0r OP0 tp0i * src0i; \ + y1r OP0 tp0i * src1i; \ + y0r OP0 tp1i * src2i; \ + y1r OP0 tp1i * src3i; \ + y0r OP0 tp2i * src4i; \ + y1r OP0 tp2i * src5i; \ + y0r OP0 tp3i * src6i; \ + y1r OP0 tp3i * src7i; \ + \ + y0i OP1 tp0r * src0i; \ + y1i OP1 tp0r * src1i; \ + y0i OP1 tp1r * src2i; \ + y1i OP1 tp1r * src3i; \ + y0i OP1 tp2r * src4i; \ + y1i OP1 tp2r * src5i; \ + y0i OP1 tp3r * src6i; \ + y1i OP1 tp3r * src7i; \ + \ + y0i OP2 tp0i * src0r; \ + y1i OP2 tp0i * src1r; \ + y0i OP2 tp1i * src2r; \ + y1i OP2 tp1i * src3r; \ + y0i OP2 tp2i * src4r; \ + y1i OP2 tp2i * src5r; \ + y0i OP2 tp3i * src6r; \ + y1i OP2 tp3i * src7r; \ + +#define CGEMV_N_4x4() \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t4, t5); \ + LD_SP2(pa2 + k, 4, t8, t9); \ + LD_SP2(pa3 + k, 4, t12, t13); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ + PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ + \ + y0r += tp0r * src0r; \ + y0r += tp1r * src2r; \ + y0r += tp2r * src4r; \ + y0r += tp3r * src6r; \ + \ + y0r OP0 tp0i * src0i; \ + y0r OP0 tp1i * src2i; \ + y0r OP0 tp2i * src4i; \ + y0r OP0 tp3i * src6i; \ + \ + y0i OP1 tp0r * src0i; \ + y0i OP1 tp1r * src2i; \ + y0i OP1 tp2r * src4i; \ + y0i OP1 tp3r * src6i; \ + \ + y0i OP2 tp0i * src0r; \ + y0i OP2 tp1i * src2r; \ + y0i OP2 tp2i * src4r; \ + y0i OP2 tp3i * src6r; \ + +#define CGEMV_N_1x4() \ + res0 = y[0 * inc_y2]; \ + res1 = y[0 * inc_y2 + 1]; \ + \ + res0 += temp0_r * pa0[k]; \ + res0 OP0 temp0_i * pa0[k + 1]; \ + res0 += temp1_r * pa1[k]; \ + res0 OP0 temp1_i * pa1[k + 1]; \ + res0 += temp2_r * pa2[k]; \ + res0 OP0 temp2_i * pa2[k + 1]; \ + res0 += temp3_r * pa3[k]; \ + res0 OP0 temp3_i * pa3[k + 1]; \ + \ + res1 OP1 temp0_r * pa0[k + 1]; \ + res1 OP2 temp0_i * pa0[k]; \ + res1 OP1 temp1_r * pa1[k + 1]; \ + res1 OP2 temp1_i * pa1[k]; \ + res1 OP1 temp2_r * pa2[k + 1]; \ + res1 OP2 temp2_i * pa2[k]; \ + res1 OP1 temp3_r * pa3[k + 1]; \ + res1 OP2 temp3_i * pa3[k]; \ + \ + y[0 * inc_y2] = res0; \ + y[0 * inc_y2 + 1] = res1; \ + +#define CGEMV_N_8x2() \ + LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ + LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ + \ + y0r += tp0r * src0r; \ + y1r += tp0r * src1r; \ + y0r += tp1r * src2r; \ + y1r += tp1r * src3r; \ + \ + y0r OP0 tp0i * src0i; \ + y1r OP0 tp0i * src1i; \ + y0r OP0 tp1i * src2i; \ + y1r OP0 tp1i * src3i; \ + \ + y0i OP1 tp0r * src0i; \ + y1i OP1 tp0r * src1i; \ + y0i OP1 tp1r * src2i; \ + y1i OP1 tp1r * src3i; \ + \ + y0i OP2 tp0i * src0r; \ + y1i OP2 tp0i * src1r; \ + y0i OP2 tp1i * src2r; \ + y1i OP2 tp1i * src3r; \ + +#define CGEMV_N_4x2() \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t4, t5); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + \ + y0r += tp0r * src0r; \ + y0r += tp1r * src2r; \ + \ + y0r OP0 tp0i * src0i; \ + y0r OP0 tp1i * src2i; \ + \ + y0i OP1 tp0r * src0i; \ + y0i OP1 tp1r * src2i; \ + \ + y0i OP2 tp0i * src0r; \ + y0i OP2 tp1i * src2r; \ + +#define CGEMV_N_1x2() \ + res0 = y[0 * inc_y2]; \ + res1 = y[0 * inc_y2 + 1]; \ + \ + res0 += temp0_r * pa0[k]; \ + res0 OP0 temp0_i * pa0[k + 1]; \ + res0 += temp1_r * pa1[k]; \ + res0 OP0 temp1_i * pa1[k + 1]; \ + \ + res1 OP1 temp0_r * pa0[k + 1]; \ + res1 OP2 temp0_i * pa0[k]; \ + res1 OP1 temp1_r * pa1[k + 1]; \ + res1 OP2 temp1_i * pa1[k]; \ + \ + y[0 * inc_y2] = res0; \ + y[0 * inc_y2 + 1] = res1; \ + +#define CGEMV_N_1x1() \ + res0 = y[0 * inc_y2]; \ + res1 = y[0 * inc_y2 + 1]; \ + \ + res0 += temp_r * pa0[k]; \ + res0 OP0 temp_i * pa0[k + 1]; \ + \ + res1 OP1 temp_r * pa0[k + 1]; \ + res1 OP2 temp_i * pa0[k]; \ + \ + y[0 * inc_y2] = res0; \ + y[0 * inc_y2 + 1] = res1; \ + +#define CLOAD_X4_SCALE_VECTOR() \ + LD_SP2(x, 4, x0, x1); \ + \ + PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ + \ + tp4r = alphar * x0r; \ + tp4r OP3 alphai * x0i; \ + tp4i = alphar * x0i; \ + tp4i OP4 alphai * x0r; \ + \ + SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \ + SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \ + +#define CLOAD_X4_SCALE_GP() \ + x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ + x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ + \ + tp4r = alphar * x0r; \ + tp4r OP3 alphai * x0i; \ + tp4i = alphar * x0i; \ + tp4i OP4 alphai * x0r; \ + \ + SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \ + SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \ + +#define CLOAD_X2_SCALE_GP() \ + temp0_r = alpha_r * x[0 * inc_x2]; \ + temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ + temp0_i = alpha_r * x[0 * inc_x2 + 1]; \ + temp0_i OP4 alpha_i * x[0 * inc_x2]; \ + \ + temp1_r = alpha_r * x[1 * inc_x2]; \ + temp1_r OP3 alpha_i * x[1 * inc_x2 + 1]; \ + temp1_i = alpha_r * x[1 * inc_x2 + 1]; \ + temp1_i OP4 alpha_i * x[1 * inc_x2]; \ + \ + tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r); \ + tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i); \ + tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r); \ + tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i); \ + +#define CLOAD_X1_SCALE_GP() \ + temp_r = alpha_r * x[0 * inc_x2]; \ + temp_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ + temp_i = alpha_r * x[0 * inc_x2 + 1]; \ + temp_i OP4 alpha_i * x[0 * inc_x2]; \ + +#define CLOAD_Y8_VECTOR() \ + LD_SP4(y, 4, y0, y1, y2, y3); \ + PCKEVOD_W2_SP(y1, y0, y0r, y0i); \ + PCKEVOD_W2_SP(y3, y2, y1r, y1i); \ + +#define CLOAD_Y4_VECTOR() \ + LD_SP2(y, 4, y0, y1); \ + PCKEVOD_W2_SP(y1, y0, y0r, y0i); \ + +#define CSTORE_Y8_VECTOR() \ + ILVRL_W2_SP(y0i, y0r, y0, y1); \ + ILVRL_W2_SP(y1i, y1r, y2, y3); \ + ST_SP4(y0, y1, y2, y3, y, 4); \ + +#define CSTORE_Y4_VECTOR() \ + ILVRL_W2_SP(y0i, y0r, y0, y1); \ + ST_SP2(y0, y1, y, 4); \ + +#define CLOAD_Y8_GP() \ + y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \ + y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2))); \ + y1r = (v4f32) __msa_insert_w((v4i32) y1r, 1, *((int *)(y + 5 * inc_y2))); \ + y1r = (v4f32) __msa_insert_w((v4i32) y1r, 2, *((int *)(y + 6 * inc_y2))); \ + y1r = (v4f32) __msa_insert_w((v4i32) y1r, 3, *((int *)(y + 7 * inc_y2))); \ + y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \ + y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1))); \ + y1i = (v4f32) __msa_insert_w((v4i32) y1i, 1, *((int *)(y + 5 * inc_y2 + 1))); \ + y1i = (v4f32) __msa_insert_w((v4i32) y1i, 2, *((int *)(y + 6 * inc_y2 + 1))); \ + y1i = (v4f32) __msa_insert_w((v4i32) y1i, 3, *((int *)(y + 7 * inc_y2 + 1))); \ + +#define CLOAD_Y4_GP() \ + y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \ + y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \ + +#define CSTORE_Y8_GP() \ + *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \ + *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \ + *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \ + *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \ + *((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0); \ + *((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1); \ + *((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2); \ + *((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3); \ + *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \ + *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \ + *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \ + *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \ + *((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0); \ + *((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1); \ + *((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2); \ + *((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3); \ + +#define CSTORE_Y4_GP() \ + *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \ + *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \ + *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \ + *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \ + *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \ + *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \ + *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \ + *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \ + +#define CGEMV_N_MSA() \ + for (j = (n >> 2); j--;) \ + { \ + CLOAD_X4_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + CLOAD_Y8() \ + CGEMV_N_8x4(); \ + CSTORE_Y8(); \ + \ + k += 2 * 8; \ + y += inc_y2 * 8; \ + } \ + \ + if (m & 4) \ + { \ + CLOAD_Y4(); \ + CGEMV_N_4x4(); \ + CSTORE_Y4(); \ + \ + k += 2 * 4; \ + y += inc_y2 * 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0_r = tp4r[0]; \ + temp1_r = tp4r[1]; \ + temp2_r = tp4r[2]; \ + temp3_r = tp4r[3]; \ + \ + temp0_i = tp4i[0]; \ + temp1_i = tp4i[1]; \ + temp2_i = tp4i[2]; \ + temp3_i = tp4i[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + CGEMV_N_1x4(); \ + \ + k += 2; \ + y += inc_y2; \ + } \ + } \ + \ + pa0 += 4 * lda2; \ + pa1 += 4 * lda2; \ + pa2 += 4 * lda2; \ + pa3 += 4 * lda2; \ + \ + x += 4 * inc_x2; \ + } \ + \ + if (n & 2) \ + { \ + CLOAD_X2_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + CLOAD_Y8(); \ + CGEMV_N_8x2(); \ + CSTORE_Y8(); \ + \ + k += 2 * 8; \ + y += inc_y2 * 8; \ + } \ + \ + if (m & 4) \ + { \ + CLOAD_Y4(); \ + CGEMV_N_4x2(); \ + CSTORE_Y4(); \ + \ + k += 2 * 4; \ + y += inc_y2 * 4; \ + } \ + \ + for (i = (m & 3); i--;) \ + { \ + CGEMV_N_1x2(); \ + \ + k += 2; \ + y += inc_y2; \ + } \ + \ + pa0 += 2 * lda2; \ + pa1 += 2 * lda2; \ + \ + x += 2 * inc_x2; \ + } \ + \ + if (n & 1) \ + { \ + CLOAD_X1_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = m; i--;) \ + { \ + CGEMV_N_1x1(); \ + \ + k += 2; \ + y += inc_y2; \ + } \ + \ + pa0 += lda2; \ + x += inc_x2; \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y, + BLASLONG inc_y2, FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *y_org = y; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT temp_r, temp_i, res0, res1, temp0_r; + FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i; + v4f32 alphar, alphai; + v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i; + v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; + v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; + v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i; + + lda2 = 2 * lda2; + inc_x2 = 2 * inc_x2; + inc_y2 = 2 * inc_y2; + + pa0 = A; + pa1 = A + lda2; + pa2 = A + 2 * lda2; + pa3 = A + 3 * lda2; + + alphar = COPY_FLOAT_TO_VECTOR(alpha_r); + alphai = COPY_FLOAT_TO_VECTOR(alpha_i); + + if ((2 == inc_x2) && (2 == inc_y2)) + { + #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR + #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP + #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP + #define CLOAD_Y8 CLOAD_Y8_VECTOR + #define CLOAD_Y4 CLOAD_Y4_VECTOR + #define CSTORE_Y8 CSTORE_Y8_VECTOR + #define CSTORE_Y4 CSTORE_Y4_VECTOR + + CGEMV_N_MSA(); + + #undef CLOAD_X4_SCALE + #undef CLOAD_X2_SCALE + #undef CLOAD_X1_SCALE + #undef CLOAD_Y8 + #undef CLOAD_Y4 + #undef CSTORE_Y8 + #undef CSTORE_Y4 + } + else if (2 == inc_x2) + { + #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR + #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP + #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP + #define CLOAD_Y8 CLOAD_Y8_GP + #define CLOAD_Y4 CLOAD_Y4_GP + #define CSTORE_Y8 CSTORE_Y8_GP + #define CSTORE_Y4 CSTORE_Y4_GP + + CGEMV_N_MSA(); + + #undef CLOAD_X4_SCALE + #undef CLOAD_X2_SCALE + #undef CLOAD_X1_SCALE + #undef CLOAD_Y8 + #undef CLOAD_Y4 + #undef CSTORE_Y8 + #undef CSTORE_Y4 + } + else if (2 == inc_y2) + { + #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP + #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP + #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP + #define CLOAD_Y8 CLOAD_Y8_VECTOR + #define CLOAD_Y4 CLOAD_Y4_VECTOR + #define CSTORE_Y8 CSTORE_Y8_VECTOR + #define CSTORE_Y4 CSTORE_Y4_VECTOR + + CGEMV_N_MSA(); + + #undef CLOAD_X4_SCALE + #undef CLOAD_X2_SCALE + #undef CLOAD_X1_SCALE + #undef CLOAD_Y8 + #undef CLOAD_Y4 + #undef CSTORE_Y8 + #undef CSTORE_Y4 + } + else + { + #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP + #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP + #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP + #define CLOAD_Y8 CLOAD_Y8_GP + #define CLOAD_Y4 CLOAD_Y4_GP + #define CSTORE_Y8 CSTORE_Y8_GP + #define CSTORE_Y4 CSTORE_Y4_GP + + CGEMV_N_MSA(); + + #undef CLOAD_X4_SCALE + #undef CLOAD_X2_SCALE + #undef CLOAD_X1_SCALE + #undef CLOAD_Y8 + #undef CLOAD_Y4 + #undef CSTORE_Y8 + #undef CSTORE_Y4 + } + return(0); +} + +#undef OP0 +#undef OP1 +#undef OP2 +#undef OP3 +#undef OP4 diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c new file mode 100644 index 0000000000..b9620bfb90 --- /dev/null +++ b/kernel/mips/cgemv_t_msa.c @@ -0,0 +1,583 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#undef OP0 +#undef OP1 +#undef OP2 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + #define OP0 -= + #define OP1 += + #define OP2 += +#else + #define OP0 += + #define OP1 += + #define OP2 -= +#endif + +#define CGEMV_T_8x4() \ + LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ + LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ + LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \ + LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ + PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ + PCKEVOD_W2_SP(t11, t10, src5r, src5i); \ + PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ + PCKEVOD_W2_SP(t15, t14, src7r, src7i); \ + \ + tp0r += src0r * x0r; \ + tp0r += src1r * x1r; \ + tp0r OP0 src0i * x0i; \ + tp0r OP0 src1i * x1i; \ + \ + tp1r += src2r * x0r; \ + tp1r += src3r * x1r; \ + tp1r OP0 src2i * x0i; \ + tp1r OP0 src3i * x1i; \ + \ + tp2r += src4r * x0r; \ + tp2r += src5r * x1r; \ + tp2r OP0 src4i * x0i; \ + tp2r OP0 src5i * x1i; \ + \ + tp3r += src6r * x0r; \ + tp3r += src7r * x1r; \ + tp3r OP0 src6i * x0i; \ + tp3r OP0 src7i * x1i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP1 src1r * x1i; \ + tp0i OP2 src0i * x0r; \ + tp0i OP2 src1i * x1r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP1 src3r * x1i; \ + tp1i OP2 src2i * x0r; \ + tp1i OP2 src3i * x1r; \ + \ + tp2i OP1 src4r * x0i; \ + tp2i OP1 src5r * x1i; \ + tp2i OP2 src4i * x0r; \ + tp2i OP2 src5i * x1r; \ + \ + tp3i OP1 src6r * x0i; \ + tp3i OP1 src7r * x1i; \ + tp3i OP2 src6i * x0r; \ + tp3i OP2 src7i * x1r; \ + +#define CGEMV_T_8x2() \ + LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ + LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ + \ + tp0r += src0r * x0r; \ + tp0r += src1r * x1r; \ + tp0r OP0 src0i * x0i; \ + tp0r OP0 src1i * x1i; \ + \ + tp1r += src2r * x0r; \ + tp1r += src3r * x1r; \ + tp1r OP0 src2i * x0i; \ + tp1r OP0 src3i * x1i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP1 src1r * x1i; \ + tp0i OP2 src0i * x0r; \ + tp0i OP2 src1i * x1r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP1 src3r * x1i; \ + tp1i OP2 src2i * x0r; \ + tp1i OP2 src3i * x1r; \ + +#define CGEMV_T_8x1() \ + LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ + \ + tp0r += src0r * x0r; \ + tp0r += src1r * x1r; \ + tp0r OP0 src0i * x0i; \ + tp0r OP0 src1i * x1i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP1 src1r * x1i; \ + tp0i OP2 src0i * x0r; \ + tp0i OP2 src1i * x1r; \ + +#define CGEMV_T_4x4() \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t4, t5); \ + LD_SP2(pa2 + k, 4, t8, t9); \ + LD_SP2(pa3 + k, 4, t12, t13); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ + PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ + \ + tp0r += src0r * x0r; \ + tp0r OP0 src0i * x0i; \ + \ + tp1r += src2r * x0r; \ + tp1r OP0 src2i * x0i; \ + \ + tp2r += src4r * x0r; \ + tp2r OP0 src4i * x0i; \ + \ + tp3r += src6r * x0r; \ + tp3r OP0 src6i * x0i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP2 src0i * x0r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP2 src2i * x0r; \ + \ + tp2i OP1 src4r * x0i; \ + tp2i OP2 src4i * x0r; \ + \ + tp3i OP1 src6r * x0i; \ + tp3i OP2 src6i * x0r; \ + +#define CGEMV_T_4x2() \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t4, t5); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + \ + tp0r += src0r * x0r; \ + tp0r OP0 src0i * x0i; \ + \ + tp1r += src2r * x0r; \ + tp1r OP0 src2i * x0i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP2 src0i * x0r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP2 src2i * x0r; \ + +#define CGEMV_T_4x1() \ + LD_SP2(pa0 + k, 4, t0, t1); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + \ + tp0r += src0r * x0r; \ + tp0r OP0 src0i * x0i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP2 src0i * x0r; \ + +#define CGEMV_T_1x4() \ + temp0r += pa0[k + 0] * x[0 * inc_x2]; \ + temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ + temp1r += pa1[k + 0] * x[0 * inc_x2]; \ + temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ + temp2r += pa2[k + 0] * x[0 * inc_x2]; \ + temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \ + temp3r += pa3[k + 0] * x[0 * inc_x2]; \ + temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \ + \ + temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ + temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ + temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ + temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ + temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \ + temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \ + temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \ + temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \ + +#define CGEMV_T_1x2() \ + temp0r += pa0[k + 0] * x[0 * inc_x2]; \ + temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ + temp1r += pa1[k + 0] * x[0 * inc_x2]; \ + temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ + \ + temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ + temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ + temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ + temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ + +#define CGEMV_T_1x1() \ + temp0r += pa0[k + 0] * x[0 * inc_x2]; \ + temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ + \ + temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ + temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ + +#define CSCALE_STORE_Y4_GP() \ + res0r = y[0 * inc_y2]; \ + res1r = y[1 * inc_y2]; \ + res2r = y[2 * inc_y2]; \ + res3r = y[3 * inc_y2]; \ + \ + res0i = y[0 * inc_y2 + 1]; \ + res1i = y[1 * inc_y2 + 1]; \ + res2i = y[2 * inc_y2 + 1]; \ + res3i = y[3 * inc_y2 + 1]; \ + \ + res0r += alphar * temp0r; \ + res0r OP0 alphai * temp0i; \ + res1r += alphar * temp1r; \ + res1r OP0 alphai * temp1i; \ + res2r += alphar * temp2r; \ + res2r OP0 alphai * temp2i; \ + res3r += alphar * temp3r; \ + res3r OP0 alphai * temp3i; \ + \ + res0i OP1 alphar * temp0i; \ + res0i OP2 alphai * temp0r; \ + res1i OP1 alphar * temp1i; \ + res1i OP2 alphai * temp1r; \ + res2i OP1 alphar * temp2i; \ + res2i OP2 alphai * temp2r; \ + res3i OP1 alphar * temp3i; \ + res3i OP2 alphai * temp3r; \ + \ + y[0 * inc_y2] = res0r; \ + y[1 * inc_y2] = res1r; \ + y[2 * inc_y2] = res2r; \ + y[3 * inc_y2] = res3r; \ + \ + y[0 * inc_y2 + 1] = res0i; \ + y[1 * inc_y2 + 1] = res1i; \ + y[2 * inc_y2 + 1] = res2i; \ + y[3 * inc_y2 + 1] = res3i; \ + +#define CSCALE_STORE_Y2_GP() \ + res0r = y[0 * inc_y2]; \ + res1r = y[1 * inc_y2]; \ + \ + res0i = y[0 * inc_y2 + 1]; \ + res1i = y[1 * inc_y2 + 1]; \ + \ + res0r += alphar * temp0r; \ + res0r OP0 alphai * temp0i; \ + res1r += alphar * temp1r; \ + res1r OP0 alphai * temp1i; \ + \ + res0i OP1 alphar * temp0i; \ + res0i OP2 alphai * temp0r; \ + res1i OP1 alphar * temp1i; \ + res1i OP2 alphai * temp1r; \ + \ + y[0 * inc_y2] = res0r; \ + y[1 * inc_y2] = res1r; \ + \ + y[0 * inc_y2 + 1] = res0i; \ + y[1 * inc_y2 + 1] = res1i; \ + + +#define CSCALE_STORE_Y1_GP() \ + res0r = y[0 * inc_y2]; \ + res0i = y[0 * inc_y2 + 1]; \ + \ + res0r += alphar * temp0r; \ + res0r OP0 alphai * temp0i; \ + \ + res0i OP1 alphar * temp0i; \ + res0i OP2 alphai * temp0r; \ + \ + y[0 * inc_y2] = res0r; \ + y[0 * inc_y2 + 1] = res0i; \ + +#define CLOAD_X8_VECTOR() \ + LD_SP4(x, 4, x0, x1, x2, x3); \ + PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ + PCKEVOD_W2_SP(x3, x2, x1r, x1i); \ + +#define CLOAD_X4_VECTOR() \ + LD_SP2(x, 4, x0, x1); \ + PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ + +#define CLOAD_X8_GP() \ + x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ + x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2))); \ + x1r = (v4f32) __msa_insert_w((v4i32) x1r, 1, *((int *) (x + 5 * inc_x2))); \ + x1r = (v4f32) __msa_insert_w((v4i32) x1r, 2, *((int *) (x + 6 * inc_x2))); \ + x1r = (v4f32) __msa_insert_w((v4i32) x1r, 3, *((int *) (x + 7 * inc_x2))); \ + x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ + x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1))); \ + x1i = (v4f32) __msa_insert_w((v4i32) x1i, 1, *((int *) (x + 5 * inc_x2 + 1))); \ + x1i = (v4f32) __msa_insert_w((v4i32) x1i, 2, *((int *) (x + 6 * inc_x2 + 1))); \ + x1i = (v4f32) __msa_insert_w((v4i32) x1i, 3, *((int *) (x + 7 * inc_x2 + 1))); \ + +#define CLOAD_X4_GP() \ + x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ + x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ + +#define CGEMV_T_MSA() \ + for (j = (n >> 2); j--;) \ + { \ + tp0r = tp1r = tp2r = tp3r = zero; \ + tp0i = tp1i = tp2i = tp3i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + CLOAD_X8() \ + CGEMV_T_8x4(); \ + \ + k += 2 * 8; \ + x += inc_x2 * 8; \ + } \ + \ + if (m & 4) \ + { \ + CLOAD_X4(); \ + \ + CGEMV_T_4x4(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r, \ + tp0r, tp1r, tp2r, tp3r); \ + TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i, \ + tp0i, tp1i, tp2i, tp3i); \ + \ + tp0r += tp1r; \ + tp0r += tp2r; \ + tp0r += tp3r; \ + tp0i += tp1i; \ + tp0i += tp2i; \ + tp0i += tp3i; \ + \ + temp0r = tp0r[0]; \ + temp1r = tp0r[1]; \ + temp2r = tp0r[2]; \ + temp3r = tp0r[3]; \ + temp0i = tp0i[0]; \ + temp1i = tp0i[1]; \ + temp2i = tp0i[2]; \ + temp3i = tp0i[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + CGEMV_T_1x4(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + CSCALE_STORE_Y4_GP(); \ + \ + pa0 += 4 * lda2; \ + pa1 += 4 * lda2; \ + pa2 += 4 * lda2; \ + pa3 += 4 * lda2; \ + y += 4 * inc_y2; \ + } \ + \ + if (n & 2) \ + { \ + tp0r = tp1r = zero; \ + tp0i = tp1i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + CLOAD_X8(); \ + \ + CGEMV_T_8x2(); \ + \ + k += 2 * 8; \ + x += inc_x2 * 8; \ + } \ + \ + if (m & 4) \ + { \ + CLOAD_X4(); \ + \ + CGEMV_T_4x2(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i, \ + tp0r, tp1r, tp0i, tp1i); \ + \ + tp0r += tp1r; \ + tp0r += tp0i; \ + tp0r += tp1i; \ + \ + temp0r = tp0r[0]; \ + temp1r = tp0r[1]; \ + temp0i = tp0r[2]; \ + temp1i = tp0r[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + CGEMV_T_1x2(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + CSCALE_STORE_Y2_GP(); \ + \ + pa0 += 2 * lda2; \ + pa1 += 2 * lda2; \ + y += 2 * inc_y2; \ + } \ + \ + if (n & 1) \ + { \ + tp0r = zero; \ + tp0i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + CLOAD_X8(); \ + \ + CGEMV_T_8x1(); \ + \ + k += 2 * 8; \ + x += inc_x2 * 8; \ + } \ + \ + if (m & 4) \ + { \ + CLOAD_X4(); \ + \ + CGEMV_T_4x1(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + ILVRL_W2_SP(tp0i, tp0r, t0, t1); \ + \ + t0 += t1; \ + \ + temp0r = t0[0] + t0[2]; \ + temp0i = t0[1] + t0[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + CGEMV_T_1x1(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + CSCALE_STORE_Y1_GP(); \ + \ + pa0 += lda2; \ + y += inc_y2; \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai, + FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *srcx_org = x; + FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i; + FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i; + BLASLONG inc_x2, inc_y2, lda2; + v4f32 zero = {0}; + v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i; + v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; + v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; + v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i; + + lda2 = 2 * lda; + + pa0 = A; + pa1 = A + lda2; + pa2 = A + 2 * lda2; + pa3 = A + 3 * lda2; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if (2 == inc_x2) + { + #define CLOAD_X8 CLOAD_X8_VECTOR + #define CLOAD_X4 CLOAD_X4_VECTOR + + CGEMV_T_MSA(); + + #undef CLOAD_X8 + #undef CLOAD_X4 + } + else + { + #define CLOAD_X8 CLOAD_X8_GP + #define CLOAD_X4 CLOAD_X4_GP + + CGEMV_T_MSA(); + + #undef CLOAD_X8 + #undef CLOAD_X4 + } + + return(0); +} + +#undef OP0 +#undef OP1 +#undef OP2 diff --git a/kernel/mips/copy.c b/kernel/mips/copy.c new file mode 100644 index 0000000000..9f488ddb38 --- /dev/null +++ b/kernel/mips/copy.c @@ -0,0 +1,50 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips/dasum_msa.c b/kernel/mips/dasum_msa.c new file mode 100644 index 0000000000..a3641cd50c --- /dev/null +++ b/kernel/mips/dasum_msa.c @@ -0,0 +1,278 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include +#include "macros_msa.h" + +#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec)) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i; + FLOAT sumf = 0.0; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3; + v2f64 zero_v = {0}; + v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; + + if (n <= 0 || inc_x <= 0) return (sumf); + + if (1 == inc_x) + { + if (n > 15) + { + n -= 16; + + LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 = AND_VEC_D(src0); + sum_abs1 = AND_VEC_D(src1); + sum_abs2 = AND_VEC_D(src2); + sum_abs3 = AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + sum_abs2 = zero_v; + sum_abs3 = zero_v; + } + + for (i = (n >> 4); i--;) + { + LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + } + + if (n & 15) + { + if ((n & 8) && (n & 4) && (n & 2)) + { + LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + } + else if ((n & 8) && (n & 4)) + { + LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + } + else if ((n & 8) && (n & 2)) + { + LD_DP5_INC(x, 2, src0, src1, src2, src3, src4); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + } + else if ((n & 4) && (n & 2)) + { + LD_DP3_INC(x, 2, src0, src1, src2); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + } + else if (n & 8) + { + LD_DP4_INC(x, 2, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + } + else if (n & 4) + { + LD_DP2_INC(x, 2, src0, src1); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + } + else if (n & 2) + { + src0 = LD_DP(x); x += 2; + + sum_abs0 += AND_VEC_D(src0); + } + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0] + sum_abs0[1]; + + if (n & 1) + { + sumf += fabs(*x); + } + } + else + { + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0] + sum_abs0[1]; + } + } + else + { + if (n > 8) + { + n -= 8; + + LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 = AND_VEC_D(src0); + sum_abs1 = AND_VEC_D(src1); + sum_abs2 = AND_VEC_D(src2); + sum_abs3 = AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + sum_abs2 = zero_v; + sum_abs3 = zero_v; + } + + for (i = (n >> 3); i--;) + { + LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + } + + if (n & 7) + { + if ((n & 4) && (n & 2) && (n & 1)) + { + LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + } + else if ((n & 4) && (n & 2)) + { + LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + } + else if ((n & 4) && (n & 1)) + { + LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + } + else if ((n & 2) && (n & 1)) + { + LD_DP3_INC(x, inc_x, src0, src1, src2); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + } + else if (n & 4) + { + LD_DP4_INC(x, inc_x, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + } + else if (n & 2) + { + LD_DP2_INC(x, inc_x, src0, src1); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + } + else if (n & 1) + { + src0 = LD_DP(x); + + sum_abs0 += AND_VEC_D(src0); + } + } + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + } + + return (sumf); +} diff --git a/kernel/mips/ddot_msa.c b/kernel/mips/ddot_msa.c new file mode 100644 index 0000000000..b56e101358 --- /dev/null +++ b/kernel/mips/ddot_msa.c @@ -0,0 +1,189 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +/* return float, x,y float */ +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i = 0; + double dot = 0.0; + FLOAT x0, x1, x2, x3, y0, y1, y2, y3; + v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; + v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; + v2f64 dot0 = {0, 0}; + + if (n < 0) return (dot); + + if ((1 == inc_x) && (1 == inc_y)) + { + for (i = (n >> 4); i--;) + { + LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + dot0 += (vy6 * vx6); + dot0 += (vy7 * vx7); + } + + if (n & 15) + { + if ((n & 8) && (n & 4) && (n & 2)) + { + LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6); + LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + dot0 += (vy6 * vx6); + } + else if ((n & 8) && (n & 4)) + { + LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5); + LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + } + else if ((n & 8) && (n & 2)) + { + LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4); + LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + } + else if ((n & 4) && (n & 2)) + { + LD_DP3_INC(x, 2, vx0, vx1, vx2); + LD_DP3_INC(y, 2, vy0, vy1, vy2); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + } + else if (n & 8) + { + LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); + LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + } + else if (n & 4) + { + LD_DP2_INC(x, 2, vx0, vx1); + LD_DP2_INC(y, 2, vy0, vy1); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + } + else if (n & 2) + { + vx0 = LD_DP(x); x += 2; + vy0 = LD_DP(y); y += 2; + + dot0 += (vy0 * vx0); + } + + if (n & 1) + { + x0 = *x; + y0 = *y; + + dot += (y0 * x0); + } + } + + dot += dot0[0]; + dot += dot0[1]; + } + else + { + for (i = (n >> 2); i--;) + { + LD_GP4_INC(x, inc_x, x0, x1, x2, x3); + LD_GP4_INC(y, inc_y, y0, y1, y2, y3); + + dot += (y0 * x0); + dot += (y1 * x1); + dot += (y2 * x2); + dot += (y3 * x3); + } + + if ((n & 2) && (n & 1)) + { + LD_GP3_INC(x, inc_x, x0, x1, x2); + LD_GP3_INC(y, inc_y, y0, y1, y2); + + dot += (y0 * x0); + dot += (y1 * x1); + dot += (y2 * x2); + } + else if (n & 2) + { + LD_GP2_INC(x, inc_x, x0, x1); + LD_GP2_INC(y, inc_y, y0, y1); + + dot += (y0 * x0); + dot += (y1 * x1); + } + else if (n & 1) + { + x0 = *x; + y0 = *y; + + dot += (y0 * x0); + } + } + + return (dot); +} diff --git a/kernel/mips/dgemm_kernel_8x4_msa.c b/kernel/mips/dgemm_kernel_8x4_msa.c new file mode 100644 index 0000000000..9286e74694 --- /dev/null +++ b/kernel/mips/dgemm_kernel_8x4_msa.c @@ -0,0 +1,1566 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, + FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif + FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; + FLOAT tmp0, tmp1, tmp2, tmp3; + FLOAT a0, b0, b1, b2, b3; + v2f64 v_alpha = {alpha, alpha}; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v2f64 res0, res1, res2, res3, res4, res5, res6, res7; + v2f64 res8, res9, res10, res11, res12, res13, res14, res15; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 2); j--;) + { + pc0 = C; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + res2 = src_a2 * src_b; + res3 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + res6 = src_a2 * src_b; + res7 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res8 = src_a0 * src_b; + res9 = src_a1 * src_b; + res10 = src_a2 * src_b; + res11 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res12 = src_a0 * src_b; + res13 = src_a1 * src_b; + res14 = src_a2 * src_b; + res15 = src_a3 * src_b; + + for (l = ((temp - 1) >> 1); l--;) + { + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + res10 += src_a2 * src_b; + res11 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + res14 += src_a2 * src_b; + res15 += src_a3 * src_b; + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + res10 += src_a2 * src_b; + res11 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + res14 += src_a2 * src_b; + res15 += src_a3 * src_b; + } + + if ((temp - 1) & 1) + { + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + res10 += src_a2 * src_b; + res11 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + res14 += src_a2 * src_b; + res15 += src_a3 * src_b; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; +#endif + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); + +#if defined(TRMMKERNEL) + dst0 = res8 * v_alpha; + dst1 = res9 * v_alpha; + dst2 = res10 * v_alpha; + dst3 = res11 * v_alpha; + dst4 = res12 * v_alpha; + dst5 = res13 * v_alpha; + dst6 = res14 * v_alpha; + dst7 = res15 * v_alpha; +#else + LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); + LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); + + dst0 += res8 * v_alpha; + dst1 += res9 * v_alpha; + dst2 += res10 * v_alpha; + dst3 += res11 * v_alpha; + dst4 += res12 * v_alpha; + dst5 += res13 * v_alpha; + dst6 += res14 * v_alpha; + dst7 += res15 * v_alpha; +#endif + + ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); + ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res6 = src_a0 * src_b; + res7 = src_a1 * src_b; + + for (l = ((temp - 1) >> 1); l--;) + { + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + } + + if ((temp - 1) & 1) + { + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else + LD_DP2(pc0, 2, dst0, dst1); + LD_DP2(pc1, 2, dst2, dst3); + LD_DP2(pc2, 2, dst4, dst5); + LD_DP2(pc3, 2, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; +#endif + ST_DP2_INC(dst0, dst1, pc0, 2); + ST_DP2_INC(dst2, dst3, pc1, 2); + ST_DP2_INC(dst4, dst5, pc2, 2); + ST_DP2_INC(dst6, dst7, pc3, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + src_a0 = LD_DP(pa0); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res2 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res3 = src_a0 * src_b; + + for (l = ((temp - 1) >> 1); l--;) + { + src_a0 = LD_DP(pa0); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res2 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res3 += src_a0 * src_b; + + src_a0 = LD_DP(pa0); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res2 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res3 += src_a0 * src_b; + } + + if ((temp - 1) & 1) + { + src_a0 = LD_DP(pa0); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res2 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res3 += src_a0 * src_b; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else + dst0 = LD_DP(pc0); + dst1 = LD_DP(pc1); + dst2 = LD_DP(pc2); + dst3 = LD_DP(pc3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; +#endif + ST_DP(dst0, pc0); + ST_DP(dst1, pc1); + ST_DP(dst2, pc2); + ST_DP(dst3, pc3); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + b2 = pb0[2]; + tmp2 = a0 * b2; + + b3 = pb0[3]; + tmp3 = a0 * b3; + + pa0 += 1; + pb0 += 4; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + tmp2 = alpha * tmp2; + tmp3 = alpha * tmp3; + +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; + pc2[0] = tmp2; + pc3[0] = tmp3; +#else + pc0[0] += tmp0; + pc1[0] += tmp1; + pc2[0] += tmp2; + pc3[0] += tmp3; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 1; + pc1 += 1; + pc2 += 1; + pc3 += 1; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + l = (k << 2); + B = B + l; + i = (ldc << 2); + C = C + i; + } + + if (n & 2) + { + pc0 = C; + pc1 = pc0 + ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + res2 = src_a2 * src_b; + res3 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + res6 = src_a2 * src_b; + res7 = src_a3 * src_b; + + for (l = ((temp - 1) >> 1); l--;) + { + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + } + + if ((temp - 1) & 1) + { + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; +#endif + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP2_INC(pa0, 2, src_a0, src_a1); + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + for (l = ((temp - 1) >> 1); l--;) + { + LD_DP2_INC(pa0, 2, src_a0, src_a1); + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + LD_DP2_INC(pa0, 2, src_a0, src_a1); + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + } + + if ((temp - 1) & 1) + { + LD_DP2_INC(pa0, 2, src_a0, src_a1); + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else + LD_DP2(pc0, 2, dst0, dst1); + LD_DP2(pc1, 2, dst2, dst3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; +#endif + ST_DP2_INC(dst0, dst1, pc0, 2); + ST_DP2_INC(dst2, dst3, pc1, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + src_a0 = LD_DP(pa0); + pa0 += 2; + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 = src_a0 * src_b; + + for (l = ((temp - 1) >> 1); l--;) + { + src_a0 = LD_DP(pa0); + pa0 += 2; + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + src_a0 = LD_DP(pa0); + pa0 += 2; + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + } + + if ((temp - 1) & 1) + { + src_a0 = LD_DP(pa0); + pa0 += 2; + src_b0 = LD_DP(pb0); + pb0 += 2; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else + dst0 = LD_DP(pc0); + dst1 = LD_DP(pc1); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; +#endif + ST_DP(dst0, pc0); + ST_DP(dst1, pc1); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + pc0 += 2; + pc1 += 2; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + pa0 += 1; + pb0 += 2; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; +#else + pc0[0] += tmp0; + pc1[0] += tmp1; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 1; + pc1 += 1; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + l = (k << 1); + B = B + l; + i = (ldc << 1); + C = C + i; + } + + if (n & 1) + { + pc0 = C; + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + res2 = src_a2 * src_b; + res3 = src_a3 * src_b; + + pb0 += 1; + + for (l = ((temp - 1) >> 1); l--;) + { + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + pb0 += 1; + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + pb0 += 1; + } + + if ((temp - 1) & 1) + { + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + pb0 += 1; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; +#endif + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP2_INC(pa0, 2, src_a0, src_a1); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + pb0 += 1; + + for (l = ((temp - 1) >> 1); l--;) + { + LD_DP2_INC(pa0, 2, src_a0, src_a1); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pb0 += 1; + + LD_DP2_INC(pa0, 2, src_a0, src_a1); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pb0 += 1; + } + + if ((temp - 1) & 1) + { + LD_DP2_INC(pa0, 2, src_a0, src_a1); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pb0 += 1; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else + LD_DP2(pc0, 2, dst0, dst1); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; +#endif + ST_DP2_INC(dst0, dst1, pc0, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + src_a0 = LD_DP(pa0); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 = src_a0 * src_b; + + pa0 += 2; + pb0 += 1; + + for (l = ((temp - 1) >> 1); l--;) + { + src_a0 = LD_DP(pa0); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + + pa0 += 2; + pb0 += 1; + + src_a0 = LD_DP(pa0); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + + pa0 += 2; + pb0 += 1; + } + + if ((temp - 1) & 1) + { + src_a0 = LD_DP(pa0); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + + pa0 += 2; + pb0 += 1; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; +#else + dst0 = LD_DP(pc0); + + dst0 += res0 * v_alpha; +#endif + ST_DP(dst0, pc0); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + pc0 += 2; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + pa0 += 1; + pb0 += 1; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + } + +#if defined(TRMMKERNEL) + pc0[0] = alpha * tmp0; +#else + pc0[0] += alpha * tmp0; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 1; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + + l = (k << 0); + B = B + l; + i = (ldc << 0); + C = C + i; + } + + return 0; +} diff --git a/kernel/mips/dgemm_ncopy_4_msa.c b/kernel/mips/dgemm_ncopy_4_msa.c new file mode 100644 index 0000000000..a61b2e8062 --- /dev/null +++ b/kernel/mips/dgemm_ncopy_4_msa.c @@ -0,0 +1,118 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + psrc0 = src; + pdst = dst; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src3, src1, dst2, dst6); + ILVRL_D2_DP(src7, src5, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src3, src1, dst1, dst5); + + ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2); + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + psrc1 += 4; + + ST_DP2(src0, src1, pdst, 2); + pdst += 4; + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_ncopy_8_msa.c b/kernel/mips/dgemm_ncopy_8_msa.c new file mode 100644 index 0000000000..86d019c4f5 --- /dev/null +++ b/kernel/mips/dgemm_ncopy_8_msa.c @@ -0,0 +1,186 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + psrc0 = src; + pdst = dst; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + for (i = (m >> 3); i--;) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + LD_DP2_INC(psrc5, 2, src8, src9); + LD_DP2_INC(psrc6, 2, src10, src11); + LD_DP2_INC(psrc7, 2, src12, src13); + LD_DP2_INC(psrc8, 2, src14, src15); + + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src10, src8, dst2, dst6); + ILVRL_D2_DP(src14, src12, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); + + ILVRL_D2_DP(src3, src1, dst0, dst4); + ILVRL_D2_DP(src7, src5, dst1, dst5); + ILVRL_D2_DP(src11, src9, dst2, dst6); + ILVRL_D2_DP(src15, src13, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); + + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + LD_DP2_INC(psrc5, 2, src8, src9); + LD_DP2_INC(psrc6, 2, src10, src11); + LD_DP2_INC(psrc7, 2, src12, src13); + LD_DP2_INC(psrc8, 2, src14, src15); + + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src10, src8, dst2, dst6); + ILVRL_D2_DP(src14, src12, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); + + ILVRL_D2_DP(src3, src1, dst0, dst4); + ILVRL_D2_DP(src7, src5, dst1, dst5); + ILVRL_D2_DP(src11, src9, dst2, dst6); + ILVRL_D2_DP(src15, src13, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); + } + + for (i = (m & 7); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + *pdst++ = *psrc5++; + *pdst++ = *psrc6++; + *pdst++ = *psrc7++; + *pdst++ = *psrc8++; + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src3, src1, dst2, dst6); + ILVRL_D2_DP(src7, src5, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 1); i--;) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + ILVRL_D2_DP(src1, src0, dst0, dst1); + + ST_DP2_INC(dst0, dst1, pdst, 2); + } + + if (m & 1) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = m; i--;) + { + *pdst++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_tcopy_4_msa.c b/kernel/mips/dgemm_tcopy_4_msa.c new file mode 100644 index 0000000000..a51c474297 --- /dev/null +++ b/kernel/mips/dgemm_tcopy_4_msa.c @@ -0,0 +1,153 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *pdst0, *pdst1, *pdst2, *pdst3; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + + psrc0 = src; + pdst0 = dst; + + pdst2 = dst + m * (n & ~3); + pdst3 = dst + m * (n & ~1); + + for (j = (m >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 2); i--;) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + pdst1 += m * 4; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); + } + + if (n & 1) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc4++; + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 8; + + for (i = (n >> 2); i--;) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + + ST_DP4(src0, src1, src2, src3, pdst1, 2); + pdst1 += m * 4; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + ST_DP2_INC(src0, src1, pdst2, 2); + } + + if (n & 1) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + } + } + + if (m & 1) + { + psrc1 = psrc0; + pdst1 = pdst0; + + for (i = (n >> 2); i--;) + { + LD_DP2_INC(psrc1, 2, src0, src1); + + ST_DP2(src0, src1, pdst1, 2); + pdst1 += 4 * m; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + psrc1 += 2; + + ST_DP(src0, pdst2); + } + + if (n & 1) + { + *pdst3 = *psrc1; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_tcopy_8_msa.c b/kernel/mips/dgemm_tcopy_8_msa.c new file mode 100644 index 0000000000..350ecb3595 --- /dev/null +++ b/kernel/mips/dgemm_tcopy_8_msa.c @@ -0,0 +1,276 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *psrc5, *psrc6, *psrc7, *psrc8; + FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + + pdst2 = dst + m * (n & ~7); + pdst3 = dst + m * (n & ~3); + pdst4 = dst + m * (n & ~1); + + for (j = (m >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + pdst1 = pdst0; + pdst0 += 64; + + for (i = (n >> 3); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 16, 2); + + LD_DP4_INC(psrc5, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc6, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc7, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc8, 2, src12, src13, src14, src15); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32, + 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 48, 2); + pdst1 += m * 8; + } + + if (n & 4) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + LD_DP2_INC(psrc5, 2, src8, src9); + LD_DP2_INC(psrc6, 2, src10, src11); + LD_DP2_INC(psrc7, 2, src12, src13); + LD_DP2_INC(psrc8, 2, src14, src15); + + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, + pdst2, 2); + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + src4 = LD_DP(psrc5); + src5 = LD_DP(psrc6); + src6 = LD_DP(psrc7); + src7 = LD_DP(psrc8); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + psrc5 += 2; + psrc6 += 2; + psrc7 += 2; + psrc8 += 2; + + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + *pdst4++ = *psrc3++; + *pdst4++ = *psrc4++; + *pdst4++ = *psrc5++; + *pdst4++ = *psrc6++; + *pdst4++ = *psrc7++; + *pdst4++ = *psrc8++; + } + } + + if (m & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 32; + + for (i = (n >> 3); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 16, 2); + pdst1 += 8 * m; + } + + if (n & 4) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + *pdst4++ = *psrc3++; + *pdst4++ = *psrc4++; + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 3); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + pdst1 += 8 * m; + } + + if (n & 4) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + + ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + ST_DP2_INC(src0, src1, pdst3, 2); + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + } + } + + if (m & 1) + { + psrc1 = psrc0; + psrc0 += lda; + + pdst1 = pdst0; + pdst0 += 8; + + for (i = (n >> 3); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + + ST_DP4(src0, src1, src2, src3, pdst1, 2); + pdst1 += 8 * m; + } + + if (n & 4) + { + LD_DP2_INC(psrc1, 2, src0, src1); + + ST_DP2_INC(src0, src1, pdst2, 2); + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + psrc1 += 2; + + ST_DP(src0, pdst3); + pdst3 += 2; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/dgemv_n_msa.c b/kernel/mips/dgemv_n_msa.c new file mode 100644 index 0000000000..09bb063ff2 --- /dev/null +++ b/kernel/mips/dgemv_n_msa.c @@ -0,0 +1,577 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define DGEMV_N_8x8() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ + LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ + LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \ + LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \ + LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \ + LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + y2 += tp0 * t2; \ + y3 += tp0 * t3; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ + y2 += tp1 * t6; \ + y3 += tp1 * t7; \ + \ + y0 += tp2 * t8; \ + y1 += tp2 * t9; \ + y2 += tp2 * t10; \ + y3 += tp2 * t11; \ + \ + y0 += tp3 * t12; \ + y1 += tp3 * t13; \ + y2 += tp3 * t14; \ + y3 += tp3 * t15; \ + \ + y0 += tp4 * t16; \ + y1 += tp4 * t17; \ + y2 += tp4 * t18; \ + y3 += tp4 * t19; \ + \ + y0 += tp5 * t20; \ + y1 += tp5 * t21; \ + y2 += tp5 * t22; \ + y3 += tp5 * t23; \ + \ + y0 += tp6 * t24; \ + y1 += tp6 * t25; \ + y2 += tp6 * t26; \ + y3 += tp6 * t27; \ + \ + y0 += tp7 * t28; \ + y1 += tp7 * t29; \ + y2 += tp7 * t30; \ + y3 += tp7 * t31; \ +} + +#define DGEMV_N_4x8() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + LD_DP2(pa2 + k, 2, t8, t9); \ + LD_DP2(pa3 + k, 2, t12, t13); \ + LD_DP2(pa4 + k, 2, t16, t17); \ + LD_DP2(pa5 + k, 2, t20, t21); \ + LD_DP2(pa6 + k, 2, t24, t25); \ + LD_DP2(pa7 + k, 2, t28, t29); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ + \ + y0 += tp2 * t8; \ + y1 += tp2 * t9; \ + \ + y0 += tp3 * t12; \ + y1 += tp3 * t13; \ + \ + y0 += tp4 * t16; \ + y1 += tp4 * t17; \ + \ + y0 += tp5 * t20; \ + y1 += tp5 * t21; \ + \ + y0 += tp6 * t24; \ + y1 += tp6 * t25; \ + \ + y0 += tp7 * t28; \ + y1 += tp7 * t29; \ +} + +#define DGEMV_N_8x4() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ + LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + y2 += tp0 * t2; \ + y3 += tp0 * t3; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ + y2 += tp1 * t6; \ + y3 += tp1 * t7; \ + \ + y0 += tp2 * t8; \ + y1 += tp2 * t9; \ + y2 += tp2 * t10; \ + y3 += tp2 * t11; \ + \ + y0 += tp3 * t12; \ + y1 += tp3 * t13; \ + y2 += tp3 * t14; \ + y3 += tp3 * t15; \ +} + +#define DGEMV_N_4x4() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + LD_DP2(pa2 + k, 2, t8, t9); \ + LD_DP2(pa3 + k, 2, t12, t13); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ + \ + y0 += tp2 * t8; \ + y1 += tp2 * t9; \ + \ + y0 += tp3 * t12; \ + y1 += tp3 * t13; \ +} + +#define DGEMV_N_8x2() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + y2 += tp0 * t2; \ + y3 += tp0 * t3; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ + y2 += tp1 * t6; \ + y3 += tp1 * t7; \ +} + +#define DGEMV_N_4x2() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ +} + +#define DLOAD_X8_SCALE_GP() \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + temp4 = alpha * x[4 * inc_x]; \ + temp5 = alpha * x[5 * inc_x]; \ + temp6 = alpha * x[6 * inc_x]; \ + temp7 = alpha * x[7 * inc_x]; \ + \ + tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ + tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ + tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \ + tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \ + tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \ + tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \ + tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \ + tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \ + +#define DLOAD_X4_SCALE_GP() \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + \ + tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ + tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ + tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \ + tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \ + +#define DLOAD_X8_SCALE_VECTOR() \ + LD_DP4(x, 2, x0, x1, x2, x3); \ + \ + x0 = x0 * v_alpha; \ + x1 = x1 * v_alpha; \ + x2 = x2 * v_alpha; \ + x3 = x3 * v_alpha; \ + \ + SPLATI_D2_DP(x0, tp0, tp1); \ + SPLATI_D2_DP(x1, tp2, tp3); \ + SPLATI_D2_DP(x2, tp4, tp5); \ + SPLATI_D2_DP(x3, tp6, tp7); \ + +#define DLOAD_X4_SCALE_VECTOR() \ + LD_DP2(x, 2, x0, x1); \ + \ + x0 = x0 * v_alpha; \ + x1 = x1 * v_alpha; \ + \ + SPLATI_D2_DP(x0, tp0, tp1); \ + SPLATI_D2_DP(x1, tp2, tp3); \ + +#define DLOAD_Y8_GP() \ + y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \ + y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \ + y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \ + y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \ + y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \ + y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \ + y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \ + y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \ + +#define DLOAD_Y4_GP() \ + y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \ + y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \ + y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \ + y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \ + +#define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3); +#define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1); + +#define DSTORE_Y8_GP() \ + *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \ + *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \ + *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \ + *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \ + *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \ + *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \ + *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \ + *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \ + +#define DSTORE_Y4_GP() \ + *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \ + *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \ + *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \ + *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \ + +#define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2); +#define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2); + +#define DGEMV_N_MSA() \ + for (j = (n >> 3); j--;) \ + { \ + DLOAD_X8_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_Y8(); \ + DGEMV_N_8x8(); \ + DSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_Y4(); \ + DGEMV_N_4x8(); \ + DSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + temp4 = alpha * x[4 * inc_x]; \ + temp5 = alpha * x[5 * inc_x]; \ + temp6 = alpha * x[6 * inc_x]; \ + temp7 = alpha * x[7 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + temp += temp2 * pa2[k]; \ + temp += temp3 * pa3[k]; \ + temp += temp4 * pa4[k]; \ + temp += temp5 * pa5[k]; \ + temp += temp6 * pa6[k]; \ + temp += temp7 * pa7[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + pa0 += 8 * lda; \ + pa1 += 8 * lda; \ + pa2 += 8 * lda; \ + pa3 += 8 * lda; \ + pa4 += 8 * lda; \ + pa5 += 8 * lda; \ + pa6 += 8 * lda; \ + pa7 += 8 * lda; \ + \ + x += 8 * inc_x; \ + } \ + \ + if (n & 4) \ + { \ + DLOAD_X4_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_Y8(); \ + DGEMV_N_8x4(); \ + DSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_Y4(); \ + DGEMV_N_4x4(); \ + DSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + temp += temp2 * pa2[k]; \ + temp += temp3 * pa3[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + \ + pa0 += 4 * lda; \ + pa1 += 4 * lda; \ + pa2 += 4 * lda; \ + pa3 += 4 * lda; \ + \ + x += 4 * inc_x; \ + } \ + \ + if (n & 2) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + \ + tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ + tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_Y8(); \ + DGEMV_N_8x2(); \ + DSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_Y4(); \ + DGEMV_N_4x2(); \ + DSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + \ + pa0 += 2 * lda; \ + pa1 += 2 * lda; \ + \ + x += 2 * inc_x; \ + } \ + \ + if (n & 1) \ + { \ + temp = alpha * x[0]; \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = m; i--;) \ + { \ + y[0] += temp * pa0[k]; \ + y += inc_y; \ + k++; \ + } \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *y_org = y; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v2f64 v_alpha; + v2f64 x0, x1, x2, x3, y0, y1, y2, y3; + v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + + v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); + + pa0 = A; + pa1 = A + lda; + pa2 = A + 2 * lda; + pa3 = A + 3 * lda; + pa4 = A + 4 * lda; + pa5 = A + 5 * lda; + pa6 = A + 6 * lda; + pa7 = A + 7 * lda; + + if ((1 == inc_x) && (1 == inc_y)) + { + #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR + #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR + #define DLOAD_Y8 DLOAD_Y8_VECTOR + #define DLOAD_Y4 DLOAD_Y4_VECTOR + #define DSTORE_Y8 DSTORE_Y8_VECTOR + #define DSTORE_Y4 DSTORE_Y4_VECTOR + + DGEMV_N_MSA(); + + #undef DLOAD_X8_SCALE + #undef DLOAD_X4_SCALE + #undef DLOAD_Y8 + #undef DLOAD_Y4 + #undef DSTORE_Y8 + #undef DSTORE_Y4 + } + else if (1 == inc_y) + { + #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP + #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP + #define DLOAD_Y8 DLOAD_Y8_VECTOR + #define DLOAD_Y4 DLOAD_Y4_VECTOR + #define DSTORE_Y8 DSTORE_Y8_VECTOR + #define DSTORE_Y4 DSTORE_Y4_VECTOR + + DGEMV_N_MSA(); + + #undef DLOAD_X8_SCALE + #undef DLOAD_X4_SCALE + #undef DLOAD_Y8 + #undef DLOAD_Y4 + #undef DSTORE_Y8 + #undef DSTORE_Y4 + } + else if (1 == inc_x) + { + #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR + #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR + #define DLOAD_Y8 DLOAD_Y8_GP + #define DLOAD_Y4 DLOAD_Y4_GP + #define DSTORE_Y8 DSTORE_Y8_GP + #define DSTORE_Y4 DSTORE_Y4_GP + + DGEMV_N_MSA(); + + #undef DLOAD_X8_SCALE + #undef DLOAD_X4_SCALE + #undef DLOAD_Y8 + #undef DLOAD_Y4 + #undef DSTORE_Y8 + #undef DSTORE_Y4 + } + else + { + #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP + #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP + #define DLOAD_Y8 DLOAD_Y8_GP + #define DLOAD_Y4 DLOAD_Y4_GP + #define DSTORE_Y8 DSTORE_Y8_GP + #define DSTORE_Y4 DSTORE_Y4_GP + + DGEMV_N_MSA(); + + #undef DLOAD_X8_SCALE + #undef DLOAD_X4_SCALE + #undef DLOAD_Y8 + #undef DLOAD_Y4 + #undef DSTORE_Y8 + #undef DSTORE_Y4 + } + + return(0); +} diff --git a/kernel/mips/dgemv_t_msa.c b/kernel/mips/dgemv_t_msa.c new file mode 100644 index 0000000000..f74cb2e668 --- /dev/null +++ b/kernel/mips/dgemv_t_msa.c @@ -0,0 +1,589 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define DGEMV_T_8x8() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ + LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ + LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \ + LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \ + LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \ + LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + tp0 += x2 * t2; \ + tp0 += x3 * t3; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ + tp1 += x2 * t6; \ + tp1 += x3 * t7; \ + \ + tp2 += x0 * t8; \ + tp2 += x1 * t9; \ + tp2 += x2 * t10; \ + tp2 += x3 * t11; \ + \ + tp3 += x0 * t12; \ + tp3 += x1 * t13; \ + tp3 += x2 * t14; \ + tp3 += x3 * t15; \ + \ + tp4 += x0 * t16; \ + tp4 += x1 * t17; \ + tp4 += x2 * t18; \ + tp4 += x3 * t19; \ + \ + tp5 += x0 * t20; \ + tp5 += x1 * t21; \ + tp5 += x2 * t22; \ + tp5 += x3 * t23; \ + \ + tp6 += x0 * t24; \ + tp6 += x1 * t25; \ + tp6 += x2 * t26; \ + tp6 += x3 * t27; \ + \ + tp7 += x0 * t28; \ + tp7 += x1 * t29; \ + tp7 += x2 * t30; \ + tp7 += x3 * t31; \ +} + +#define DGEMV_T_8x4() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + LD_DP2(pa2 + k, 2, t8, t9); \ + LD_DP2(pa3 + k, 2, t12, t13); \ + LD_DP2(pa4 + k, 2, t16, t17); \ + LD_DP2(pa5 + k, 2, t20, t21); \ + LD_DP2(pa6 + k, 2, t24, t25); \ + LD_DP2(pa7 + k, 2, t28, t29); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ + \ + tp2 += x0 * t8; \ + tp2 += x1 * t9; \ + \ + tp3 += x0 * t12; \ + tp3 += x1 * t13; \ + \ + tp4 += x0 * t16; \ + tp4 += x1 * t17; \ + \ + tp5 += x0 * t20; \ + tp5 += x1 * t21; \ + \ + tp6 += x0 * t24; \ + tp6 += x1 * t25; \ + \ + tp7 += x0 * t28; \ + tp7 += x1 * t29; \ +} + +#define DGEMV_T_8x2() \ +{ \ + t0 = LD_DP(pa0 + k); \ + t4 = LD_DP(pa1 + k); \ + t8 = LD_DP(pa2 + k); \ + t12 = LD_DP(pa3 + k); \ + t16 = LD_DP(pa4 + k); \ + t20 = LD_DP(pa5 + k); \ + t24 = LD_DP(pa6 + k); \ + t28 = LD_DP(pa7 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t4; \ + tp2 += x0 * t8; \ + tp3 += x0 * t12; \ + tp4 += x0 * t16; \ + tp5 += x0 * t20; \ + tp6 += x0 * t24; \ + tp7 += x0 * t28; \ +} + +#define DGEMV_T_4x8() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ + LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + tp0 += x2 * t2; \ + tp0 += x3 * t3; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ + tp1 += x2 * t6; \ + tp1 += x3 * t7; \ + \ + tp2 += x0 * t8; \ + tp2 += x1 * t9; \ + tp2 += x2 * t10; \ + tp2 += x3 * t11; \ + \ + tp3 += x0 * t12; \ + tp3 += x1 * t13; \ + tp3 += x2 * t14; \ + tp3 += x3 * t15; \ +} + +#define DGEMV_T_4x4() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + LD_DP2(pa2 + k, 2, t8, t9); \ + LD_DP2(pa3 + k, 2, t12, t13); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ + \ + tp2 += x0 * t8; \ + tp2 += x1 * t9; \ + \ + tp3 += x0 * t12; \ + tp3 += x1 * t13; \ +} + +#define DGEMV_T_4x2() \ +{ \ + t0 = LD_DP(pa0 + k); \ + t4 = LD_DP(pa1 + k); \ + t8 = LD_DP(pa2 + k); \ + t12 = LD_DP(pa3 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t4; \ + tp2 += x0 * t8; \ + tp3 += x0 * t12; \ +} + +#define DGEMV_T_2x8() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + tp0 += x2 * t2; \ + tp0 += x3 * t3; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ + tp1 += x2 * t6; \ + tp1 += x3 * t7; \ +} + +#define DGEMV_T_2x4() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ +} + +#define DGEMV_T_2x2() \ +{ \ + t0 = LD_DP(pa0 + k); \ + t4 = LD_DP(pa1 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t4; \ +} + +#define DLOAD_X8_GP() \ + x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ + x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ + x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \ + x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \ + x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x))); \ + x2 = (v2f64) __msa_insert_d((v2i64) x2, 1, *((long long *)(x + 5 * inc_x))); \ + x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x))); \ + x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((long long *)(x + 7 * inc_x))); \ + +#define DLOAD_X4_GP() \ + x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ + x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ + x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \ + x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \ + +#define DLOAD_X2_GP() \ + x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ + x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ + +#define DLOAD_X8_VECTOR() LD_DP4(x, 2, x0, x1, x2, x3); +#define DLOAD_X4_VECTOR() LD_DP2(x, 2, x0, x1); +#define DLOAD_X2_VECTOR() x0 = LD_DP(x); + +#define DGEMV_T_MSA() \ + for (j = (n >> 3); j--;) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + tp2 = zero; \ + tp3 = zero; \ + tp4 = zero; \ + tp5 = zero; \ + tp6 = zero; \ + tp7 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_X8(); \ + DGEMV_T_8x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_X4(); \ + DGEMV_T_8x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + if (m & 2) \ + { \ + DLOAD_X2(); \ + DGEMV_T_8x2(); \ + \ + x += 2 * inc_x; \ + k += 2; \ + } \ + \ + ILVRL_D2_DP(tp1, tp0, t0, t4); \ + ILVRL_D2_DP(tp3, tp2, t1, t5); \ + ILVRL_D2_DP(tp5, tp4, t2, t6); \ + ILVRL_D2_DP(tp7, tp6, t3, t7); \ + ADD2(t0, t4, t1, t5, t0, t1); \ + ADD2(t2, t6, t3, t7, t2, t3); \ + \ + temp0 = t0[0]; \ + temp1 = t0[1]; \ + temp2 = t1[0]; \ + temp3 = t1[1]; \ + temp4 = t2[0]; \ + temp5 = t2[1]; \ + temp6 = t3[0]; \ + temp7 = t3[1]; \ + \ + if (m & 1) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + temp2 += pa2[k] * x[0]; \ + temp3 += pa3[k] * x[0]; \ + temp4 += pa4[k] * x[0]; \ + temp5 += pa5[k] * x[0]; \ + temp6 += pa6[k] * x[0]; \ + temp7 += pa7[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + res2 = y[2 * inc_y]; \ + res3 = y[3 * inc_y]; \ + res4 = y[4 * inc_y]; \ + res5 = y[5 * inc_y]; \ + res6 = y[6 * inc_y]; \ + res7 = y[7 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + res2 += alpha * temp2; \ + res3 += alpha * temp3; \ + res4 += alpha * temp4; \ + res5 += alpha * temp5; \ + res6 += alpha * temp6; \ + res7 += alpha * temp7; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + y[2 * inc_y] = res2; \ + y[3 * inc_y] = res3; \ + y[4 * inc_y] = res4; \ + y[5 * inc_y] = res5; \ + y[6 * inc_y] = res6; \ + y[7 * inc_y] = res7; \ + \ + y += 8 * inc_y; \ + \ + pa0 += 8 * lda; \ + pa1 += 8 * lda; \ + pa2 += 8 * lda; \ + pa3 += 8 * lda; \ + pa4 += 8 * lda; \ + pa5 += 8 * lda; \ + pa6 += 8 * lda; \ + pa7 += 8 * lda; \ + } \ + \ + if (n & 4) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + tp2 = zero; \ + tp3 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_X8(); \ + DGEMV_T_4x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_X4(); \ + DGEMV_T_4x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + if (m & 2) \ + { \ + DLOAD_X2(); \ + DGEMV_T_4x2(); \ + \ + x += 2 * inc_x; \ + k += 2; \ + } \ + \ + ILVRL_D2_DP(tp1, tp0, t0, t4); \ + ILVRL_D2_DP(tp3, tp2, t1, t5); \ + ADD2(t0, t4, t1, t5, t0, t1); \ + \ + temp0 = t0[0]; \ + temp1 = t0[1]; \ + temp2 = t1[0]; \ + temp3 = t1[1]; \ + \ + if (m & 1) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + temp2 += pa2[k] * x[0]; \ + temp3 += pa3[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + res2 = y[2 * inc_y]; \ + res3 = y[3 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + res2 += alpha * temp2; \ + res3 += alpha * temp3; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + y[2 * inc_y] = res2; \ + y[3 * inc_y] = res3; \ + \ + y += 4 * inc_y; \ + \ + pa0 += 4 * lda; \ + pa1 += 4 * lda; \ + pa2 += 4 * lda; \ + pa3 += 4 * lda; \ + } \ + \ + if (n & 2) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_X8(); \ + DGEMV_T_2x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_X4(); \ + DGEMV_T_2x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + if (m & 2) \ + { \ + DLOAD_X2(); \ + DGEMV_T_2x2(); \ + \ + x += 2 * inc_x; \ + k += 2; \ + } \ + \ + ILVRL_D2_DP(tp1, tp0, t0, t4); \ + \ + t0 += t4; \ + \ + temp0 = t0[0]; \ + temp1 = t0[1]; \ + \ + if (m & 1) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + \ + y += 2 * inc_y; \ + \ + pa0 += 2 * lda; \ + pa1 += 2 * lda; \ + } \ + \ + if (n & 1) \ + { \ + temp0 = 0.0; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = m; i--;) \ + { \ + temp0 += pa0[k] * x[0]; \ + x += inc_x; \ + k++; \ + } \ + \ + y[0] += alpha * temp0; \ + y += inc_y; \ + pa0 += lda; \ + } + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *srcx_org = x; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + FLOAT res0, res1, res2, res3, res4, res5, res6, res7; + v2f64 x0, x1, x2, x3; + v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + v2f64 zero = {0}; + + pa0 = A + 0 * lda; + pa1 = A + 1 * lda; + pa2 = A + 2 * lda; + pa3 = A + 3 * lda; + pa4 = A + 4 * lda; + pa5 = A + 5 * lda; + pa6 = A + 6 * lda; + pa7 = A + 7 * lda; + + if (1 == inc_x) + { + #define DLOAD_X8 DLOAD_X8_VECTOR + #define DLOAD_X4 DLOAD_X4_VECTOR + #define DLOAD_X2 DLOAD_X2_VECTOR + + DGEMV_T_MSA(); + + #undef DLOAD_X8 + #undef DLOAD_X4 + #undef DLOAD_X2 + } + else + { + #define DLOAD_X8 DLOAD_X8_GP + #define DLOAD_X4 DLOAD_X4_GP + #define DLOAD_X2 DLOAD_X2_GP + + DGEMV_T_MSA(); + + #undef DLOAD_X8 + #undef DLOAD_X4 + #undef DLOAD_X2 + } + + return(0); +} diff --git a/kernel/mips/dot.c b/kernel/mips/dot.c new file mode 100644 index 0000000000..de7f7167f8 --- /dev/null +++ b/kernel/mips/dot.c @@ -0,0 +1,55 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c new file mode 100644 index 0000000000..dc21dab456 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -0,0 +1,1349 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17; + v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33; + v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43; + v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52; + v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60; + v2f64 src_a61, src_a62, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk > 0) + { + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pbb, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + pba += 8; + pbb += 4; + + LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17); + LD_DP2(pbb, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a8; + src_a1 = src_a9; + src_a2 = src_a16; + src_a3 = src_a17; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + } + + a -= 64; + b -= 32; + + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); + ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9); + ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11); + ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); + ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); + + src_a54 = __msa_cast_to_vector_double(*(a + 54)); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a62 = LD_DP(a + 62); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); + src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); + src_a60 = LD_DP(a + 60); + src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1); + src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0); + src_a52 = LD_DP(a + 52); + src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); + src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); + src_a44 = LD_DP(a + 44); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); + src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); + src_a36 = __msa_cast_to_vector_double(*(a + 36)); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + + res_c7 *= src_a63; + res_c6 -= res_c7 * src_a62; + res_c6 *= src_a54; + + res_c15 *= src_a63; + res_c14 -= res_c15 * src_a62; + res_c14 *= src_a54; + + ST_DP(res_c7, b + 28); + ST_DP(res_c6, b + 24); + ST_DP(res_c15, b + 30); + ST_DP(res_c14, b + 26); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); + ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15); + ST_DP(src_c3, c + 6); + ST_DP(src_c7, c_nxt1line + 6); + ST_DP(src_c11, c_nxt2line + 6); + ST_DP(src_c15, c_nxt3line + 6); + + res_c5 -= res_c7 * src_a61; + res_c5 -= res_c6 * src_a53; + res_c5 *= src_a45; + + res_c4 -= res_c7 * src_a60; + res_c4 -= res_c6 * src_a52; + res_c4 -= res_c5 * src_a44; + res_c4 *= src_a36; + + res_c13 -= res_c15 * src_a61; + res_c13 -= res_c14 * src_a53; + res_c13 *= src_a45; + + res_c12 -= res_c15 * src_a60; + res_c12 -= res_c14 * src_a52; + res_c12 -= res_c13 * src_a44; + res_c12 *= src_a36; + + src_a56 = LD_DP(a + 56); + src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); + src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0); + src_a58 = LD_DP(a + 58); + src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1); + src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0); + + ST_DP(res_c4, b + 16); + ST_DP(res_c5, b + 20); + ST_DP(res_c12, b + 18); + ST_DP(res_c13, b + 22); + + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); + ST_DP(src_c2, c + 4); + ST_DP(src_c6, c_nxt1line + 4); + ST_DP(src_c10, c_nxt2line + 4); + ST_DP(src_c14, c_nxt3line + 4); + + src_a50 = LD_DP(a + 50); + src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1); + src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0); + src_a42 = LD_DP(a + 42); + src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1); + src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0); + src_a34 = LD_DP(a + 34); + src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); + src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); + src_a26 = LD_DP(a + 26); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); + src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); + src_a18 = __msa_cast_to_vector_double(*(a + 18)); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + + res_c3 -= res_c7 * src_a59; + res_c2 -= res_c7 * src_a58; + res_c1 -= res_c7 * src_a57; + res_c0 -= res_c7 * src_a56; + + res_c11 -= res_c15 * src_a59; + res_c10 -= res_c15 * src_a58; + res_c9 -= res_c15 * src_a57; + res_c8 -= res_c15 * src_a56; + + res_c3 -= res_c6 * src_a51; + res_c3 -= res_c5 * src_a43; + res_c3 -= res_c4 * src_a35; + res_c3 *= src_a27; + + res_c2 -= res_c6 * src_a50; + res_c2 -= res_c5 * src_a42; + res_c2 -= res_c4 * src_a34; + res_c2 -= res_c3 * src_a26; + res_c2 *= src_a18; + + res_c11 -= res_c14 * src_a51; + res_c11 -= res_c13 * src_a43; + res_c11 -= res_c12 * src_a35; + res_c11 *= src_a27; + + res_c10 -= res_c14 * src_a50; + res_c10 -= res_c13 * src_a42; + res_c10 -= res_c12 * src_a34; + res_c10 -= res_c11 * src_a26; + res_c10 *= src_a18; + + src_a48 = LD_DP(a + 48); + src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1); + src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0); + src_a40 = LD_DP(a + 40); + src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); + src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0); + + ST_DP(res_c2, b + 8); + ST_DP(res_c3, b + 12); + ST_DP(res_c10, b + 10); + ST_DP(res_c11, b + 14); + + src_a32 = LD_DP(a + 32); + src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1); + src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0); + src_a24 = LD_DP(a + 24); + src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); + src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); + + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); + ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13); + ST_DP(src_c1, c + 2); + ST_DP(src_c5, c_nxt1line + 2); + ST_DP(src_c9, c_nxt2line + 2); + ST_DP(src_c13, c_nxt3line + 2); + + res_c1 -= res_c6 * src_a49; + res_c1 -= res_c5 * src_a41; + res_c1 -= res_c4 * src_a33; + res_c1 -= res_c3 * src_a25; + + res_c0 -= res_c6 * src_a48; + res_c0 -= res_c5 * src_a40; + res_c0 -= res_c4 * src_a32; + res_c0 -= res_c3 * src_a24; + + res_c9 -= res_c14 * src_a49; + res_c9 -= res_c13 * src_a41; + res_c9 -= res_c12 * src_a33; + res_c9 -= res_c11 * src_a25; + + res_c8 -= res_c14 * src_a48; + res_c8 -= res_c13 * src_a40; + res_c8 -= res_c12 * src_a32; + res_c8 -= res_c11 * src_a24; + + src_a16 = LD_DP(a + 16); + src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); + src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); + src_a8 = LD_DP(a + 8); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + src_a0 = __msa_cast_to_vector_double(*(a + 0)); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + + res_c1 -= res_c2 * src_a17; + res_c1 *= src_a9; + + res_c9 -= res_c10 * src_a17; + res_c9 *= src_a9; + + res_c0 -= res_c2 * src_a16; + res_c0 -= res_c1 * src_a8; + res_c0 *= src_a0; + + res_c8 -= res_c10 * src_a16; + res_c8 -= res_c9 * src_a8; + res_c8 *= src_a0; + + ST_DP(res_c0, b + 0); + ST_DP(res_c8, b + 2); + ST_DP(res_c1, b + 4); + ST_DP(res_c9, b + 6); + + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12); + + ST_DP(src_c0, c); + ST_DP(src_c4, c_nxt1line); + ST_DP(src_c8, c_nxt2line); + ST_DP(src_c12, c_nxt3line); +} + +static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17; + v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33; + v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43; + v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52; + v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60; + v2f64 src_a61, src_a62, src_a63; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b0, src_b1; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pbb); + + for (i = bk - 1; i--;) + { + pba += 8; + pbb += 2; + + LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17); + src_b1 = LD_DP(pbb); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_a0 = src_a8; + src_a1 = src_a9; + src_a2 = src_a16; + src_a3 = src_a17; + src_b0 = src_b1; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + } + + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); + + src_a56 = LD_DP(a - 8); + src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); + src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0); + src_a58 = LD_DP(a - 6); + src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1); + src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0); + src_a60 = LD_DP(a - 4); + src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1); + src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0); + src_a62 = LD_DP(a - 2); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); + src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); + + res_c7 *= src_a63; + res_c6 -= res_c7 * src_a62; + res_c5 -= res_c7 * src_a61; + res_c4 -= res_c7 * src_a60; + res_c3 -= res_c7 * src_a59; + res_c2 -= res_c7 * src_a58; + res_c1 -= res_c7 * src_a57; + res_c0 -= res_c7 * src_a56; + + src_a48 = LD_DP(a - 16); + src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1); + src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0); + src_a50 = LD_DP(a - 14); + src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1); + src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0); + src_a52 = LD_DP(a - 12); + src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); + src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); + src_a54 = __msa_cast_to_vector_double(*(a - 10)); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + + src_a40 = LD_DP(a - 24); + src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); + src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0); + src_a42 = LD_DP(a - 22); + src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1); + src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0); + src_a44 = LD_DP(a - 20); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); + src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); + + res_c6 *= src_a54; + res_c5 -= res_c6 * src_a53; + res_c4 -= res_c6 * src_a52; + res_c3 -= res_c6 * src_a51; + res_c2 -= res_c6 * src_a50; + res_c1 -= res_c6 * src_a49; + res_c0 -= res_c6 * src_a48; + + res_c5 *= src_a45; + res_c4 -= res_c5 * src_a44; + res_c3 -= res_c5 * src_a43; + res_c2 -= res_c5 * src_a42; + res_c1 -= res_c5 * src_a41; + res_c0 -= res_c5 * src_a40; + + ST_DP(res_c7, b - 2); + ST_DP(res_c6, b - 4); + ST_DP(res_c5, b - 6); + + src_a32 = LD_DP(a - 32); + src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1); + src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0); + src_a34 = LD_DP(a - 30); + src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); + src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); + src_a36 = __msa_cast_to_vector_double(*(a - 28)); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + + res_c4 *= src_a36; + res_c3 -= res_c4 * src_a35; + res_c2 -= res_c4 * src_a34; + res_c1 -= res_c4 * src_a33; + res_c0 -= res_c4 * src_a32; + + src_a24 = LD_DP(a - 40); + src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); + src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); + src_a26 = LD_DP(a - 38); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); + src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); + src_a16 = LD_DP(a - 48); + src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); + src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); + src_a18 = __msa_cast_to_vector_double(*(a - 46)); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a0 = __msa_cast_to_vector_double(*(a - 64)); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = LD_DP(a - 56); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + + res_c3 *= src_a27; + res_c2 -= res_c3 * src_a26; + res_c1 -= res_c3 * src_a25; + res_c0 -= res_c3 * src_a24; + + res_c2 *= src_a18; + res_c1 -= res_c2 * src_a17; + res_c0 -= res_c2 * src_a16; + + res_c1 *= src_a9; + res_c0 -= res_c1 * src_a8; + + res_c0 *= src_a0; + + ST_DP(res_c4, b - 8); + ST_DP(res_c3, b - 10); + ST_DP(res_c2, b - 12); + ST_DP(res_c1, b - 14); + ST_DP(res_c0, b - 16); + + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); +} + +static void dsolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; + FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; + FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--; ) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c4 -= aa[4] * bb[0]; + c5 -= aa[5] * bb[0]; + c6 -= aa[6] * bb[0]; + c7 -= aa[7] * bb[0]; + + aa += 8; + bb += 1; + } + } + + a -= 64; + b -= 8; + + a0 = *(a + 0); + a8 = *(a + 8); + a9 = *(a + 9); + a16 = *(a + 16); + a17 = *(a + 17); + a18 = *(a + 18); + a24 = *(a + 24); + a25 = *(a + 25); + a26 = *(a + 26); + a27 = *(a + 27); + a32 = *(a + 32); + a33 = *(a + 33); + a34 = *(a + 34); + a35 = *(a + 35); + a36 = *(a + 36); + a40 = *(a + 40); + a41 = *(a + 41); + a42 = *(a + 42); + a43 = *(a + 43); + a44 = *(a + 44); + a45 = *(a + 45); + a48 = *(a + 48); + a49 = *(a + 49); + a50 = *(a + 50); + a51 = *(a + 51); + a52 = *(a + 52); + a53 = *(a + 53); + a54 = *(a + 54); + a56 = *(a + 56); + a57 = *(a + 57); + a58 = *(a + 58); + a59 = *(a + 59); + a60 = *(a + 60); + a61 = *(a + 61); + a62 = *(a + 62); + a63 = *(a + 63); + + c7 *= a63; + + c6 -= c7 * a62; + c6 *= a54; + + c5 -= c7 * a61; + c5 -= c6 * a53; + c5 *= a45; + + c4 -= c7 * a60; + c4 -= c6 * a52; + c4 -= c5 * a44; + c4 *= a36; + + c3 -= c7 * a59; + c3 -= c6 * a51; + c3 -= c5 * a43; + c3 -= c4 * a35; + c3 *= a27; + + c2 -= c7 * a58; + c2 -= c6 * a50; + c2 -= c5 * a42; + c2 -= c4 * a34; + c2 -= c3 * a26; + c2 *= a18; + + c1 -= c7 * a57; + c1 -= c6 * a49; + c1 -= c5 * a41; + c1 -= c4 * a33; + c1 -= c3 * a25; + c1 -= c2 * a17; + c1 *= a9; + + c0 -= c7 * a56; + c0 -= c6 * a48; + c0 -= c5 * a40; + c0 -= c4 * a32; + c0 -= c3 * a24; + c0 -= c2 * a16; + c0 -= c1 * a8; + c0 *= a0; + + *(b + 7) = c7; + *(b + 6) = c6; + *(b + 5) = c5; + *(b + 4) = c4; + *(b + 3) = c3; + *(b + 2) = c2; + *(b + 1) = c1; + *(b + 0) = c0; + + *(c + 7) = c7; + *(c + 6) = c6; + *(c + 5) = c5; + *(c + 4) = c4; + *(c + 3) = c3; + *(c + 2) = c2; + *(c + 1) = c1; + *(c + 0) = c0; +} + +static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13; + v2f64 src_a14, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + LD_DP2(bb, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + aa += 4; + bb += 4; + } + } + + a -= 16; + b -= 16; + + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7); + + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + + src_a9 = LD_DP(a + 9); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + + src_a8 = __msa_cast_to_vector_double(*(a + 8)); + src_a0 = __msa_cast_to_vector_double(*(a + 0)); + + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + + res_c3 *= src_a15; + res_c7 *= src_a15; + + res_c2 -= res_c3 * src_a14; + res_c6 -= res_c7 * src_a14; + res_c2 *= src_a10; + res_c6 *= src_a10; + + res_c1 -= res_c3 * src_a13; + res_c5 -= res_c7 * src_a13; + res_c1 -= res_c2 * src_a9; + res_c5 -= res_c6 * src_a9; + res_c1 *= src_a5; + res_c5 *= src_a5; + + res_c0 -= res_c3 * src_a12; + res_c4 -= res_c7 * src_a12; + res_c0 -= res_c2 * src_a8; + res_c4 -= res_c6 * src_a8; + res_c0 -= res_c1 * src_a4; + res_c4 -= res_c5 * src_a4; + res_c0 *= src_a0; + res_c4 *= src_a0; + + ST_DP(res_c7, b + 14); + ST_DP(res_c3, b + 12); + ST_DP(res_c6, b + 10); + ST_DP(res_c2, b + 8); + ST_DP(res_c5, b + 6); + ST_DP(res_c1, b + 4); + ST_DP(res_c4, b + 2); + ST_DP(res_c0, b + 0); + + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); + ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); +} + +static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13; + v2f64 src_a14, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + v2f64 src_a0, src_a1, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + src_b0 = LD_DP(bb); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + aa += 4; + bb += 2; + } + } + + a -= 16; + b -= 8; + + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); + + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + + src_a9 = LD_DP(a + 9); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + + src_a8 = __msa_cast_to_vector_double(*(a + 8)); + src_a0 = __msa_cast_to_vector_double(*(a + 0)); + + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + + res_c3 *= src_a15; + + res_c2 -= res_c3 * src_a14; + res_c2 *= src_a10; + + res_c1 -= res_c3 * src_a13; + res_c1 -= res_c2 * src_a9; + res_c1 *= src_a5; + + res_c0 -= res_c3 * src_a12; + res_c0 -= res_c2 * src_a8; + res_c0 -= res_c1 * src_a4; + res_c0 *= src_a0; + + ST_DP(res_c3, b + 6); + ST_DP(res_c2, b + 4); + ST_DP(res_c1, b + 2); + ST_DP(res_c0, b + 0); + + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); +} + +static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + + aa += 4; + bb += 1; + } + } + + a -= 16; + b -= 4; + + a0 = *(a + 0); + a4 = *(a + 4); + a5 = *(a + 5); + a8 = *(a + 8); + a9 = *(a + 9); + a10 = *(a + 10); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + + c3 *= a15; + + c2 -= c3 * a14; + c2 *= a10; + + c1 -= c3 * a13; + c1 -= c2 * a9; + c1 *= a5; + + c0 -= c3 * a12; + c0 -= c2 * a8; + c0 -= c1 * a4; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + + aa += 2; + bb += 4; + } + } + + a -= 4; + b -= 8; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c0 -= c1 * a2; + c0 *= a0; + + c1_nxt1 *= a3; + c0_nxt1 -= c1_nxt1 * a2; + c0_nxt1 *= a0; + + c1_nxt2 *= a3; + c0_nxt2 -= c1_nxt2 * a2; + c0_nxt2 *= a0; + + c1_nxt3 *= a3; + c0_nxt3 -= c1_nxt3 * a2; + c0_nxt3 *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; + + aa += 2; + bb += 2; + } + } + + a -= 4; + b -= 4; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + + c0 -= c1 * a2; + c0 *= a0; + + c1_nxt *= a3; + + c0_nxt -= c1_nxt * a2; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + + aa += 2; + bb += 1; + } + } + + a0 = *(a - 4); + a2 = *(a - 2); + a3 = *(a - 1); + + c1 *= a3; + c0 -= c1 * a2; + c0 *= a0; + + *(b - 2) = c0; + *(b - 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + + aa += 1; + bb += 4; + } + } + + c0 *= *(a - 1); + c1 *= *(a - 1); + c2 *= *(a - 1); + c3 *= *(a - 1); + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + + *(b - 4) = c0; + *(b - 3) = c1; + *(b - 2) = c2; + *(b - 1) = c3; +} + +static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + *c *= *a; + *(c + ldc) = *a * *(c + ldc); + + *b = *c; + *(b + 1) = *(c + ldc); +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG kk, i, j; + FLOAT *aa, *bb, *cc; + + for (j = (n >> 2); j--;) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + bb = b + 4 * kk; + cc = c + (m - 1); + + dsolve_1x4_ln_msa(aa, bb, cc, ldc, k - kk); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & -2) - 2) * k + 2 * kk; + bb = b + 4 * kk; + cc = c + ((m & -2) - 2); + + dsolve_2x4_ln_msa(aa, bb, cc, ldc, k - kk); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & -4) - 4) * k + 4 * kk; + bb = b + 4 * kk; + cc = c + ((m & -4) - 4); + + dsolve_4x4_ln_msa(aa, bb, cc, ldc, k - kk); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & -8) - 8) * k; + cc = c + ((m & -8) - 8); + + do + { + dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, k - kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 3) + { + if (n & 2) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + ((m & -1) - 1) * k; + cc = c + ((m & -1) - 1); + + dsolve_1x2_ln_msa(aa + kk - 1, b + kk * 2 - 2, cc, ldc); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & -2) - 2) * k; + cc = c + ((m & -2) - 2); + + dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, k - kk); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & -4) - 4) * k; + cc = c + ((m & -4) - 4); + + dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, k - kk); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & -8) - 8) * k; + cc = c + ((m & -8) - 8); + + do + { + dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, k - kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + kk -= 1; + aa = a + ((m & -1) - 1) * k + kk; + cc = c + ((m & -1) - 1); + + *cc *= *aa; + *(b + kk) = *cc; + } + + if (m & 2) + { + aa = a + ((m & -2) - 2) * k + kk * 2; + cc = c + ((m & -2) - 2); + + dsolve_2x1_ln_msa(aa, b + kk, cc, k - kk); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & -4) - 4) * k; + cc = c + ((m & -4) - 4); + + dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, k - kk); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & -8) - 8) * k; + cc = c + ((m & -8) - 8); + + do + { + dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, k - kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + } + } + + return 0; +} diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c new file mode 100644 index 0000000000..897fd313b4 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -0,0 +1,1334 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk) + { + BLASLONG i; + v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(b, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + a += 8; + b += 4; + + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2(b, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + a += 8; + b += 4; + } + + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); + ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9); + ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11); + ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); + ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c7 -= res_c0 * src_a7; + + res_c8 *= src_a0; + res_c9 -= res_c8 * src_a1; + res_c10 -= res_c8 * src_a2; + res_c11 -= res_c8 * src_a3; + res_c12 -= res_c8 * src_a4; + res_c13 -= res_c8 * src_a5; + res_c14 -= res_c8 * src_a6; + res_c15 -= res_c8 * src_a7; + + src_a9 = __msa_cast_to_vector_double(*(a + 9)); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + res_c1 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c7 -= res_c1 * src_a15; + + res_c9 *= src_a9; + res_c10 -= res_c9 * src_a10; + res_c11 -= res_c9 * src_a11; + res_c12 -= res_c9 * src_a12; + res_c13 -= res_c9 * src_a13; + res_c14 -= res_c9 * src_a14; + res_c15 -= res_c9 * src_a15; + + ST_DP(res_c0, b + 0); + ST_DP(res_c8, b + 2); + ST_DP(res_c1, b + 4); + ST_DP(res_c9, b + 6); + + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12); + + ST_DP(src_c0, c); + ST_DP(src_c4, c_nxt1line); + ST_DP(src_c8, c_nxt2line); + ST_DP(src_c12, c_nxt3line); + + src_a18 = LD_DP(a + 18); + src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a20 = LD_DP(a + 20); + src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1); + src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0); + src_a22 = LD_DP(a + 22); + src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1); + src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0); + + res_c2 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c7 -= res_c2 * src_a23; + + res_c10 *= src_a18; + res_c11 -= res_c10 * src_a19; + res_c12 -= res_c10 * src_a20; + res_c13 -= res_c10 * src_a21; + res_c14 -= res_c10 * src_a22; + res_c15 -= res_c10 * src_a23; + + src_a27 = __msa_cast_to_vector_double(*(a + 27)); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a28 = LD_DP(a + 28); + src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); + src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); + src_a30 = LD_DP(a + 30); + src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1); + src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0); + + res_c3 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c7 -= res_c3 * src_a31; + + res_c11 *= src_a27; + res_c12 -= res_c11 * src_a28; + res_c13 -= res_c11 * src_a29; + res_c14 -= res_c11 * src_a30; + res_c15 -= res_c11 * src_a31; + + ST_DP(res_c2, b + 8); + ST_DP(res_c10, b + 10); + ST_DP(res_c3, b + 12); + ST_DP(res_c11, b + 14); + + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); + ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13); + + src_a36 = LD_DP(a + 36); + src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a38 = LD_DP(a + 38); + src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1); + src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0); + + res_c4 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c7 -= res_c4 * src_a39; + + res_c12 *= src_a36; + res_c13 -= res_c12 * src_a37; + res_c14 -= res_c12 * src_a38; + res_c15 -= res_c12 * src_a39; + + src_a45 = __msa_cast_to_vector_double(*(a + 45)); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a46 = LD_DP(a + 46); + src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); + src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); + + res_c5 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c7 -= res_c5 * src_a47; + + res_c13 *= src_a45; + res_c14 -= res_c13 * src_a46; + res_c15 -= res_c13 * src_a47; + + ST_DP(src_c1, c + 2); + ST_DP(src_c5, c_nxt1line + 2); + ST_DP(src_c9, c_nxt2line + 2); + ST_DP(src_c13, c_nxt3line + 2); + + ST_DP(res_c4, b + 16); + ST_DP(res_c12, b + 18); + ST_DP(res_c5, b + 20); + ST_DP(res_c13, b + 22); + + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); + + src_a63 = __msa_cast_to_vector_double(*(a + 63)); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a54 = LD_DP(a + 54); + src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + + res_c6 *= src_a54; + res_c7 -= res_c6 * src_a55; + + res_c14 *= src_a54; + res_c15 -= res_c14 * src_a55; + + res_c7 *= src_a63; + res_c15 *= src_a63; + + ST_DP(src_c2, c + 4); + ST_DP(src_c6, c_nxt1line + 4); + ST_DP(src_c10, c_nxt2line + 4); + ST_DP(src_c14, c_nxt3line + 4); + + ST_DP(res_c6, b + 24); + ST_DP(res_c14, b + 26); + ST_DP(res_c7, b + 28); + ST_DP(res_c15, b + 30); + + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); + ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15); + + ST_DP(src_c3, c + 6); + ST_DP(src_c7, c_nxt1line + 6); + ST_DP(src_c11, c_nxt2line + 6); + ST_DP(src_c15, c_nxt3line + 6); +} + +static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk) + { + BLASLONG i; + v2f64 src_b, src_b0, src_b1; + + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b); + + a += 8; + b += 2; + + for (i = (bk - 1); i--;) + { + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + + a += 8; + b += 2; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + } + + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c7 -= res_c0 * src_a7; + + src_a9 = __msa_cast_to_vector_double(*(a + 9)); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + res_c1 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c7 -= res_c1 * src_a15; + + src_a18 = LD_DP(a + 18); + src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a20 = LD_DP(a + 20); + src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1); + src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0); + src_a22 = LD_DP(a + 22); + src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1); + src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0); + + res_c2 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c7 -= res_c2 * src_a23; + + src_a27 = __msa_cast_to_vector_double(*(a + 27)); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a28 = LD_DP(a + 28); + src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); + src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); + src_a30 = LD_DP(a + 30); + src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1); + src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0); + + res_c3 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c7 -= res_c3 * src_a31; + + ST_DP(res_c0, b + 0); + ST_DP(res_c1, b + 2); + ST_DP(res_c2, b + 4); + ST_DP(res_c3, b + 6); + + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c4, src_c5, c + ldc, 2); + + src_a36 = LD_DP(a + 36); + src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a38 = LD_DP(a + 38); + src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1); + src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0); + + res_c4 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c7 -= res_c4 * src_a39; + + src_a45 = __msa_cast_to_vector_double(*(a + 45)); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a46 = LD_DP(a + 46); + src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); + src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); + + res_c5 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c7 -= res_c5 * src_a47; + + src_a63 = __msa_cast_to_vector_double(*(a + 63)); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a54 = LD_DP(a + 54); + src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + + res_c6 *= src_a54; + res_c7 -= res_c6 * src_a55; + + res_c7 *= src_a63; + + ST_DP(res_c4, b + 8); + ST_DP(res_c5, b + 10); + ST_DP(res_c6, b + 12); + ST_DP(res_c7, b + 14); + + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); + + ST_DP2(src_c2, src_c3, c + 4, 2); + ST_DP2(src_c6, src_c7, c + 4 + ldc, 2); +} + +static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; + FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; + FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--; ) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c4 -= a[4] * b[0]; + c5 -= a[5] * b[0]; + c6 -= a[6] * b[0]; + c7 -= a[7] * b[0]; + + a += 8; + b += 1; + } + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a4 = *(a + 4); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a9 = *(a + 9); + a10 = *(a + 10); + a11 = *(a + 11); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + a18 = *(a + 18); + a19 = *(a + 19); + a20 = *(a + 20); + a21 = *(a + 21); + a22 = *(a + 22); + a23 = *(a + 23); + a27 = *(a + 27); + a28 = *(a + 28); + a29 = *(a + 29); + a30 = *(a + 30); + a31 = *(a + 31); + a36 = *(a + 36); + a37 = *(a + 37); + a38 = *(a + 38); + a39 = *(a + 39); + a45 = *(a + 45); + a46 = *(a + 46); + a47 = *(a + 47); + a54 = *(a + 54); + a55 = *(a + 55); + a63 = *(a + 63); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a9; + + c2 -= c0 * a2; + c2 -= c1 * a10; + c2 *= a18; + + c3 -= c0 * a3; + c3 -= c1 * a11; + c3 -= c2 * a19; + c3 *= a27; + + c4 -= c0 * a4; + c4 -= c1 * a12; + c4 -= c2 * a20; + c4 -= c3 * a28; + c4 *= a36; + + c5 -= c0 * a5; + c5 -= c1 * a13; + c5 -= c2 * a21; + c5 -= c3 * a29; + c5 -= c4 * a37; + c5 *= a45; + + c6 -= c0 * a6; + c6 -= c1 * a14; + c6 -= c2 * a22; + c6 -= c3 * a30; + c6 -= c4 * a38; + c6 -= c5 * a46; + c6 *= a54; + + c7 -= c0 * a7; + c7 -= c1 * a15; + c7 -= c2 * a23; + c7 -= c3 * a31; + c7 -= c4 * a39; + c7 -= c5 * a47; + c7 -= c6 * a55; + c7 *= a63; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; +} + +static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v2f64 src_a10, src_a11, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + LD_DP2(b, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + a += 4; + b += 4; + } + } + + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + + res_c4 *= src_a0; + res_c5 -= res_c4 * src_a1; + res_c6 -= res_c4 * src_a2; + res_c7 -= res_c4 * src_a3; + + src_a5 = __msa_cast_to_vector_double(*(a + 5)); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c1 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c3 -= res_c1 * src_a7; + + res_c5 *= src_a5; + res_c6 -= res_c5 * src_a6; + res_c7 -= res_c5 * src_a7; + + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a15 = __msa_cast_to_vector_double(*(a + 15)); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + + res_c2 *= src_a10; + res_c3 -= res_c2 * src_a11; + res_c3 *= src_a15; + + res_c6 *= src_a10; + res_c7 -= res_c6 * src_a11; + res_c7 *= src_a15; + + ST_DP(res_c0, b + 0); + ST_DP(res_c4, b + 2); + ST_DP(res_c1, b + 4); + ST_DP(res_c5, b + 6); + ST_DP(res_c2, b + 8); + ST_DP(res_c6, b + 10); + ST_DP(res_c3, b + 12); + ST_DP(res_c7, b + 14); + + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); + ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); +} + +static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v2f64 src_a10, src_a11, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + src_b0 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + a += 4; + b += 2; + } + } + + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + + src_a5 = __msa_cast_to_vector_double(*(a + 5)); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c1 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c3 -= res_c1 * src_a7; + + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a15 = __msa_cast_to_vector_double(*(a + 15)); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + + res_c2 *= src_a10; + res_c3 -= res_c2 * src_a11; + res_c3 *= src_a15; + + ST_DP(res_c0, b + 0); + ST_DP(res_c1, b + 2); + ST_DP(res_c2, b + 4); + ST_DP(res_c3, b + 6); + + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); +} + +static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + + a += 4; + b += 1; + } + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a10 = *(a + 10); + a11 = *(a + 11); + a15 = *(a + 15); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a5; + + c2 -= c0 * a2; + c2 -= c1 * a6; + c2 *= a10; + + c3 -= c0 * a3; + c3 -= c1 * a7; + c3 -= c2 * a11; + c3 *= a15; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; + + a += 2; + b += 4; + } + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c1 -= c0 * a1; + c1 *= a3; + + c0_nxt1 *= a0; + c1_nxt1 -= c0_nxt1 * a1; + c1_nxt1 *= a3; + + c0_nxt2 *= a0; + c1_nxt2 -= c0_nxt2 * a1; + c1_nxt2 *= a3; + + c0_nxt3 *= a0; + c1_nxt3 -= c0_nxt3 * a1; + c1_nxt3 *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + + c0_nxt = *(c + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; + + a += 2; + b += 2; + } + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c1 -= c0 * a1; + c1 *= a3; + + c0_nxt *= a0; + c1_nxt -= c0_nxt * a1; + c1_nxt *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + + a += 2; + b += 1; + } + } + + a0 = *(a + 0); + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c1 -= c0 * a1; + c1 *= a3; + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; + + a += 1; + b += 4; + } + } + + c0 *= *a; + c1 *= *a; + c2 *= *a; + c3 *= *a; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; +} + +static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c1; + + c0 = *c; + c1 = *(c + ldc); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= *a * b[0]; + c1 -= *a * b[1]; + + a += 1; + b += 2; + } + } + + c0 *= *a; + c1 *= *a; + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + ldc) = c1; +} + +static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + *c -= *a * *b; + + a += 1; + b += 1; + } + } + + *c *= *a; + *b = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i, j, kk; + FLOAT *aa, *cc; + + for (j = (n >> 2); j--;) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + dsolve_2x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + dsolve_1x4_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 3) + { + if (n & 2) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + dsolve_2x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + dsolve_1x2_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x1_lt_msa(aa, b, cc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x1_lt_msa(aa, b, cc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + dsolve_2x1_lt_msa(aa, b, cc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + dgmm_dsolve_1x1_msa(aa, b, cc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += k; + c += ldc; + } + } + + return 0; +} diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c new file mode 100644 index 0000000000..44313241e9 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -0,0 +1,953 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; + v2f64 src_b10, src_b11, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_b; + + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(b, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + a += 8; + b += 4; + + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2(b, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + a += 8; + b += 4; + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + src_b5 = __msa_cast_to_vector_double(*(b + 5)); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b6 = LD_DP(b + 6); + src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); + src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); + src_b10 = LD_DP(b + 10); + src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b15 = __msa_cast_to_vector_double(*(b + 15)); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + src_c4 -= src_c0 * src_b1; + src_c5 -= src_c1 * src_b1; + src_c6 -= src_c2 * src_b1; + src_c7 -= src_c3 * src_b1; + + src_c4 *= src_b5; + src_c5 *= src_b5; + src_c6 *= src_b5; + src_c7 *= src_b5; + + src_c8 -= src_c0 * src_b2; + src_c9 -= src_c1 * src_b2; + src_c10 -= src_c2 * src_b2; + src_c11 -= src_c3 * src_b2; + + src_c8 -= src_c4 * src_b6; + src_c9 -= src_c5 * src_b6; + src_c10 -= src_c6 * src_b6; + src_c11 -= src_c7 * src_b6; + + src_c8 *= src_b10; + src_c9 *= src_b10; + src_c10 *= src_b10; + src_c11 *= src_b10; + + src_c12 -= src_c0 * src_b3; + src_c13 -= src_c1 * src_b3; + src_c14 -= src_c2 * src_b3; + src_c15 -= src_c3 * src_b3; + + src_c12 -= src_c4 * src_b7; + src_c13 -= src_c5 * src_b7; + src_c14 -= src_c6 * src_b7; + src_c15 -= src_c7 * src_b7; + + src_c12 -= src_c8 * src_b11; + src_c13 -= src_c9 * src_b11; + src_c14 -= src_c10 * src_b11; + src_c15 -= src_c11 * src_b11; + + src_c12 *= src_b15; + src_c13 *= src_b15; + src_c14 *= src_b15; + src_c15 *= src_b15; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2); + ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2); + ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2); +} + +static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b1, src_b3, src_b; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b); + + a += 8; + b += 2; + + for (i = (bk - 1); i--;) + { + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + + a += 8; + b += 2; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b3 = __msa_cast_to_vector_double(*(b + 3)); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + src_c4 -= src_c0 * src_b1; + src_c5 -= src_c1 * src_b1; + src_c6 -= src_c2 * src_b1; + src_c7 -= src_c3 * src_b1; + + src_c4 *= src_b3; + src_c5 *= src_b3; + src_c6 *= src_b3; + src_c7 *= src_b3; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); +} + +static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3; + v2f64 src_b0; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + + if (bk) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b; + + for (i = bk; i--;) + { + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b, (v2i64) src_b); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + a += 8; + b += 1; + } + } + + src_b0 = __msa_cast_to_vector_double(*b); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; + v2f64 src_b10, src_b11, src_b15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + LD_DP2(b, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + a += 4; + b += 4; + } + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + src_b5 = __msa_cast_to_vector_double(*(b + 5)); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b6 = LD_DP(b + 6); + src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); + src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); + src_b10 = LD_DP(b + 10); + src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b15 = __msa_cast_to_vector_double(*(b + 15)); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + + src_c2 *= src_b5; + src_c3 *= src_b5; + + src_c4 -= src_c0 * src_b2; + src_c5 -= src_c1 * src_b2; + + src_c4 -= src_c2 * src_b6; + src_c5 -= src_c3 * src_b6; + + src_c4 *= src_b10; + src_c5 *= src_b10; + + src_c6 -= src_c0 * src_b3; + src_c7 -= src_c1 * src_b3; + + src_c6 -= src_c2 * src_b7; + src_c7 -= src_c3 * src_b7; + + src_c6 -= src_c4 * src_b11; + src_c7 -= src_c5 * src_b11; + + src_c6 *= src_b15; + src_c7 *= src_b15; + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); +} + +static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + src_b0 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + a += 4; + b += 2; + } + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b3 = __msa_cast_to_vector_double(*(b + 3)); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + + src_c2 *= src_b3; + src_c3 *= src_b3; + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + + a += 4; + b += 1; + } + } + + c0 *= *b; + c1 *= *b; + c2 *= *b; + c3 *= *b; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT c1, c1_nxt1, c1_nxt2, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; + + a += 2; + b += 4; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + c1 *= b0; + + c0_nxt1 -= c0 * b1; + c1_nxt1 -= c1 * b1; + c0_nxt1 *= b5; + c1_nxt1 *= b5; + + c0_nxt2 -= c0 * b2; + c1_nxt2 -= c1 * b2; + c0_nxt2 -= c0_nxt1 * b6; + c1_nxt2 -= c1_nxt1 * b6; + c0_nxt2 *= b10; + c1_nxt2 *= b10; + + c0_nxt3 -= c0 * b3; + c1_nxt3 -= c1 * b3; + c0_nxt3 -= c0_nxt1 * b7; + c1_nxt3 -= c1_nxt1 * b7; + c0_nxt3 -= c0_nxt2 * b11; + c1_nxt3 -= c1_nxt2 * b11; + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; + + a += 2; + b += 2; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + c1 *= b0; + + c0_nxt -= c0 * b1; + c1_nxt -= c1 * b1; + + c0_nxt *= b3; + c1_nxt *= b3; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + + a += 2; + b += 1; + } + } + + b0 = *b; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; + + a += 1; + b += 4; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b5; + + c2 -= c0 * b2; + c2 -= c1 * b6; + c2 *= b10; + + c3 -= c0 * b3; + c3 -= c1 * b7; + c3 -= c2 * b11; + c3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b3, c0, c1; + + c0 = *c; + c1 = *(c + ldc); + + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= *a * b[0]; + c1 -= *a * b[1]; + + a += 1; + b += 2; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b3; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + ldc) = c1; +} + +static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk) + { + BLASLONG i; + + for (i = bk; i--;) + { + *c -= *a * *b; + + a += 1; + b += 1; + } + } + + *c *= *a; + *b = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i, j, kk; + FLOAT *aa, *cc; + + kk = -offset; + + for (j = (n >> 2); j--;) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x4_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + kk += 4; + b += 4 * k; + c += 4 * ldc; + } + + if (n & 3) + { + if (n & 2) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x2_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + kk += 2; + } + + if (n & 1) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x1_rn_msa(aa, b, cc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x1_rn_msa(aa, b, cc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x1_rn_msa(aa, b, cc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dgmm_dsolve_1x1_msa(b, aa, cc, kk); + + aa += k; + cc += 1; + } + } + + b += k; + c += ldc; + kk += 1; + } + } + + return 0; +} diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c new file mode 100644 index 0000000000..49274e5bc6 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -0,0 +1,1015 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; + v2f64 src_b14, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk > 0) + { + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pbb, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + pba += 8; + pbb += 4; + + LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2(pbb, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + } + + a -= 32; + b -= 16; + + src_b12 = LD_DP(b + 12); + src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1); + src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0); + src_b14 = LD_DP(b + 14); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1); + src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0); + + src_b8 = LD_DP(b + 8); + src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); + src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); + src_b10 = __msa_cast_to_vector_double(*(b + 10)); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b4 = LD_DP(b + 4); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); + src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); + + src_c12 *= src_b15; + src_c13 *= src_b15; + src_c14 *= src_b15; + src_c15 *= src_b15; + + src_c8 -= src_c12 * src_b14; + src_c9 -= src_c13 * src_b14; + src_c10 -= src_c14 * src_b14; + src_c11 -= src_c15 * src_b14; + + src_c8 *= src_b10; + src_c9 *= src_b10; + src_c10 *= src_b10; + src_c11 *= src_b10; + + src_c4 -= src_c12 * src_b13; + src_c5 -= src_c13 * src_b13; + src_c6 -= src_c14 * src_b13; + src_c7 -= src_c15 * src_b13; + + src_c4 -= src_c8 * src_b9; + src_c5 -= src_c9 * src_b9; + src_c6 -= src_c10 * src_b9; + src_c7 -= src_c11 * src_b9; + + src_c4 *= src_b5; + src_c5 *= src_b5; + src_c6 *= src_b5; + src_c7 *= src_b5; + + src_c0 -= src_c12 * src_b12; + src_c1 -= src_c13 * src_b12; + src_c2 -= src_c14 * src_b12; + src_c3 -= src_c15 * src_b12; + + src_c0 -= src_c8 * src_b8; + src_c1 -= src_c9 * src_b8; + src_c2 -= src_c10 * src_b8; + src_c3 -= src_c11 * src_b8; + + src_c0 -= src_c4 * src_b4; + src_c1 -= src_c5 * src_b4; + src_c2 -= src_c6 * src_b4; + src_c3 -= src_c7 * src_b4; + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2); + ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b2, src_b3; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b1, src_a0, src_a1, src_a2, src_a3; + v2f64 src_a4, src_a5, src_a6, src_a7; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pbb); + + for (i = bk - 1; i--;) + { + pba += 8; + pbb += 2; + + LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(pbb); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + } + + a -= 16; + b -= 4; + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + + src_c4 *= src_b3; + src_c5 *= src_b3; + src_c6 *= src_b3; + src_c7 *= src_b3; + + src_c0 -= src_c4 * src_b2; + src_c1 -= src_c5 * src_b2; + src_c2 -= src_c6 * src_b2; + src_c3 -= src_c7 * src_b2; + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); +} + +static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3; + v2f64 src_b0; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_b1; + + LD_DP4(aa, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(bb); + + aa += 8; + bb += 1; + + for (i = (bk - 1); i--;) + { + LD_DP4(aa, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(bb); + + src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a2 * src_b0; + src_c3 -= src_a3 * src_b0; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + + aa += 8; + bb += 1; + } + + src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a2 * src_b0; + src_c3 -= src_a3 * src_b0; + } + + a -= 8; + b -= 1; + + src_b0 = __msa_cast_to_vector_double(*b); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; + v2f64 src_b14, src_b15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + LD_DP2(bb, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + aa += 4; + bb += 4; + } + } + + a -= 16; + b -= 16; + + src_b12 = LD_DP(b + 12); + src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1); + src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0); + src_b14 = LD_DP(b + 14); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1); + src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0); + + src_b8 = LD_DP(b + 8); + src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); + src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); + src_b10 = __msa_cast_to_vector_double(*(b + 10)); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b4 = LD_DP(b + 4); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); + src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); + + src_c6 *= src_b15; + src_c7 *= src_b15; + + src_c4 -= src_c6 * src_b14; + src_c5 -= src_c7 * src_b14; + + src_c4 *= src_b10; + src_c5 *= src_b10; + + src_c2 -= src_c6 * src_b13; + src_c3 -= src_c7 * src_b13; + + src_c2 -= src_c4 * src_b9; + src_c3 -= src_c5 * src_b9; + + src_c2 *= src_b5; + src_c3 *= src_b5; + + src_c0 -= src_c6 * src_b12; + src_c1 -= src_c7 * src_b12; + + src_c0 -= src_c4 * src_b8; + src_c1 -= src_c5 * src_b8; + + src_c0 -= src_c2 * src_b4; + src_c1 -= src_c3 * src_b4; + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c0, src_c1, c, 2); + + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + v2f64 src_a0, src_a1, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + src_b0 = LD_DP(bb); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + aa += 4; + bb += 2; + } + } + + a -= 8; + b -= 4; + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + + src_c2 *= src_b3; + src_c3 *= src_b3; + + src_c0 -= src_c2 * src_b2; + src_c1 -= src_c3 * src_b2; + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + + aa += 4; + bb += 1; + } + } + + a -= 4; + + b0 = *(b - 1); + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + + aa += 2; + bb += 4; + } + } + + a -= 8; + b -= 16; + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + c0_nxt2 -= c0_nxt3 * b14; + c1_nxt2 -= c1_nxt3 * b14; + c0_nxt2 *= b10; + c1_nxt2 *= b10; + + c0_nxt1 -= c0_nxt3 * b13; + c1_nxt1 -= c1_nxt3 * b13; + c0_nxt1 -= c0_nxt2 * b9; + c1_nxt1 -= c1_nxt2 * b9; + c0_nxt1 *= b5; + c1_nxt1 *= b5; + + c0 -= c0_nxt3 * b12; + c1 -= c1_nxt3 * b12; + c0 -= c0_nxt2 * b8; + c1 -= c1_nxt2 * b8; + c0 -= c0_nxt1 * b4; + c1 -= c1_nxt1 * b4; + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; + + aa += 2; + bb += 2; + } + } + + a -= 4; + b -= 4; + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c0_nxt *= b3; + c1_nxt *= b3; + + c0 -= c0_nxt * b2; + c0 *= b0; + + c1 -= c1_nxt * b2; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + + aa += 2; + bb += 1; + } + } + + b0 = *(b - 1); + + c0 *= b0; + c1 *= b0; + + *(a - 2) = c0; + *(a - 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + + aa += 1; + bb += 4; + } + } + + a -= 4; + b -= 16; + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c3 *= b15; + + c2 -= c3 * b14; + c2 *= b10; + + c1 -= c3 * b13; + c1 -= c2 * b9; + c1 *= b5; + + c0 -= c3 * b12; + c0 -= c2 * b8; + c0 -= c1 * b4; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b2, b3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= *aa * bb[0]; + c1 -= *aa * bb[1]; + + aa += 1; + bb += 2; + } + } + + a -= 2; + b -= 4; + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c1 *= b3; + + c0 -= c1 * b2; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + ldc) = c1; +} + +static void dsolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG i; + + for (i = 0; i < bk; i++) + { + *c -= a[i] * b[i]; + } + } + + *c *= *(b - 1); + *(a - 1) = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i, j, kk; + FLOAT *aa, *cc, *bb; + + kk = n - offset; + c += n * ldc; + b += n * k; + + if (n & 3) + { + if (n & 1) + { + aa = a; + c -= ldc; + b -= k; + bb = b + kk; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, k - kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, k - kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, k - kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x1_rt_msa(aa + kk, bb, cc, k - kk); + + aa += k; + cc += 1; + } + + } + + kk -= 1; + } + + if (n & 2) + { + aa = a; + c -= 2 * ldc; + b -= 2 * k; + bb = b + 2 * kk; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, k - kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, k - kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, k - kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, k - kk); + + aa += k; + cc += 1; + } + } + + kk -= 2; + } + } + + for (j = (n >> 2); j--;) + { + aa = a; + b -= 4 * k; + bb = b + 4 * kk; + c -= 4 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, k - kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, k - kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, k - kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, k - kk); + + aa += k; + cc += 1; + } + } + + kk -= 4; + } + + return 0; +} diff --git a/kernel/mips/gemv_n.c b/kernel/mips/gemv_n.c new file mode 100644 index 0000000000..4cc1772097 --- /dev/null +++ b/kernel/mips/gemv_n.c @@ -0,0 +1,56 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/mips/iamin.c b/kernel/mips/iamin.c new file mode 100644 index 0000000000..7f1c4d9057 --- /dev/null +++ b/kernel/mips/iamin.c @@ -0,0 +1,68 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/mips/imax.c b/kernel/mips/imax.c new file mode 100644 index 0000000000..744bfc0d9b --- /dev/null +++ b/kernel/mips/imax.c @@ -0,0 +1,59 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/mips/imin.c b/kernel/mips/imin.c new file mode 100644 index 0000000000..d9b283d2d9 --- /dev/null +++ b/kernel/mips/imin.c @@ -0,0 +1,59 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/mips/izamax.c b/kernel/mips/izamax.c new file mode 100644 index 0000000000..708ee921d3 --- /dev/null +++ b/kernel/mips/izamax.c @@ -0,0 +1,72 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} + + diff --git a/kernel/mips/izamin.c b/kernel/mips/izamin.c new file mode 100644 index 0000000000..523605ef49 --- /dev/null +++ b/kernel/mips/izamin.c @@ -0,0 +1,72 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} + + diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h new file mode 100644 index 0000000000..dbc1853028 --- /dev/null +++ b/kernel/mips/macros_msa.h @@ -0,0 +1,747 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef __MACROS_MSA_H__ +#define __MACROS_MSA_H__ + +#include + +#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc)) +#define LD_SP(...) LD_W(v4f32, __VA_ARGS__) + +#define LD_D(RTYPE, psrc) *((RTYPE *)(psrc)) +#define LD_DP(...) LD_D(v2f64, __VA_ARGS__) + +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SP(...) ST_W(v4f32, __VA_ARGS__) + +#define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_DP(...) ST_D(v2f64, __VA_ARGS__) + +#define COPY_FLOAT_TO_VECTOR(a) ( { \ + v4f32 out; \ + out = __msa_cast_to_vector_float(a); \ + out = (v4f32) __msa_splati_w((v4i32) out, 0); \ + out; \ +} ) + +#define COPY_DOUBLE_TO_VECTOR(a) ( { \ + v2f64 out; \ + out = __msa_cast_to_vector_double(a); \ + out = (v2f64) __msa_splati_d((v2i64) out, 0); \ + out; \ +} ) + +/* Description : Load 2 variables with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 +*/ +#define LD_GP2_INC(psrc, stride, out0, out1) \ +{ \ + out0 = *(psrc); \ + (psrc) += stride; \ + out1 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP3_INC(psrc, stride, out0, \ + out1, out2) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + out2 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP4_INC(psrc, stride, out0, \ + out1, out2, out3) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ +} + +#define LD_GP5_INC(psrc, stride, out0, \ + out1, out2, out3, out4) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ + out4 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP6_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ + LD_GP2_INC(psrc, stride, out4, out5); \ +} + +#define LD_GP7_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5, out6) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ + LD_GP2_INC(psrc, stride, out4, out5); \ + out6 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ +{ \ + LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \ + LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \ +} + +/* Description : Load 2 vectors of single precision floating point elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - single precision floating point +*/ +#define LD_SP2(psrc, stride, out0, out1) \ +{ \ + out0 = LD_SP((psrc)); \ + out1 = LD_SP((psrc) + stride); \ +} + +#define LD_SP4(psrc, stride, out0, out1, out2, out3) \ +{ \ + LD_SP2(psrc, stride, out0, out1) \ + LD_SP2(psrc + 2 * stride, stride, out2, out3) \ +} + +#define LD_SP2_INC(psrc, stride, out0, out1) \ +{ \ + out0 = LD_SP((psrc)); \ + (psrc) += stride; \ + out1 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP3_INC(psrc, stride, out0, \ + out1, out2) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + out2 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP4_INC(psrc, stride, out0, \ + out1, out2, out3) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ +} + +#define LD_SP5_INC(psrc, stride, out0, \ + out1, out2, out3, out4) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ + out4 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP6_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ + LD_SP2_INC(psrc, stride, out4, out5); \ +} + +#define LD_SP7_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5, out6) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ + LD_SP2_INC(psrc, stride, out4, out5); \ + out6 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ +{ \ + LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \ + LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \ +} + +#define LD_SP16_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7, out8, \ + out9, out10, out11, out12, out13, \ + out14, out15) \ +{ \ + LD_SP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7); \ + LD_SP8_INC(psrc, stride, out8, out9, out10, \ + out11, out12, out13, out14, out15); \ +} + +/* Description : Load 2 vectors of double precision floating point elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - double precision floating point +*/ +#define LD_DP2(psrc, stride, out0, out1) \ +{ \ + out0 = LD_DP((psrc)); \ + out1 = LD_DP((psrc) + stride); \ +} + +#define LD_DP4(psrc, stride, out0, out1, out2, out3) \ +{ \ + LD_DP2(psrc, stride, out0, out1) \ + LD_DP2(psrc + 2 * stride, stride, out2, out3) \ +} + +#define LD_DP2_INC(psrc, stride, out0, out1) \ +{ \ + out0 = LD_DP(psrc); \ + (psrc) += stride; \ + out1 = LD_DP(psrc); \ + (psrc) += stride; \ +} + +#define LD_DP3_INC(psrc, stride, out0, \ + out1, out2) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + out2 = LD_DP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_DP4_INC(psrc, stride, out0, \ + out1, out2, out3) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ +} + +#define LD_DP5_INC(psrc, stride, out0, \ + out1, out2, out3, out4) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ + out4 = LD_DP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_DP6_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ + LD_DP2_INC(psrc, stride, out4, out5); \ +} + +#define LD_DP7_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5, out6) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ + LD_DP2_INC(psrc, stride, out4, out5); \ + out6 = LD_DP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_DP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ +{ \ + LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \ + LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \ +} + +#define LD_DP16_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7, out8, \ + out9, out10, out11, out12, out13, \ + out14, out15) \ +{ \ + LD_DP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7); \ + LD_DP8_INC(psrc, stride, out8, out9, out10, \ + out11, out12, out13, out14, out15); \ +} + +/* Description : Store GP variable with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 single precision floating point elements from 'in0' to (pdst) + Store 4 single precision floating point elements from 'in1' to (pdst + stride) +*/ +#define ST_GP2_INC(in0, in1, \ + pdst, stride) \ +{ \ + *(pdst) = in0; \ + (pdst) += stride; \ + *(pdst) = in1; \ + (pdst) += stride; \ +} + +#define ST_GP3_INC(in0, in1, in2, \ + pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + *(pdst) = in2; \ + (pdst) += stride; \ +} + +#define ST_GP4_INC(in0, in1, in2, in3, \ + pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ +} + +#define ST_GP5_INC(in0, in1, in2, in3, \ + in4, pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ + *(pdst) = in4; \ + (pdst) += stride; \ +} + +#define ST_GP6_INC(in0, in1, in2, in3, \ + in4, in5, pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ + ST_GP2_INC(in4, in5, pdst, stride); \ +} + +#define ST_GP7_INC(in0, in1, in2, in3, in4, \ + in5, in6, pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ + ST_GP2_INC(in4, in5, pdst, stride); \ + *(pdst) = in6; \ + (pdst) += stride; \ +} + +#define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \ + in6, in7, pdst, stride) \ +{ \ + ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \ + ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \ +} + +/* Description : Store vectors of single precision floating point elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 single precision floating point elements from 'in0' to (pdst) + Store 4 single precision floating point elements from 'in1' to (pdst + stride) +*/ +#define ST_SP2(in0, in1, pdst, stride) \ +{ \ + ST_SP(in0, (pdst)); \ + ST_SP(in1, (pdst) + stride); \ +} + +#define ST_SP4(in0, in1, in2, in3, pdst, stride) \ +{ \ + ST_SP2(in0, in1, (pdst), stride); \ + ST_SP2(in2, in3, (pdst + 2 * stride), stride); \ +} + +#define ST_SP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +{ \ + ST_SP4(in0, in1, in2, in3, (pdst), stride); \ + ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \ +} + +#define ST_SP2_INC(in0, in1, pdst, stride) \ +{ \ + ST_SP(in0, (pdst)); \ + (pdst) += stride; \ + ST_SP(in1, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP3_INC(in0, in1, in2, \ + pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP(in2, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP4_INC(in0, in1, in2, in3, \ + pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ +} + +#define ST_SP5_INC(in0, in1, in2, in3, \ + in4, pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ + ST_SP(in4, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP6_INC(in0, in1, in2, in3, \ + in4, in5, pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ + ST_SP2_INC(in4, in5, pdst, stride); \ +} + +#define ST_SP7_INC(in0, in1, in2, in3, in4, \ + in5, in6, pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ + ST_SP2_INC(in4, in5, pdst, stride); \ + ST_SP(in6, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \ + in6, in7, pdst, stride) \ +{ \ + ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \ + ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \ +} + +#define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, in8, in9, in10, in11, in12, \ + in13, in14, in15, pdst, stride) \ +{ \ + ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, pdst, stride); \ + ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \ + in15, pdst, stride); \ +} + +/* Description : Store vectors of double precision floating point elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 2 double precision floating point elements from 'in0' to (pdst) + Store 2 double precision floating point elements from 'in1' to (pdst + stride) +*/ +#define ST_DP2(in0, in1, pdst, stride) \ +{ \ + ST_DP(in0, (pdst)); \ + ST_DP(in1, (pdst) + stride); \ +} + +#define ST_DP4(in0, in1, in2, in3, pdst, stride) \ +{ \ + ST_DP2(in0, in1, (pdst), stride); \ + ST_DP2(in2, in3, (pdst) + 2 * stride, stride); \ +} + +#define ST_DP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +{ \ + ST_DP4(in0, in1, in2, in3, (pdst), stride); \ + ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ +} + +#define ST_DP2_INC(in0, in1, pdst, stride) \ +{ \ + ST_DP(in0, (pdst)); \ + (pdst) += stride; \ + ST_DP(in1, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP3_INC(in0, in1, in2, \ + pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP(in2, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP4_INC(in0, in1, in2, in3, \ + pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ +} + +#define ST_DP5_INC(in0, in1, in2, in3, \ + in4, pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ + ST_DP(in4, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP6_INC(in0, in1, in2, in3, \ + in4, in5, pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ + ST_DP2_INC(in4, in5, pdst, stride); \ +} + +#define ST_DP7_INC(in0, in1, in2, in3, in4, \ + in5, in6, pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ + ST_DP2_INC(in4, in5, pdst, stride); \ + ST_DP(in6, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \ + in6, in7, pdst, stride) \ +{ \ + ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \ + ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \ +} + +#define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, in8, in9, in10, in11, in12, \ + in13, in14, in15, pdst, stride) \ +{ \ + ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, pdst, stride); \ + ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \ + in15, pdst, stride); \ +} + +/* Description : shuffle elements in vector as shf_val + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE +*/ +#define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \ +{ \ + out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \ + out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \ +} +#define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__) +#define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__) + +#define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \ + shf_val) \ +{ \ + out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \ + out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \ + out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \ +} +#define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__) + +#define SHF_W4(RTYPE, in0, in1, in2, in3, \ + out0, out1, out2, out3, shf_val) \ +{ \ + SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \ + SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \ +} +#define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__) +#define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \ + out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ +} +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) +#define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__) + +#define ILVRL_D2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \ +} +#define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__) +#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) + +/* Description : Indexed word element values are replicated to all + elements in output vector + Arguments : Inputs - in, stidx + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'stidx' element value from 'in' vector is replicated to all + elements in 'out0' vector + 'stidx + 1' element value from 'in' vector is replicated to all + elements in 'out1' vector + Valid index range for word operation is 0-3 +*/ +#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ + out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ +} +#define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__) + +#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ +{ \ + SPLATI_W2(RTYPE, in, 0, out0, out1); \ + SPLATI_W2(RTYPE, in, 2, out2, out3); \ +} +#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__) + +#define SPLATI_D2(RTYPE, in, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \ + out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \ +} +#define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' are copied to the left half + of 'out0' & even double word elements of 'in1' are copied to + the right half of 'out0'. +*/ +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ +} +#define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__) +#define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__) + +#define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ + out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \ +} +#define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__) + +#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__) + +/* Description : pack both even and odd half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are copied to the + 'out0' & odd double word elements of 'in0' and 'in1' are + copied to the 'out1'. +*/ +#define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \ + out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \ +} +#define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__) + +#define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \ +} +#define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ +} +#define MUL3(in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ + out2 = in4 * in5; \ +} +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Addition of 2 pairs of variables + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD3(in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ + out2 = in4 + in5; \ +} +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Transpose 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE +*/ +#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \ + out0, out1, out2, out3) \ +{ \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \ + ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \ +} +#define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__) + +#endif /* __MACROS_MSA_H__ */ diff --git a/kernel/mips/max.c b/kernel/mips/max.c new file mode 100644 index 0000000000..2ad956bc01 --- /dev/null +++ b/kernel/mips/max.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/mips/min.c b/kernel/mips/min.c new file mode 100644 index 0000000000..2812fe3978 --- /dev/null +++ b/kernel/mips/min.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/mips/nrm2.c b/kernel/mips/nrm2.c new file mode 100644 index 0000000000..fcff093375 --- /dev/null +++ b/kernel/mips/nrm2.c @@ -0,0 +1,88 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/13 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + FLOAT absxi = 0.0; + + + if (n <= 0 || inc_x <= 0) return(0.0); + if ( n == 1 ) return( ABS(x[0]) ); + + n *= inc_x; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + absxi = ABS( x[i] ); + if ( scale < absxi ) + { + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else + { + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/mips/omatcopy_cn.c b/kernel/mips/omatcopy_cn.c new file mode 100644 index 0000000000..11357ec933 --- /dev/null +++ b/kernel/mips/omatcopy_cn.c @@ -0,0 +1,82 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i,j; + FLOAT *aptr,*bptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + aptr = a; + bptr = b; + + if ( alpha == 0.0 ) + { + for ( i=0; i +#include "macros_msa.h" + +#define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec)) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + FLOAT data0, data1, data2, sumf = 0.0; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3; + v4f32 zero_v = {0}; + v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; + + if (n <= 0 || inc_x <= 0) return (sumf); + + if (1 == inc_x) + { + if (n > 31) + { + n -= 32; + + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 = AND_VEC_W(src0); + sum_abs1 = AND_VEC_W(src1); + sum_abs2 = AND_VEC_W(src2); + sum_abs3 = AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + sum_abs2 = zero_v; + sum_abs3 = zero_v; + } + + for (i = 0; i < (n >> 5); i++) + { + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + + if (n & 31) + { + if ((n & 16) && (n & 8) && (n & 4)) + { + LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 16) && (n & 8)) + { + LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 16) && (n & 4)) + { + LD_SP5_INC(x, 4, src0, src1, src2, src3, src4); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 8) && (n & 4)) + { + LD_SP3_INC(x, 4, src0, src1, src2); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 16) + { + LD_SP4_INC(x, 4, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 8) + { + LD_SP2_INC(x, 4, src0, src1); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 4) + { + src0 = LD_SP(x); x += 4; + + sum_abs0 += AND_VEC_W(src0); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else + { + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + + if (n & 2) + { + sumf += fabsf(*(x + 0)); + sumf += fabsf(*(x + 1)); + x += 2; + } + + if (n & 1) + { + sumf += fabsf(*(x + 0)); + } + } + else + { + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + } + else + { + if (n > 8) + { + n -= 8; + + src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x)); + x += inc_x; + + sum_abs0 = AND_VEC_W(src0); + sum_abs1 = AND_VEC_W(src4); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + } + + for (i = (n >> 3); i--;) + { + src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x)); + x += inc_x; + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src4); + } + + if (n & 4) + { + src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); + x += inc_x; + + sum_abs0 += AND_VEC_W(src0); + } + + sum_abs0 += sum_abs1; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + + if ((n & 2) && (n & 1)) + { + data0 = fabsf(*x); x += inc_x; + data1 = fabsf(*x); x += inc_x; + data2 = fabsf(*x); + + sumf += data0; + sumf += data1; + sumf += data2; + } + else if (n & 2) + { + data0 = fabsf(*x); x += inc_x; + data1 = fabsf(*x); + + sumf += data0; + sumf += data1; + } + else if (n & 1) + { + data0 = fabsf(*x); + + sumf += data0; + } + } + + return (sumf); +} diff --git a/kernel/mips/scal.c b/kernel/mips/scal.c new file mode 100644 index 0000000000..01f708b1d9 --- /dev/null +++ b/kernel/mips/scal.c @@ -0,0 +1,50 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0,j=0; + + while(j < n) + { + + if ( da == 0.0 ) + x[i]=0.0; + else + x[i] = da * x[i] ; + + i += inc_x ; + j++; + + } + return 0; + +} + + diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c new file mode 100644 index 0000000000..1997ec5a09 --- /dev/null +++ b/kernel/mips/sdot_msa.c @@ -0,0 +1,208 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +/* return float, x,y float */ +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i = 0; + double dot = 0.0; + float x0, x1, x2, x3, y0, y1, y2, y3; + v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; + v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; + v4f32 dot0 = {0, 0, 0, 0}; + + if (n < 0) return (dot); + + if ((1 == inc_x) && (1 == inc_y)) + { + for (i = (n >> 5); i--;) + { + LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + dot0 += (vy6 * vx6); + dot0 += (vy7 * vx7); + } + + if (n & 31) + { + if ((n & 16) && (n & 8) && (n & 4)) + { + LD_SP7_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6); + LD_SP7_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + dot0 += (vy6 * vx6); + } + else if ((n & 16) && (n & 8)) + { + LD_SP6_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5); + LD_SP6_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + } + else if ((n & 16) && (n & 4)) + { + LD_SP5_INC(x, 4, vx0, vx1, vx2, vx3, vx4); + LD_SP5_INC(y, 4, vy0, vy1, vy2, vy3, vy4); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + } + else if ((n & 8) && (n & 4)) + { + LD_SP3_INC(x, 4, vx0, vx1, vx2); + LD_SP3_INC(y, 4, vy0, vy1, vy2); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + } + else if (n & 16) + { + LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); + LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + } + else if (n & 8) + { + LD_SP2_INC(x, 4, vx0, vx1); + LD_SP2_INC(y, 4, vy0, vy1); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + } + else if (n & 4) + { + vx0 = LD_SP(x); x += 4; + vy0 = LD_SP(y); y += 4; + + dot0 += (vy0 * vx0); + } + + if ((n & 2) && (n & 1)) + { + LD_GP3_INC(x, 1, x0, x1, x2); + LD_GP3_INC(y, 1, y0, y1, y2); + + dot += (y0 * x0); + dot += (y1 * x1); + dot += (y2 * x2); + } + else if (n & 2) + { + LD_GP2_INC(x, 1, x0, x1); + LD_GP2_INC(y, 1, y0, y1); + + dot += (y0 * x0); + dot += (y1 * x1); + } + else if (n & 1) + { + x0 = *x; + y0 = *y; + + dot += (y0 * x0); + } + } + + dot += dot0[0]; + dot += dot0[1]; + dot += dot0[2]; + dot += dot0[3]; + } + else + { + for (i = (n >> 2); i--;) + { + LD_GP4_INC(x, inc_x, x0, x1, x2, x3); + LD_GP4_INC(y, inc_y, y0, y1, y2, y3); + + dot += (y0 * x0); + dot += (y1 * x1); + dot += (y2 * x2); + dot += (y3 * x3); + } + + if ((n & 2) && (n & 1)) + { + LD_GP3_INC(x, inc_x, x0, x1, x2); + LD_GP3_INC(y, inc_y, y0, y1, y2); + + dot += (y0 * x0); + dot += (y1 * x1); + dot += (y2 * x2); + } + else if (n & 2) + { + LD_GP2_INC(x, inc_x, x0, x1); + LD_GP2_INC(y, inc_y, y0, y1); + + dot += (y0 * x0); + dot += (y1 * x1); + } + else if (n & 1) + { + x0 = *x; + y0 = *y; + + dot += (y0 * x0); + } + } + + return (dot); +} diff --git a/kernel/mips/sgemm_kernel_8x8_msa.c b/kernel/mips/sgemm_kernel_8x8_msa.c new file mode 100644 index 0000000000..1695471add --- /dev/null +++ b/kernel/mips/sgemm_kernel_8x8_msa.c @@ -0,0 +1,2482 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, + FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + FLOAT *pa0, *pb0; + FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + FLOAT a0, a1, b0, b1, b2, b3, b4, b5, b6, b7; + v4f32 v_alpha = {alpha, alpha, alpha, alpha}; + v4f32 src_a0, src_a1, src_b, src_b0, src_b1; + v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 3); j--;) + { + pc0 = C; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pc4 = pc3 + ldc; + pc5 = pc4 + ldc; + pc6 = pc5 + ldc; + pc7 = pc6 + ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + pa0 = A; + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 = src_a0 * src_b; + res7 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res8 = src_a0 * src_b; + res9 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res10 = src_a0 * src_b; + res11 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res12 = src_a0 * src_b; + res13 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res14 = src_a0 * src_b; + res15 = src_a1 * src_b; + + for (l = ((temp - 1) >> 1); l--;) + { + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res10 += src_a0 * src_b; + res11 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res14 += src_a0 * src_b; + res15 += src_a1 * src_b; + + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res10 += src_a0 * src_b; + res11 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res14 += src_a0 * src_b; + res15 += src_a1 * src_b; + } + + if ((temp - 1) & 1) + { + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res10 += src_a0 * src_b; + res11 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res14 += src_a0 * src_b; + res15 += src_a1 * src_b; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else + LD_SP2(pc0, 4, dst0, dst1); + LD_SP2(pc1, 4, dst2, dst3); + LD_SP2(pc2, 4, dst4, dst5); + LD_SP2(pc3, 4, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc0, 4); + ST_SP2_INC(dst2, dst3, pc1, 4); + ST_SP2_INC(dst4, dst5, pc2, 4); + ST_SP2_INC(dst6, dst7, pc3, 4); + +#if defined(TRMMKERNEL) + dst0 = res8 * v_alpha; + dst1 = res9 * v_alpha; + dst2 = res10 * v_alpha; + dst3 = res11 * v_alpha; + dst4 = res12 * v_alpha; + dst5 = res13 * v_alpha; + dst6 = res14 * v_alpha; + dst7 = res15 * v_alpha; +#else + LD_SP2(pc4, 4, dst0, dst1); + LD_SP2(pc5, 4, dst2, dst3); + LD_SP2(pc6, 4, dst4, dst5); + LD_SP2(pc7, 4, dst6, dst7); + + dst0 += res8 * v_alpha; + dst1 += res9 * v_alpha; + dst2 += res10 * v_alpha; + dst3 += res11 * v_alpha; + dst4 += res12 * v_alpha; + dst5 += res13 * v_alpha; + dst6 += res14 * v_alpha; + dst7 += res15 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc4, 4); + ST_SP2_INC(dst2, dst3, pc5, 4); + ST_SP2_INC(dst4, dst5, pc6, 4); + ST_SP2_INC(dst6, dst7, pc7, 4); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 8; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + src_a0 = LD_SP(pa0); + LD_SP2_INC(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res4 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res5 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res6 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res7 = src_a0 * src_b; + + pa0 += 4; + + for (l = ((temp - 1) >> 1); l--;) + { + src_a0 = LD_SP(pa0); + LD_SP2_INC(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res4 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res5 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res6 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res7 += src_a0 * src_b; + + pa0 += 4; + + src_a0 = LD_SP(pa0); + LD_SP2_INC(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res4 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res5 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res6 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res7 += src_a0 * src_b; + + pa0 += 4; + } + + if ((temp - 1) & 1) + { + src_a0 = LD_SP(pa0); + LD_SP2_INC(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res4 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res5 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res6 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res7 += src_a0 * src_b; + + pa0 += 4; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else + dst0 = LD_SP(pc0); + dst1 = LD_SP(pc1); + dst2 = LD_SP(pc2); + dst3 = LD_SP(pc3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; +#endif + ST_SP(dst0, pc0); + ST_SP(dst1, pc1); + ST_SP(dst2, pc2); + ST_SP(dst3, pc3); + +#if defined(TRMMKERNEL) + dst0 = res4 * v_alpha; + dst1 = res5 * v_alpha; + dst2 = res6 * v_alpha; + dst3 = res7 * v_alpha; +#else + dst0 = LD_SP(pc4); + dst1 = LD_SP(pc5); + dst2 = LD_SP(pc6); + dst3 = LD_SP(pc7); + + dst0 += res4 * v_alpha; + dst1 += res5 * v_alpha; + dst2 += res6 * v_alpha; + dst3 += res7 * v_alpha; +#endif + ST_SP(dst0, pc4); + ST_SP(dst1, pc5); + ST_SP(dst2, pc6); + ST_SP(dst3, pc7); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 8; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + pc4 += 4; + pc5 += 4; + pc6 += 4; + pc7 += 4; + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + a1 = pa0[1]; + tmp1 = a1 * b0; + + b1 = pb0[1]; + tmp2 = a0 * b1; + tmp3 = a1 * b1; + + b2 = pb0[2]; + tmp4 = a0 * b2; + tmp5 = a1 * b2; + + b3 = pb0[3]; + tmp6 = a0 * b3; + tmp7 = a1 * b3; + + b4 = pb0[4]; + tmp8 = a0 * b4; + tmp9 = a1 * b4; + + b5 = pb0[5]; + tmp10 = a0 * b5; + tmp11 = a1 * b5; + + b6 = pb0[6]; + tmp12 = a0 * b6; + tmp13 = a1 * b6; + + b7 = pb0[7]; + tmp14 = a0 * b7; + tmp15 = a1 * b7; + + pa0 += 2; + pb0 += 8; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + b4 = pb0[4]; + tmp8 += a0 * b4; + tmp9 += a1 * b4; + + b5 = pb0[5]; + tmp10 += a0 * b5; + tmp11 += a1 * b5; + + b6 = pb0[6]; + tmp12 += a0 * b6; + tmp13 += a1 * b6; + + b7 = pb0[7]; + tmp14 += a0 * b7; + tmp15 += a1 * b7; + + pa0 += 2; + pb0 += 8; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + b4 = pb0[4]; + tmp8 += a0 * b4; + tmp9 += a1 * b4; + + b5 = pb0[5]; + tmp10 += a0 * b5; + tmp11 += a1 * b5; + + b6 = pb0[6]; + tmp12 += a0 * b6; + tmp13 += a1 * b6; + + b7 = pb0[7]; + tmp14 += a0 * b7; + tmp15 += a1 * b7; + + pa0 += 2; + pb0 += 8; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + b4 = pb0[4]; + tmp8 += a0 * b4; + tmp9 += a1 * b4; + + b5 = pb0[5]; + tmp10 += a0 * b5; + tmp11 += a1 * b5; + + b6 = pb0[6]; + tmp12 += a0 * b6; + tmp13 += a1 * b6; + + b7 = pb0[7]; + tmp14 += a0 * b7; + tmp15 += a1 * b7; + + pa0 += 2; + pb0 += 8; + } + + tmp0 = alpha * tmp0; + tmp2 = alpha * tmp2; + tmp4 = alpha * tmp4; + tmp6 = alpha * tmp6; + tmp8 = alpha * tmp8; + tmp10 = alpha * tmp10; + tmp12 = alpha * tmp12; + tmp14 = alpha * tmp14; + +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp2; + pc2[0] = tmp4; + pc3[0] = tmp6; + pc4[0] = tmp8; + pc5[0] = tmp10; + pc6[0] = tmp12; + pc7[0] = tmp14; +#else + pc0[0] += tmp0; + pc1[0] += tmp2; + pc2[0] += tmp4; + pc3[0] += tmp6; + pc4[0] += tmp8; + pc5[0] += tmp10; + pc6[0] += tmp12; + pc7[0] += tmp14; +#endif + tmp1 = alpha * tmp1; + tmp3 = alpha * tmp3; + tmp5 = alpha * tmp5; + tmp7 = alpha * tmp7; + tmp9 = alpha * tmp9; + tmp11 = alpha * tmp11; + tmp13 = alpha * tmp13; + tmp15 = alpha * tmp15; + +#if defined(TRMMKERNEL) + pc0[1] = tmp1; + pc1[1] = tmp3; + pc2[1] = tmp5; + pc3[1] = tmp7; + pc4[1] = tmp9; + pc5[1] = tmp11; + pc6[1] = tmp13; + pc7[1] = tmp15; +#else + pc0[1] += tmp1; + pc1[1] += tmp3; + pc2[1] += tmp5; + pc3[1] += tmp7; + pc4[1] += tmp9; + pc5[1] += tmp11; + pc6[1] += tmp13; + pc7[1] += tmp15; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 8; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + pc4 += 2; + pc5 += 2; + pc6 += 2; + pc7 += 2; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + b2 = pb0[2]; + tmp2 = a0 * b2; + + b3 = pb0[3]; + tmp3 = a0 * b3; + + b4 = pb0[4]; + tmp4 = a0 * b4; + + b5 = pb0[5]; + tmp5 = a0 * b5; + + b6 = pb0[6]; + tmp6 = a0 * b6; + + b7 = pb0[7]; + tmp7 = a0 * b7; + + pa0 += 1; + pb0 += 8; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + b4 = pb0[4]; + tmp4 += a0 * b4; + + b5 = pb0[5]; + tmp5 += a0 * b5; + + b6 = pb0[6]; + tmp6 += a0 * b6; + + b7 = pb0[7]; + tmp7 += a0 * b7; + + pa0 += 1; + pb0 += 8; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + b4 = pb0[4]; + tmp4 += a0 * b4; + + b5 = pb0[5]; + tmp5 += a0 * b5; + + b6 = pb0[6]; + tmp6 += a0 * b6; + + b7 = pb0[7]; + tmp7 += a0 * b7; + + pa0 += 1; + pb0 += 8; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + b4 = pb0[4]; + tmp4 += a0 * b4; + + b5 = pb0[5]; + tmp5 += a0 * b5; + + b6 = pb0[6]; + tmp6 += a0 * b6; + + b7 = pb0[7]; + tmp7 += a0 * b7; + + pa0 += 1; + pb0 += 8; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + tmp2 = alpha * tmp2; + tmp3 = alpha * tmp3; + tmp4 = alpha * tmp4; + tmp5 = alpha * tmp5; + tmp6 = alpha * tmp6; + tmp7 = alpha * tmp7; + +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; + pc2[0] = tmp2; + pc3[0] = tmp3; + pc4[0] = tmp4; + pc5[0] = tmp5; + pc6[0] = tmp6; + pc7[0] = tmp7; +#else + pc0[0] += tmp0; + pc1[0] += tmp1; + pc2[0] += tmp2; + pc3[0] += tmp3; + pc4[0] += tmp4; + pc5[0] += tmp5; + pc6[0] += tmp6; + pc7[0] += tmp7; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 8; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + pc0 += 1; + pc1 += 1; + pc2 += 1; + pc3 += 1; + pc4 += 1; + pc5 += 1; + pc6 += 1; + pc7 += 1; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + + l = (k << 3); + B = B + l; + i = (ldc << 3); + C = C + i; + } + + if (n & 4) + { + pc0 = C; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 = src_a0 * src_b; + res7 = src_a1 * src_b; + + pb0 += 4; + + for (l = ((temp - 1) >> 1); l--;) + { + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + pb0 += 4; + + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + pb0 += 4; + } + + if ((temp - 1) & 1) + { + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + pb0 += 4; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else + LD_SP2(pc0, 4, dst0, dst1); + LD_SP2(pc1, 4, dst2, dst3); + LD_SP2(pc2, 4, dst4, dst5); + LD_SP2(pc3, 4, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; +#endif + + ST_SP2_INC(dst0, dst1, pc0, 4); + ST_SP2_INC(dst2, dst3, pc1, 4); + ST_SP2_INC(dst4, dst5, pc2, 4); + ST_SP2_INC(dst6, dst7, pc3, 4); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + src_a0 = LD_SP(pa0); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 = src_a0 * src_b; + + pa0 += 4; + pb0 += 4; + + for (l = ((temp - 1) >> 1); l--;) + { + src_a0 = LD_SP(pa0); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + pa0 += 4; + pb0 += 4; + + src_a0 = LD_SP(pa0); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + pa0 += 4; + pb0 += 4; + } + + if ((temp - 1) & 1) + { + src_a0 = LD_SP(pa0); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + pa0 += 4; + pb0 += 4; + } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else + dst0 = LD_SP(pc0); + dst1 = LD_SP(pc1); + dst2 = LD_SP(pc2); + dst3 = LD_SP(pc3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; +#endif + ST_SP(dst0, pc0); + ST_SP(dst1, pc1); + ST_SP(dst2, pc2); + ST_SP(dst3, pc3); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + a1 = pa0[1]; + tmp1 = a1 * b0; + + b1 = pb0[1]; + tmp2 = a0 * b1; + tmp3 = a1 * b1; + + b2 = pb0[2]; + tmp4 = a0 * b2; + tmp5 = a1 * b2; + + b3 = pb0[3]; + tmp6 = a0 * b3; + tmp7 = a1 * b3; + + pa0 += 2; + pb0 += 4; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + pa0 += 2; + pb0 += 4; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + pa0 += 2; + pb0 += 4; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + pa0 += 2; + pb0 += 4; + } + + tmp0 = alpha * tmp0; + tmp2 = alpha * tmp2; + tmp4 = alpha * tmp4; + tmp6 = alpha * tmp6; + +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp2; + pc2[0] = tmp4; + pc3[0] = tmp6; +#else + pc0[0] += tmp0; + pc1[0] += tmp2; + pc2[0] += tmp4; + pc3[0] += tmp6; +#endif + tmp1 = alpha * tmp1; + tmp3 = alpha * tmp3; + tmp5 = alpha * tmp5; + tmp7 = alpha * tmp7; + +#if defined(TRMMKERNEL) + pc0[1] = tmp1; + pc1[1] = tmp3; + pc2[1] = tmp5; + pc3[1] = tmp7; +#else + pc0[1] += tmp1; + pc1[1] += tmp3; + pc2[1] += tmp5; + pc3[1] += tmp7; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + b2 = pb0[2]; + tmp2 = a0 * b2; + + b3 = pb0[3]; + tmp3 = a0 * b3; + + pa0 += 1; + pb0 += 4; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + tmp2 = alpha * tmp2; + tmp3 = alpha * tmp3; + +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; + pc2[0] = tmp2; + pc3[0] = tmp3; +#else + pc0[0] += tmp0; + pc1[0] += tmp1; + pc2[0] += tmp2; + pc3[0] += tmp3; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + pc0 += 1; + pc1 += 1; + pc2 += 1; + pc3 += 1; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + l = (k << 2); + B = B + l; + i = (ldc << 2); + C = C + i; + } + + if (n & 2) + { + pc0 = C; + pc1 = pc0 + ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + pb0 += 2; + + for (l = ((temp - 1) >> 1); l--;) + { + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + pb0 += 2; + + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + pb0 += 2; + } + + if ((temp - 1) & 1) + { + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else + LD_SP2(pc0, 4, dst0, dst1); + LD_SP2(pc1, 4, dst2, dst3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc0, 4); + ST_SP2_INC(dst2, dst3, pc1, 4); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 = src_a0 * src_b; + + pa0 += 4; + pb0 += 2; + + for (l = ((temp - 1) >> 1); l--;) + { + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + pa0 += 4; + pb0 += 2; + + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + pa0 += 4; + pb0 += 2; + } + + if ((temp - 1) & 1) + { + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + pa0 += 4; + pb0 += 2; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else + dst0 = LD_SP(pc0); + dst1 = LD_SP(pc1); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; +#endif + ST_SP(dst0, pc0); + ST_SP(dst1, pc1); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + pc0 += 4; + pc1 += 4; + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + a1 = pa0[1]; + tmp1 = a1 * b0; + + b1 = pb0[1]; + tmp2 = a0 * b1; + tmp3 = a1 * b1; + + pa0 += 2; + pb0 += 2; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + pa0 += 2; + pb0 += 2; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + pa0 += 2; + pb0 += 2; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + pa0 += 2; + pb0 += 2; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + tmp2 = alpha * tmp2; + tmp3 = alpha * tmp3; + +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp2; + pc0[1] = tmp1; + pc1[1] = tmp3; +#else + pc0[0] += tmp0; + pc1[0] += tmp2; + pc0[1] += tmp1; + pc1[1] += tmp3; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + pa0 += 1; + pb0 += 2; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; +#else + pc0[0] += tmp0; + pc1[0] += tmp1; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + pc0 += 1; + pc1 += 1; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + l = (k << 1); + B = B + l; + i = (ldc << 1); + C = C + i; + } + + if (n & 1) + { + pc0 = C; + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + pb0 += 1; + + for (l = ((temp - 1) >> 1); l--;) + { + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pb0 += 1; + + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pb0 += 1; + } + + if ((temp - 1) & 1) + { + LD_SP2_INC(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pb0 += 1; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else + LD_SP2(pc0, 4, dst0, dst1); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc0, 4); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + + pa0 += 4; + pb0 += 1; + + for (l = ((temp - 1) >> 1); l--;) + { + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + pa0 += 4; + pb0 += 1; + + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + pa0 += 4; + pb0 += 1; + } + + if ((temp - 1) & 1) + { + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + pa0 += 4; + pb0 += 1; + } + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; +#else + dst0 = LD_SP(pc0); + + dst0 += res0 * v_alpha; +#endif + ST_SP(dst0, pc0); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + pc0 += 4; + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + a1 = pa0[1]; + tmp1 = a1 * b0; + + pa0 += 2; + pb0 += 1; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + pa0 += 2; + pb0 += 1; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + pa0 += 2; + pb0 += 1; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + pa0 += 2; + pb0 += 1; + } + +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc0[1] = tmp1; +#else + pc0[0] += tmp0; + pc0[1] += tmp1; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 2; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + pa0 += 1; + pb0 += 1; + + for (l = ((temp - 1) >> 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + } + + if ((temp - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + } + +#if defined(TRMMKERNEL) + pc0[0] = alpha * tmp0; +#else + pc0[0] += alpha * tmp0; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 1; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + l = (k << 0); + B = B + l; + i = (ldc << 0); + C = C + i; + } + + return 0; +} diff --git a/kernel/mips/sgemm_ncopy_8_msa.c b/kernel/mips/sgemm_ncopy_8_msa.c new file mode 100644 index 0000000000..8618c44351 --- /dev/null +++ b/kernel/mips/sgemm_ncopy_8_msa.c @@ -0,0 +1,164 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v4f32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + psrc0 = src; + pdst = dst; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + for (i = (m >> 3); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + LD_SP2_INC(psrc5, 4, src8, src9); + LD_SP2_INC(psrc6, 4, src10, src11); + LD_SP2_INC(psrc7, 4, src12, src13); + LD_SP2_INC(psrc8, 4, src14, src15); + + TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6); + TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5, + dst7); + TRANSPOSE4x4_SP_SP(src1, src3, src5, src7, dst8, dst10, dst12, + dst14); + TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13, + dst15); + + ST_SP2_INC(dst0, dst1, pdst, 4); + ST_SP2_INC(dst2, dst3, pdst, 4); + ST_SP2_INC(dst4, dst5, pdst, 4); + ST_SP2_INC(dst6, dst7, pdst, 4); + ST_SP2_INC(dst8, dst9, pdst, 4); + ST_SP2_INC(dst10, dst11, pdst, 4); + ST_SP2_INC(dst12, dst13, pdst, 4); + ST_SP2_INC(dst14, dst15, pdst, 4); + } + + for (i = (m & 7); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + *pdst++ = *psrc5++; + *pdst++ = *psrc6++; + *pdst++ = *psrc7++; + *pdst++ = *psrc8++; + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + src2 = LD_SP(psrc3); + src3 = LD_SP(psrc4); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3); + + ST_SP2_INC(dst0, dst1, pdst, 4); + ST_SP2_INC(dst2, dst3, pdst, 4); + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 1); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + } + + if (m & 1) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = m; i--;) + { + *pdst++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/sgemm_tcopy_8_msa.c b/kernel/mips/sgemm_tcopy_8_msa.c new file mode 100644 index 0000000000..3542eca211 --- /dev/null +++ b/kernel/mips/sgemm_tcopy_8_msa.c @@ -0,0 +1,271 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + + pdst2 = dst + m * (n & ~7); + pdst3 = dst + m * (n & ~3); + pdst4 = dst + m * (n & ~1); + + for (j = (m >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + pdst1 = pdst0; + pdst0 += 64; + + for (i = (n >> 3); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + LD_SP2_INC(psrc5, 4, src8, src9); + LD_SP2_INC(psrc6, 4, src10, src11); + LD_SP2_INC(psrc7, 4, src12, src13); + LD_SP2_INC(psrc8, 4, src14, src15); + + ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); + ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 32, 4); + pdst1 += m * 8; + } + + if (n & 4) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + src2 = LD_SP(psrc3); + src3 = LD_SP(psrc4); + src4 = LD_SP(psrc5); + src5 = LD_SP(psrc6); + src6 = LD_SP(psrc7); + src7 = LD_SP(psrc8); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + psrc5 += 4; + psrc6 += 4; + psrc7 += 4; + psrc8 += 4; + + ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4); + } + + if (n & 2) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc4++; + *pdst3++ = *psrc4++; + *pdst3++ = *psrc5++; + *pdst3++ = *psrc5++; + *pdst3++ = *psrc6++; + *pdst3++ = *psrc6++; + *pdst3++ = *psrc7++; + *pdst3++ = *psrc7++; + *pdst3++ = *psrc8++; + *pdst3++ = *psrc8++; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + *pdst4++ = *psrc3++; + *pdst4++ = *psrc4++; + *pdst4++ = *psrc5++; + *pdst4++ = *psrc6++; + *pdst4++ = *psrc7++; + *pdst4++ = *psrc8++; + } + } + + if (m & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 32; + + for (i = (n >> 3); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + + ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); + pdst1 += 8 * m; + } + + if (n & 4) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + src2 = LD_SP(psrc3); + src3 = LD_SP(psrc4); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ST_SP4_INC(src0, src1, src2, src3, pdst2, 4); + } + + if (n & 2) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc4++; + *pdst3++ = *psrc4++; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + *pdst4++ = *psrc3++; + *pdst4++ = *psrc4++; + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 3); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + + ST_SP4(src0, src1, src2, src3, pdst1, 4); + pdst1 += 8 * m; + } + + if (n & 4) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + psrc1 += 4; + psrc2 += 4; + + ST_SP2_INC(src0, src1, pdst2, 4); + } + + if (n & 2) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc2++; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + } + } + + if (m & 1) + { + psrc1 = psrc0; + psrc0 += lda; + + pdst1 = pdst0; + pdst0 += 8; + + for (i = (n >> 3); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + + ST_SP2(src0, src1, pdst1, 4); + pdst1 += 8 * m; + } + + if (n & 4) + { + src0 = LD_SP(psrc1); + psrc1 += 4; + + ST_SP(src0, pdst2); + pdst2 += 4; + } + + if (n & 2) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc1++; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/sgemv_n_msa.c b/kernel/mips/sgemv_n_msa.c new file mode 100644 index 0000000000..ae6e6558fb --- /dev/null +++ b/kernel/mips/sgemv_n_msa.c @@ -0,0 +1,515 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define SGEMV_N_8x8() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + LD_SP2(pa2 + k, 4, t4, t5); \ + LD_SP2(pa3 + k, 4, t6, t7); \ + LD_SP2(pa4 + k, 4, t8, t9); \ + LD_SP2(pa5 + k, 4, t10, t11); \ + LD_SP2(pa6 + k, 4, t12, t13); \ + LD_SP2(pa7 + k, 4, t14, t15); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t2; \ + y1 += tp1 * t3; \ + \ + y0 += tp2 * t4; \ + y1 += tp2 * t5; \ + \ + y0 += tp3 * t6; \ + y1 += tp3 * t7; \ + \ + y0 += tp4 * t8; \ + y1 += tp4 * t9; \ + \ + y0 += tp5 * t10; \ + y1 += tp5 * t11; \ + \ + y0 += tp6 * t12; \ + y1 += tp6 * t13; \ + \ + y0 += tp7 * t14; \ + y1 += tp7 * t15; \ +} + +#define SGEMV_N_4x8() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + t4 = LD_SP(pa2 + k); \ + t6 = LD_SP(pa3 + k); \ + t8 = LD_SP(pa4 + k); \ + t10 = LD_SP(pa5 + k); \ + t12 = LD_SP(pa6 + k); \ + t14 = LD_SP(pa7 + k); \ + \ + y0 += tp0 * t0; \ + y0 += tp1 * t2; \ + y0 += tp2 * t4; \ + y0 += tp3 * t6; \ + y0 += tp4 * t8; \ + y0 += tp5 * t10; \ + y0 += tp6 * t12; \ + y0 += tp7 * t14; \ +} + +#define SGEMV_N_8x4() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + LD_SP2(pa2 + k, 4, t4, t5); \ + LD_SP2(pa3 + k, 4, t6, t7); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t2; \ + y1 += tp1 * t3; \ + \ + y0 += tp2 * t4; \ + y1 += tp2 * t5; \ + \ + y0 += tp3 * t6; \ + y1 += tp3 * t7; \ +} + +#define SGEMV_N_4x4() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + t4 = LD_SP(pa2 + k); \ + t6 = LD_SP(pa3 + k); \ + \ + y0 += tp0 * t0; \ + y0 += tp1 * t2; \ + y0 += tp2 * t4; \ + y0 += tp3 * t6; \ +} + +#define SGEMV_N_8x2() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t2; \ + y1 += tp1 * t3; \ +} + +#define SGEMV_N_4x2() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + \ + y0 += tp0 * t0; \ + y0 += tp1 * t2; \ +} + +#define SLOAD_X8_SCALE_GP() \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + temp4 = alpha * x[4 * inc_x]; \ + temp5 = alpha * x[5 * inc_x]; \ + temp6 = alpha * x[6 * inc_x]; \ + temp7 = alpha * x[7 * inc_x]; \ + \ + tp0 = COPY_FLOAT_TO_VECTOR(temp0); \ + tp1 = COPY_FLOAT_TO_VECTOR(temp1); \ + tp2 = COPY_FLOAT_TO_VECTOR(temp2); \ + tp3 = COPY_FLOAT_TO_VECTOR(temp3); \ + tp4 = COPY_FLOAT_TO_VECTOR(temp4); \ + tp5 = COPY_FLOAT_TO_VECTOR(temp5); \ + tp6 = COPY_FLOAT_TO_VECTOR(temp6); \ + tp7 = COPY_FLOAT_TO_VECTOR(temp7); \ + +#define SLOAD_X4_SCALE_GP() \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + \ + tp0 = COPY_FLOAT_TO_VECTOR(temp0); \ + tp1 = COPY_FLOAT_TO_VECTOR(temp1); \ + tp2 = COPY_FLOAT_TO_VECTOR(temp2); \ + tp3 = COPY_FLOAT_TO_VECTOR(temp3); \ + +#define SLOAD_X8_SCALE_VECTOR() \ + LD_SP2(x, 4, x0, x1); \ + \ + x0 = x0 * v_alpha; \ + x1 = x1 * v_alpha; \ + \ + SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \ + SPLATI_W4_SP(x1, tp4, tp5, tp6, tp7); \ + +#define SLOAD_X4_SCALE_VECTOR() \ + x0 = LD_SP(x); \ + x0 = x0 * v_alpha; \ + SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \ + +#define SLOAD_Y8_GP() \ + y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \ + y1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 4 * inc_y))); \ + y1 = (v4f32) __msa_insert_w((v4i32) y1, 1, *((int *)(y + 5 * inc_y))); \ + y1 = (v4f32) __msa_insert_w((v4i32) y1, 2, *((int *)(y + 6 * inc_y))); \ + y1 = (v4f32) __msa_insert_w((v4i32) y1, 3, *((int *)(y + 7 * inc_y))); \ + +#define SLOAD_Y4_GP() \ + y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \ + +#define SLOAD_Y8_VECTOR() LD_SP2(y, 4, y0, y1); +#define SLOAD_Y4_VECTOR() y0 = LD_SP(y); + +#define SSTORE_Y8_GP() \ + *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \ + *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \ + *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \ + *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \ + *((int *)(y + 4 * inc_y)) = __msa_copy_s_w((v4i32) y1, 0); \ + *((int *)(y + 5 * inc_y)) = __msa_copy_s_w((v4i32) y1, 1); \ + *((int *)(y + 6 * inc_y)) = __msa_copy_s_w((v4i32) y1, 2); \ + *((int *)(y + 7 * inc_y)) = __msa_copy_s_w((v4i32) y1, 3); \ + +#define SSTORE_Y4_GP() \ + *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \ + *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \ + *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \ + *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \ + +#define SSTORE_Y8_VECTOR() ST_SP2(y0, y1, y, 4); +#define SSTORE_Y4_VECTOR() ST_SP(y0, y); + +#define SGEMV_N_MSA() \ + for (j = (n >> 3); j--;) \ + { \ + SLOAD_X8_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_Y8(); \ + SGEMV_N_8x8(); \ + SSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_Y4(); \ + SGEMV_N_4x8(); \ + SSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + temp4 = alpha * x[4 * inc_x]; \ + temp5 = alpha * x[5 * inc_x]; \ + temp6 = alpha * x[6 * inc_x]; \ + temp7 = alpha * x[7 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + temp += temp2 * pa2[k]; \ + temp += temp3 * pa3[k]; \ + temp += temp4 * pa4[k]; \ + temp += temp5 * pa5[k]; \ + temp += temp6 * pa6[k]; \ + temp += temp7 * pa7[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + pa0 += 8 * lda; \ + pa1 += 8 * lda; \ + pa2 += 8 * lda; \ + pa3 += 8 * lda; \ + pa4 += 8 * lda; \ + pa5 += 8 * lda; \ + pa6 += 8 * lda; \ + pa7 += 8 * lda; \ + \ + x += 8 * inc_x; \ + } \ + \ + if (n & 4) \ + { \ + SLOAD_X4_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_Y8(); \ + SGEMV_N_8x4(); \ + SSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_Y4(); \ + SGEMV_N_4x4(); \ + SSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + temp += temp2 * pa2[k]; \ + temp += temp3 * pa3[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + \ + pa0 += 4 * lda; \ + pa1 += 4 * lda; \ + pa2 += 4 * lda; \ + pa3 += 4 * lda; \ + \ + x += 4 * inc_x; \ + } \ + \ + if (n & 2) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + \ + tp0 = COPY_FLOAT_TO_VECTOR(temp0); \ + tp1 = COPY_FLOAT_TO_VECTOR(temp1); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_Y8(); \ + SGEMV_N_8x2(); \ + SSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_Y4(); \ + SGEMV_N_4x2(); \ + SSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + \ + pa0 += 2 * lda; \ + pa1 += 2 * lda; \ + \ + x += 2 * inc_x; \ + } \ + \ + if (n & 1) \ + { \ + temp = alpha * x[0]; \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = m; i--;) \ + { \ + y[0] += temp * pa0[k]; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *y_org = y; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v4f32 v_alpha, x0, x1, y0, y1; + v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + + v_alpha = COPY_FLOAT_TO_VECTOR(alpha); + + pa0 = A; + pa1 = A + lda; + pa2 = A + 2 * lda; + pa3 = A + 3 * lda; + pa4 = A + 4 * lda; + pa5 = A + 5 * lda; + pa6 = A + 6 * lda; + pa7 = A + 7 * lda; + + if ((1 == inc_x) && (1 == inc_y)) + { + #define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR + #define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR + #define SLOAD_Y8 SLOAD_Y8_VECTOR + #define SLOAD_Y4 SLOAD_Y4_VECTOR + #define SSTORE_Y8 SSTORE_Y8_VECTOR + #define SSTORE_Y4 SSTORE_Y4_VECTOR + + SGEMV_N_MSA(); + + #undef SLOAD_X8_SCALE + #undef SLOAD_X4_SCALE + #undef SLOAD_Y8 + #undef SLOAD_Y4 + #undef SSTORE_Y8 + #undef SSTORE_Y4 + } + else if (1 == inc_y) + { + #define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP + #define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP + #define SLOAD_Y8 SLOAD_Y8_VECTOR + #define SLOAD_Y4 SLOAD_Y4_VECTOR + #define SSTORE_Y8 SSTORE_Y8_VECTOR + #define SSTORE_Y4 SSTORE_Y4_VECTOR + + SGEMV_N_MSA(); + + #undef SLOAD_X8_SCALE + #undef SLOAD_X4_SCALE + #undef SLOAD_Y8 + #undef SLOAD_Y4 + #undef SSTORE_Y8 + #undef SSTORE_Y4 + } + else if (1 == inc_x) + { + #define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR + #define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR + #define SLOAD_Y8 SLOAD_Y8_GP + #define SLOAD_Y4 SLOAD_Y4_GP + #define SSTORE_Y8 SSTORE_Y8_GP + #define SSTORE_Y4 SSTORE_Y4_GP + + SGEMV_N_MSA(); + + #undef SLOAD_X8_SCALE + #undef SLOAD_X4_SCALE + #undef SLOAD_Y8 + #undef SLOAD_Y4 + #undef SSTORE_Y8 + #undef SSTORE_Y4 + } + else + { + #define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP + #define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP + #define SLOAD_Y8 SLOAD_Y8_GP + #define SLOAD_Y4 SLOAD_Y4_GP + #define SSTORE_Y8 SSTORE_Y8_GP + #define SSTORE_Y4 SSTORE_Y4_GP + + SGEMV_N_MSA(); + + #undef SLOAD_X8_SCALE + #undef SLOAD_X4_SCALE + #undef SLOAD_Y8 + #undef SLOAD_Y4 + #undef SSTORE_Y8 + #undef SSTORE_Y4 + } + + return(0); +} diff --git a/kernel/mips/sgemv_t_msa.c b/kernel/mips/sgemv_t_msa.c new file mode 100644 index 0000000000..1c7f2998f6 --- /dev/null +++ b/kernel/mips/sgemv_t_msa.c @@ -0,0 +1,463 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define SGEMV_T_8x8() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + LD_SP2(pa2 + k, 4, t4, t5); \ + LD_SP2(pa3 + k, 4, t6, t7); \ + LD_SP2(pa4 + k, 4, t8, t9); \ + LD_SP2(pa5 + k, 4, t10, t11); \ + LD_SP2(pa6 + k, 4, t12, t13); \ + LD_SP2(pa7 + k, 4, t14, t15); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t2; \ + tp1 += x1 * t3; \ + \ + tp2 += x0 * t4; \ + tp2 += x1 * t5; \ + \ + tp3 += x0 * t6; \ + tp3 += x1 * t7; \ + \ + tp4 += x0 * t8; \ + tp4 += x1 * t9; \ + \ + tp5 += x0 * t10; \ + tp5 += x1 * t11; \ + \ + tp6 += x0 * t12; \ + tp6 += x1 * t13; \ + \ + tp7 += x0 * t14; \ + tp7 += x1 * t15; \ +} + +#define SGEMV_T_8x4() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + t4 = LD_SP(pa2 + k); \ + t6 = LD_SP(pa3 + k); \ + t8 = LD_SP(pa4 + k); \ + t10 = LD_SP(pa5 + k); \ + t12 = LD_SP(pa6 + k); \ + t14 = LD_SP(pa7 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t2; \ + tp2 += x0 * t4; \ + tp3 += x0 * t6; \ + tp4 += x0 * t8; \ + tp5 += x0 * t10; \ + tp6 += x0 * t12; \ + tp7 += x0 * t14; \ +} + +#define SGEMV_T_4x8() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + LD_SP2(pa2 + k, 4, t4, t5); \ + LD_SP2(pa3 + k, 4, t6, t7); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t2; \ + tp1 += x1 * t3; \ + \ + tp2 += x0 * t4; \ + tp2 += x1 * t5; \ + \ + tp3 += x0 * t6; \ + tp3 += x1 * t7; \ +} + +#define SGEMV_T_4x4() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + t4 = LD_SP(pa2 + k); \ + t6 = LD_SP(pa3 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t2; \ + tp2 += x0 * t4; \ + tp3 += x0 * t6; \ +} + +#define SGEMV_T_2x8() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t2; \ + tp1 += x1 * t3; \ +} + +#define SGEMV_T_2x4() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t2; \ +} + +#define SLOAD_X8_GP() \ + x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \ + x1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 4 * inc_x))); \ + x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *)(x + 5 * inc_x))); \ + x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *)(x + 6 * inc_x))); \ + x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *)(x + 7 * inc_x))); \ + +#define SLOAD_X4_GP() \ + x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \ + +#define SLOAD_X8_VECTOR() LD_SP2(x, 4, x0, x1); +#define SLOAD_X4_VECTOR() x0 = LD_SP(x); + +#define SGEMV_T_MSA() \ + for (j = (n >> 3); j--;) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + tp2 = zero; \ + tp3 = zero; \ + tp4 = zero; \ + tp5 = zero; \ + tp6 = zero; \ + tp7 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_X8(); \ + SGEMV_T_8x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_X4(); \ + SGEMV_T_8x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \ + tp0, tp1, tp2, tp3); \ + TRANSPOSE4x4_SP_SP(tp4, tp5, tp6, tp7, \ + tp4, tp5, tp6, tp7); \ + tp0 += tp1; \ + tp0 += tp2; \ + tp0 += tp3; \ + tp4 += tp5; \ + tp4 += tp6; \ + tp4 += tp7; \ + \ + temp0 = tp0[0]; \ + temp1 = tp0[1]; \ + temp2 = tp0[2]; \ + temp3 = tp0[3]; \ + temp4 = tp4[0]; \ + temp5 = tp4[1]; \ + temp6 = tp4[2]; \ + temp7 = tp4[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + temp2 += pa2[k] * x[0]; \ + temp3 += pa3[k] * x[0]; \ + temp4 += pa4[k] * x[0]; \ + temp5 += pa5[k] * x[0]; \ + temp6 += pa6[k] * x[0]; \ + temp7 += pa7[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + res2 = y[2 * inc_y]; \ + res3 = y[3 * inc_y]; \ + res4 = y[4 * inc_y]; \ + res5 = y[5 * inc_y]; \ + res6 = y[6 * inc_y]; \ + res7 = y[7 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + res2 += alpha * temp2; \ + res3 += alpha * temp3; \ + res4 += alpha * temp4; \ + res5 += alpha * temp5; \ + res6 += alpha * temp6; \ + res7 += alpha * temp7; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + y[2 * inc_y] = res2; \ + y[3 * inc_y] = res3; \ + y[4 * inc_y] = res4; \ + y[5 * inc_y] = res5; \ + y[6 * inc_y] = res6; \ + y[7 * inc_y] = res7; \ + \ + y += 8 * inc_y; \ + \ + pa0 += 8 * lda; \ + pa1 += 8 * lda; \ + pa2 += 8 * lda; \ + pa3 += 8 * lda; \ + pa4 += 8 * lda; \ + pa5 += 8 * lda; \ + pa6 += 8 * lda; \ + pa7 += 8 * lda; \ + } \ + \ + if (n & 4) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + tp2 = zero; \ + tp3 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_X8(); \ + SGEMV_T_4x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_X4(); \ + SGEMV_T_4x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \ + tp0, tp1, tp2, tp3); \ + tp0 += tp1; \ + tp0 += tp2; \ + tp0 += tp3; \ + \ + temp0 = tp0[0]; \ + temp1 = tp0[1]; \ + temp2 = tp0[2]; \ + temp3 = tp0[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + temp2 += pa2[k] * x[0]; \ + temp3 += pa3[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + res2 = y[2 * inc_y]; \ + res3 = y[3 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + res2 += alpha * temp2; \ + res3 += alpha * temp3; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + y[2 * inc_y] = res2; \ + y[3 * inc_y] = res3; \ + \ + y += 4 * inc_y; \ + \ + pa0 += 4 * lda; \ + pa1 += 4 * lda; \ + pa2 += 4 * lda; \ + pa3 += 4 * lda; \ + } \ + \ + if (n & 2) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_X8(); \ + SGEMV_T_2x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_X4(); \ + SGEMV_T_2x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + ILVRL_W2_SP(tp1, tp0, tp2, tp3); \ + \ + tp2 += tp3; \ + \ + temp0 = tp2[0] + tp2[2]; \ + temp1 = tp2[1] + tp2[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + \ + y += 2 * inc_y; \ + \ + pa0 += 2 * lda; \ + pa1 += 2 * lda; \ + } \ + \ + if (n & 1) \ + { \ + temp0 = 0.0; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = m; i--;) \ + { \ + temp0 += pa0[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + y[0] += alpha * temp0; \ + y += inc_y; \ + pa0 += lda; \ + } + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *srcx_org = x; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + FLOAT res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 x0, x1; + v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + v4f32 zero = {0}; + + pa0 = A + 0 * lda; + pa1 = A + 1 * lda; + pa2 = A + 2 * lda; + pa3 = A + 3 * lda; + pa4 = A + 4 * lda; + pa5 = A + 5 * lda; + pa6 = A + 6 * lda; + pa7 = A + 7 * lda; + + if (1 == inc_x) + { + #define SLOAD_X8 SLOAD_X8_VECTOR + #define SLOAD_X4 SLOAD_X4_VECTOR + + SGEMV_T_MSA(); + + #undef SLOAD_X8 + #undef SLOAD_X4 + } + else + { + #define SLOAD_X8 SLOAD_X8_GP + #define SLOAD_X4 SLOAD_X4_GP + + SGEMV_T_MSA(); + + #undef SLOAD_X8 + #undef SLOAD_X4 + } + + return(0); +} diff --git a/kernel/mips/strsm_kernel_LN_8x8_msa.c b/kernel/mips/strsm_kernel_LN_8x8_msa.c new file mode 100644 index 0000000000..53891e64ff --- /dev/null +++ b/kernel/mips/strsm_kernel_LN_8x8_msa.c @@ -0,0 +1,1786 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24; + v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35, src_a36; + v4f32 src_a40, src_a41, src_a42, src_a43, src_a44, src_a45; + v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54; + v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + for (k = 0; k < bk; k++) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + + aa += 8; + bb += 8; + } + + a -= 64; + b -= 64; + + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15, + res_c12, res_c13, res_c14, res_c15); + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14, + res_c8, res_c9, res_c10, res_c11); + + src_a = LD_SP(a + 60); + SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63); + src_a = LD_SP(a + 56); + SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59); + + res_c7 *= src_a63; + res_c15 *= src_a63; + res_c6 -= res_c7 * src_a62; + res_c14 -= res_c15 * src_a62; + res_c5 -= res_c7 * src_a61; + res_c13 -= res_c15 * src_a61; + res_c4 -= res_c7 * src_a60; + res_c12 -= res_c15 * src_a60; + res_c3 -= res_c7 * src_a59; + res_c11 -= res_c15 * src_a59; + res_c2 -= res_c7 * src_a58; + res_c10 -= res_c15 * src_a58; + res_c1 -= res_c7 * src_a57; + res_c9 -= res_c15 * src_a57; + res_c0 -= res_c7 * src_a56; + res_c8 -= res_c15 * src_a56; + + src_a = LD_SP(a + 48); + SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51); + src_a52 = LD_SP(a + 52); + src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2); + src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1); + src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0); + + res_c6 *= src_a54; + res_c14 *= src_a54; + res_c5 -= res_c6 * src_a53; + res_c13 -= res_c14 * src_a53; + res_c4 -= res_c6 * src_a52; + res_c12 -= res_c14 * src_a52; + res_c3 -= res_c6 * src_a51; + res_c11 -= res_c14 * src_a51; + res_c2 -= res_c6 * src_a50; + res_c10 -= res_c14 * src_a50; + res_c1 -= res_c6 * src_a49; + res_c9 -= res_c14 * src_a49; + res_c0 -= res_c6 * src_a48; + res_c8 -= res_c14 * src_a48; + + src_a = LD_SP(a + 40); + SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43); + src_a44 = LD_SP(a + 44); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1); + src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0); + + res_c5 *= src_a45; + res_c13 *= src_a45; + res_c4 -= res_c5 * src_a44; + res_c12 -= res_c13 * src_a44; + res_c3 -= res_c5 * src_a43; + res_c11 -= res_c13 * src_a43; + res_c2 -= res_c5 * src_a42; + res_c10 -= res_c13 * src_a42; + res_c1 -= res_c5 * src_a41; + res_c9 -= res_c13 * src_a41; + res_c0 -= res_c5 * src_a40; + res_c8 -= res_c13 * src_a40; + + src_a = LD_SP(a + 32); + SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); + src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36)); + + res_c4 *= src_a36; + res_c12 *= src_a36; + res_c3 -= res_c4 * src_a35; + res_c11 -= res_c12 * src_a35; + res_c2 -= res_c4 * src_a34; + res_c10 -= res_c12 * src_a34; + res_c1 -= res_c4 * src_a33; + res_c9 -= res_c12 * src_a33; + res_c0 -= res_c4 * src_a32; + res_c8 -= res_c12 * src_a32; + + ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4); + ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4); + + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15, + src_c9, src_c11, src_c13, src_c15); + + ST_SP(src_c1, c + 4); + ST_SP(src_c3, c_nxt1line + 4); + ST_SP(src_c5, c_nxt2line + 4); + ST_SP(src_c7, c_nxt3line + 4); + ST_SP(src_c9, c_nxt4line + 4); + ST_SP(src_c11, c_nxt5line + 4); + ST_SP(src_c13, c_nxt6line + 4); + ST_SP(src_c15, c_nxt7line + 4); + + src_a = LD_SP(a + 24); + SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27); + + res_c3 *= src_a27; + res_c11 *= src_a27; + res_c2 -= res_c3 * src_a26; + res_c10 -= res_c11 * src_a26; + res_c1 -= res_c3 * src_a25; + res_c9 -= res_c11 * src_a25; + res_c0 -= res_c3 * src_a24; + res_c8 -= res_c11 * src_a24; + + src_a16 = LD_SP(a + 16); + src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2); + src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1); + src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0); + + res_c2 *= src_a18; + res_c10 *= src_a18; + res_c1 -= res_c2 * src_a17; + res_c9 -= res_c10 * src_a17; + res_c0 -= res_c2 * src_a16; + res_c8 -= res_c10 * src_a16; + + src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9)); + src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); + + res_c1 *= src_a9; + res_c9 *= src_a9; + res_c0 -= res_c1 * src_a8; + res_c8 -= res_c9 * src_a8; + + res_c0 *= src_a0; + res_c8 *= src_a0; + + ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4); + ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11, + src_c8, src_c10, src_c12, src_c14); + + ST_SP(src_c0, c); + ST_SP(src_c2, c_nxt1line); + ST_SP(src_c4, c_nxt2line); + ST_SP(src_c6, c_nxt3line); + ST_SP(src_c8, c_nxt4line); + ST_SP(src_c10, c_nxt5line); + ST_SP(src_c12, c_nxt6line); + ST_SP(src_c14, c_nxt7line); +} + +static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24; + v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35; + v4f32 src_a36, src_a40, src_a41, src_a42, src_a43, src_a44, src_a45; + v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54; + v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + for (k = 0; k < (bk >> 1); k++) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + aa += 8; + bb += 4; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + aa += 8; + bb += 4; + } + + if ((bk & 1) && (bk > 0)) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + } + + a -= 64; + b -= 32; + + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 60); + SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63); + src_a = LD_SP(a + 56); + SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59); + + src_a = LD_SP(a + 48); + SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51); + src_a52 = LD_SP(a + 52); + src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2); + src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1); + src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0); + + res_c7 *= src_a63; + res_c6 -= res_c7 * src_a62; + res_c5 -= res_c7 * src_a61; + res_c4 -= res_c7 * src_a60; + res_c3 -= res_c7 * src_a59; + res_c2 -= res_c7 * src_a58; + res_c1 -= res_c7 * src_a57; + res_c0 -= res_c7 * src_a56; + + res_c6 *= src_a54; + res_c5 -= res_c6 * src_a53; + res_c4 -= res_c6 * src_a52; + res_c3 -= res_c6 * src_a51; + res_c2 -= res_c6 * src_a50; + res_c1 -= res_c6 * src_a49; + res_c0 -= res_c6 * src_a48; + + src_a = LD_SP(a + 40); + SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43); + src_a44 = LD_SP(a + 44); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1); + src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0); + + res_c5 *= src_a45; + res_c4 -= res_c5 * src_a44; + res_c3 -= res_c5 * src_a43; + res_c2 -= res_c5 * src_a42; + res_c1 -= res_c5 * src_a41; + res_c0 -= res_c5 * src_a40; + + src_a = LD_SP(a + 32); + SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); + src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36)); + + res_c4 *= src_a36; + res_c3 -= res_c4 * src_a35; + res_c2 -= res_c4 * src_a34; + res_c1 -= res_c4 * src_a33; + res_c0 -= res_c4 * src_a32; + + src_a = LD_SP(a + 24); + SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27); + + res_c3 *= src_a27; + res_c2 -= res_c3 * src_a26; + res_c1 -= res_c3 * src_a25; + res_c0 -= res_c3 * src_a24; + + src_a16 = LD_SP(a + 16); + src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2); + src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1); + src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0); + + res_c2 *= src_a18; + res_c1 -= res_c2 * src_a17; + res_c0 -= res_c2 * src_a16; + + src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9)); + src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); + + res_c1 *= src_a9; + res_c0 -= res_c1 * src_a8; + + res_c0 *= src_a0; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + ST_SP4(res_c4, res_c5, res_c6, res_c7, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; + FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; + FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + c4_nxt = *(c + 4 + ldc); + c5_nxt = *(c + 5 + ldc); + c6_nxt = *(c + 6 + ldc); + c7_nxt = *(c + 7 + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c4 -= aa[4] * bb[0]; + c5 -= aa[5] * bb[0]; + c6 -= aa[6] * bb[0]; + c7 -= aa[7] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; + c2_nxt -= aa[2] * bb[1]; + c3_nxt -= aa[3] * bb[1]; + c4_nxt -= aa[4] * bb[1]; + c5_nxt -= aa[5] * bb[1]; + c6_nxt -= aa[6] * bb[1]; + c7_nxt -= aa[7] * bb[1]; + + aa += 8; + bb += 2; + } + + a -= 64; + b -= 16; + + a0 = *(a + 0); + a8 = *(a + 8); + a9 = *(a + 9); + a16 = *(a + 16); + a17 = *(a + 17); + a18 = *(a + 18); + a24 = *(a + 24); + a25 = *(a + 25); + a26 = *(a + 26); + a27 = *(a + 27); + a32 = *(a + 32); + a33 = *(a + 33); + a34 = *(a + 34); + a35 = *(a + 35); + a36 = *(a + 36); + a40 = *(a + 40); + a41 = *(a + 41); + a42 = *(a + 42); + a43 = *(a + 43); + a44 = *(a + 44); + a45 = *(a + 45); + a48 = *(a + 48); + a49 = *(a + 49); + a50 = *(a + 50); + a51 = *(a + 51); + a52 = *(a + 52); + a53 = *(a + 53); + a54 = *(a + 54); + a56 = *(a + 56); + a57 = *(a + 57); + a58 = *(a + 58); + a59 = *(a + 59); + a60 = *(a + 60); + a61 = *(a + 61); + a62 = *(a + 62); + a63 = *(a + 63); + + c7 *= a63; + c7_nxt *= a63; + c6 -= c7 * a62; + c6_nxt -= c7_nxt * a62; + c5 -= c7 * a61; + c5_nxt -= c7_nxt * a61; + c4 -= c7 * a60; + c4_nxt -= c7_nxt * a60; + c3 -= c7 * a59; + c3_nxt -= c7_nxt * a59; + c2 -= c7 * a58; + c2_nxt -= c7_nxt * a58; + c1 -= c7 * a57; + c1_nxt -= c7_nxt * a57; + c0 -= c7 * a56; + c0_nxt -= c7_nxt * a56; + + c6 *= a54; + c6_nxt *= a54; + c5 -= c6 * a53; + c5_nxt -= c6_nxt * a53; + c4 -= c6 * a52; + c4_nxt -= c6_nxt * a52; + c3 -= c6 * a51; + c3_nxt -= c6_nxt * a51; + c2 -= c6 * a50; + c2_nxt -= c6_nxt * a50; + c1 -= c6 * a49; + c1_nxt -= c6_nxt * a49; + c0 -= c6 * a48; + c0_nxt -= c6_nxt * a48; + + c5 *= a45; + c5_nxt *= a45; + c4 -= c5 * a44; + c4_nxt -= c5_nxt * a44; + c3 -= c5 * a43; + c3_nxt -= c5_nxt * a43; + c2 -= c5 * a42; + c2_nxt -= c5_nxt * a42; + c1 -= c5 * a41; + c1_nxt -= c5_nxt * a41; + c0 -= c5 * a40; + c0_nxt -= c5_nxt * a40; + + c4 *= a36; + c4_nxt *= a36; + c3 -= c4 * a35; + c3_nxt -= c4_nxt * a35; + c2 -= c4 * a34; + c2_nxt -= c4_nxt * a34; + c1 -= c4 * a33; + c1_nxt -= c4_nxt * a33; + c0 -= c4 * a32; + c0_nxt -= c4_nxt * a32; + + c3 *= a27; + c3_nxt *= a27; + c2 -= c3 * a26; + c2_nxt -= c3_nxt * a26; + c1 -= c3 * a25; + c1_nxt -= c3_nxt * a25; + c0 -= c3 * a24; + c0_nxt -= c3_nxt * a24; + + c2 *= a18; + c2_nxt *= a18; + c1 -= c2 * a17; + c1_nxt -= c2_nxt * a17; + c0 -= c2 * a16; + c0_nxt -= c2_nxt * a16; + + c1 *= a9; + c1_nxt *= a9; + c0 -= c1 * a8; + c0_nxt -= c1_nxt * a8; + + c0 *= a0; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + *(b + 8) = c4; + *(b + 9) = c4_nxt; + *(b + 10) = c5; + *(b + 11) = c5_nxt; + *(b + 12) = c6; + *(b + 13) = c6_nxt; + *(b + 14) = c7; + *(b + 15) = c7_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; + *(c + 4 + ldc) = c4_nxt; + *(c + 5 + ldc) = c5_nxt; + *(c + 6 + ldc) = c6_nxt; + *(c + 7 + ldc) = c7_nxt; +} + +static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; + FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; + FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c4 -= aa[4] * bb[0]; + c5 -= aa[5] * bb[0]; + c6 -= aa[6] * bb[0]; + c7 -= aa[7] * bb[0]; + + aa += 8; + bb += 1; + } + + a -= 64; + b -= 8; + + a0 = *(a + 0); + a8 = *(a + 8); + a9 = *(a + 9); + a16 = *(a + 16); + a17 = *(a + 17); + a18 = *(a + 18); + a24 = *(a + 24); + a25 = *(a + 25); + a26 = *(a + 26); + a27 = *(a + 27); + a32 = *(a + 32); + a33 = *(a + 33); + a34 = *(a + 34); + a35 = *(a + 35); + a36 = *(a + 36); + a40 = *(a + 40); + a41 = *(a + 41); + a42 = *(a + 42); + a43 = *(a + 43); + a44 = *(a + 44); + a45 = *(a + 45); + a48 = *(a + 48); + a49 = *(a + 49); + a50 = *(a + 50); + a51 = *(a + 51); + a52 = *(a + 52); + a53 = *(a + 53); + a54 = *(a + 54); + a56 = *(a + 56); + a57 = *(a + 57); + a58 = *(a + 58); + a59 = *(a + 59); + a60 = *(a + 60); + a61 = *(a + 61); + a62 = *(a + 62); + a63 = *(a + 63); + + c7 *= a63; + + c6 -= c7 * a62; + c6 *= a54; + + c5 -= c7 * a61; + c5 -= c6 * a53; + c5 *= a45; + + c4 -= c7 * a60; + c4 -= c6 * a52; + c4 -= c5 * a44; + c4 *= a36; + + c3 -= c7 * a59; + c3 -= c6 * a51; + c3 -= c5 * a43; + c3 -= c4 * a35; + c3 *= a27; + + c2 -= c7 * a58; + c2 -= c6 * a50; + c2 -= c5 * a42; + c2 -= c4 * a34; + c2 -= c3 * a26; + c2 *= a18; + + c1 -= c7 * a57; + c1 -= c6 * a49; + c1 -= c5 * a41; + c1 -= c4 * a33; + c1 -= c3 * a25; + c1 -= c2 * a17; + c1 *= a9; + + c0 -= c7 * a56; + c0 -= c6 * a48; + c0 -= c5 * a40; + c0 -= c4 * a32; + c0 -= c3 * a24; + c0 -= c2 * a16; + c0 -= c1 * a8; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; +} + +static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; + v4f32 src_a13, src_a14, src_a15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + for (k = 0; k < bk; k++) + { + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; + + aa += 4; + bb += 8; + } + + a -= 16; + b -= 32; + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 12); + SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15); + src_a8 = LD_SP(a + 8); + src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); + src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); + src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); + + src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5)); + src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); + + res_c3 *= src_a15; + res_c7 *= src_a15; + res_c2 -= res_c3 * src_a14; + res_c6 -= res_c7 * src_a14; + res_c1 -= res_c3 * src_a13; + res_c5 -= res_c7 * src_a13; + res_c0 -= res_c3 * src_a12; + res_c4 -= res_c7 * src_a12; + + res_c2 *= src_a10; + res_c6 *= src_a10; + res_c1 -= res_c2 * src_a9; + res_c5 -= res_c6 * src_a9; + res_c0 -= res_c2 * src_a8; + res_c4 -= res_c6 * src_a8; + + res_c1 *= src_a5; + res_c5 *= src_a5; + res_c0 -= res_c1 * src_a4; + res_c4 -= res_c5 * src_a4; + + res_c0 *= src_a0; + res_c4 *= src_a0; + + ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4); + ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c4, src_c5, src_c6, src_c7); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; + v4f32 src_a13, src_a14, src_a15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + for (k = 0; k < (bk >> 1); k++) + { + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + aa += 4; + bb += 4; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + aa += 4; + bb += 4; + } + + if ((bk & 1) && (bk > 0)) + { + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + } + + a -= 16; + b -= 16; + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + + src_a = LD_SP(a + 12); + SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15); + src_a8 = LD_SP(a + 8); + src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); + src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); + src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); + src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5)); + src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); + + res_c3 *= src_a15; + res_c2 -= res_c3 * src_a14; + res_c1 -= res_c3 * src_a13; + res_c0 -= res_c3 * src_a12; + + res_c2 *= src_a10; + res_c1 -= res_c2 * src_a9; + res_c0 -= res_c2 * src_a8; + + res_c1 *= src_a5; + res_c0 -= res_c1 * src_a4; + + res_c0 *= src_a0; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15; + FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; + c2_nxt -= aa[2] * bb[1]; + c3_nxt -= aa[3] * bb[1]; + + aa += 4; + bb += 2; + } + + a -= 16; + b -= 8; + + a0 = *(a + 0); + a4 = *(a + 4); + a5 = *(a + 5); + a8 = *(a + 8); + a9 = *(a + 9); + a10 = *(a + 10); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + + c3 *= a15; + c3_nxt *= a15; + + c2 -= c3 * a14; + c2_nxt -= c3_nxt * a14; + + c2 *= a10; + c2_nxt *= a10; + + c1 -= c3 * a13; + c1_nxt -= c3_nxt * a13; + + c1 -= c2 * a9; + c1_nxt -= c2_nxt * a9; + + c1 *= a5; + c1_nxt *= a5; + + c0 -= c3 * a12; + c0_nxt -= c3_nxt * a12; + + c0 -= c2 * a8; + c0_nxt -= c2_nxt * a8; + + c0 -= c1 * a4; + c0_nxt -= c1_nxt * a4; + + c0 *= a0; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; +} + +static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + + aa += 4; + bb += 1; + } + + a -= 16; + b -= 4; + + a0 = *(a + 0); + a4 = *(a + 4); + a5 = *(a + 5); + a8 = *(a + 8); + a9 = *(a + 9); + a10 = *(a + 10); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + + c3 *= a15; + + c2 -= c3 * a14; + c2 *= a10; + + c1 -= c3 * a13; + c1 -= c2 * a9; + c1 *= a5; + + c0 -= c3 * a12; + c0 -= c2 * a8; + c0 -= c1 * a4; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3; + FLOAT c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; + FLOAT c0_nxt7, c1_nxt7; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 0 + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 0 + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 0 + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 0 + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + c0_nxt4 -= aa[0] * bb[4]; + c1_nxt4 -= aa[1] * bb[4]; + c0_nxt5 -= aa[0] * bb[5]; + c1_nxt5 -= aa[1] * bb[5]; + c0_nxt6 -= aa[0] * bb[6]; + c1_nxt6 -= aa[1] * bb[6]; + c0_nxt7 -= aa[0] * bb[7]; + c1_nxt7 -= aa[1] * bb[7]; + + aa += 2; + bb += 8; + } + + a -= 4; + b -= 16; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c1_nxt1 *= a3; + c1_nxt2 *= a3; + c1_nxt3 *= a3; + c1_nxt4 *= a3; + c1_nxt5 *= a3; + c1_nxt6 *= a3; + c1_nxt7 *= a3; + + c0 -= c1 * a2; + c0_nxt1 -= c1_nxt1 * a2; + c0_nxt2 -= c1_nxt2 * a2; + c0_nxt3 -= c1_nxt3 * a2; + c0_nxt4 -= c1_nxt4 * a2; + c0_nxt5 -= c1_nxt5 * a2; + c0_nxt6 -= c1_nxt6 * a2; + c0_nxt7 -= c1_nxt7 * a2; + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + c0_nxt4 *= a0; + c0_nxt5 *= a0; + c0_nxt6 *= a0; + c0_nxt7 *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c0_nxt4; + *(b + 5) = c0_nxt5; + *(b + 6) = c0_nxt6; + *(b + 7) = c0_nxt7; + *(b + 8) = c1; + *(b + 9) = c1_nxt1; + *(b + 10) = c1_nxt2; + *(b + 11) = c1_nxt3; + *(b + 12) = c1_nxt4; + *(b + 13) = c1_nxt5; + *(b + 14) = c1_nxt6; + *(b + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + + aa += 2; + bb += 4; + } + + a -= 4; + b -= 8; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c1_nxt1 *= a3; + c1_nxt2 *= a3; + c1_nxt3 *= a3; + + c0 -= c1 * a2; + c0_nxt1 -= c1_nxt1 * a2; + c0_nxt2 -= c1_nxt2 * a2; + c0_nxt3 -= c1_nxt3 * a2; + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; + + aa += 2; + bb += 2; + } + + a -= 4; + b -= 4; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c1_nxt *= a3; + + c0 -= c1 * a2; + c0_nxt -= c1_nxt * a2; + + c0 *= a0; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, a2, a3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + + aa += 2; + bb += 1; + } + + a -= 4; + b -= 2; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + + c0 -= c1 * a2; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + c4 -= aa[0] * bb[4]; + c5 -= aa[0] * bb[5]; + c6 -= aa[0] * bb[6]; + c7 -= aa[0] * bb[7]; + + aa += 1; + bb += 8; + } + + a0 = *(a - 1); + + c0 *= a0; + c1 *= a0; + c2 *= a0; + c3 *= a0; + c4 *= a0; + c5 *= a0; + c6 *= a0; + c7 *= a0; + + *(b - 8) = c0; + *(b - 7) = c1; + *(b - 6) = c2; + *(b - 5) = c3; + *(b - 4) = c4; + *(b - 3) = c5; + *(b - 2) = c6; + *(b - 1) = c7; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; +} + +static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, c0, c1, c2, c3; + + c0 = *(c + 0 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + + aa += 1; + bb += 4; + } + + a0 = *(a - 1); + + c0 *= a0; + c1 *= a0; + c2 *= a0; + c3 *= a0; + + *(b - 4) = c0; + *(b - 3) = c1; + *(b - 2) = c2; + *(b - 1) = c3; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT a0, c0, c1; + + c0 = *c; + c1 = *(c + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + + aa += 1; + bb += 2; + } + + a0 = *(a - 1); + + c0 *= a0; + c1 *= a0; + + *(b - 2) = c0; + *(b - 1) = c1; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; +} + +static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + + for (k = 0; k < bk; k++) + { + *c -= a[k] * b[k]; + } + + *c *= *(a - 1); + *(b - 1) = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + for (j = (n >> 3); j--;) + { + kk = m + offset; + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x8_ln_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 8 * k; + c += 8 * ldc; + } + + if (n & 7) + { + if (n & 4) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 2) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x1_ln_msa(aa, b + kk, cc, (k - kk)); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x1_ln_msa(aa, b + kk, cc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x1_ln_msa(aa, b + kk, cc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk)); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += k; + c += ldc; + } + } + + return 0; +} diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c new file mode 100644 index 0000000000..5834d77b24 --- /dev/null +++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c @@ -0,0 +1,1694 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + for (k = 0; k < bk; k++) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + + a += 8; + b += 8; + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14, + res_c8, res_c9, res_c10, res_c11); + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15, + res_c12, res_c13, res_c14, res_c15); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a = LD_SP(a + 4); + SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7); + + res_c0 *= src_a0; + res_c8 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c9 -= res_c8 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c10 -= res_c8 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c11 -= res_c8 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c12 -= res_c8 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c13 -= res_c8 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c14 -= res_c8 * src_a6; + res_c7 -= res_c0 * src_a7; + res_c15 -= res_c8 * src_a7; + + src_a = LD_SP(a + 9); + SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12); + src_a13 = LD_SP(a + 13); + src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2); + src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1); + src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0); + + res_c1 *= src_a9; + res_c9 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c10 -= res_c9 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c11 -= res_c9 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c12 -= res_c9 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c13 -= res_c9 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c14 -= res_c9 * src_a14; + res_c7 -= res_c1 * src_a15; + res_c15 -= res_c9 * src_a15; + + src_a = LD_SP(a + 18); + SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21); + src_a22 = LD_SP(a + 22); + src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1); + src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0); + + res_c2 *= src_a18; + res_c10 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c11 -= res_c10 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c12 -= res_c10 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c13 -= res_c10 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c14 -= res_c10 * src_a22; + res_c7 -= res_c2 * src_a23; + res_c15 -= res_c10 * src_a23; + + src_a = LD_SP(a + 27); + SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); + src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31)); + + res_c3 *= src_a27; + res_c11 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c12 -= res_c11 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c13 -= res_c11 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c14 -= res_c11 * src_a30; + res_c7 -= res_c3 * src_a31; + res_c15 -= res_c11 * src_a31; + + ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4); + ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11, + src_c8, src_c10, src_c12, src_c14); + + ST_SP(src_c0, c); + ST_SP(src_c2, c_nxt1line); + ST_SP(src_c4, c_nxt2line); + ST_SP(src_c6, c_nxt3line); + ST_SP(src_c8, c_nxt4line); + ST_SP(src_c10, c_nxt5line); + ST_SP(src_c12, c_nxt6line); + ST_SP(src_c14, c_nxt7line); + + src_a = LD_SP(a + 36); + SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39); + + res_c4 *= src_a36; + res_c12 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c13 -= res_c12 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c14 -= res_c12 * src_a38; + res_c7 -= res_c4 * src_a39; + res_c15 -= res_c12 * src_a39; + + src_a45 = LD_SP(a + 45); + src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2); + src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0); + + res_c5 *= src_a45; + res_c13 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c14 -= res_c13 * src_a46; + res_c7 -= res_c5 * src_a47; + res_c15 -= res_c13 * src_a47; + + src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54)); + src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55)); + src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63)); + + res_c6 *= src_a54; + res_c14 *= src_a54; + res_c7 -= res_c6 * src_a55; + res_c15 -= res_c14 * src_a55; + + res_c7 *= src_a63; + res_c15 *= src_a63; + + ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4); + ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4); + + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15, + src_c9, src_c11, src_c13, src_c15); + + ST_SP(src_c1, c + 4); + ST_SP(src_c3, c_nxt1line + 4); + ST_SP(src_c5, c_nxt2line + 4); + ST_SP(src_c7, c_nxt3line + 4); + ST_SP(src_c9, c_nxt4line + 4); + ST_SP(src_c11, c_nxt5line + 4); + ST_SP(src_c13, c_nxt6line + 4); + ST_SP(src_c15, c_nxt7line + 4); +} + +static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + for (k = 0; k < bk; k++) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + a += 8; + b += 4; + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a = LD_SP(a + 4); + SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c7 -= res_c0 * src_a7; + + src_a = LD_SP(a + 9); + SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12); + src_a13 = LD_SP(a + 13); + src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2); + src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1); + src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0); + + res_c1 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c7 -= res_c1 * src_a15; + + src_a = LD_SP(a + 18); + SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21); + src_a22 = LD_SP(a + 22); + src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1); + src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0); + + res_c2 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c7 -= res_c2 * src_a23; + + src_a = LD_SP(a + 27); + SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); + src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31)); + + res_c3 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c7 -= res_c3 * src_a31; + + src_a = LD_SP(a + 36); + SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39); + + res_c4 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c7 -= res_c4 * src_a39; + + src_a45 = LD_SP(a + 45); + src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2); + src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0); + + res_c5 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c7 -= res_c5 * src_a47; + + src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54)); + src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55)); + src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63)); + + res_c6 *= src_a54; + res_c7 -= res_c6 * src_a55; + res_c7 *= src_a63; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + b += 16; + ST_SP4(res_c4, res_c5, res_c6, res_c7, b, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; + FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; + FLOAT a45, a46, a47, a54, a55, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + c4_nxt = *(c + 4 + ldc); + c5_nxt = *(c + 5 + ldc); + c6_nxt = *(c + 6 + ldc); + c7_nxt = *(c + 7 + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c4 -= a[4] * b[0]; + c5 -= a[5] * b[0]; + c6 -= a[6] * b[0]; + c7 -= a[7] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; + c2_nxt -= a[2] * b[1]; + c3_nxt -= a[3] * b[1]; + c4_nxt -= a[4] * b[1]; + c5_nxt -= a[5] * b[1]; + c6_nxt -= a[6] * b[1]; + c7_nxt -= a[7] * b[1]; + + a += 8; + b += 2; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a4 = *(a + 4); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a9 = *(a + 9); + a10 = *(a + 10); + a11 = *(a + 11); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + a18 = *(a + 18); + a19 = *(a + 19); + a20 = *(a + 20); + a21 = *(a + 21); + a22 = *(a + 22); + a23 = *(a + 23); + a27 = *(a + 27); + a28 = *(a + 28); + a29 = *(a + 29); + a30 = *(a + 30); + a31 = *(a + 31); + a36 = *(a + 36); + a37 = *(a + 37); + a38 = *(a + 38); + a39 = *(a + 39); + a45 = *(a + 45); + a46 = *(a + 46); + a47 = *(a + 47); + a54 = *(a + 54); + a55 = *(a + 55); + a63 = *(a + 63); + + c0 *= a0; + c0_nxt *= a0; + + c1 -= c0 * a1; + c1_nxt -= c0_nxt * a1; + c1 *= a9; + c1_nxt *= a9; + + c2 -= c0 * a2; + c2_nxt -= c0_nxt * a2; + c2 -= c1 * a10; + c2_nxt -= c1_nxt * a10; + c2 *= a18; + c2_nxt *= a18; + + c3 -= c0 * a3; + c3_nxt -= c0_nxt * a3; + c3 -= c1 * a11; + c3_nxt -= c1_nxt * a11; + c3 -= c2 * a19; + c3_nxt -= c2_nxt * a19; + c3 *= a27; + c3_nxt *= a27; + + c4 -= c0 * a4; + c4_nxt -= c0_nxt * a4; + c4 -= c1 * a12; + c4_nxt -= c1_nxt * a12; + c4 -= c2 * a20; + c4_nxt -= c2_nxt * a20; + c4 -= c3 * a28; + c4_nxt -= c3_nxt * a28; + c4 *= a36; + c4_nxt *= a36; + + c5 -= c0 * a5; + c5_nxt -= c0_nxt * a5; + c5 -= c1 * a13; + c5_nxt -= c1_nxt * a13; + c5 -= c2 * a21; + c5_nxt -= c2_nxt * a21; + c5 -= c3 * a29; + c5_nxt -= c3_nxt * a29; + c5 -= c4 * a37; + c5_nxt -= c4_nxt * a37; + c5 *= a45; + c5_nxt *= a45; + + c6 -= c0 * a6; + c6_nxt -= c0_nxt * a6; + c6 -= c1 * a14; + c6_nxt -= c1_nxt * a14; + c6 -= c2 * a22; + c6_nxt -= c2_nxt * a22; + c6 -= c3 * a30; + c6_nxt -= c3_nxt * a30; + c6 -= c4 * a38; + c6_nxt -= c4_nxt * a38; + c6 -= c5 * a46; + c6_nxt -= c5_nxt * a46; + c6 *= a54; + c6_nxt *= a54; + + c7 -= c0 * a7; + c7_nxt -= c0_nxt * a7; + c7 -= c1 * a15; + c7_nxt -= c1_nxt * a15; + c7 -= c2 * a23; + c7_nxt -= c2_nxt * a23; + c7 -= c3 * a31; + c7_nxt -= c3_nxt * a31; + c7 -= c4 * a39; + c7_nxt -= c4_nxt * a39; + c7 -= c5 * a47; + c7_nxt -= c5_nxt * a47; + c7 -= c6 * a55; + c7_nxt -= c6_nxt * a55; + c7 *= a63; + c7_nxt *= a63; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; + *(c + 4 + ldc) = c4_nxt; + *(c + 5 + ldc) = c5_nxt; + *(c + 6 + ldc) = c6_nxt; + *(c + 7 + ldc) = c7_nxt; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + *(b + 8) = c4; + *(b + 9) = c4_nxt; + *(b + 10) = c5; + *(b + 11) = c5_nxt; + *(b + 12) = c6; + *(b + 13) = c6_nxt; + *(b + 14) = c7; + *(b + 15) = c7_nxt; +} + +static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; + FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; + FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c4 -= a[4] * b[0]; + c5 -= a[5] * b[0]; + c6 -= a[6] * b[0]; + c7 -= a[7] * b[0]; + + a += 8; + b += 1; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a4 = *(a + 4); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a9 = *(a + 9); + a10 = *(a + 10); + a11 = *(a + 11); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + a18 = *(a + 18); + a19 = *(a + 19); + a20 = *(a + 20); + a21 = *(a + 21); + a22 = *(a + 22); + a23 = *(a + 23); + a27 = *(a + 27); + a28 = *(a + 28); + a29 = *(a + 29); + a30 = *(a + 30); + a31 = *(a + 31); + a36 = *(a + 36); + a37 = *(a + 37); + a38 = *(a + 38); + a39 = *(a + 39); + a45 = *(a + 45); + a46 = *(a + 46); + a47 = *(a + 47); + a54 = *(a + 54); + a55 = *(a + 55); + a63 = *(a + 63); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a9; + + c2 -= c0 * a2; + c2 -= c1 * a10; + c2 *= a18; + + c3 -= c0 * a3; + c3 -= c1 * a11; + c3 -= c2 * a19; + c3 *= a27; + + c4 -= c0 * a4; + c4 -= c1 * a12; + c4 -= c2 * a20; + c4 -= c3 * a28; + c4 *= a36; + + c5 -= c0 * a5; + c5 -= c1 * a13; + c5 -= c2 * a21; + c5 -= c3 * a29; + c5 -= c4 * a37; + c5 *= a45; + + c6 -= c0 * a6; + c6 -= c1 * a14; + c6 -= c2 * a22; + c6 -= c3 * a30; + c6 -= c4 * a38; + c6 -= c5 * a46; + c6 *= a54; + + c7 -= c0 * a7; + c7 -= c1 * a15; + c7 -= c2 * a23; + c7 -= c3 * a31; + c7 -= c4 * a39; + c7 -= c5 * a47; + c7 -= c6 * a55; + c7 *= a63; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; +} + +static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v4f32 src_a10, src_a11, src_a15, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + for (k = 0; k < (bk >> 1); k++) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; + + a += 4; + b += 8; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; + + a += 4; + b += 8; + } + + if ((bk & 1) && (bk > 0)) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; + + a += 4; + b += 8; + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a5 = LD_SP(a + 5); + src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); + src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); + src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); + src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10)); + src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11)); + src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15)); + + res_c0 *= src_a0; + res_c4 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c5 -= res_c4 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c6 -= res_c4 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c7 -= res_c4 * src_a3; + + res_c1 *= src_a5; + res_c5 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c6 -= res_c5 * src_a6; + res_c3 -= res_c1 * src_a7; + res_c7 -= res_c5 * src_a7; + + res_c2 *= src_a10; + res_c6 *= src_a10; + res_c3 -= res_c2 * src_a11; + res_c7 -= res_c6 * src_a11; + + res_c3 *= src_a15; + res_c7 *= src_a15; + + ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4); + ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c4, src_c5, src_c6, src_c7); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v4f32 src_a10, src_a11, src_a15, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + for (k = 0; k < (bk >> 1); k++) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + a += 4; + b += 4; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + a += 4; + b += 4; + } + + if ((bk & 1) && (bk > 0)) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + a += 4; + b += 4; + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a5 = LD_SP(a + 5); + src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); + src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); + src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); + src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10)); + src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11)); + src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15)); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + + res_c1 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c3 -= res_c1 * src_a7; + + res_c2 *= src_a10; + res_c3 -= res_c2 * src_a11; + + res_c3 *= src_a15; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; + c2_nxt -= a[2] * b[1]; + c3_nxt -= a[3] * b[1]; + + a += 4; + b += 2; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a10 = *(a + 10); + a11 = *(a + 11); + a15 = *(a + 15); + + c0 *= a0; + c0_nxt *= a0; + + c1 -= c0 * a1; + c1_nxt -= c0_nxt * a1; + + c1 *= a5; + c1_nxt *= a5; + + c2 -= c0 * a2; + c2_nxt -= c0_nxt * a2; + + c2 -= c1 * a6; + c2_nxt -= c1_nxt * a6; + + c2 *= a10; + c2_nxt *= a10; + + c3 -= c0 * a3; + c3_nxt -= c0_nxt * a3; + + c3 -= c1 * a7; + c3_nxt -= c1_nxt * a7; + + c3 -= c2 * a11; + c3_nxt -= c2_nxt * a11; + + c3 *= a15; + c3_nxt *= a15; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; +} + +static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + + a += 4; + b += 1; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a10 = *(a + 10); + a11 = *(a + 11); + a15 = *(a + 15); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a5; + + c2 -= c0 * a2; + c2 -= c1 * a6; + c2 *= a10; + + c3 -= c0 * a3; + c3 -= c1 * a7; + c3 -= c2 * a11; + c3 *= a15; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2; + FLOAT c0_nxt3, c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5; + FLOAT c0_nxt6, c1_nxt6, c0_nxt7, c1_nxt7; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; + c0_nxt4 -= a[0] * b[4]; + c1_nxt4 -= a[1] * b[4]; + c0_nxt5 -= a[0] * b[5]; + c1_nxt5 -= a[1] * b[5]; + c0_nxt6 -= a[0] * b[6]; + c1_nxt6 -= a[1] * b[6]; + c0_nxt7 -= a[0] * b[7]; + c1_nxt7 -= a[1] * b[7]; + + a += 2; + b += 8; + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 = c0 * a0; + c1 = (c1 - c0 * a1) * a3; + + c0_nxt1 = c0_nxt1 * a0; + c1_nxt1 = (c1_nxt1 - c0_nxt1 * a1) * a3; + + c0_nxt2 = c0_nxt2 * a0; + c1_nxt2 = (c1_nxt2 - c0_nxt2 * a1) * a3; + + c0_nxt3 = c0_nxt3 * a0; + c1_nxt3 = (c1_nxt3 - c0_nxt3 * a1) * a3; + + c0_nxt4 = c0_nxt4 * a0; + c1_nxt4 = (c1_nxt4 - c0_nxt4 * a1) * a3; + + c0_nxt5 = c0_nxt5 * a0; + c1_nxt5 = (c1_nxt5 - c0_nxt5 * a1) * a3; + + c0_nxt6 = c0_nxt6 * a0; + c1_nxt6 = (c1_nxt6 - c0_nxt6 * a1) * a3; + + c0_nxt7 = c0_nxt7 * a0; + c1_nxt7 = (c1_nxt7 - c0_nxt7 * a1) * a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c0_nxt4; + *(b + 5) = c0_nxt5; + *(b + 6) = c0_nxt6; + *(b + 7) = c0_nxt7; + *(b + 8) = c1; + *(b + 9) = c1_nxt1; + *(b + 10) = c1_nxt2; + *(b + 11) = c1_nxt3; + *(b + 12) = c1_nxt4; + *(b + 13) = c1_nxt5; + *(b + 14) = c1_nxt6; + *(b + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; + + a += 2; + b += 4; + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + + c1 -= c0 * a1; + c1_nxt1 -= c0_nxt1 * a1; + c1_nxt2 -= c0_nxt2 * a1; + c1_nxt3 -= c0_nxt3 * a1; + c1 *= a3; + c1_nxt1 *= a3; + c1_nxt2 *= a3; + c1_nxt3 *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + ldc); + c1_nxt = *(c + 1 + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; + + a += 2; + b += 2; + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c0_nxt *= a0; + c1 -= c0 * a1; + c1_nxt -= c0_nxt * a1; + c1 *= a3; + c1_nxt *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + FLOAT c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + + a += 2; + b += 1; + } + + c0 *= *(a + 0); + + c1 -= c0 * *(a + 1); + c1 *= *(a + 3); + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; + c4 -= a[0] * b[4]; + c5 -= a[0] * b[5]; + c6 -= a[0] * b[6]; + c7 -= a[0] * b[7]; + + a += 1; + b += 8; + } + + c0 *= *a; + c1 *= *a; + c2 *= *a; + c3 *= *a; + c4 *= *a; + c5 *= *a; + c6 *= *a; + c7 *= *a; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; +} + +static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; + + a += 1; + b += 4; + } + + c0 *= *a; + c1 *= *a; + c2 *= *a; + c3 *= *a; + + *c = c0; + *(c + ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + + *b = *c; + *(b + 1) = *(c + ldc); + *(b + 2) = *(c + 2 * ldc); + *(b + 3) = *(c + 3 * ldc); +} + +static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT c0, c1; + + c0 = *c; + c1 = *(c + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + + a += 1; + b += 2; + } + + *c = c0 * *a; + *(c + ldc) = c1 * *a; + + *b = *c; + *(b + 1) = *(c + ldc); +} + +static void ssolve_1x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + + for (k = 0; k < bk; k++) + { + *c -= a[0] * b[0]; + + a++; + b++; + } + + *c *= *a; + *b = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + for (j = (n >> 3); j--;) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x8_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x8_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x8_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x8_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 8 * k; + c += 8 * ldc; + } + + if (n & 7) + { + if (n & 4) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x4_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 2) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x2_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x1_lt_msa(aa, b, cc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x1_lt_msa(aa, b, cc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x1_lt_msa(aa, b, cc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x1_lt_msa(aa, b, cc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += k; + c += ldc; + } + } + + return 0; +} diff --git a/kernel/mips/strsm_kernel_RN_8x8_msa.c b/kernel/mips/strsm_kernel_RN_8x8_msa.c new file mode 100644 index 0000000000..642ee37572 --- /dev/null +++ b/kernel/mips/strsm_kernel_RN_8x8_msa.c @@ -0,0 +1,1704 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_a0, src_a1; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; + v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18; + v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28; + v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39; + v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + for (k = 0; k < bk; k++) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + + a += 8; + b += 8; + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7); + + src_b = LD_SP(b + 9); + SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12); + src_b13 = LD_SP(b + 13); + src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2); + src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1); + src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + src_c4 -= src_c0 * src_b2; + src_c5 -= src_c1 * src_b2; + src_c6 -= src_c0 * src_b3; + src_c7 -= src_c1 * src_b3; + src_c8 -= src_c0 * src_b4; + src_c9 -= src_c1 * src_b4; + src_c10 -= src_c0 * src_b5; + src_c11 -= src_c1 * src_b5; + src_c12 -= src_c0 * src_b6; + src_c13 -= src_c1 * src_b6; + src_c14 -= src_c0 * src_b7; + src_c15 -= src_c1 * src_b7; + + ST_SP2(src_c0, src_c1, a, 4); + ST_SP2(src_c0, src_c1, c, 4); + + src_c2 *= src_b9; + src_c3 *= src_b9; + src_c4 -= src_c2 * src_b10; + src_c5 -= src_c3 * src_b10; + src_c6 -= src_c2 * src_b11; + src_c7 -= src_c3 * src_b11; + src_c8 -= src_c2 * src_b12; + src_c9 -= src_c3 * src_b12; + src_c10 -= src_c2 * src_b13; + src_c11 -= src_c3 * src_b13; + src_c12 -= src_c2 * src_b14; + src_c13 -= src_c3 * src_b14; + src_c14 -= src_c2 * src_b15; + src_c15 -= src_c3 * src_b15; + + ST_SP2(src_c2, src_c3, a + 8, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + + src_b = LD_SP(b + 18); + SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21); + src_b22 = LD_SP(b + 22); + src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1); + src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0); + + src_b = LD_SP(b + 27); + SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); + src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31)); + + src_c4 *= src_b18; + src_c5 *= src_b18; + src_c6 -= src_c4 * src_b19; + src_c7 -= src_c5 * src_b19; + src_c8 -= src_c4 * src_b20; + src_c9 -= src_c5 * src_b20; + src_c10 -= src_c4 * src_b21; + src_c11 -= src_c5 * src_b21; + src_c12 -= src_c4 * src_b22; + src_c13 -= src_c5 * src_b22; + src_c14 -= src_c4 * src_b23; + src_c15 -= src_c5 * src_b23; + + ST_SP2(src_c4, src_c5, a + 16, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + + src_c6 *= src_b27; + src_c7 *= src_b27; + src_c8 -= src_c6 * src_b28; + src_c9 -= src_c7 * src_b28; + src_c10 -= src_c6 * src_b29; + src_c11 -= src_c7 * src_b29; + src_c12 -= src_c6 * src_b30; + src_c13 -= src_c7 * src_b30; + src_c14 -= src_c6 * src_b31; + src_c15 -= src_c7 * src_b31; + + ST_SP2(src_c6, src_c7, a + 24, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); + + src_b = LD_SP(b + 36); + SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); + + src_b45 = LD_SP(b + 45); + src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2); + src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); + + src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54)); + src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55)); + src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63)); + + src_c8 *= src_b36; + src_c9 *= src_b36; + src_c10 -= src_c8 * src_b37; + src_c11 -= src_c9 * src_b37; + src_c12 -= src_c8 * src_b38; + src_c13 -= src_c9 * src_b38; + src_c14 -= src_c8 * src_b39; + src_c15 -= src_c9 * src_b39; + + ST_SP2(src_c8, src_c9, a + 32, 4); + ST_SP2(src_c8, src_c9, c_nxt4line, 4); + + src_c10 *= src_b45; + src_c11 *= src_b45; + src_c12 -= src_c10 * src_b46; + src_c13 -= src_c11 * src_b46; + src_c14 -= src_c10 * src_b47; + src_c15 -= src_c11 * src_b47; + + ST_SP2(src_c10, src_c11, a + 40, 4); + ST_SP2(src_c10, src_c11, c_nxt5line, 4); + + src_c12 *= src_b54; + src_c13 *= src_b54; + src_c14 -= src_c12 * src_b55; + src_c15 -= src_c13 * src_b55; + + ST_SP2(src_c12, src_c13, a + 48, 4); + ST_SP2(src_c12, src_c13, c_nxt6line, 4); + + src_c14 *= src_b63; + src_c15 *= src_b63; + + ST_SP2(src_c14, src_c15, a + 56, 4); + ST_SP2(src_c14, src_c15, c_nxt7line, 4); +} + +static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; + v4f32 src_b10, src_b11, src_b15, src_b, src_a0, src_a1; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + for (k = 0; k < (bk >> 1); k++) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + a += 8; + b += 4; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + a += 8; + b += 4; + } + + if ((bk & 1) && (bk > 0)) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + a += 8; + b += 4; + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b5 = LD_SP(b + 5); + src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); + src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); + src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); + src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10)); + src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11)); + src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15)); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + src_c4 -= src_c0 * src_b2; + src_c5 -= src_c1 * src_b2; + src_c6 -= src_c0 * src_b3; + src_c7 -= src_c1 * src_b3; + + src_c2 *= src_b5; + src_c3 *= src_b5; + src_c4 -= src_c2 * src_b6; + src_c5 -= src_c3 * src_b6; + src_c6 -= src_c2 * src_b7; + src_c7 -= src_c3 * src_b7; + + src_c4 *= src_b10; + src_c5 *= src_b10; + src_c6 -= src_c4 * src_b11; + src_c7 -= src_c5 * src_b11; + + src_c6 *= src_b15; + src_c7 *= src_b15; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_a0, src_a1; + v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3; + FLOAT *c_nxt1line = c + ldc; + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + + for (k = 0; k < (bk >> 1); k++) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + + a += 8; + b += 2; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + + a += 8; + b += 2; + } + + if ((bk & 1) && (bk > 0)) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + + a += 8; + b += 2; + } + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + src_c2 *= src_b3; + src_c3 *= src_b3; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); +} + +static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_a0, src_a1, src_c0, src_c1, src_b0; + + LD_SP2(c, 4, src_c0, src_c1); + + for (k = 0; k < (bk >> 2); k++) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; + } + + if ((bk & 3) && (bk > 0)) + { + if (bk & 2) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; + } + + if (bk & 1) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; + } + } + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP2(src_c0, src_c1, a, 4); + ST_SP2(src_c0, src_c1, c, 4); +} + +static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; + v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18; + v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28; + v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39; + v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b, src_a0; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + for (k = 0; k < bk; k++) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; + + a += 4; + b += 8; + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7); + + src_b = LD_SP(b + 9); + SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12); + src_b13 = LD_SP(b + 13); + src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2); + src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1); + src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0); + + src_b = LD_SP(b + 18); + SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21); + src_b22 = LD_SP(b + 22); + src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1); + src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0); + + src_b = LD_SP(b + 27); + SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); + src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31)); + + src_b = LD_SP(b + 36); + SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); + + src_b45 = LD_SP(b + 45); + src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2); + src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); + + src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54)); + src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55)); + src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63)); + + src_c0 *= src_b0; + src_c1 -= src_c0 * src_b1; + src_c2 -= src_c0 * src_b2; + src_c3 -= src_c0 * src_b3; + src_c4 -= src_c0 * src_b4; + src_c5 -= src_c0 * src_b5; + src_c6 -= src_c0 * src_b6; + src_c7 -= src_c0 * src_b7; + + src_c1 *= src_b9; + src_c2 -= src_c1 * src_b10; + src_c3 -= src_c1 * src_b11; + src_c4 -= src_c1 * src_b12; + src_c5 -= src_c1 * src_b13; + src_c6 -= src_c1 * src_b14; + src_c7 -= src_c1 * src_b15; + + src_c2 *= src_b18; + src_c3 -= src_c2 * src_b19; + src_c4 -= src_c2 * src_b20; + src_c5 -= src_c2 * src_b21; + src_c6 -= src_c2 * src_b22; + src_c7 -= src_c2 * src_b23; + + src_c3 *= src_b27; + src_c4 -= src_c3 * src_b28; + src_c5 -= src_c3 * src_b29; + src_c6 -= src_c3 * src_b30; + src_c7 -= src_c3 * src_b31; + + src_c4 *= src_b36; + src_c5 -= src_c4 * src_b37; + src_c6 -= src_c4 * src_b38; + src_c7 -= src_c4 * src_b39; + + src_c5 *= src_b45; + src_c6 -= src_c5 * src_b46; + src_c7 -= src_c5 * src_b47; + + src_c6 *= src_b54; + src_c7 -= src_c6 * src_b55; + + src_c7 *= src_b63; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b2, src_b3; + v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b, src_a0; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + for (k = 0; k < (bk >> 1); k++) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + a += 4; + b += 4; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + a += 4; + b += 4; + } + + if ((bk & 1) && (bk > 0)) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + a += 4; + b += 4; + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b5 = LD_SP(b + 5); + src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); + src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); + src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); + src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10)); + src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11)); + src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15)); + + src_c0 *= src_b0; + src_c1 -= src_c0 * src_b1; + src_c2 -= src_c0 * src_b2; + src_c3 -= src_c0 * src_b3; + + src_c1 *= src_b5; + src_c2 -= src_c1 * src_b6; + src_c3 -= src_c1 * src_b7; + + src_c2 *= src_b10; + src_c3 -= src_c2 * src_b11; + + src_c3 *= src_b15; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + v4f32 src_a, src_c0, src_c1, src_b0, src_b1, src_b3; + FLOAT *c_nxt1line = c + ldc; + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + + for (k = 0; k < (bk >> 2); k++) + { + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + } + + if ((bk & 3) && (bk > 0)) + { + if (bk & 2) + { + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + } + + if (bk & 1) + { + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + } + } + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); + + src_c0 *= src_b0; + src_c1 -= src_c0 * src_b1; + src_c1 *= src_b3; + + ST_SP2(src_c0, src_c1, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); +} + +static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT b0, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + + a += 4; + b += 1; + } + + b0 = *(b + 0); + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; + FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31; + FLOAT b36, b37, b38, b39, b45, b46, b47, b54, b55, b63; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; + FLOAT c0_nxt7, c1_nxt7; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 0 + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 0 + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 0 + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 0 + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; + c0_nxt4 -= a[0] * b[4]; + c1_nxt4 -= a[1] * b[4]; + c0_nxt5 -= a[0] * b[5]; + c1_nxt5 -= a[1] * b[5]; + c0_nxt6 -= a[0] * b[6]; + c1_nxt6 -= a[1] * b[6]; + c0_nxt7 -= a[0] * b[7]; + c1_nxt7 -= a[1] * b[7]; + + a += 2; + b += 8; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b4 = *(b + 4); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b9 = *(b + 9); + b10 = *(b + 10); + b11 = *(b + 11); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + b18 = *(b + 18); + b19 = *(b + 19); + b20 = *(b + 20); + b21 = *(b + 21); + b22 = *(b + 22); + b23 = *(b + 23); + b27 = *(b + 27); + b28 = *(b + 28); + b29 = *(b + 29); + b30 = *(b + 30); + b31 = *(b + 31); + b36 = *(b + 36); + b37 = *(b + 37); + b38 = *(b + 38); + b39 = *(b + 39); + b45 = *(b + 45); + b46 = *(b + 46); + b47 = *(b + 47); + b54 = *(b + 54); + b55 = *(b + 55); + b63 = *(b + 63); + + c0 *= b0; + c1 *= b0; + + c0_nxt1 -= c0 * b1; + c1_nxt1 -= c1 * b1; + + c0_nxt2 -= c0 * b2; + c1_nxt2 -= c1 * b2; + + c0_nxt3 -= c0 * b3; + c1_nxt3 -= c1 * b3; + + c0_nxt4 -= c0 * b4; + c1_nxt4 -= c1 * b4; + + c0_nxt5 -= c0 * b5; + c1_nxt5 -= c1 * b5; + + c0_nxt6 -= c0 * b6; + c1_nxt6 -= c1 * b6; + + c0_nxt7 -= c0 * b7; + c1_nxt7 -= c1 * b7; + + c0_nxt1 *= b9; + c1_nxt1 *= b9; + + c0_nxt2 -= c0_nxt1 * b10; + c1_nxt2 -= c1_nxt1 * b10; + + c0_nxt3 -= c0_nxt1 * b11; + c1_nxt3 -= c1_nxt1 * b11; + + c0_nxt4 -= c0_nxt1 * b12; + c1_nxt4 -= c1_nxt1 * b12; + + c0_nxt5 -= c0_nxt1 * b13; + c1_nxt5 -= c1_nxt1 * b13; + + c0_nxt6 -= c0_nxt1 * b14; + c1_nxt6 -= c1_nxt1 * b14; + + c0_nxt7 -= c0_nxt1 * b15; + c1_nxt7 -= c1_nxt1 * b15; + + c0_nxt2 *= b18; + c1_nxt2 *= b18; + + c0_nxt3 -= c0_nxt2 * b19; + c1_nxt3 -= c1_nxt2 * b19; + + c0_nxt4 -= c0_nxt2 * b20; + c1_nxt4 -= c1_nxt2 * b20; + + c0_nxt5 -= c0_nxt2 * b21; + c1_nxt5 -= c1_nxt2 * b21; + + c0_nxt6 -= c0_nxt2 * b22; + c1_nxt6 -= c1_nxt2 * b22; + + c0_nxt7 -= c0_nxt2 * b23; + c1_nxt7 -= c1_nxt2 * b23; + + c0_nxt3 *= b27; + c1_nxt3 *= b27; + + c0_nxt4 -= c0_nxt3 * b28; + c1_nxt4 -= c1_nxt3 * b28; + + c0_nxt5 -= c0_nxt3 * b29; + c1_nxt5 -= c1_nxt3 * b29; + + c0_nxt6 -= c0_nxt3 * b30; + c1_nxt6 -= c1_nxt3 * b30; + + c0_nxt7 -= c0_nxt3 * b31; + c1_nxt7 -= c1_nxt3 * b31; + + c0_nxt4 *= b36; + c1_nxt4 *= b36; + + c0_nxt5 -= c0_nxt4 * b37; + c1_nxt5 -= c1_nxt4 * b37; + + c0_nxt6 -= c0_nxt4 * b38; + c1_nxt6 -= c1_nxt4 * b38; + + c0_nxt7 -= c0_nxt4 * b39; + c1_nxt7 -= c1_nxt4 * b39; + + c0_nxt5 *= b45; + c1_nxt5 *= b45; + + c0_nxt6 -= c0_nxt5 * b46; + c1_nxt6 -= c1_nxt5 * b46; + + c0_nxt7 -= c0_nxt5 * b47; + c1_nxt7 -= c1_nxt5 * b47; + + c0_nxt6 *= b54; + c1_nxt6 *= b54; + + c0_nxt7 -= c0_nxt6 * b55; + c1_nxt7 -= c1_nxt6 * b55; + + c0_nxt7 *= b63; + c1_nxt7 *= b63; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + *(a + 8) = c0_nxt4; + *(a + 9) = c1_nxt4; + *(a + 10) = c0_nxt5; + *(a + 11) = c1_nxt5; + *(a + 12) = c0_nxt6; + *(a + 13) = c1_nxt6; + *(a + 14) = c0_nxt7; + *(a + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1; + FLOAT c0_nxt1, c0_nxt2, c0_nxt3, c1_nxt1, c1_nxt2, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; + + a += 2; + b += 4; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + c1 *= b0; + + c0_nxt1 -= c0 * b1; + c1_nxt1 -= c1 * b1; + c0_nxt1 *= b5; + c1_nxt1 *= b5; + + c0_nxt2 -= c0 * b2; + c1_nxt2 -= c1 * b2; + c0_nxt2 -= c0_nxt1 * b6; + c1_nxt2 -= c1_nxt1 * b6; + c0_nxt2 *= b10; + c1_nxt2 *= b10; + + c0_nxt3 -= c0 * b3; + c1_nxt3 -= c1 * b3; + c0_nxt3 -= c0_nxt1 * b7; + c1_nxt3 -= c1_nxt1 * b7; + c0_nxt3 -= c0_nxt2 * b11; + c1_nxt3 -= c1_nxt2 * b11; + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; + + a += 2; + b += 2; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + c1 *= b0; + + c0_nxt -= c0 * b1; + c1_nxt -= c1 * b1; + + c0_nxt *= b3; + c1_nxt *= b3; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + + a += 2; + b += 1; + } + + b0 = *(b + 0); + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; + FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31, b36, b37, b38; + FLOAT b39, b45, b46, b47, b54, b55, b63, c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; + c4 -= a[0] * b[4]; + c5 -= a[0] * b[5]; + c6 -= a[0] * b[6]; + c7 -= a[0] * b[7]; + + a += 1; + b += 8; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b4 = *(b + 4); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b9 = *(b + 9); + b10 = *(b + 10); + b11 = *(b + 11); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + b18 = *(b + 18); + b19 = *(b + 19); + b20 = *(b + 20); + b21 = *(b + 21); + b22 = *(b + 22); + b23 = *(b + 23); + b27 = *(b + 27); + b28 = *(b + 28); + b29 = *(b + 29); + b30 = *(b + 30); + b31 = *(b + 31); + b36 = *(b + 36); + b37 = *(b + 37); + b38 = *(b + 38); + b39 = *(b + 39); + b45 = *(b + 45); + b46 = *(b + 46); + b47 = *(b + 47); + b54 = *(b + 54); + b55 = *(b + 55); + b63 = *(b + 63); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b9; + + c2 -= c0 * b2; + c2 -= c1 * b10; + c2 *= b18; + + c3 -= c0 * b3; + c3 -= c1 * b11; + c3 -= c2 * b19; + c3 *= b27; + + c4 -= c0 * b4; + c4 -= c1 * b12; + c4 -= c2 * b20; + c4 -= c3 * b28; + c4 *= b36; + + c5 -= c0 * b5; + c5 -= c1 * b13; + c5 -= c2 * b21; + c5 -= c3 * b29; + c5 -= c4 * b37; + c5 *= b45; + + c6 -= c0 * b6; + c6 -= c1 * b14; + c6 -= c2 * b22; + c6 -= c3 * b30; + c6 -= c4 * b38; + c6 -= c5 * b46; + c6 *= b54; + + c7 -= c0 * b7; + c7 -= c1 * b15; + c7 -= c2 * b23; + c7 -= c3 * b31; + c7 -= c4 * b39; + c7 -= c5 * b47; + c7 -= c6 * b55; + c7 *= b63; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + *(a + 4) = c4; + *(a + 5) = c5; + *(a + 6) = c6; + *(a + 7) = c7; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; +} + +static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; + + a += 1; + b += 4; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b5; + + c2 -= c0 * b2; + c2 -= c1 * b6; + c2 *= b10; + + c3 -= c0 * b3; + c3 -= c1 * b7; + c3 -= c2 * b11; + c3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT b0, b1, b3, c0, c1; + + c0 = *c; + c1 = *(c + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + + a += 1; + b += 2; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b3; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + ldc) = c1; +} + +static void ssolve_1x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + + for (k = 0; k < bk; k++) + { + *c -= a[0] * b[0]; + + a++; + b++; + } + + *c *= *b; + *a = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + kk = -offset; + + for (j = (n >> 3); j--;) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x8_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x8_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x8_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x8_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + kk += 8; + b += 8 * k; + c += 8 * ldc; + } + + if (n & 7) + { + if (n & 4) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x4_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + b += 4 * k; + c += 4 * ldc; + kk += 4; + } + + if (n & 2) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x2_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + kk += 2; + } + + if (n & 1) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x1_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x1_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x1_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x1_rn_msa(aa, b, cc, kk); + + aa += k; + cc += 1; + } + } + + b += k; + c += ldc; + kk += 1; + } + } + + return 0; +} diff --git a/kernel/mips/strsm_kernel_RT_8x8_msa.c b/kernel/mips/strsm_kernel_RT_8x8_msa.c new file mode 100644 index 0000000000..21e41c8fbd --- /dev/null +++ b/kernel/mips/strsm_kernel_RT_8x8_msa.c @@ -0,0 +1,1726 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; + v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35; + v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45; + v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54; + v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + for (k = 0; k < bk; k++) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + + aa += 8; + bb += 8; + } + + b -= 64; + + src_b = LD_SP(b + 60); + SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63); + src_b = LD_SP(b + 56); + SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59); + + src_c15 *= src_b63; + src_c14 *= src_b63; + src_c13 -= src_c15 * src_b62; + src_c12 -= src_c14 * src_b62; + src_c11 -= src_c15 * src_b61; + src_c10 -= src_c14 * src_b61; + src_c9 -= src_c15 * src_b60; + src_c8 -= src_c14 * src_b60; + src_c7 -= src_c15 * src_b59; + src_c6 -= src_c14 * src_b59; + src_c5 -= src_c15 * src_b58; + src_c4 -= src_c14 * src_b58; + src_c3 -= src_c15 * src_b57; + src_c2 -= src_c14 * src_b57; + src_c1 -= src_c15 * src_b56; + src_c0 -= src_c14 * src_b56; + + src_b = LD_SP(b + 48); + SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51); + src_b52 = LD_SP(b + 52); + src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2); + src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1); + src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0); + + src_c12 *= src_b54; + src_c13 *= src_b54; + src_c10 -= src_c12 * src_b53; + src_c11 -= src_c13 * src_b53; + src_c8 -= src_c12 * src_b52; + src_c9 -= src_c13 * src_b52; + src_c6 -= src_c12 * src_b51; + src_c7 -= src_c13 * src_b51; + src_c4 -= src_c12 * src_b50; + src_c5 -= src_c13 * src_b50; + src_c2 -= src_c12 * src_b49; + src_c3 -= src_c13 * src_b49; + src_c0 -= src_c12 * src_b48; + src_c1 -= src_c13 * src_b48; + + ST_SP4(src_c12, src_c13, src_c14, src_c15, a - 16, 4); + ST_SP2(src_c12, src_c13, c_nxt6line, 4); + ST_SP2(src_c14, src_c15, c_nxt7line, 4); + + src_b = LD_SP(b + 40); + SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43); + src_b44 = LD_SP(b + 44); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1); + src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0); + + src_c10 *= src_b45; + src_c11 *= src_b45; + src_c8 -= src_c10 * src_b44; + src_c9 -= src_c11 * src_b44; + src_c6 -= src_c10 * src_b43; + src_c7 -= src_c11 * src_b43; + src_c4 -= src_c10 * src_b42; + src_c5 -= src_c11 * src_b42; + src_c2 -= src_c10 * src_b41; + src_c3 -= src_c11 * src_b41; + src_c0 -= src_c10 * src_b40; + src_c1 -= src_c11 * src_b40; + + src_b = LD_SP(b + 32); + SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); + src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36)); + + src_c8 *= src_b36; + src_c9 *= src_b36; + src_c6 -= src_c8 * src_b35; + src_c7 -= src_c9 * src_b35; + src_c4 -= src_c8 * src_b34; + src_c5 -= src_c9 * src_b34; + src_c2 -= src_c8 * src_b33; + src_c3 -= src_c9 * src_b33; + src_c0 -= src_c8 * src_b32; + src_c1 -= src_c9 * src_b32; + + ST_SP4(src_c8, src_c9, src_c10, src_c11, a - 32, 4); + ST_SP2(src_c8, src_c9, c_nxt4line, 4); + ST_SP2(src_c10, src_c11, c_nxt5line, 4); + + src_b = LD_SP(b + 24); + SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); + + src_c6 *= src_b27; + src_c7 *= src_b27; + src_c4 -= src_c6 * src_b26; + src_c5 -= src_c7 * src_b26; + src_c2 -= src_c6 * src_b25; + src_c3 -= src_c7 * src_b25; + src_c0 -= src_c6 * src_b24; + src_c1 -= src_c7 * src_b24; + + src_b16 = LD_SP(b + 16); + src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2); + src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); + src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); + + src_c4 *= src_b18; + src_c5 *= src_b18; + src_c2 -= src_c4 * src_b17; + src_c3 -= src_c5 * src_b17; + src_c0 -= src_c4 * src_b16; + src_c1 -= src_c5 * src_b16; + + ST_SP4(src_c4, src_c5, src_c6, src_c7, a - 48, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); + + src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9)); + src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c2 *= src_b9; + src_c3 *= src_b9; + src_c0 -= src_c2 * src_b8; + src_c1 -= src_c3 * src_b8; + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a - 64, 4); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); +} + +static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12; + v4f32 src_b13, src_b14, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + for (k = 0; k < (bk >> 1); k++) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + aa += 8; + bb += 4; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + aa += 8; + bb += 4; + } + + if ((bk & 1) && (bk > 0)) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + } + + a -= 32; + b -= 16; + + src_b = LD_SP(b + 12); + SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15); + src_b8 = LD_SP(b + 8); + src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); + src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); + src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); + src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5)); + src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c7 *= src_b15; + src_c6 *= src_b15; + src_c5 -= src_c7 * src_b14; + src_c4 -= src_c6 * src_b14; + src_c3 -= src_c7 * src_b13; + src_c2 -= src_c6 * src_b13; + src_c1 -= src_c7 * src_b12; + src_c0 -= src_c6 * src_b12; + + src_c5 *= src_b10; + src_c4 *= src_b10; + src_c3 -= src_c5 * src_b9; + src_c2 -= src_c4 * src_b9; + src_c1 -= src_c5 * src_b8; + src_c0 -= src_c4 * src_b8; + + src_c3 *= src_b5; + src_c2 *= src_b5; + src_c1 -= src_c3 * src_b4; + src_c0 -= src_c2 * src_b4; + + src_c1 *= src_b0; + src_c0 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1; + v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; + FLOAT *c_nxt1line = c + ldc; + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + + for (k = 0; k < (bk >> 1); k++) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + + aa += 8; + bb += 2; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + + aa += 8; + bb += 2; + } + + if ((bk & 1) && (bk > 0)) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + } + + a -= 16; + b -= 4; + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2)); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); + + src_c2 *= src_b3; + src_c3 *= src_b3; + src_c0 -= src_c2 * src_b2; + src_c1 -= src_c3 * src_b2; + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); +} + +static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_c0, src_c1, src_b0; + + LD_SP2(c, 4, src_c0, src_c1); + + for (k = 0; k < (bk >> 2); k++) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; + } + + if ((bk & 3) && (bk > 0)) + { + if (bk & 2) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; + } + + if (bk & 1) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + } + } + + a -= 8; + b -= 1; + + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP2(src_c0, src_c1, a, 4); + ST_SP2(src_c0, src_c1, c, 4); +} + +static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_b1, src_b2, src_b3; + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; + v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35; + v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45; + v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54; + v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + for (k = 0; k < bk; k++) + { + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; + + aa += 4; + bb += 8; + } + + a -= 32; + b -= 64; + + src_b = LD_SP(b + 60); + SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63); + src_b = LD_SP(b + 56); + SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59); + + src_b = LD_SP(b + 48); + SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51); + src_b52 = LD_SP(b + 52); + src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2); + src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1); + src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0); + + src_b = LD_SP(b + 40); + SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43); + src_b44 = LD_SP(b + 44); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1); + src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0); + + src_b = LD_SP(b + 32); + SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); + src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36)); + + src_b = LD_SP(b + 24); + SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); + + src_b16 = LD_SP(b + 16); + src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2); + src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); + src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); + + src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9)); + src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c7 *= src_b63; + src_c6 -= src_c7 * src_b62; + src_c5 -= src_c7 * src_b61; + src_c4 -= src_c7 * src_b60; + src_c3 -= src_c7 * src_b59; + src_c2 -= src_c7 * src_b58; + src_c1 -= src_c7 * src_b57; + src_c0 -= src_c7 * src_b56; + + src_c6 *= src_b54; + src_c5 -= src_c6 * src_b53; + src_c4 -= src_c6 * src_b52; + src_c3 -= src_c6 * src_b51; + src_c2 -= src_c6 * src_b50; + src_c1 -= src_c6 * src_b49; + src_c0 -= src_c6 * src_b48; + + src_c5 *= src_b45; + src_c4 -= src_c5 * src_b44; + src_c3 -= src_c5 * src_b43; + src_c2 -= src_c5 * src_b42; + src_c1 -= src_c5 * src_b41; + src_c0 -= src_c5 * src_b40; + + src_c4 *= src_b36; + src_c3 -= src_c4 * src_b35; + src_c2 -= src_c4 * src_b34; + src_c1 -= src_c4 * src_b33; + src_c0 -= src_c4 * src_b32; + + src_c3 *= src_b27; + src_c2 -= src_c3 * src_b26; + src_c1 -= src_c3 * src_b25; + src_c0 -= src_c3 * src_b24; + + src_c2 *= src_b18; + src_c1 -= src_c2 * src_b17; + src_c0 -= src_c2 * src_b16; + + src_c1 *= src_b9; + src_c0 -= src_c1 * src_b8; + + src_c0 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_c0, src_c1, src_c2, src_c3, src_b; + v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; + v4f32 src_b14, src_b15, src_a, src_b1, src_b2, src_b3; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + for (k = 0; k < (bk >> 1); k++) + { + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + src_c2 -= src_a * src_b2; + src_c3 -= src_a * src_b3; + + aa += 4; + bb += 4; + + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + src_c2 -= src_a * src_b2; + src_c3 -= src_a * src_b3; + + aa += 4; + bb += 4; + } + + if ((bk & 1) && (bk > 0)) + { + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + src_c2 -= src_a * src_b2; + src_c3 -= src_a * src_b3; + } + + a -= 16; + b -= 16; + + src_b = LD_SP(b + 12); + SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15); + src_b8 = LD_SP(b + 8); + src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); + src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); + src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); + src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5)); + src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c3 *= src_b15; + src_c2 -= src_c3 * src_b14; + src_c1 -= src_c3 * src_b13; + src_c0 -= src_c3 * src_b12; + + src_c2 *= src_b10; + src_c1 -= src_c2 * src_b9; + src_c0 -= src_c2 * src_b8; + + src_c1 *= src_b5; + src_c0 -= src_c1 * src_b4; + + src_c0 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a, src_b1, src_c0, src_c1, src_b0, src_b2, src_b3; + FLOAT *c_nxt1line = c + ldc; + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + + for (k = 0; k < (bk >> 2); k++) + { + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; + } + + if ((bk & 3) && (bk > 0)) + { + if (bk & 2) + { + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; + } + + if (bk & 1) + { + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + } + } + + a -= 8; + b -= 4; + + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); + src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + + src_c1 *= src_b3; + src_c0 -= src_c1 * src_b2; + src_c0 *= src_b0; + + ST_SP2(src_c0, src_c1, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); +} + +static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT b0, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + + aa += 4; + bb += 1; + } + + a -= 4; + b -= 1; + + b0 = *b; + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; + FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; + FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 0 + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 0 + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 0 + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 0 + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + c0_nxt4 -= aa[0] * bb[4]; + c1_nxt4 -= aa[1] * bb[4]; + c0_nxt5 -= aa[0] * bb[5]; + c1_nxt5 -= aa[1] * bb[5]; + c0_nxt6 -= aa[0] * bb[6]; + c1_nxt6 -= aa[1] * bb[6]; + c0_nxt7 -= aa[0] * bb[7]; + c1_nxt7 -= aa[1] * bb[7]; + + aa += 2; + bb += 8; + } + + a -= 16; + b -= 64; + + b0 = *(b + 0); + b8 = *(b + 8); + b9 = *(b + 9); + b16 = *(b + 16); + b17 = *(b + 17); + b18 = *(b + 18); + b24 = *(b + 24); + b25 = *(b + 25); + b26 = *(b + 26); + b27 = *(b + 27); + b32 = *(b + 32); + b33 = *(b + 33); + b34 = *(b + 34); + b35 = *(b + 35); + b36 = *(b + 36); + b40 = *(b + 40); + b41 = *(b + 41); + b42 = *(b + 42); + b43 = *(b + 43); + b44 = *(b + 44); + b45 = *(b + 45); + b48 = *(b + 48); + b49 = *(b + 49); + b50 = *(b + 50); + b51 = *(b + 51); + b52 = *(b + 52); + b53 = *(b + 53); + b54 = *(b + 54); + b56 = *(b + 56); + b57 = *(b + 57); + b58 = *(b + 58); + b59 = *(b + 59); + b60 = *(b + 60); + b61 = *(b + 61); + b62 = *(b + 62); + b63 = *(b + 63); + + c0_nxt7 *= b63; + c1_nxt7 *= b63; + + c0_nxt6 -= c0_nxt7 * b62; + c1_nxt6 -= c1_nxt7 * b62; + + c0_nxt6 *= b54; + c1_nxt6 *= b54; + + c0_nxt5 -= c0_nxt7 * b61; + c1_nxt5 -= c1_nxt7 * b61; + + c0_nxt5 -= c0_nxt6 * b53; + c1_nxt5 -= c1_nxt6 * b53; + + c0_nxt5 *= b45; + c1_nxt5 *= b45; + + c0_nxt4 -= c0_nxt7 * b60; + c1_nxt4 -= c1_nxt7 * b60; + + c0_nxt4 -= c0_nxt6 * b52; + c1_nxt4 -= c1_nxt6 * b52; + + c0_nxt4 -= c0_nxt5 * b44; + c1_nxt4 -= c1_nxt5 * b44; + + c0_nxt4 *= b36; + c1_nxt4 *= b36; + + c0_nxt3 -= c0_nxt7 * b59; + c1_nxt3 -= c1_nxt7 * b59; + + c0_nxt3 -= c0_nxt6 * b51; + c1_nxt3 -= c1_nxt6 * b51; + + c0_nxt3 -= c0_nxt5 * b43; + c1_nxt3 -= c1_nxt5 * b43; + + c0_nxt3 -= c0_nxt4 * b35; + c1_nxt3 -= c1_nxt4 * b35; + + c0_nxt3 *= b27; + c1_nxt3 *= b27; + + c0_nxt2 -= c0_nxt7 * b58; + c1_nxt2 -= c1_nxt7 * b58; + + c0_nxt2 -= c0_nxt6 * b50; + c1_nxt2 -= c1_nxt6 * b50; + + c0_nxt2 -= c0_nxt5 * b42; + c1_nxt2 -= c1_nxt5 * b42; + + c0_nxt2 -= c0_nxt4 * b34; + c1_nxt2 -= c1_nxt4 * b34; + + c0_nxt2 -= c0_nxt3 * b26; + c1_nxt2 -= c1_nxt3 * b26; + + c0_nxt2 *= b18; + c1_nxt2 *= b18; + + c0_nxt1 -= c0_nxt7 * b57; + c1_nxt1 -= c1_nxt7 * b57; + + c0_nxt1 -= c0_nxt6 * b49; + c1_nxt1 -= c1_nxt6 * b49; + + c0_nxt1 -= c0_nxt5 * b41; + c1_nxt1 -= c1_nxt5 * b41; + + c0_nxt1 -= c0_nxt4 * b33; + c1_nxt1 -= c1_nxt4 * b33; + + c0_nxt1 -= c0_nxt3 * b25; + c1_nxt1 -= c1_nxt3 * b25; + + c0_nxt1 -= c0_nxt2 * b17; + c1_nxt1 -= c1_nxt2 * b17; + + c0_nxt1 *= b9; + c1_nxt1 *= b9; + + c0 -= c0_nxt7 * b56; + c1 -= c1_nxt7 * b56; + + c0 -= c0_nxt6 * b48; + c1 -= c1_nxt6 * b48; + + c0 -= c0_nxt5 * b40; + c1 -= c1_nxt5 * b40; + + c0 -= c0_nxt4 * b32; + c1 -= c1_nxt4 * b32; + + c0 -= c0_nxt3 * b24; + c1 -= c1_nxt3 * b24; + + c0 -= c0_nxt2 * b16; + c1 -= c1_nxt2 * b16; + + c0 -= c0_nxt1 * b8; + c1 -= c1_nxt1 * b8; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + *(a + 8) = c0_nxt4; + *(a + 9) = c1_nxt4; + *(a + 10) = c0_nxt5; + *(a + 11) = c1_nxt5; + *(a + 12) = c0_nxt6; + *(a + 13) = c1_nxt6; + *(a + 14) = c0_nxt7; + *(a + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + + aa += 2; + bb += 4; + } + + a -= 8; + b -= 16; + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + c0_nxt2 = (c0_nxt2 - c0_nxt3 * b14) * b10; + c1_nxt2 = (c1_nxt2 - c1_nxt3 * b14) * b10; + + c0_nxt1 = ((c0_nxt1 - c0_nxt3 * b13) - c0_nxt2 * b9) * b5; + c1_nxt1 = ((c1_nxt1 - c1_nxt3 * b13) - c1_nxt2 * b9) * b5; + + c0 = (((c0 - c0_nxt3 * b12) - c0_nxt2 * b8) - c0_nxt1 * b4) * b0; + c1 = (((c1 - c1_nxt3 * b12) - c1_nxt2 * b8) - c1_nxt1 * b4) * b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; + + aa += 2; + bb += 2; + } + + a -= 4; + b -= 4; + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c0_nxt *= b3; + c1_nxt *= b3; + + c0 -= c0_nxt * b2; + c1 -= c1_nxt * b2; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + + aa += 2; + bb += 1; + } + + a -= 2; + b -= 1; + + b0 = *b; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; + FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; + FLOAT b56, b57, b58, b59, b60, b61, b62, b63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + c4 -= aa[0] * bb[4]; + c5 -= aa[0] * bb[5]; + c6 -= aa[0] * bb[6]; + c7 -= aa[0] * bb[7]; + + aa += 1; + bb += 8; + } + + a -= 8; + b -= 64; + + b0 = *(b + 0); + b8 = *(b + 8); + b9 = *(b + 9); + b16 = *(b + 16); + b17 = *(b + 17); + b18 = *(b + 18); + b24 = *(b + 24); + b25 = *(b + 25); + b26 = *(b + 26); + b27 = *(b + 27); + b32 = *(b + 32); + b33 = *(b + 33); + b34 = *(b + 34); + b35 = *(b + 35); + b36 = *(b + 36); + b40 = *(b + 40); + b41 = *(b + 41); + b42 = *(b + 42); + b43 = *(b + 43); + b44 = *(b + 44); + b45 = *(b + 45); + b48 = *(b + 48); + b49 = *(b + 49); + b50 = *(b + 50); + b51 = *(b + 51); + b52 = *(b + 52); + b53 = *(b + 53); + b54 = *(b + 54); + b56 = *(b + 56); + b57 = *(b + 57); + b58 = *(b + 58); + b59 = *(b + 59); + b60 = *(b + 60); + b61 = *(b + 61); + b62 = *(b + 62); + b63 = *(b + 63); + + c7 *= b63; + + c6 -= c7 * b62; + c6 *= b54; + + c5 -= c7 * b61; + c5 -= c6 * b53; + c5 *= b45; + + c4 -= c7 * b60; + c4 -= c6 * b52; + c4 -= c5 * b44; + c4 *= b36; + + c3 -= c7 * b59; + c3 -= c6 * b51; + c3 -= c5 * b43; + c3 -= c4 * b35; + c3 *= b27; + + c2 -= c7 * b58; + c2 -= c6 * b50; + c2 -= c5 * b42; + c2 -= c4 * b34; + c2 -= c3 * b26; + c2 *= b18; + + c1 -= c7 * b57; + c1 -= c6 * b49; + c1 -= c5 * b41; + c1 -= c4 * b33; + c1 -= c3 * b25; + c1 -= c2 * b17; + c1 *= b9; + + c0 -= c7 * b56; + c0 -= c6 * b48; + c0 -= c5 * b40; + c0 -= c4 * b32; + c0 -= c3 * b24; + c0 -= c2 * b16; + c0 -= c1 * b8; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + *(a + 4) = c4; + *(a + 5) = c5; + *(a + 6) = c6; + *(a + 7) = c7; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; +} + +static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + + aa += 1; + bb += 4; + } + + a -= 4; + b -= 16; + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c3 *= b15; + c2 = (c2 - c3 * b14) * b10; + c1 = ((c1 - c3 * b13) - c2 * b9) * b5; + c0 = (((c0 - c3 * b12) - c2 * b8) - c1 * b4) * b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT b0, b2, b3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + + aa += 1; + bb += 2; + } + + a -= 2; + b -= 4; + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c1 *= b3; + + c0 -= c1 * b2; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + ldc) = c1; +} + +static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + BLASLONG k; + + for (k = 0; k < bk; k++) + { + *c -= a[k] * b[k]; + } + + *c *= *(a - 1); + *(b - 1) = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + kk = n - offset; + c += n * ldc; + b += n * k; + + if (n & 7) + { + if (n & 1) + { + aa = a; + b -= k; + c -= ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x1_rt_msa(aa + 8 * kk, b + kk, cc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x1_rt_msa(aa + 4 * kk, b + kk, cc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x1_rt_msa(aa + 2 * kk, b + kk, cc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x1_rt_msa(b + kk, aa + kk, cc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 1; + } + + if (n & 2) + { + aa = a; + b -= 2 * k; + c -= 2 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x2_rt_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x2_rt_msa(aa + 4 * kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x2_rt_msa(aa + 2 * kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x2_rt_msa(aa + kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 2; + } + + if (n & 4) + { + aa = a; + b -= 4 * k; + c -= 4 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x4_rt_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x4_rt_msa(aa + 4 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x4_rt_msa(aa + 2 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x4_rt_msa(aa + kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 4; + } + } + + for (j = (n >> 3); j--;) + { + aa = a; + b -= 8 * k; + c -= 8 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x8_rt_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x8_rt_msa(aa + 4 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x8_rt_msa(aa + 2 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x8_rt_msa(aa + kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 8; + } + + return 0; +} diff --git a/kernel/mips/swap.c b/kernel/mips/swap.c new file mode 100644 index 0000000000..23f7a35802 --- /dev/null +++ b/kernel/mips/swap.c @@ -0,0 +1,55 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips/symv_L.c b/kernel/mips/symv_L.c new file mode 100644 index 0000000000..6a83d73f9d --- /dev/null +++ b/kernel/mips/symv_L.c @@ -0,0 +1,70 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG jx,jy; + BLASLONG j; + FLOAT temp1; + FLOAT temp2; + +#if 0 + if ( m != offset ) + printf("Symv_L: m=%d offset=%d\n",m,offset); +#endif + + jx = 0; + jy = 0; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(maxf); +} + + diff --git a/kernel/mips/zamin.c b/kernel/mips/zamin.c new file mode 100644 index 0000000000..97c07da818 --- /dev/null +++ b/kernel/mips/zamin.c @@ -0,0 +1,70 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(minf); +} + + diff --git a/kernel/mips/zasum.c b/kernel/mips/zasum.c new file mode 100644 index 0000000000..77a2ed6855 --- /dev/null +++ b/kernel/mips/zasum.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} + + diff --git a/kernel/mips/zasum_msa.c b/kernel/mips/zasum_msa.c new file mode 100644 index 0000000000..c84d48ecbc --- /dev/null +++ b/kernel/mips/zasum_msa.c @@ -0,0 +1,170 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include +#include "macros_msa.h" + +#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec)) + +#define PROCESS_ZD(inc_val) \ + if (n > 8) \ + { \ + n -= 8; \ + \ + LD_DP8_INC(x, inc_val, src0, src1, src2, \ + src3, src4, src5, src6, src7); \ + \ + sum_abs0 = AND_VEC_D(src0); \ + sum_abs1 = AND_VEC_D(src1); \ + sum_abs2 = AND_VEC_D(src2); \ + sum_abs3 = AND_VEC_D(src3); \ + sum_abs0 += AND_VEC_D(src4); \ + sum_abs1 += AND_VEC_D(src5); \ + sum_abs2 += AND_VEC_D(src6); \ + sum_abs3 += AND_VEC_D(src7); \ + } \ + else \ + { \ + sum_abs0 = zero_v; \ + sum_abs1 = zero_v; \ + sum_abs2 = zero_v; \ + sum_abs3 = zero_v; \ + } \ + \ + for (i = (n >> 3); i--;) \ + { \ + LD_DP8_INC(x, inc_val, src0, src1, src2, \ + src3, src4, src5, src6, src7); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + sum_abs3 += AND_VEC_D(src3); \ + sum_abs0 += AND_VEC_D(src4); \ + sum_abs1 += AND_VEC_D(src5); \ + sum_abs2 += AND_VEC_D(src6); \ + sum_abs3 += AND_VEC_D(src7); \ + } \ + \ + if (n & 7) \ + { \ + if ((n & 4) && (n & 2) && (n & 1)) \ + { \ + LD_DP7_INC(x, inc_val, src0, src1, src2, \ + src3, src4, src5, src6); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + sum_abs3 += AND_VEC_D(src3); \ + sum_abs0 += AND_VEC_D(src4); \ + sum_abs1 += AND_VEC_D(src5); \ + sum_abs2 += AND_VEC_D(src6); \ + } \ + else if ((n & 4) && (n & 2)) \ + { \ + LD_DP6_INC(x, inc_val, src0, src1, src2, \ + src3, src4, src5); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + sum_abs3 += AND_VEC_D(src3); \ + sum_abs0 += AND_VEC_D(src4); \ + sum_abs1 += AND_VEC_D(src5); \ + } \ + else if ((n & 4) && (n & 1)) \ + { \ + LD_DP5_INC(x, inc_val, src0, src1, src2, \ + src3, src4); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + sum_abs3 += AND_VEC_D(src3); \ + sum_abs0 += AND_VEC_D(src4); \ + } \ + else if ((n & 2) && (n & 1)) \ + { \ + LD_DP3_INC(x, inc_val, src0, src1, src2); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + } \ + else if (n & 4) \ + { \ + LD_DP4_INC(x, inc_val, src0, src1, src2, \ + src3); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + sum_abs3 += AND_VEC_D(src3); \ + } \ + else if (n & 2) \ + { \ + LD_DP2_INC(x, inc_val, src0, src1); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + } \ + else if (n & 1) \ + { \ + src0 = LD_DP(x); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + } \ + } \ + \ + sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; \ + sumf = sum_abs0[0] + sum_abs0[1]; + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i; + FLOAT sumf = 0.0; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3; + v2f64 zero_v = {0}; + v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; + + if (n <= 0 || inc_x <= 0) return (sumf); + + if (1 == inc_x) + { + PROCESS_ZD(2); + } + else + { + inc_x *= 2; + PROCESS_ZD(inc_x); + } + + return (sumf); +} diff --git a/kernel/mips/zaxpby.c b/kernel/mips/zaxpby.c new file mode 100644 index 0000000000..97452e942e --- /dev/null +++ b/kernel/mips/zaxpby.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix,iy; + FLOAT temp; + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + y[iy] = 0.0 ; + y[iy+1] = 0.0 ; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + } + else + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + + + } + return(0); + +} + + diff --git a/kernel/mips/zaxpy.c b/kernel/mips/zaxpy.c new file mode 100644 index 0000000000..f0fbab4a26 --- /dev/null +++ b/kernel/mips/zaxpy.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips/zcopy.c b/kernel/mips/zcopy.c new file mode 100644 index 0000000000..6bb6e33b62 --- /dev/null +++ b/kernel/mips/zcopy.c @@ -0,0 +1,56 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips/zdot.c b/kernel/mips/zdot.c new file mode 100644 index 0000000000..da9ec70767 --- /dev/null +++ b/kernel/mips/zdot.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#ifndef _MSC_VER +#include +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; + + dot[0]=0.0; + dot[1]=0.0; + + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + return(result); + +} + + diff --git a/kernel/mips/zdot_msa.c b/kernel/mips/zdot_msa.c new file mode 100644 index 0000000000..b94509392b --- /dev/null +++ b/kernel/mips/zdot_msa.c @@ -0,0 +1,227 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#if !defined(CONJ) + #define OP2 += + #define OP3 - + #define OP4 + +#else + #define OP2 -= + #define OP3 + + #define OP4 - +#endif + +#define DOT16_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); \ + \ + dot0 += (vx2r * vy2r); \ + dot0 OPR0## = (vx2i * vy2i); \ + dot1 OPR1## = (vx2i * vy2r); \ + dot1 += (vx2r * vy2i); \ + \ + dot0 += (vx3r * vy3r); \ + dot0 OPR0## = (vx3i * vy3i); \ + dot1 OPR1## = (vx3i * vy3r); \ + dot1 += (vx3r * vy3i); + +#define DOT12_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); \ + \ + dot0 += (vx2r * vy2r); \ + dot0 OPR0## = (vx2i * vy2i); \ + dot1 OPR1## = (vx2i * vy2r); \ + dot1 += (vx2r * vy2i); + +#define DOT8_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); + +#define DOT4_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); + +/* return double, x,y double */ +/* zdotc - CONJ */ +/* zdotu - !CONJ */ +#ifndef _MSC_VER +#include +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i = 0; + FLOAT dot[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; + v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; + v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; + v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; + v2f64 dot0 = {0, 0}; + v2f64 dot1 = {0, 0}; + v2f64 zero = {0, 0}; + openblas_complex_double result; + + dot[0] = 0.0; + dot[1] = 0.0; + + __real__(result) = 0.0; + __imag__(result) = 0.0; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + for (i = (n >> 3); i--;) + { + LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); + PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); + PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i); + + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); + PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); + PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i); + + #if !defined(CONJ) + DOT16_KERNEL(-, +); + #else + DOT16_KERNEL(+, -); + #endif + } + + if (n & 7) + { + if ((n & 4) && (n & 2)) + { + LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); + LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); + LD_DP2_INC(x, inc_x2, vx4, vx5); + LD_DP2_INC(y, inc_y2, vy4, vy5); + + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); + PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); + + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); + PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); + + #if !defined(CONJ) + DOT12_KERNEL(-, +); + #else + DOT12_KERNEL(+, -); + #endif + } + else if (n & 4) + { + LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); + LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); + + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); + + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); + + #if !defined(CONJ) + DOT8_KERNEL(-, +); + #else + DOT8_KERNEL(+, -); + #endif + } + else if (n & 2) + { + LD_DP2_INC(x, inc_x2, vx0, vx1); + LD_DP2_INC(y, inc_y2, vy0, vy1); + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + + #if !defined(CONJ) + DOT4_KERNEL(-, +); + #else + DOT4_KERNEL(+, -); + #endif + } + + if (n & 1) + { + vx0 = LD_DP(x); + vy0 = LD_DP(y); + PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i); + + #if !defined(CONJ) + DOT4_KERNEL(-, +); + #else + DOT4_KERNEL(+, -); + #endif + } + } + + dot[0] += (dot0[0] + dot0[1]); + dot[1] += (dot1[0] + dot1[1]); + + __real__(result) = dot[0]; + __imag__(result) = dot[1]; + + return(result); +} diff --git a/kernel/mips/zgemm_kernel_4x4_msa.c b/kernel/mips/zgemm_kernel_4x4_msa.c new file mode 100644 index 0000000000..a185c69dd2 --- /dev/null +++ b/kernel/mips/zgemm_kernel_4x4_msa.c @@ -0,0 +1,1589 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define ZGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ + LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = OP4 src_a1r * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = OP4 src_a1r * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_D2_DP(src_b2, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = OP4 src_a0r * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + res5_r OP0## = src_a1r * src_br; \ + res5_r OP1## = src_a1i * src_bi; \ + res5_i OP2## = OP4 src_a1r * src_bi; \ + res5_i OP3## = src_a1i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_D2_DP(src_b3, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = OP4 src_a0r * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ + \ + res7_r OP0## = src_a1r * src_br; \ + res7_r OP1## = src_a1i * src_bi; \ + res7_i OP2## = OP4 src_a1r * src_bi; \ + res7_i OP3## = src_a1i * src_br; \ +} + +#define ZGEMM_KERNEL_2X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP2_INC(pa0, 2, src_a0, src_a1); \ + LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_D2_DP(src_b2, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = OP4 src_a0r * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_D2_DP(src_b3, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = OP4 src_a0r * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_1X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + src_a0 = LD_DP(pa0); \ + LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ + \ + PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \ + \ + /* 0th and 1st col */ \ + PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 2nd and 3rd col */ \ + PCKEVOD_D2_DP(src_b3, src_b2, src_br, src_bi); \ + res1_r OP0## = src_a0r * src_br; \ + res1_r OP1## = src_a0i * src_bi; \ + res1_i OP2## = OP4 src_a0r * src_bi; \ + res1_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ + LD_DP2_INC(pb0, 2, src_b0, src_b1); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = OP4 src_a1r * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = OP4 src_a1r * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ +} + +#define ZGEMM_KERNEL_2X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP2_INC(pa0, 2, src_a0, src_a1); \ + LD_DP2_INC(pb0, 2, src_b0, src_b1); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_1X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + src_a0 = LD_DP(pa0); \ + LD_DP2_INC(pb0, 2, src_b0, src_b1); \ + \ + PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \ + \ + /* 0th and 1st col */ \ + PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ + src_b0 = LD_DP(pb0); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = OP4 src_a1r * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ +} + +#define ZGEMM_KERNEL_2X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP2_INC(pa0, 2, src_a0, src_a1); \ + src_b0 = LD_DP(pb0); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ +} + +#define ZGEMM_SCALE_4X4_MSA \ +{ \ + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \ + \ + PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ + \ + LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r += alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i += alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); \ + \ + PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r += alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i += alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \ +} + +#define ZGEMM_SCALE_2X4_MSA \ +{ \ + LD_DP2(pc0, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + LD_DP2(pc1, 2, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ + \ + LD_DP2(pc2, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + LD_DP2(pc3, 2, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc2, 2); \ + ST_DP2_INC(dst2, dst3, pc3, 2); \ +} + +#define ZGEMM_SCALE_1X4_MSA \ +{ \ + dst0 = LD_DP(pc0); \ + dst1 = LD_DP(pc1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst2 = LD_DP(pc2); \ + dst3 = LD_DP(pc3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res1_r; \ + dst0_r -= alpha_i * res1_i; \ + dst0_i += alpha_r * res1_i; \ + dst0_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ + ST_DP(dst2, pc2); \ + ST_DP(dst3, pc3); \ +} + +#define ZGEMM_SCALE_4X2_MSA \ +{ \ + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \ + \ + PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ +} + +#define ZGEMM_SCALE_2X2_MSA \ +{ \ + LD_DP2(pc0, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + \ + LD_DP2(pc1, 2, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ +} + +#define ZGEMM_SCALE_1X2_MSA \ +{ \ + dst0 = LD_DP(pc0); \ + dst1 = LD_DP(pc1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ +} + +#define ZGEMM_SCALE_4X1_MSA \ +{ \ + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ +} + +#define ZGEMM_SCALE_2X1_MSA \ +{ \ + LD_DP2(pc0, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ +} + +#define ZGEMM_SCALE_1X1 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +#define ZGEMM_TRMM_SCALE_4X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r = alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i = alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r = alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i = alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \ +} + +#define ZGEMM_TRMM_SCALE_2X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc2, 2); \ + ST_DP2_INC(dst2, dst3, pc3, 2); \ +} + +#define ZGEMM_TRMM_SCALE_1X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst0_r = alpha_r * res1_r; \ + dst0_r -= alpha_i * res1_i; \ + dst0_i = alpha_r * res1_i; \ + dst0_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ + ST_DP(dst2, pc2); \ + ST_DP(dst3, pc3); \ +} + +#define ZGEMM_TRMM_SCALE_4X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ +} + +#define ZGEMM_TRMM_SCALE_2X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ +} + +#define ZGEMM_TRMM_SCALE_1X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ +} + +#define ZGEMM_TRMM_SCALE_4X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ +} + +#define ZGEMM_TRMM_SCALE_2X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ +} + +#define ZGEMM_TRMM_SCALE_1X1 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, + FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif + FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; + FLOAT res0, res1, a0_r, a0_i, b0_r, b0_i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1, src_b2, src_b3; + v2f64 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v2f64 dst0_r, dst0_i, dst1_r, dst1_i, alpha_r, alpha_i; + v2f64 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; + v2f64 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; + + alpha_r = COPY_DOUBLE_TO_VECTOR(alphar); + alpha_i = COPY_DOUBLE_TO_VECTOR(alphai); + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 2); j--;) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + pc2 = pc1 + 2 * ldc; + pc3 = pc2 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 2); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_4X4_MSA +#else + ZGEMM_SCALE_4X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_2X4_MSA +#else + ZGEMM_SCALE_2X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X4_MSA(, -, , -, -); +#endif + + pa0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X4_MSA(+, -, -, -,); +#endif + + pa0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_1X4_MSA +#else + ZGEMM_SCALE_1X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + l = k << 3; + B = B + l; + i = ldc << 3; + C = C + i; + } + + if (n & 2) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 2); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X2_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X2_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_4X2_MSA +#else + ZGEMM_SCALE_4X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X2_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X2_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_2X2_MSA +#else + ZGEMM_SCALE_2X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X2_MSA(, -, , -, -); +#endif + + pa0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X2_MSA(+, -, -, -,); +#endif + + pa0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_1X2_MSA +#else + ZGEMM_SCALE_1X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + l = k << 2; + B = B + l; + i = ldc << 2; + C = C + i; + } + + if (n & 1) + { + pc0 = C; + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 2); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_4X1_MSA +#else + ZGEMM_SCALE_4X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_2X1_MSA +#else + ZGEMM_SCALE_2X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X1(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X1(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X1(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X1(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X1(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X1(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X1(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X1(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_1X1 +#else + ZGEMM_SCALE_1X1 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + + l = k << 1; + B = B + l; + i = ldc << 1; + C = C + i; + } + return 0; +} diff --git a/kernel/mips/zgemm_ncopy_4_msa.c b/kernel/mips/zgemm_ncopy_4_msa.c new file mode 100644 index 0000000000..3ef46a5719 --- /dev/null +++ b/kernel/mips/zgemm_ncopy_4_msa.c @@ -0,0 +1,144 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst = dst; + lda *= 2; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); + + ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2); + ST_DP8_INC(src2, src6, src10, src14, src3, src7, src11, src15, + pdst, 2); + } + + if (m & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src4, src5); + LD_DP2_INC(psrc3, 2, src8, src9); + LD_DP2_INC(psrc4, 2, src12, src13); + + ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2); + } + + if (m & 1) + { + src0 = LD_DP(psrc1); + src4 = LD_DP(psrc2); + src8 = LD_DP(psrc3); + src12 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4_INC(src0, src4, src8, src12, pdst, 2); + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + + ST_DP8_INC(src0, src4, src1, src5, src2, src6, src3, src7, pdst, 2); + } + + if (m & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src4, src5); + + ST_DP4_INC(src0, src4, src1, src5, pdst, 2); + } + + if (m & 1) + { + src0 = LD_DP(psrc1); + src4 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + ST_DP2_INC(src0, src4, pdst, 2); + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + ST_DP4_INC(src0, src1, src2, src3, pdst, 2); + } + + if (m & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + ST_DP2_INC(src0, src1, pdst, 2); + } + + if (m & 1) + { + src0 = LD_DP(psrc1); + ST_DP(src0, pdst); + } + } + + return 0; +} diff --git a/kernel/mips/zgemm_tcopy_4_msa.c b/kernel/mips/zgemm_tcopy_4_msa.c new file mode 100644 index 0000000000..70314cb21b --- /dev/null +++ b/kernel/mips/zgemm_tcopy_4_msa.c @@ -0,0 +1,161 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *pdst0, *pdst1, *pdst2, *pdst3; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + lda *= 2; + + pdst2 = dst + 2 * m * (n & ~3); + pdst3 = dst + 2 * m * (n & ~1); + + for (j = (m >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 32; + + for (i = (n >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 16, 2); + pdst1 += m * 8; + } + + if (n & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + } + + if (n & 1) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + + pdst1 += m * 8; + } + + if (n & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + + ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); + } + + if (n & 1) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + + ST_DP2_INC(src0, src1, pdst3, 2); + + psrc1 += 2; + psrc2 += 2; + } + } + + if (m & 1) + { + psrc1 = psrc0; + pdst1 = pdst0; + + for (i = (n >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + ST_DP4(src0, src1, src2, src3, pdst1, 2); + + pdst1 += m * 8; + } + + if (n & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + ST_DP2_INC(src0, src1, pdst2, 2); + } + + if (n & 1) + { + src0 = LD_DP(psrc1); + ST_DP(src0, pdst3); + } + } + + return 0; +} diff --git a/kernel/mips/zgemv_n.c b/kernel/mips/zgemv_n.c new file mode 100644 index 0000000000..9bf1f6b429 --- /dev/null +++ b/kernel/mips/zgemv_n.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + ix = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; j> 2); j--;) \ + { \ + ZLOAD_X4_SCALE() \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_Y4() \ + ZGEMV_N_4x4() \ + ZSTORE_Y4() \ + \ + k += 2 * 4; \ + y += inc_y2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_Y2() \ + ZGEMV_N_2x4() \ + ZSTORE_Y2() \ + \ + k += 2 * 2; \ + y += inc_y2 * 2; \ + } \ + \ + if (m & 1) \ + { \ + temp0_r = tp4r[0]; \ + temp1_r = tp4r[1]; \ + temp2_r = tp5r[0]; \ + temp3_r = tp5r[1]; \ + \ + temp0_i = tp4i[0]; \ + temp1_i = tp4i[1]; \ + temp2_i = tp5i[0]; \ + temp3_i = tp5i[1]; \ + \ + ZGEMV_N_1x4() \ + k += 2; \ + y += inc_y2; \ + } \ + \ + pa0 += 4 * lda2; \ + pa1 += 4 * lda2; \ + pa2 += 4 * lda2; \ + pa3 += 4 * lda2; \ + \ + x += 4 * inc_x2; \ + } \ + \ + if (n & 2) \ + { \ + ZLOAD_X2_SCALE() \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_Y4() \ + ZGEMV_N_4x2() \ + ZSTORE_Y4() \ + \ + k += 2 * 4; \ + y += inc_y2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_Y2() \ + ZGEMV_N_2x2() \ + ZSTORE_Y2() \ + \ + k += 2 * 2; \ + y += inc_y2 * 2; \ + } \ + \ + if (m & 1) \ + { \ + temp0_r = tp4r[0]; \ + temp1_r = tp4r[1]; \ + \ + temp0_i = tp4i[0]; \ + temp1_i = tp4i[1]; \ + \ + ZGEMV_N_1x2() \ + \ + k += 2; \ + y += inc_y2; \ + } \ + \ + pa0 += 2 * lda2; \ + pa1 += 2 * lda2; \ + \ + x += 2 * inc_x2; \ + } \ + \ + if (n & 1) \ + { \ + ZLOAD_X1_SCALE() \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_Y4() \ + ZGEMV_N_4x1() \ + ZSTORE_Y4() \ + \ + k += 2 * 4; \ + y += inc_y2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_Y2() \ + ZGEMV_N_2x1() \ + ZSTORE_Y2() \ + \ + k += 2 * 2; \ + y += inc_y2 * 2; \ + } \ + \ + if (m & 1) \ + { \ + ZGEMV_N_1x1() \ + \ + k += 2; \ + y += inc_y2; \ + } \ + \ + pa0 += lda2; \ + x += inc_x2; \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y, + BLASLONG inc_y2, FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *y_org = y; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT temp0_r, temp1_r, temp2_r, temp3_r, temp0_i, temp1_i, temp2_i; + FLOAT temp3_i, res0, res1; + v2f64 alphar, alphai; + v2f64 x0, x1, x2, x3, y0, y1, y2, y3; + v2f64 x0r, x1r, x0i, x1i, y0r, y1r, y0i, y1i; + v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; + v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; + v2f64 tp0r, tp1r, tp2r, tp3r, tp4r, tp5r, tp0i, tp1i, tp2i, tp3i, tp4i, tp5i; + + lda2 = 2 * lda2; + inc_x2 = 2 * inc_x2; + inc_y2 = 2 * inc_y2; + + pa0 = A; + pa1 = A + lda2; + pa2 = A + 2 * lda2; + pa3 = A + 3 * lda2; + + alphar = COPY_DOUBLE_TO_VECTOR(alpha_r); + alphai = COPY_DOUBLE_TO_VECTOR(alpha_i); + + if ((2 == inc_x2) && (2 == inc_y2)) + { + #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR + #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR + #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP + #define ZLOAD_Y4 ZLOAD_Y4_VECTOR + #define ZLOAD_Y2 ZLOAD_Y2_VECTOR + #define ZSTORE_Y4 ZSTORE_Y4_VECTOR + #define ZSTORE_Y2 ZSTORE_Y2_VECTOR + + ZGEMV_N_MSA(); + + #undef ZLOAD_X4_SCALE + #undef ZLOAD_X2_SCALE + #undef ZLOAD_X1_SCALE + #undef ZLOAD_Y4 + #undef ZLOAD_Y2 + #undef ZSTORE_Y4 + #undef ZSTORE_Y2 + } + else if (2 == inc_x2) + { + #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR + #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR + #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP + #define ZLOAD_Y4 ZLOAD_Y4_GP + #define ZLOAD_Y2 ZLOAD_Y2_GP + #define ZSTORE_Y4 ZSTORE_Y4_GP + #define ZSTORE_Y2 ZSTORE_Y2_GP + + ZGEMV_N_MSA(); + + #undef ZLOAD_X4_SCALE + #undef ZLOAD_X2_SCALE + #undef ZLOAD_X1_SCALE + #undef ZLOAD_Y4 + #undef ZLOAD_Y2 + #undef ZSTORE_Y4 + #undef ZSTORE_Y2 + } + else if (2 == inc_y2) + { + #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP + #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP + #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP + #define ZLOAD_Y4 ZLOAD_Y4_VECTOR + #define ZLOAD_Y2 ZLOAD_Y2_VECTOR + #define ZSTORE_Y4 ZSTORE_Y4_VECTOR + #define ZSTORE_Y2 ZSTORE_Y2_VECTOR + + ZGEMV_N_MSA(); + + #undef ZLOAD_X4_SCALE + #undef ZLOAD_X2_SCALE + #undef ZLOAD_X1_SCALE + #undef ZLOAD_Y4 + #undef ZLOAD_Y2 + #undef ZSTORE_Y4 + #undef ZSTORE_Y2 + } + else + { + #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP + #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP + #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP + #define ZLOAD_Y4 ZLOAD_Y4_GP + #define ZLOAD_Y2 ZLOAD_Y2_GP + #define ZSTORE_Y4 ZSTORE_Y4_GP + #define ZSTORE_Y2 ZSTORE_Y2_GP + + ZGEMV_N_MSA(); + + #undef ZLOAD_X4_SCALE + #undef ZLOAD_X2_SCALE + #undef ZLOAD_X1_SCALE + #undef ZLOAD_Y4 + #undef ZLOAD_Y2 + #undef ZSTORE_Y4 + #undef ZSTORE_Y2 + } + return(0); +} + +#undef OP0 +#undef OP1 +#undef OP2 +#undef OP3 +#undef OP4 diff --git a/kernel/mips/zgemv_t.c b/kernel/mips/zgemv_t.c new file mode 100644 index 0000000000..2dfb9d255a --- /dev/null +++ b/kernel/mips/zgemv_t.c @@ -0,0 +1,130 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + iy = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; j> 2); j--;) \ + { \ + tp0r = tp1r = tp2r = tp3r = zero; \ + tp0i = tp1i = tp2i = tp3i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_X4(); \ + ZGEMV_T_4x4(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_X2(); \ + ZGEMV_T_2x4(); \ + \ + k += 2 * 2; \ + x += inc_x2 * 2; \ + } \ + \ + temp0r = tp0r[0] + tp0r[1]; \ + temp1r = tp1r[0] + tp1r[1]; \ + temp2r = tp2r[0] + tp2r[1]; \ + temp3r = tp3r[0] + tp3r[1]; \ + temp0i = tp0i[0] + tp0i[1]; \ + temp1i = tp1i[0] + tp1i[1]; \ + temp2i = tp2i[0] + tp2i[1]; \ + temp3i = tp3i[0] + tp3i[1]; \ + \ + if (m & 1) \ + { \ + ZGEMV_T_1x4(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + ZSCALE_STORE_Y4_GP(); \ + \ + pa0 += 4 * lda2; \ + pa1 += 4 * lda2; \ + pa2 += 4 * lda2; \ + pa3 += 4 * lda2; \ + y += 4 * inc_y2; \ + } \ + \ + if (n & 2) \ + { \ + tp0r = tp1r = zero; \ + tp0i = tp1i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_X4(); \ + ZGEMV_T_4x2(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_X2(); \ + ZGEMV_T_2x2(); \ + \ + k += 2 * 2; \ + x += inc_x2 * 2; \ + } \ + \ + temp0r = tp0r[0] + tp0r[1]; \ + temp1r = tp1r[0] + tp1r[1]; \ + temp0i = tp0i[0] + tp0i[1]; \ + temp1i = tp1i[0] + tp1i[1]; \ + \ + if (m & 1) \ + { \ + ZGEMV_T_1x2(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + ZSCALE_STORE_Y2_GP(); \ + \ + pa0 += 2 * lda2; \ + pa1 += 2 * lda2; \ + y += 2 * inc_y2; \ + } \ + \ + if (n & 1) \ + { \ + tp0r = zero; \ + tp0i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_X4(); \ + ZGEMV_T_4x1(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_X2(); \ + ZGEMV_T_2x1(); \ + \ + k += 2 * 2; \ + x += inc_x2 * 2; \ + } \ + \ + temp0r = tp0r[0] + tp0r[1]; \ + temp0i = tp0i[0] + tp0i[1]; \ + \ + if (m & 1) \ + { \ + ZGEMV_T_1x1(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + ZSCALE_STORE_Y1_GP(); \ + \ + pa0 += lda2; \ + y += inc_y2; \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai, + FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG inc_x2, inc_y2, lda2; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *srcx_org = x; + FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i; + FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i; + v2f64 zero = {0}; + v2f64 x0, x1, x2, x3, x0r, x1r, x0i, x1i; + v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; + v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; + v2f64 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i; + + lda2 = 2 * lda; + + pa0 = A; + pa1 = A + lda2; + pa2 = A + 2 * lda2; + pa3 = A + 3 * lda2; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if (2 == inc_x2) + { + #define ZLOAD_X4 ZLOAD_X4_VECTOR + #define ZLOAD_X2 ZLOAD_X2_VECTOR + + ZGEMV_T_MSA(); + + #undef ZLOAD_X4 + #undef ZLOAD_X2 + } + else + { + #define ZLOAD_X4 ZLOAD_X4_GP + #define ZLOAD_X2 ZLOAD_X2_GP + + ZGEMV_T_MSA(); + + #undef ZLOAD_X4 + #undef ZLOAD_X2 + } + + return(0); +} + +#undef OP0 +#undef OP1 +#undef OP2 diff --git a/kernel/mips/znrm2.c b/kernel/mips/znrm2.c new file mode 100644 index 0000000000..85be39cd18 --- /dev/null +++ b/kernel/mips/znrm2.c @@ -0,0 +1,97 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + BLASLONG inc_x2; + FLOAT temp; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + temp = ABS( x[i] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + if ( x[i+1] != 0.0 ) + { + temp = ABS( x[i+1] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + + i += inc_x2; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/mips/zomatcopy_cn.c b/kernel/mips/zomatcopy_cn.c new file mode 100644 index 0000000000..bf6d3c70da --- /dev/null +++ b/kernel/mips/zomatcopy_cn.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i,j,ia; + FLOAT *aptr,*bptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + aptr = a; + bptr = b; + + lda *= 2; + ldb *= 2; + + for ( i=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips64/KERNEL.I6400 b/kernel/mips64/KERNEL.I6400 new file mode 100644 index 0000000000..abf44814ae --- /dev/null +++ b/kernel/mips64/KERNEL.I6400 @@ -0,0 +1 @@ +include $(KERNELDIR)/../mips/KERNEL.P5600 diff --git a/kernel/mips64/KERNEL.P6600 b/kernel/mips64/KERNEL.P6600 new file mode 100644 index 0000000000..abf44814ae --- /dev/null +++ b/kernel/mips64/KERNEL.P6600 @@ -0,0 +1 @@ +include $(KERNELDIR)/../mips/KERNEL.P5600 diff --git a/kernel/mips64/axpy.S b/kernel/mips64/axpy.S index 32694a99da..5d9728a481 100644 --- a/kernel/mips64/axpy.S +++ b/kernel/mips64/axpy.S @@ -225,7 +225,9 @@ .align 3 .L20: + beqz INCY, .L27 dsra I, N, 3 + move YY, Y blez I, .L25 @@ -405,5 +407,19 @@ j $31 NOP + .align 3 + +.L27: + LD b1, 0 * SIZE(Y) + +.L28: + daddiu N, N, -1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + bgtz N, .L28 + MADD b1, b1, ALPHA, a1 + + j .L999 + ST b1, 0 * SIZE(Y) EPILOGUE diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index b37a4213bc..b9f44db910 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -10,9 +10,9 @@ ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c -SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o @@ -20,17 +20,17 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = ../generic/gemm_tcopy_16.c -DGEMMONCOPY = gemm_ncopy_4.S -DGEMMOTCOPY = gemm_tcopy_4.S -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMITCOPY = dgemm_tcopy_16_power8.S +DGEMMONCOPY = dgemm_ncopy_4_power8.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c -CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMITCOPY = cgemm_tcopy_8_power8.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy.o @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o @@ -54,7 +54,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c @@ -125,10 +125,10 @@ DDOTKERNEL = ddot.c #CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = zdot.c # -#SNRM2KERNEL = ../arm/nrm2.c -#DNRM2KERNEL = ../arm/nrm2.c -#CNRM2KERNEL = ../arm/znrm2.c -#ZNRM2KERNEL = ../arm/znrm2.c +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c @@ -137,7 +137,7 @@ DROTKERNEL = drot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c -#CSCALKERNEL = ../arm/zscal.c +CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c # SSWAPKERNEL = sswap.c diff --git a/kernel/power/cgemm_tcopy_8_power8.S b/kernel/power/cgemm_tcopy_8_power8.S new file mode 100644 index 0000000000..b1a7d2b277 --- /dev/null +++ b/kernel/power/cgemm_tcopy_8_power8.S @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define o4 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTUS2 r27 +#define M8 r30 +#define T1 r31 + +#define o0 0 + +#include "cgemm_tcopy_macros_8_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, ZBASE_SHIFT + slwi M8, M, 3 + ZBASE_SHIFT + + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B4, B4, ZBASE_SHIFT + slwi B2, B2, ZBASE_SHIFT + slwi B1, B1, ZBASE_SHIFT + + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M8, 128 + + li o4, 4 + li o16, 16 + li o32, 32 + li o48, 48 + +#include "cgemm_tcopy_logic_8_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/cgemm_tcopy_logic_8_power8.S b/kernel/power/cgemm_tcopy_logic_8_power8.S new file mode 100644 index 0000000000..9418908b76 --- /dev/null +++ b/kernel/power/cgemm_tcopy_logic_8_power8.S @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble CCOPYT_L2_BEGIN + + +CCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B8, B + addi B, B, 64*SIZE + + sradi. J, N, 3 + ble CCOPYT_L4x4_BEGIN + + mr BO, B8 + +CCOPYT_L4x8_LOOP: + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + dcbtst BO, M8 + dcbtst BO, PREB + COPY_4x8 + + add BO, BO, M8 + + addic. J, J, -1 + ble CCOPYT_L4x4_BEGIN + + + COPY_4x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt CCOPYT_L4x8_LOOP + +CCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble CCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + + addi B4, B4, 32*SIZE + +CCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble CCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + + addi B2, B2, 16*SIZE + +CCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble CCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + + addi B1, B1, 8*SIZE + +CCOPYT_L4_END: + + addic. I, I, -1 + bgt CCOPYT_L4_BEGIN + + + +CCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble CCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B8, B + addi B, B, 32*SIZE + + sradi. J, N, 3 + ble CCOPYT_L2x4_BEGIN + + mr BO, B8 + +CCOPYT_L2x8_LOOP: + + COPY_2x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt CCOPYT_L2x8_LOOP + +CCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble CCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + + addi B4, B4, 16*SIZE + +CCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble CCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + + addi B2, B2, 8*SIZE + +CCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble CCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + + addi B1, B1, 4*SIZE + +CCOPYT_L2_END: + + +CCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B8, B + addi B, B, 16*SIZE + + sradi. J, N, 3 + ble CCOPYT_L1x4_BEGIN + + mr BO, B8 + +CCOPYT_L1x8_LOOP: + + COPY_1x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt CCOPYT_L1x8_LOOP + +CCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble CCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + + addi B4, B4, 8*SIZE + +CCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble CCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + + addi B2, B2, 4*SIZE + +CCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble CCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + + addi B1, B1, 2*SIZE + +CCOPYT_L1_END: + diff --git a/kernel/power/cgemm_tcopy_macros_8_power8.S b/kernel/power/cgemm_tcopy_macros_8_power8.S new file mode 100644 index 0000000000..03fda27663 --- /dev/null +++ b/kernel/power/cgemm_tcopy_macros_8_power8.S @@ -0,0 +1,385 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + + lxvw4x vs36, o0, A1 + lxvw4x vs37, o16, A1 + lxvw4x vs38, o32, A1 + lxvw4x vs39, o48, A1 + + addi A0, A0, 64 + addi A1, A1, 64 + + lxvw4x vs40, o0, A2 + lxvw4x vs41, o16, A2 + lxvw4x vs42, o32, A2 + lxvw4x vs43, o48, A2 + + lxvw4x vs44, o0, A3 + lxvw4x vs45, o16, A3 + lxvw4x vs46, o32, A3 + lxvw4x vs47, o48, A3 + + mr T1, BO + addi A2, A2, 64 + addi A3, A3, 64 + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs40, o0, T1 + stxvw4x vs41, o16, T1 + stxvw4x vs42, o32, T1 + stxvw4x vs43, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs44, o0, T1 + stxvw4x vs45, o16, T1 + stxvw4x vs46, o32, T1 + stxvw4x vs47, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + addi A0, A0, 32 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + addi A1, A1, 32 + + lxvw4x vs36, o0, A2 + lxvw4x vs37, o16, A2 + addi A2, A2, 32 + + lxvw4x vs38, o0, A3 + lxvw4x vs39, o16, A3 + addi A3, A3, 32 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvw4x vs32, o0, A0 + addi A0, A0, 16 + + lxvw4x vs33, o0, A1 + addi A1, A1, 16 + + lxvw4x vs34, o0, A2 + addi A2, A2, 16 + + lxvw4x vs35, o0, A3 + addi A3, A3, 16 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + addi A0, A0, 8 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + addi A1, A1, 8 + + lxsspx vs36, o0, A2 + lxsspx vs37, o4, A2 + addi A2, A2, 8 + + lxsspx vs38, o0, A3 + lxsspx vs39, o4, A3 + addi A3, A3, 8 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + + addi T1, T1, 8 + + stxsspx vs36, o0, T1 + stxsspx vs37, o4, T1 + + addi T1, T1, 8 + + stxsspx vs38, o0, T1 + stxsspx vs39, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + addi A0, A0, 64 + + lxvw4x vs36, o0, A1 + lxvw4x vs37, o16, A1 + lxvw4x vs38, o32, A1 + lxvw4x vs39, o48, A1 + addi A1, A1, 64 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + addi A0, A0, 32 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + addi A1, A1, 32 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvw4x vs32, o0, A0 + addi A0, A0, 16 + + lxvw4x vs33, o0, A1 + addi A1, A1, 16 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + addi A0, A0, 8 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + addi A1, A1, 8 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + addi A0, A0, 64 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + addi A0, A0, 32 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvw4x vs32, o0, A0 + addi A0, A0, 16 + + mr T1, BO + + stxvw4x vs32, o0, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + addi A0, A0, 8 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + +.endm + diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index c67f311608..8af7fe3899 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK @@ -131,13 +131,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#define T4 r12 +#define T3 r11 + +#define o40 r12 +#define o56 r11 + +#define o112 r14 #define o8 r15 #define o24 r16 -#define ALPHA r17 +#define o64 r17 #define L r18 #define T1 r19 -#define KK r20 -#define BB r21 +#define o80 r20 +#define o96 r21 #define I r22 #define J r23 #define AO r24 @@ -202,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) + std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) @@ -220,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) + stw r14, 212(SP) #endif stfd f1, ALPHA_SP @@ -260,19 +269,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble .L999_H1 #ifdef __64BIT__ - addi ALPHA, SP, 296 + addi T1, SP, 296 #else - addi ALPHA, SP, 224 + addi T1, SP, 224 #endif - li PRE, 256 + li PRE, 384 li o8 , 8 li o16, 16 li o24, 24 li o32, 32 li o48, 48 + li o64, 64 + li o80, 80 + li o96, 96 + li o112, 112 - lxvdsx alpha_r, 0, ALPHA + lxvdsx alpha_r, 0, T1 #include "dgemm_logic_16x4_power8.S" @@ -320,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) + ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) @@ -338,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) + lwz r14, 212(SP) #endif addi SP, SP, STACKSIZE diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index 49c438f617..cacfab1f6f 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -33,195 +33,340 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * LAPACK-TEST : OK **************************************************************************************/ +#define MY_ALIGN .align 3 srawi. J, N, 2 - ble .LDGEMM_L4_END + ble LDGEMM_L4_END -.LDGEMM_L4_BEGIN: +LDGEMM_L4_BEGIN: - mr CO, C + li T1, 128 + li T2, 256 mr AO, A - slwi T1, LDC , 2 - add C, C, T1 + + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + + dcbt A, T1 + dcbt A, T2 + srawi. I, M, 4 - ble .LDGEMM_L4x16_END + ble LDGEMM_L4x16_END + + MY_ALIGN +LDGEMM_L4x16_BEGIN_FIRST: -.LDGEMM_L4x16_BEGIN: + li L, -128 + mr T1, CO + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + and T1, T1, L + and T2, T2, L + and T3, T3, L + and T4, T4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 mr BO, B - srawi. L, K, 3 - ble .LDGEMM_L4x16_SUB0 + srawi. L, K, 2 + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + ble LDGEMM_L4x16_SUB0_FIRST cmpwi cr0, L, 1 - ble .LDGEMM_L4x16_SUB4 + ble LDGEMM_L4x16_SUB4_FIRST -.LDGEMM_L4x16_LOOP_START: + MY_ALIGN +LDGEMM_L4x16_LOOP_START_FIRST: - dcbt AO, PRE + li T2, 512 + li o40, 40 + li o56, 56 + + dcbt AO, PRE + dcbt BO, T2 LOAD4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_I1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 + dcbt AO, PRE + addic. L, L, -2 + KERNEL4x16_L2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE + dcbt BO, T2 + KERNEL4x16_L2 - addic. L, L, -2 - ble .LDGEMM_L4x16_LOOP_END + ble LDGEMM_L4x16_LOOP_END_FIRST + mtctr L - .align 5 + MY_ALIGN -.LDGEMM_L4x16_LOOP: +LDGEMM_L4x16_LOOP_FIRST: - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE + KERNEL4x16_L2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE + dcbt BO, T2 + KERNEL4x16_L2 - addic. L, L, -1 - bgt .LDGEMM_L4x16_LOOP + bdnz LDGEMM_L4x16_LOOP_FIRST -.LDGEMM_L4x16_LOOP_END: + MY_ALIGN - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 +LDGEMM_L4x16_LOOP_END_FIRST: + + KERNEL4x16_L1 + KERNEL4x16_L2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE KERNEL4x16_1 KERNEL4x16_E2 - b .LDGEMM_L4x16_SUB1 + b LDGEMM_L4x16_SUB1_FIRST -.LDGEMM_L4x16_SUB4: +LDGEMM_L4x16_SUB4_FIRST: - dcbt AO, PRE KERNEL4x16_SUBI1 - dcbt AO, PRE KERNEL4x16_SUB1 - dcbt AO, PRE KERNEL4x16_SUB1 - dcbt AO, PRE KERNEL4x16_SUB1 + b LDGEMM_L4x16_SUB1_FIRST + +LDGEMM_L4x16_SUB0_FIRST: + + andi. L, K, 3 + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x16_SAVE_FIRST + b LDGEMM_L4x16_SUB2_FIRST + +LDGEMM_L4x16_SUB1_FIRST: + + andi. L, K, 3 + ble LDGEMM_L4x16_SAVE_FIRST + +LDGEMM_L4x16_SUB2_FIRST: + KERNEL4x16_SUB1 - KERNEL4x16_SUB1 - KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x16_SUB2_FIRST + + MY_ALIGN +LDGEMM_L4x16_SAVE_FIRST: + + SAVE4x16 + + addic. I, I, -1 + ble LDGEMM_L4x16_END + +LDGEMM_L4x16_END_FIRST: + + MY_ALIGN + +LDGEMM_L4x16_BEGIN: + + li L, -128 + + mr T1, CO + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + and T1, T1, L + and T2, T2, L + and T3, T3, L + and T4, T4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + mr BO, B + srawi. L, K, 1 + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + ble- LDGEMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble- LDGEMM_L4x16_SUB4 + + MY_ALIGN + +LDGEMM_L4x16_LOOP_START: + + li o40, 40 + li o56, 56 + + dcbt AO, PRE + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + addic. L, L, -2 + KERNEL4x16_L2 + + ble- LDGEMM_L4x16_LOOP_END + mtctr L + + MY_ALIGN + +LDGEMM_L4x16_LOOP: + + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE + KERNEL4x16_L2 + + bdnz+ LDGEMM_L4x16_LOOP + + + MY_ALIGN + +LDGEMM_L4x16_LOOP_END: + + KERNEL4x16_1 + KERNEL4x16_E2 + + b LDGEMM_L4x16_SUB1 + + MY_ALIGN + +LDGEMM_L4x16_SUB4: + + KERNEL4x16_SUBI1 KERNEL4x16_SUB1 - b .LDGEMM_L4x16_SUB1 + b LDGEMM_L4x16_SUB1 -.LDGEMM_L4x16_SUB0: + MY_ALIGN - andi. L, K, 7 +LDGEMM_L4x16_SUB0: + + andi. L, K, 1 KERNEL4x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x16_SAVE - b .LDGEMM_L4x16_SUB2 + ble LDGEMM_L4x16_SAVE + b LDGEMM_L4x16_SUB2 -.LDGEMM_L4x16_SUB1: + MY_ALIGN - andi. L, K, 7 - ble .LDGEMM_L4x16_SAVE +LDGEMM_L4x16_SUB1: -.LDGEMM_L4x16_SUB2: + andi. L, K, 1 + ble LDGEMM_L4x16_SAVE + + MY_ALIGN + +LDGEMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x16_SUB2 + bgt LDGEMM_L4x16_SUB2 + + MY_ALIGN -.LDGEMM_L4x16_SAVE: +LDGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 - bgt .LDGEMM_L4x16_BEGIN + bgt+ LDGEMM_L4x16_BEGIN -.LDGEMM_L4x16_END: +LDGEMM_L4x16_END: -.LDGEMM_L4x8_BEGIN: +LDGEMM_L4x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L4x1_END + ble LDGEMM_L4x1_END andi. T1, M, 8 - ble .LDGEMM_L4x8_END + ble LDGEMM_L4x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x8_SUB0 + ble LDGEMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x8_SUB4 + ble LDGEMM_L4x8_SUB4 -.LDGEMM_L4x8_LOOP_START: +LDGEMM_L4x8_LOOP_START: + dcbt AO, PRE LOAD4x8_1 KERNEL4x8_I1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 addic. L, L, -2 - ble .LDGEMM_L4x8_LOOP_END + ble LDGEMM_L4x8_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L4x8_LOOP: +LDGEMM_L4x8_LOOP: KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 addic. L, L, -1 - bgt .LDGEMM_L4x8_LOOP + bgt LDGEMM_L4x8_LOOP -.LDGEMM_L4x8_LOOP_END: +LDGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -233,9 +378,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LDGEMM_L4x8_SUB1 + b LDGEMM_L4x8_SUB1 -.LDGEMM_L4x8_SUB4: +LDGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -247,81 +392,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LDGEMM_L4x8_SUB1 + b LDGEMM_L4x8_SUB1 -.LDGEMM_L4x8_SUB0: +LDGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x8_SAVE - b .LDGEMM_L4x8_SUB2 + ble LDGEMM_L4x8_SAVE + b LDGEMM_L4x8_SUB2 -.LDGEMM_L4x8_SUB1: +LDGEMM_L4x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x8_SAVE + ble LDGEMM_L4x8_SAVE -.LDGEMM_L4x8_SUB2: +LDGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x8_SUB2 + bgt LDGEMM_L4x8_SUB2 -.LDGEMM_L4x8_SAVE: +LDGEMM_L4x8_SAVE: SAVE4x8 -.LDGEMM_L4x8_END: +LDGEMM_L4x8_END: -.LDGEMM_L4x4_BEGIN: +LDGEMM_L4x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L4x4_END + ble LDGEMM_L4x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x4_SUB0 + ble LDGEMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x4_SUB4 + ble LDGEMM_L4x4_SUB4 -.LDGEMM_L4x4_LOOP_START: +LDGEMM_L4x4_LOOP_START: + dcbt AO, PRE LOAD4x4_1 KERNEL4x4_I1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 addic. L, L, -2 - ble .LDGEMM_L4x4_LOOP_END + ble LDGEMM_L4x4_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L4x4_LOOP: +LDGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 addic. L, L, -1 - bgt .LDGEMM_L4x4_LOOP + bgt LDGEMM_L4x4_LOOP -.LDGEMM_L4x4_LOOP_END: +LDGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -333,9 +483,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LDGEMM_L4x4_SUB1 + b LDGEMM_L4x4_SUB1 -.LDGEMM_L4x4_SUB4: +LDGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -347,48 +497,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LDGEMM_L4x4_SUB1 + b LDGEMM_L4x4_SUB1 -.LDGEMM_L4x4_SUB0: +LDGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x4_SAVE - b .LDGEMM_L4x4_SUB2 + ble LDGEMM_L4x4_SAVE + b LDGEMM_L4x4_SUB2 -.LDGEMM_L4x4_SUB1: +LDGEMM_L4x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x4_SAVE + ble LDGEMM_L4x4_SAVE -.LDGEMM_L4x4_SUB2: +LDGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x4_SUB2 + bgt LDGEMM_L4x4_SUB2 -.LDGEMM_L4x4_SAVE: +LDGEMM_L4x4_SAVE: SAVE4x4 -.LDGEMM_L4x4_END: +LDGEMM_L4x4_END: -.LDGEMM_L4x2_BEGIN: +LDGEMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L4x2_END + ble LDGEMM_L4x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x2_SUB0 + ble LDGEMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x2_SUB4 + ble LDGEMM_L4x2_SUB4 -.LDGEMM_L4x2_LOOP_START: +LDGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -402,11 +552,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LDGEMM_L4x2_LOOP_END + ble LDGEMM_L4x2_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L4x2_LOOP: +LDGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -419,9 +569,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LDGEMM_L4x2_LOOP + bgt LDGEMM_L4x2_LOOP -.LDGEMM_L4x2_LOOP_END: +LDGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -433,9 +583,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LDGEMM_L4x2_SUB1 + b LDGEMM_L4x2_SUB1 -.LDGEMM_L4x2_SUB4: +LDGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -447,48 +597,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LDGEMM_L4x2_SUB1 + b LDGEMM_L4x2_SUB1 -.LDGEMM_L4x2_SUB0: +LDGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x2_SAVE - b .LDGEMM_L4x2_SUB2 + ble LDGEMM_L4x2_SAVE + b LDGEMM_L4x2_SUB2 -.LDGEMM_L4x2_SUB1: +LDGEMM_L4x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x2_SAVE + ble LDGEMM_L4x2_SAVE -.LDGEMM_L4x2_SUB2: +LDGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x2_SUB2 + bgt LDGEMM_L4x2_SUB2 -.LDGEMM_L4x2_SAVE: +LDGEMM_L4x2_SAVE: SAVE4x2 -.LDGEMM_L4x2_END: +LDGEMM_L4x2_END: -.LDGEMM_L4x1_BEGIN: +LDGEMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L4x1_END + ble LDGEMM_L4x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x1_SUB0 + ble LDGEMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x1_SUB4 + ble LDGEMM_L4x1_SUB4 -.LDGEMM_L4x1_LOOP_START: +LDGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -502,11 +652,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LDGEMM_L4x1_LOOP_END + ble LDGEMM_L4x1_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L4x1_LOOP: +LDGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -519,9 +669,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LDGEMM_L4x1_LOOP + bgt LDGEMM_L4x1_LOOP -.LDGEMM_L4x1_LOOP_END: +LDGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -533,9 +683,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LDGEMM_L4x1_SUB1 + b LDGEMM_L4x1_SUB1 -.LDGEMM_L4x1_SUB4: +LDGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -547,74 +697,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LDGEMM_L4x1_SUB1 + b LDGEMM_L4x1_SUB1 -.LDGEMM_L4x1_SUB0: +LDGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x1_SAVE - b .LDGEMM_L4x1_SUB2 + ble LDGEMM_L4x1_SAVE + b LDGEMM_L4x1_SUB2 -.LDGEMM_L4x1_SUB1: +LDGEMM_L4x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x1_SAVE + ble LDGEMM_L4x1_SAVE -.LDGEMM_L4x1_SUB2: +LDGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x1_SUB2 + bgt LDGEMM_L4x1_SUB2 -.LDGEMM_L4x1_SAVE: +LDGEMM_L4x1_SAVE: SAVE4x1 -.LDGEMM_L4x1_END: +LDGEMM_L4x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt .LDGEMM_L4_BEGIN + bgt LDGEMM_L4_BEGIN andi. T2, N, 3 ble .L999 -.LDGEMM_L4_END: +LDGEMM_L4_END: - b .LDGEMM_L2_BEGIN + b LDGEMM_L2_BEGIN .L999_H1: b .L999 -.LDGEMM_L2_BEGIN: +LDGEMM_L2_BEGIN: andi. T1, N, 2 - ble .LDGEMM_L2_END + ble LDGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 4 - ble .LDGEMM_L2x16_END + ble LDGEMM_L2x16_END -.LDGEMM_L2x16_BEGIN: +LDGEMM_L2x16_BEGIN: mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x16_SUB0 + ble LDGEMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x16_SUB4 + ble LDGEMM_L2x16_SUB4 -.LDGEMM_L2x16_LOOP_START: +LDGEMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -637,11 +787,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -2 - ble .LDGEMM_L2x16_LOOP_END + ble LDGEMM_L2x16_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L2x16_LOOP: +LDGEMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 @@ -662,9 +812,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -1 - bgt .LDGEMM_L2x16_LOOP + bgt LDGEMM_L2x16_LOOP -.LDGEMM_L2x16_LOOP_END: +LDGEMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 @@ -683,9 +833,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_1 KERNEL2x16_E2 - b .LDGEMM_L2x16_SUB1 + b LDGEMM_L2x16_SUB1 -.LDGEMM_L2x16_SUB4: +LDGEMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -701,86 +851,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b .LDGEMM_L2x16_SUB1 + b LDGEMM_L2x16_SUB1 -.LDGEMM_L2x16_SUB0: +LDGEMM_L2x16_SUB0: andi. L, K, 7 KERNEL2x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x16_SAVE - b .LDGEMM_L2x16_SUB2 + ble LDGEMM_L2x16_SAVE + b LDGEMM_L2x16_SUB2 -.LDGEMM_L2x16_SUB1: +LDGEMM_L2x16_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x16_SAVE + ble LDGEMM_L2x16_SAVE -.LDGEMM_L2x16_SUB2: +LDGEMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x16_SUB2 + bgt LDGEMM_L2x16_SUB2 -.LDGEMM_L2x16_SAVE: +LDGEMM_L2x16_SAVE: SAVE2x16 addic. I, I, -1 - bgt .LDGEMM_L2x16_BEGIN + bgt LDGEMM_L2x16_BEGIN -.LDGEMM_L2x16_END: +LDGEMM_L2x16_END: -.LDGEMM_L2x8_BEGIN: +LDGEMM_L2x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L2x1_END + ble LDGEMM_L2x1_END andi. T1, M, 8 - ble .LDGEMM_L2x8_END + ble LDGEMM_L2x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x8_SUB0 + ble LDGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x8_SUB4 + ble LDGEMM_L2x8_SUB4 -.LDGEMM_L2x8_LOOP_START: +LDGEMM_L2x8_LOOP_START: + dcbt AO, PRE LOAD2x8_1 KERNEL2x8_I1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 addic. L, L, -2 - ble .LDGEMM_L2x8_LOOP_END + ble LDGEMM_L2x8_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L2x8_LOOP: +LDGEMM_L2x8_LOOP: KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 addic. L, L, -1 - bgt .LDGEMM_L2x8_LOOP + bgt LDGEMM_L2x8_LOOP -.LDGEMM_L2x8_LOOP_END: +LDGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -792,9 +951,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LDGEMM_L2x8_SUB1 + b LDGEMM_L2x8_SUB1 -.LDGEMM_L2x8_SUB4: +LDGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -806,48 +965,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LDGEMM_L2x8_SUB1 + b LDGEMM_L2x8_SUB1 -.LDGEMM_L2x8_SUB0: +LDGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x8_SAVE - b .LDGEMM_L2x8_SUB2 + ble LDGEMM_L2x8_SAVE + b LDGEMM_L2x8_SUB2 -.LDGEMM_L2x8_SUB1: +LDGEMM_L2x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x8_SAVE + ble LDGEMM_L2x8_SAVE -.LDGEMM_L2x8_SUB2: +LDGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x8_SUB2 + bgt LDGEMM_L2x8_SUB2 -.LDGEMM_L2x8_SAVE: +LDGEMM_L2x8_SAVE: SAVE2x8 -.LDGEMM_L2x8_END: +LDGEMM_L2x8_END: -.LDGEMM_L2x4_BEGIN: +LDGEMM_L2x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L2x4_END + ble LDGEMM_L2x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x4_SUB0 + ble LDGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x4_SUB4 + ble LDGEMM_L2x4_SUB4 -.LDGEMM_L2x4_LOOP_START: +LDGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -861,11 +1020,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LDGEMM_L2x4_LOOP_END + ble LDGEMM_L2x4_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L2x4_LOOP: +LDGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -878,9 +1037,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LDGEMM_L2x4_LOOP + bgt LDGEMM_L2x4_LOOP -.LDGEMM_L2x4_LOOP_END: +LDGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -892,9 +1051,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LDGEMM_L2x4_SUB1 + b LDGEMM_L2x4_SUB1 -.LDGEMM_L2x4_SUB4: +LDGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -906,48 +1065,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LDGEMM_L2x4_SUB1 + b LDGEMM_L2x4_SUB1 -.LDGEMM_L2x4_SUB0: +LDGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x4_SAVE - b .LDGEMM_L2x4_SUB2 + ble LDGEMM_L2x4_SAVE + b LDGEMM_L2x4_SUB2 -.LDGEMM_L2x4_SUB1: +LDGEMM_L2x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x4_SAVE + ble LDGEMM_L2x4_SAVE -.LDGEMM_L2x4_SUB2: +LDGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x4_SUB2 + bgt LDGEMM_L2x4_SUB2 -.LDGEMM_L2x4_SAVE: +LDGEMM_L2x4_SAVE: SAVE2x4 -.LDGEMM_L2x4_END: +LDGEMM_L2x4_END: -.LDGEMM_L2x2_BEGIN: +LDGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L2x2_END + ble LDGEMM_L2x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x2_SUB0 + ble LDGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x2_SUB4 + ble LDGEMM_L2x2_SUB4 -.LDGEMM_L2x2_LOOP_START: +LDGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -961,11 +1120,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LDGEMM_L2x2_LOOP_END + ble LDGEMM_L2x2_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L2x2_LOOP: +LDGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -978,9 +1137,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LDGEMM_L2x2_LOOP + bgt LDGEMM_L2x2_LOOP -.LDGEMM_L2x2_LOOP_END: +LDGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -992,9 +1151,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LDGEMM_L2x2_SUB1 + b LDGEMM_L2x2_SUB1 -.LDGEMM_L2x2_SUB4: +LDGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -1006,48 +1165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LDGEMM_L2x2_SUB1 + b LDGEMM_L2x2_SUB1 -.LDGEMM_L2x2_SUB0: +LDGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x2_SAVE - b .LDGEMM_L2x2_SUB2 + ble LDGEMM_L2x2_SAVE + b LDGEMM_L2x2_SUB2 -.LDGEMM_L2x2_SUB1: +LDGEMM_L2x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x2_SAVE + ble LDGEMM_L2x2_SAVE -.LDGEMM_L2x2_SUB2: +LDGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x2_SUB2 + bgt LDGEMM_L2x2_SUB2 -.LDGEMM_L2x2_SAVE: +LDGEMM_L2x2_SAVE: SAVE2x2 -.LDGEMM_L2x2_END: +LDGEMM_L2x2_END: -.LDGEMM_L2x1_BEGIN: +LDGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L2x1_END + ble LDGEMM_L2x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x1_SUB0 + ble LDGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x1_SUB4 + ble LDGEMM_L2x1_SUB4 -.LDGEMM_L2x1_LOOP_START: +LDGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1061,11 +1220,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LDGEMM_L2x1_LOOP_END + ble LDGEMM_L2x1_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L2x1_LOOP: +LDGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1078,9 +1237,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LDGEMM_L2x1_LOOP + bgt LDGEMM_L2x1_LOOP -.LDGEMM_L2x1_LOOP_END: +LDGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1092,9 +1251,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LDGEMM_L2x1_SUB1 + b LDGEMM_L2x1_SUB1 -.LDGEMM_L2x1_SUB4: +LDGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1106,59 +1265,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LDGEMM_L2x1_SUB1 + b LDGEMM_L2x1_SUB1 -.LDGEMM_L2x1_SUB0: +LDGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x1_SAVE - b .LDGEMM_L2x1_SUB2 + ble LDGEMM_L2x1_SAVE + b LDGEMM_L2x1_SUB2 -.LDGEMM_L2x1_SUB1: +LDGEMM_L2x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x1_SAVE + ble LDGEMM_L2x1_SAVE -.LDGEMM_L2x1_SUB2: +LDGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x1_SUB2 + bgt LDGEMM_L2x1_SUB2 -.LDGEMM_L2x1_SAVE: +LDGEMM_L2x1_SAVE: SAVE2x1 -.LDGEMM_L2x1_END: +LDGEMM_L2x1_END: slwi T1, K, 4 add B, B, T1 -.LDGEMM_L2_END: -.LDGEMM_L1_BEGIN: +LDGEMM_L2_END: +LDGEMM_L1_BEGIN: andi. T1, N, 1 - ble .LDGEMM_L1_END + ble LDGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 4 - ble .LDGEMM_L1x16_END + ble LDGEMM_L1x16_END -.LDGEMM_L1x16_BEGIN: +LDGEMM_L1x16_BEGIN: mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x16_SUB0 + ble LDGEMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x16_SUB4 + ble LDGEMM_L1x16_SUB4 -.LDGEMM_L1x16_LOOP_START: +LDGEMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -1181,11 +1340,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -2 - ble .LDGEMM_L1x16_LOOP_END + ble LDGEMM_L1x16_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L1x16_LOOP: +LDGEMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 @@ -1206,9 +1365,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -1 - bgt .LDGEMM_L1x16_LOOP + bgt LDGEMM_L1x16_LOOP -.LDGEMM_L1x16_LOOP_END: +LDGEMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 @@ -1227,9 +1386,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_1 KERNEL1x16_E2 - b .LDGEMM_L1x16_SUB1 + b LDGEMM_L1x16_SUB1 -.LDGEMM_L1x16_SUB4: +LDGEMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -1245,86 +1404,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b .LDGEMM_L1x16_SUB1 + b LDGEMM_L1x16_SUB1 -.LDGEMM_L1x16_SUB0: +LDGEMM_L1x16_SUB0: andi. L, K, 7 KERNEL1x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x16_SAVE - b .LDGEMM_L1x16_SUB2 + ble LDGEMM_L1x16_SAVE + b LDGEMM_L1x16_SUB2 -.LDGEMM_L1x16_SUB1: +LDGEMM_L1x16_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x16_SAVE + ble LDGEMM_L1x16_SAVE -.LDGEMM_L1x16_SUB2: +LDGEMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x16_SUB2 + bgt LDGEMM_L1x16_SUB2 -.LDGEMM_L1x16_SAVE: +LDGEMM_L1x16_SAVE: SAVE1x16 addic. I, I, -1 - bgt .LDGEMM_L1x16_BEGIN + bgt LDGEMM_L1x16_BEGIN -.LDGEMM_L1x16_END: +LDGEMM_L1x16_END: -.LDGEMM_L1x8_BEGIN: +LDGEMM_L1x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L1x1_END + ble LDGEMM_L1x1_END andi. T1, M, 8 - ble .LDGEMM_L1x8_END + ble LDGEMM_L1x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x8_SUB0 + ble LDGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x8_SUB4 + ble LDGEMM_L1x8_SUB4 -.LDGEMM_L1x8_LOOP_START: +LDGEMM_L1x8_LOOP_START: + dcbt AO, PRE LOAD1x8_1 KERNEL1x8_I1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 addic. L, L, -2 - ble .LDGEMM_L1x8_LOOP_END + ble LDGEMM_L1x8_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L1x8_LOOP: +LDGEMM_L1x8_LOOP: KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 addic. L, L, -1 - bgt .LDGEMM_L1x8_LOOP + bgt LDGEMM_L1x8_LOOP -.LDGEMM_L1x8_LOOP_END: +LDGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1336,9 +1504,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LDGEMM_L1x8_SUB1 + b LDGEMM_L1x8_SUB1 -.LDGEMM_L1x8_SUB4: +LDGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1350,48 +1518,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LDGEMM_L1x8_SUB1 + b LDGEMM_L1x8_SUB1 -.LDGEMM_L1x8_SUB0: +LDGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x8_SAVE - b .LDGEMM_L1x8_SUB2 + ble LDGEMM_L1x8_SAVE + b LDGEMM_L1x8_SUB2 -.LDGEMM_L1x8_SUB1: +LDGEMM_L1x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x8_SAVE + ble LDGEMM_L1x8_SAVE -.LDGEMM_L1x8_SUB2: +LDGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x8_SUB2 + bgt LDGEMM_L1x8_SUB2 -.LDGEMM_L1x8_SAVE: +LDGEMM_L1x8_SAVE: SAVE1x8 -.LDGEMM_L1x8_END: +LDGEMM_L1x8_END: -.LDGEMM_L1x4_BEGIN: +LDGEMM_L1x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L1x4_END + ble LDGEMM_L1x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x4_SUB0 + ble LDGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x4_SUB4 + ble LDGEMM_L1x4_SUB4 -.LDGEMM_L1x4_LOOP_START: +LDGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1405,11 +1573,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LDGEMM_L1x4_LOOP_END + ble LDGEMM_L1x4_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L1x4_LOOP: +LDGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1422,9 +1590,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LDGEMM_L1x4_LOOP + bgt LDGEMM_L1x4_LOOP -.LDGEMM_L1x4_LOOP_END: +LDGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1436,9 +1604,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LDGEMM_L1x4_SUB1 + b LDGEMM_L1x4_SUB1 -.LDGEMM_L1x4_SUB4: +LDGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1450,48 +1618,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LDGEMM_L1x4_SUB1 + b LDGEMM_L1x4_SUB1 -.LDGEMM_L1x4_SUB0: +LDGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x4_SAVE - b .LDGEMM_L1x4_SUB2 + ble LDGEMM_L1x4_SAVE + b LDGEMM_L1x4_SUB2 -.LDGEMM_L1x4_SUB1: +LDGEMM_L1x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x4_SAVE + ble LDGEMM_L1x4_SAVE -.LDGEMM_L1x4_SUB2: +LDGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x4_SUB2 + bgt LDGEMM_L1x4_SUB2 -.LDGEMM_L1x4_SAVE: +LDGEMM_L1x4_SAVE: SAVE1x4 -.LDGEMM_L1x4_END: +LDGEMM_L1x4_END: -.LDGEMM_L1x2_BEGIN: +LDGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L1x2_END + ble LDGEMM_L1x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x2_SUB0 + ble LDGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x2_SUB4 + ble LDGEMM_L1x2_SUB4 -.LDGEMM_L1x2_LOOP_START: +LDGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1505,11 +1673,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LDGEMM_L1x2_LOOP_END + ble LDGEMM_L1x2_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L1x2_LOOP: +LDGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1522,9 +1690,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LDGEMM_L1x2_LOOP + bgt LDGEMM_L1x2_LOOP -.LDGEMM_L1x2_LOOP_END: +LDGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1536,9 +1704,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LDGEMM_L1x2_SUB1 + b LDGEMM_L1x2_SUB1 -.LDGEMM_L1x2_SUB4: +LDGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1550,48 +1718,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LDGEMM_L1x2_SUB1 + b LDGEMM_L1x2_SUB1 -.LDGEMM_L1x2_SUB0: +LDGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x2_SAVE - b .LDGEMM_L1x2_SUB2 + ble LDGEMM_L1x2_SAVE + b LDGEMM_L1x2_SUB2 -.LDGEMM_L1x2_SUB1: +LDGEMM_L1x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x2_SAVE + ble LDGEMM_L1x2_SAVE -.LDGEMM_L1x2_SUB2: +LDGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x2_SUB2 + bgt LDGEMM_L1x2_SUB2 -.LDGEMM_L1x2_SAVE: +LDGEMM_L1x2_SAVE: SAVE1x2 -.LDGEMM_L1x2_END: +LDGEMM_L1x2_END: -.LDGEMM_L1x1_BEGIN: +LDGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L1x1_END + ble LDGEMM_L1x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x1_SUB0 + ble LDGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x1_SUB4 + ble LDGEMM_L1x1_SUB4 -.LDGEMM_L1x1_LOOP_START: +LDGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1605,11 +1773,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LDGEMM_L1x1_LOOP_END + ble LDGEMM_L1x1_LOOP_END - .align 5 + MY_ALIGN -.LDGEMM_L1x1_LOOP: +LDGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1622,9 +1790,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LDGEMM_L1x1_LOOP + bgt LDGEMM_L1x1_LOOP -.LDGEMM_L1x1_LOOP_END: +LDGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1636,9 +1804,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LDGEMM_L1x1_SUB1 + b LDGEMM_L1x1_SUB1 -.LDGEMM_L1x1_SUB4: +LDGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1650,34 +1818,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LDGEMM_L1x1_SUB1 + b LDGEMM_L1x1_SUB1 -.LDGEMM_L1x1_SUB0: +LDGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x1_SAVE - b .LDGEMM_L1x1_SUB2 + ble LDGEMM_L1x1_SAVE + b LDGEMM_L1x1_SUB2 -.LDGEMM_L1x1_SUB1: +LDGEMM_L1x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x1_SAVE + ble LDGEMM_L1x1_SAVE -.LDGEMM_L1x1_SUB2: +LDGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x1_SUB2 + bgt LDGEMM_L1x1_SUB2 -.LDGEMM_L1x1_SAVE: +LDGEMM_L1x1_SAVE: SAVE1x1 -.LDGEMM_L1x1_END: +LDGEMM_L1x1_END: -.LDGEMM_L1_END: +LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 27c05e08e5..5be517f7c6 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO - addi AO, AO, 64 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm + .macro KERNEL4x16_I1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 - lxvd2x vs8, 0, AO + lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 - addi AO, AO, 64 - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 - addi AO, AO, 64 - addi BO, BO, 32 + addi AO, AO, 128 .endm + + .macro KERNEL4x16_1 xvmaddadp vs32, vs0, vs24 @@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 - lxvd2x vs8, 0, AO + lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 @@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 - addi AO, AO, 64 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 @@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm @@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 - addi AO, AO, 64 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO xvmaddadp vs52, vs12, vs30 xvmaddadp vs53, vs13, vs30 xvmaddadp vs54, vs14, vs30 xvmaddadp vs55, vs15, vs30 - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 @@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm +.macro KERNEL4x16_L1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, o0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + + addi AO, AO, 128 + +.endm + +.macro KERNEL4x16_L2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + lxvdsx vs24, o32, BO + lxvdsx vs25, o40, BO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + lxvdsx vs26, o48, BO + lxvdsx vs27, o56, BO + + xvmaddadp vs60, vs12, vs31 + addi AO, AO, 128 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + addi BO, BO, 64 + xvmaddadp vs63, vs15, vs31 + + +.endm + + .macro KERNEL4x16_E2 @@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO - addi AO, AO, 64 - addi BO, BO, 32 - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO - addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 @@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 + addi BO, BO, 32 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 @@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 + addi AO, AO, 128 xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 @@ -429,195 +559,126 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x16 - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif + add T2, CO, LDC + + lxvd2x vs0, 0, CO + lxvd2x vs1, o16, CO + lxvd2x vs2, o32, CO + lxvd2x vs3, o48, CO + lxvd2x vs4, o64, CO + lxvd2x vs5, o80, CO + add T3, T2, LDC + lxvd2x vs6, o96, CO + lxvd2x vs7, o112, CO + + lxvd2x vs8, 0, T2 + lxvd2x vs9, o16, T2 + lxvd2x vs10, o32, T2 + lxvd2x vs11, o48, T2 + lxvd2x vs12, o64, T2 + lxvd2x vs13, o80, T2 + add T4, T3, LDC + lxvd2x vs14, o96, T2 + lxvd2x vs15, o112, T2 + + lxvd2x vs24, 0, T3 + lxvd2x vs25, o16, T3 + lxvd2x vs26, o32, T3 + lxvd2x vs27, o48, T3 + lxvd2x vs28, o64, T3 + lxvd2x vs29, o80, T3 + lxvd2x vs30, o96, T3 + lxvd2x vs31, o112, T3 -#ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r + lxvd2x vs32, 0, T4 xvmaddadp vs1, vs33, alpha_r + lxvd2x vs33, o16, T4 xvmaddadp vs2, vs34, alpha_r + lxvd2x vs34, o32, T4 xvmaddadp vs3, vs35, alpha_r + lxvd2x vs35, o48, T4 xvmaddadp vs4, vs36, alpha_r + lxvd2x vs36, o64, T4 xvmaddadp vs5, vs37, alpha_r + lxvd2x vs37, o80, T4 xvmaddadp vs6, vs38, alpha_r + lxvd2x vs38, o96, T4 xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - dcbt T1, PRE - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC + lxvd2x vs39, o112, T4 -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 - - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif - -#ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r xvmaddadp vs13, vs45, alpha_r xvmaddadp vs14, vs46, alpha_r xvmaddadp vs15, vs47, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r - xvmuldp vs12, vs44, alpha_r - xvmuldp vs13, vs45, alpha_r - xvmuldp vs14, vs46, alpha_r - xvmuldp vs15, vs47, alpha_r -#endif - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 + xvmaddadp vs24, vs48, alpha_r + xvmaddadp vs25, vs49, alpha_r + xvmaddadp vs26, vs50, alpha_r + xvmaddadp vs27, vs51, alpha_r - dcbt T1, PRE + xvmaddadp vs28, vs52, alpha_r + xvmaddadp vs29, vs53, alpha_r + xvmaddadp vs30, vs54, alpha_r + xvmaddadp vs31, vs55, alpha_r - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC + stxvd2x vs0, 0, CO + stxvd2x vs1, o16, CO + stxvd2x vs2, o32, CO + stxvd2x vs3, o48, CO -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 + stxvd2x vs4, o64, CO + stxvd2x vs5, o80, CO + stxvd2x vs6, o96, CO + stxvd2x vs7, o112, CO - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif + xvmaddadp vs32, vs56, alpha_r + xvmaddadp vs33, vs57, alpha_r + xvmaddadp vs34, vs58, alpha_r + xvmaddadp vs35, vs59, alpha_r -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r - xvmaddadp vs1, vs49, alpha_r - xvmaddadp vs2, vs50, alpha_r - xvmaddadp vs3, vs51, alpha_r - xvmaddadp vs4, vs52, alpha_r - xvmaddadp vs5, vs53, alpha_r - xvmaddadp vs6, vs54, alpha_r - xvmaddadp vs7, vs55, alpha_r -#else - xvmuldp vs0, vs48, alpha_r - xvmuldp vs1, vs49, alpha_r - xvmuldp vs2, vs50, alpha_r - xvmuldp vs3, vs51, alpha_r - xvmuldp vs4, vs52, alpha_r - xvmuldp vs5, vs53, alpha_r - xvmuldp vs6, vs54, alpha_r - xvmuldp vs7, vs55, alpha_r -#endif + xvmaddadp vs36, vs60, alpha_r + xvmaddadp vs37, vs61, alpha_r + xvmaddadp vs38, vs62, alpha_r + xvmaddadp vs39, vs63, alpha_r - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 + addi CO, CO, 128 - dcbt T1, PRE + stxvd2x vs8, o0, T2 + stxvd2x vs9, o16, T2 + stxvd2x vs10, o32, T2 + stxvd2x vs11, o48, T2 - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 + stxvd2x vs12, o64, T2 + stxvd2x vs13, o80, T2 + stxvd2x vs14, o96, T2 + stxvd2x vs15, o112, T2 - add T1, T1, LDC - add T2, T2, LDC + stxvd2x vs24, 0, T3 + stxvd2x vs25, o16, T3 + stxvd2x vs28, o64, T3 + stxvd2x vs29, o80, T3 -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 + stxvd2x vs26, o32, T3 + stxvd2x vs27, o48, T3 + stxvd2x vs30, o96, T3 + stxvd2x vs31, o112, T3 - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif + stxvd2x vs32, o0, T4 + stxvd2x vs33, o16, T4 + stxvd2x vs34, o32, T4 + stxvd2x vs35, o48, T4 -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r - xvmaddadp vs9, vs57, alpha_r - xvmaddadp vs10, vs58, alpha_r - xvmaddadp vs11, vs59, alpha_r - xvmaddadp vs12, vs60, alpha_r - xvmaddadp vs13, vs61, alpha_r - xvmaddadp vs14, vs62, alpha_r - xvmaddadp vs15, vs63, alpha_r -#else - xvmuldp vs8, vs56, alpha_r - xvmuldp vs9, vs57, alpha_r - xvmuldp vs10, vs58, alpha_r - xvmuldp vs11, vs59, alpha_r - xvmuldp vs12, vs60, alpha_r - xvmuldp vs13, vs61, alpha_r - xvmuldp vs14, vs62, alpha_r - xvmuldp vs15, vs63, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - dcbt T1, PRE - - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 + stxvd2x vs36, o64, T4 + stxvd2x vs37, o80, T4 + stxvd2x vs38, o96, T4 + stxvd2x vs39, o112, T4 - addi CO, CO, 128 .endm diff --git a/kernel/power/dgemm_ncopy_4_power8.S b/kernel/power/dgemm_ncopy_4_power8.S new file mode 100644 index 0000000000..31966047f0 --- /dev/null +++ b/kernel/power/dgemm_ncopy_4_power8.S @@ -0,0 +1,228 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define o64 r17 +#define o80 r18 +#define o96 r19 +#define o112 r20 +#define o8 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTU1 r27 +#define NOTU2 r30 +#define T1 r31 + +#define o0 0 + +#include "dgemm_ncopy_macros_4_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + + li PREA, 384 + li PREB, 384 + + li o8, 8 + li o16, 16 + li o32, 32 + li o48, 48 + li o64, 64 + li o80, 80 + li o96, 96 + li o112, 112 + +#include "dgemm_ncopy_logic_4_power8.S" + +L999: + + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/dgemm_ncopy_logic_4_power8.S b/kernel/power/dgemm_ncopy_logic_4_power8.S new file mode 100644 index 0000000000..6944a7818a --- /dev/null +++ b/kernel/power/dgemm_ncopy_logic_4_power8.S @@ -0,0 +1,237 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + mr BO, B + srawi. I, N, 2 + ble DCOPYN_L2_BEGIN + + +DCOPYN_L4_BEGIN: + + +DCOPYN_L4_LOOP: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + +DCOPYN_L4x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L4x16_END + +DCOPYN_L4x16_LOOP: + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + COPY_4x16 + addic. J, J, -1 + bgt DCOPYN_L4x16_LOOP + +DCOPYN_L4x16_END: + + +DCOPYN_L4x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L4x8_END + COPY_4x8 + +DCOPYN_L4x8_END: + + +DCOPYN_L4x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L4x4_END + COPY_4x4 + +DCOPYN_L4x4_END: + + +DCOPYN_L4x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L4x2_END + COPY_4x2 + +DCOPYN_L4x2_END: + + +DCOPYN_L4x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L4x1_END + COPY_4x1 + +DCOPYN_L4x1_END: + + +DCOPYN_L4_END: + + addic. I, I, -1 + bgt DCOPYN_L4_LOOP + +DCOPYN_L2_BEGIN: + + andi. T1, 4, 2 + ble DCOPYN_L2_END + +DCOPYN_L2_LOOP: + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + +DCOPYN_L2x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L2x16_END + +DCOPYN_L2x16_LOOP: + + COPY_2x16 + addic. J, J, -1 + bgt DCOPYN_L2x16_LOOP + +DCOPYN_L2x16_END: + + +DCOPYN_L2x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L2x8_END + COPY_2x8 + +DCOPYN_L2x8_END: + + +DCOPYN_L2x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L2x4_END + COPY_2x4 + +DCOPYN_L2x4_END: + + +DCOPYN_L2x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L2x2_END + COPY_2x2 + +DCOPYN_L2x2_END: + + +DCOPYN_L2x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L2x1_END + COPY_2x1 + +DCOPYN_L2x1_END: + + +DCOPYN_L2_END: + + +DCOPYN_L1_BEGIN: + + andi. T1, 4, 1 + ble DCOPYN_L1_END + +DCOPYN_L1_LOOP: + + mr A0, A + add A, A0, LDA + +DCOPYN_L1x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L1x16_END + +DCOPYN_L1x16_LOOP: + + COPY_1x16 + addic. J, J, -1 + bgt DCOPYN_L1x16_LOOP + +DCOPYN_L1x16_END: + + +DCOPYN_L1x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L1x8_END + COPY_1x8 + +DCOPYN_L1x8_END: + + +DCOPYN_L1x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L1x4_END + COPY_1x4 + +DCOPYN_L1x4_END: + + +DCOPYN_L1x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L1x2_END + COPY_1x2 + +DCOPYN_L1x2_END: + + +DCOPYN_L1x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L1x1_END + COPY_1x1 + +DCOPYN_L1x1_END: + + +DCOPYN_L1_END: + diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S new file mode 100644 index 0000000000..fafb09877e --- /dev/null +++ b/kernel/power/dgemm_ncopy_macros_4_power8.S @@ -0,0 +1,698 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro COPY_4x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs8, o0, A1 + lxvd2x vs24, o0, A3 + lxvd2x vs16, o0, A2 + + lxvd2x vs1, o16, A0 + lxvd2x vs9, o16, A1 + lxvd2x vs17, o16, A2 + lxvd2x vs25, o16, A3 + + lxvd2x vs2, o32, A0 + lxvd2x vs10, o32, A1 + lxvd2x vs18, o32, A2 + lxvd2x vs26, o32, A3 + + lxvd2x vs3, o48, A0 + lxvd2x vs11, o48, A1 + lxvd2x vs19, o48, A2 + lxvd2x vs27, o48, A3 + + lxvd2x vs4, o64, A0 + lxvd2x vs12, o64, A1 + lxvd2x vs20, o64, A2 + lxvd2x vs28, o64, A3 + + lxvd2x vs5, o80, A0 + lxvd2x vs13, o80, A1 + lxvd2x vs21, o80, A2 + lxvd2x vs29, o80, A3 + + lxvd2x vs6, o96, A0 + lxvd2x vs14, o96, A1 + lxvd2x vs22, o96, A2 + lxvd2x vs30, o96, A3 + + lxvd2x vs7, o112, A0 + lxvd2x vs15, o112, A1 + lxvd2x vs23, o112, A2 + lxvd2x vs31, o112, A3 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + xxpermdi vs40, vs2, vs10, 0 + xxpermdi vs41, vs18, vs26, 0 + xxpermdi vs42, vs2, vs10, 3 + xxpermdi vs43, vs18, vs26, 3 + + xxpermdi vs44, vs3, vs11, 0 + xxpermdi vs45, vs19, vs27, 0 + xxpermdi vs46, vs3, vs11, 3 + xxpermdi vs47, vs19, vs27, 3 + + xxpermdi vs48, vs4, vs12, 0 + xxpermdi vs49, vs20, vs28, 0 + xxpermdi vs50, vs4, vs12, 3 + xxpermdi vs51, vs20, vs28, 3 + + xxpermdi vs52, vs5, vs13, 0 + xxpermdi vs53, vs21, vs29, 0 + xxpermdi vs54, vs5, vs13, 3 + xxpermdi vs55, vs21, vs29, 3 + + addi A0, A0, 128 + addi A1, A1, 128 + + xxpermdi vs56, vs6, vs14, 0 + xxpermdi vs57, vs22, vs30, 0 + xxpermdi vs58, vs6, vs14, 3 + xxpermdi vs59, vs22, vs30, 3 + + addi A3, A3, 128 + addi A2, A2, 128 + + xxpermdi vs60, vs7, vs15, 0 + xxpermdi vs61, vs23, vs31, 0 + xxpermdi vs62, vs7, vs15, 3 + xxpermdi vs63, vs23, vs31, 3 + + dcbt BO, PREB + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + dcbt BO, PREB + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + dcbt BO, PREB + + stxvd2x vs48, o0, BO + stxvd2x vs49, o16, BO + stxvd2x vs50, o32, BO + stxvd2x vs51, o48, BO + stxvd2x vs52, o64, BO + stxvd2x vs53, o80, BO + stxvd2x vs54, o96, BO + stxvd2x vs55, o112, BO + addi BO, BO, 128 + + dcbt BO, PREB + + stxvd2x vs56, o0, BO + stxvd2x vs57, o16, BO + stxvd2x vs58, o32, BO + stxvd2x vs59, o48, BO + stxvd2x vs60, o64, BO + stxvd2x vs61, o80, BO + stxvd2x vs62, o96, BO + stxvd2x vs63, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs16, o0, A2 + lxvd2x vs17, o16, A2 + lxvd2x vs18, o32, A2 + lxvd2x vs19, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs24, o0, A3 + lxvd2x vs25, o16, A3 + lxvd2x vs26, o32, A3 + lxvd2x vs27, o48, A3 + addi A3, A3, 64 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + xxpermdi vs40, vs2, vs10, 0 + xxpermdi vs41, vs18, vs26, 0 + xxpermdi vs42, vs2, vs10, 3 + xxpermdi vs43, vs18, vs26, 3 + + xxpermdi vs44, vs3, vs11, 0 + xxpermdi vs45, vs19, vs27, 0 + xxpermdi vs46, vs3, vs11, 3 + xxpermdi vs47, vs19, vs27, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + addi A1, A1, 32 + + + lxvd2x vs16, o0, A2 + lxvd2x vs17, o16, A2 + addi A2, A2, 32 + + + lxvd2x vs24, o0, A3 + lxvd2x vs25, o16, A3 + addi A3, A3, 32 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs8, o0, A1 + addi A1, A1, 16 + + + lxvd2x vs16, o0, A2 + addi A2, A2, 16 + + + lxvd2x vs24, o0, A3 + addi A3, A3, 16 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + lxsdx vs8, o0, A1 + addi A1, A1, 8 + + + lxsdx vs16, o0, A2 + addi A2, A2, 8 + + + lxsdx vs24, o0, A3 + addi A3, A3, 8 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro COPY_2x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + lxvd2x vs4, o64, A0 + lxvd2x vs5, o80, A0 + lxvd2x vs6, o96, A0 + lxvd2x vs7, o112, A0 + addi A0, A0, 128 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + lxvd2x vs12, o64, A1 + lxvd2x vs13, o80, A1 + lxvd2x vs14, o96, A1 + lxvd2x vs15, o112, A1 + addi A1, A1, 128 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + xxpermdi vs36, vs2, vs10, 0 + xxpermdi vs37, vs2, vs10, 3 + + xxpermdi vs38, vs3, vs11, 0 + xxpermdi vs39, vs3, vs11, 3 + + xxpermdi vs40, vs4, vs12, 0 + xxpermdi vs41, vs4, vs12, 3 + + xxpermdi vs42, vs5, vs13, 0 + xxpermdi vs43, vs5, vs13, 3 + + xxpermdi vs44, vs6, vs14, 0 + xxpermdi vs45, vs6, vs14, 3 + + xxpermdi vs46, vs7, vs15, 0 + xxpermdi vs47, vs7, vs15, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + addi A1, A1, 64 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + xxpermdi vs36, vs2, vs10, 0 + xxpermdi vs37, vs2, vs10, 3 + + xxpermdi vs38, vs3, vs11, 0 + xxpermdi vs39, vs3, vs11, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + addi A1, A1, 32 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs8, o0, A1 + addi A1, A1, 16 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + lxsdx vs8, o0, A1 + addi A1, A1, 8 + + + xxpermdi vs32, vs0, vs8, 0 + + + stxvd2x vs32, o0, BO + addi BO, BO, 16 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro COPY_1x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + lxvd2x vs4, o64, A0 + lxvd2x vs5, o80, A0 + lxvd2x vs6, o96, A0 + lxvd2x vs7, o112, A0 + addi A0, A0, 128 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + stxvd2x vs2, o32, BO + stxvd2x vs3, o48, BO + addi BO, BO, 64 + + stxvd2x vs4, o0, BO + stxvd2x vs5, o16, BO + stxvd2x vs6, o32, BO + stxvd2x vs7, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + stxvd2x vs2, o32, BO + stxvd2x vs3, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + stxvd2x vs0, o0, BO + addi BO, BO, 16 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + stxsdx vs0, o0, BO + addi BO, BO, 8 + + +.endm + diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S new file mode 100644 index 0000000000..eb37877e03 --- /dev/null +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -0,0 +1,211 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define o8 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define B16 r29 +#define M16 r30 +#define T1 r31 + +#define o0 0 + +#include "dgemm_tcopy_macros_16_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + slwi M16, M, 4 + BASE_SHIFT + + li T1, -16 + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B8, N, T1 + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B8, B8, M + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B8, B8, BASE_SHIFT + slwi B4, B4, BASE_SHIFT + slwi B2, B2, BASE_SHIFT + slwi B1, B1, BASE_SHIFT + + add B8, B8, B + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M16, 128 + + li o8, 8 + li o16, 16 + li o32, 32 + li o48, 48 + +#include "dgemm_tcopy_logic_16_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/dgemm_tcopy_logic_16_power8.S b/kernel/power/dgemm_tcopy_logic_16_power8.S new file mode 100644 index 0000000000..3c34a6167d --- /dev/null +++ b/kernel/power/dgemm_tcopy_logic_16_power8.S @@ -0,0 +1,285 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble DCOPYT_L2_BEGIN + + +DCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B16, B + addi B, B, 64*SIZE + + sradi. J, N, 4 + ble DCOPYT_L4x8_BEGIN + + mr BO, B16 + addi T2, M16, 384 + mtctr J + + .align 5 + +DCOPYT_L4x16_LOOP: + + addi T1, M16, 256 + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + + dcbt BO, M16 + dcbt BO, PREB + dcbt BO, T1 + dcbt BO, T2 + + COPY_4x16 + + add BO, BO, M16 + + // addic. J, J, -1 + bdnz+ DCOPYT_L4x16_LOOP + +DCOPYT_L4x8_BEGIN: + + andi. T1, N, 8 + ble DCOPYT_L4x4_BEGIN + + mr BO, B8 + + COPY_4x8 + + + addi B8, B8, 32*SIZE + +DCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble DCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + + addi B4, B4, 16*SIZE + +DCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble DCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + + addi B2, B2, 8*SIZE + +DCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble DCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + + addi B1, B1, 4*SIZE + +DCOPYT_L4_END: + + addic. I, I, -1 + bgt DCOPYT_L4_BEGIN + + + +DCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble DCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B16, B + addi B, B, 32*SIZE + + sradi. J, N, 4 + ble DCOPYT_L2x8_BEGIN + + mr BO, B16 + +DCOPYT_L2x16_LOOP: + + COPY_2x16 + + add BO, BO, M16 + + addic. J, J, -1 + bgt DCOPYT_L2x16_LOOP + +DCOPYT_L2x8_BEGIN: + + andi. T1, N, 8 + ble DCOPYT_L2x4_BEGIN + + mr BO, B8 + + COPY_2x8 + + + addi B8, B8, 16*SIZE + +DCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble DCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + + addi B4, B4, 8*SIZE + +DCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble DCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + + addi B2, B2, 4*SIZE + +DCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble DCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + + addi B1, B1, 2*SIZE + +DCOPYT_L2_END: + + +DCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B16, B + addi B, B, 16*SIZE + + sradi. J, N, 4 + ble DCOPYT_L1x8_BEGIN + + mr BO, B16 + +DCOPYT_L1x16_LOOP: + + COPY_1x16 + + add BO, BO, M16 + + addic. J, J, -1 + bgt DCOPYT_L1x16_LOOP + +DCOPYT_L1x8_BEGIN: + + andi. T1, N, 8 + ble DCOPYT_L1x4_BEGIN + + mr BO, B8 + + COPY_1x8 + + + addi B8, B8, 8*SIZE + +DCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble DCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + + addi B4, B4, 4*SIZE + +DCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble DCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + + addi B2, B2, 2*SIZE + +DCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble DCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + + addi B1, B1, 1*SIZE + +DCOPYT_L1_END: + diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S new file mode 100644 index 0000000000..333e231053 --- /dev/null +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -0,0 +1,608 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro COPY_4x16 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs48, o0, A2 + lxvd2x vs49, o16, A2 + lxvd2x vs50, o32, A2 + lxvd2x vs51, o48, A2 + addi A2, A2, 64 + + lxvd2x vs56, o0, A3 + lxvd2x vs57, o16, A3 + lxvd2x vs58, o32, A3 + lxvd2x vs59, o48, A3 + addi A3, A3, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + lxvd2x vs52, o0, A2 + lxvd2x vs53, o16, A2 + lxvd2x vs54, o32, A2 + lxvd2x vs55, o48, A2 + addi A2, A2, 64 + + lxvd2x vs60, o0, A3 + lxvd2x vs61, o16, A3 + lxvd2x vs62, o32, A3 + lxvd2x vs63, o48, A3 + addi A3, A3, 64 + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 + addi T1, T1, 64 + + stxvd2x vs52, o0, T1 + stxvd2x vs53, o16, T1 + stxvd2x vs54, o32, T1 + stxvd2x vs55, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs56, o0, T1 + stxvd2x vs57, o16, T1 + stxvd2x vs58, o32, T1 + stxvd2x vs59, o48, T1 + addi T1, T1, 64 + + stxvd2x vs60, o0, T1 + stxvd2x vs61, o16, T1 + stxvd2x vs62, o32, T1 + stxvd2x vs63, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs40, o0, A2 + lxvd2x vs41, o16, A2 + lxvd2x vs42, o32, A2 + lxvd2x vs43, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs44, o0, A3 + lxvd2x vs45, o16, A3 + lxvd2x vs46, o32, A3 + lxvd2x vs47, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + lxvd2x vs36, o0, A2 + lxvd2x vs37, o16, A2 + addi A2, A2, 32 + + + lxvd2x vs38, o0, A3 + lxvd2x vs39, o16, A3 + addi A3, A3, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + lxvd2x vs34, o0, A2 + addi A2, A2, 16 + + + lxvd2x vs35, o0, A3 + addi A3, A3, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsdx vs32, o0, A0 + addi A0, A0, 8 + + + lxsdx vs33, o0, A1 + addi A1, A1, 8 + + + lxsdx vs34, o0, A2 + addi A2, A2, 8 + + + lxsdx vs35, o0, A3 + addi A3, A3, 8 + + + mr T1, BO + + stxsdx vs32, o0, T1 + + stxsdx vs33, o8, T1 + + addi T1, T1, 16 + + stxsdx vs34, o0, T1 + + stxsdx vs35, o8, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro COPY_2x16 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsdx vs32, o0, A0 + addi A0, A0, 8 + + + lxsdx vs33, o0, A1 + addi A1, A1, 8 + + + mr T1, BO + + stxsdx vs32, o0, T1 + + stxsdx vs33, o8, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro COPY_1x16 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsdx vs32, o0, A0 + addi A0, A0, 8 + + + mr T1, BO + + stxsdx vs32, o0, T1 + +.endm + diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 2294128a21..e9dbd991ee 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PRE r30 #define T2 r31 -#include "dgemm_macros_16x4_power8.S" +#include "dtrmm_macros_16x4_power8.S" #ifndef NEEDPARAM diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S new file mode 100644 index 0000000000..079144a90f --- /dev/null +++ b/kernel/power/dtrmm_macros_16x4_power8.S @@ -0,0 +1,3431 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ + +.macro LOAD4x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + +.endm + +.macro KERNEL4x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.endm + +.macro KERNEL4x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endm + +.macro SAVE4x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r + xvmaddadp vs4, vs52, alpha_r + xvmaddadp vs5, vs53, alpha_r + xvmaddadp vs6, vs54, alpha_r + xvmaddadp vs7, vs55, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r + xvmuldp vs4, vs52, alpha_r + xvmuldp vs5, vs53, alpha_r + xvmuldp vs6, vs54, alpha_r + xvmuldp vs7, vs55, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r + xvmaddadp vs12, vs60, alpha_r + xvmaddadp vs13, vs61, alpha_r + xvmaddadp vs14, vs62, alpha_r + xvmaddadp vs15, vs63, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r + xvmuldp vs12, vs60, alpha_r + xvmuldp vs13, vs61, alpha_r + xvmuldp vs14, vs62, alpha_r + xvmuldp vs15, vs63, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + +.endm + +.macro KERNEL4x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.endm + +.macro KERNEL4x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S new file mode 100644 index 0000000000..fdfc5ac704 --- /dev/null +++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S @@ -0,0 +1,294 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define o0 0 + +#define PRE r15 +#define T4 r16 +#define L r17 +#define T3 r18 +#define T2 r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO r25 +#define o8 r26 +#define o16 r27 +#define o24 r28 +#define o32 r29 +#define o48 r30 +#define T1 r31 + +#include "dtrsm_macros_LT_16x4_power8.S" + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + + + cmpwi cr0, M, 0 + ble L999 + cmpwi cr0, N, 0 + ble L999 + cmpwi cr0, K, 0 + ble L999 + + slwi LDC, LDC, BASE_SHIFT + + li o8, 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + li PRE, 384 + + mr KK, OFFSET + +#include "dtrsm_logic_LT_16x4_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/dtrsm_logic_LT_16x4_power8.S b/kernel/power/dtrsm_logic_LT_16x4_power8.S new file mode 100644 index 0000000000..04f5fdd904 --- /dev/null +++ b/kernel/power/dtrsm_logic_LT_16x4_power8.S @@ -0,0 +1,755 @@ + srawi. J, N, 2 + ble DSTRM_LT_L4_END + + +DSTRM_LT_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + + mr KK, OFFSET + srawi. I, M, 4 + ble DSTRM_LT_L4x16_END + + +DSTRM_LT_L4x16_BEGIN: + + mr BO, B + + li L, -128 + + mr T1, CO + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + and T1, T1, L + and T2, T2, L + and T3, T3, L + and T4, T4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + +DSTRM_LT_L4x16_LOOP_START: + + + INIT_16x4 + + + addic. L, KK, 0 + ble- DSTRM_LT_L4x16_SAVE + mtctr L + +DSTRM_LT_L4x16_LOOP: + + dcbt AO, PRE + dcbt BO, PRE + KERNEL_16x4 + bdz- DSTRM_LT_L4x16_SAVE + + dcbt AO, PRE + KERNEL_16x4 + bdz- DSTRM_LT_L4x16_SAVE + + dcbt AO, PRE + KERNEL_16x4 + bdz- DSTRM_LT_L4x16_SAVE + + dcbt AO, PRE + KERNEL_16x4 + bdnz+ DSTRM_LT_L4x16_LOOP + + +DSTRM_LT_L4x16_SAVE: + + SOLVE_LT_16x4 + + addi CO, CO, 16*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 4+BASE_SHIFT + slwi T4, T4, 2+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 16 + + addic. I, I, -1 + bgt DSTRM_LT_L4x16_BEGIN + +DSTRM_LT_L4x16_END: + + +DSTRM_LT_L4x8_BEGIN: + + andi. T2, M, 15 + ble DSTRM_LT_L4x1_END + + andi. T1, M, 8 + ble DSTRM_LT_L4x8_END + + mr BO, B + + +DSTRM_LT_L4x8_LOOP_START: + + + INIT_8x4 + + + addic. L, KK, 0 + ble DSTRM_LT_L4x8_SAVE + +DSTRM_LT_L4x8_LOOP: + + + KERNEL_8x4 + + addic. L, L, -1 + bgt DSTRM_LT_L4x8_LOOP + + +DSTRM_LT_L4x8_SAVE: + + SOLVE_LT_8x4 + + addi CO, CO, 8*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 3+BASE_SHIFT + slwi T4, T4, 2+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 8 + +DSTRM_LT_L4x8_END: + + +DSTRM_LT_L4x4_BEGIN: + + andi. T1, M, 4 + ble DSTRM_LT_L4x4_END + + mr BO, B + + +DSTRM_LT_L4x4_LOOP_START: + + + INIT_4x4 + + + addic. L, KK, 0 + ble DSTRM_LT_L4x4_SAVE + +DSTRM_LT_L4x4_LOOP: + + + KERNEL_4x4 + + addic. L, L, -1 + bgt DSTRM_LT_L4x4_LOOP + + +DSTRM_LT_L4x4_SAVE: + + SOLVE_LT_4x4 + + addi CO, CO, 4*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 2+BASE_SHIFT + slwi T4, T4, 2+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 4 + +DSTRM_LT_L4x4_END: + + +DSTRM_LT_L4x2_BEGIN: + + andi. T1, M, 2 + ble DSTRM_LT_L4x2_END + + mr BO, B + + +DSTRM_LT_L4x2_LOOP_START: + + + INIT_2x4 + + + addic. L, KK, 0 + ble DSTRM_LT_L4x2_SAVE + +DSTRM_LT_L4x2_LOOP: + + + KERNEL_2x4 + + addic. L, L, -1 + bgt DSTRM_LT_L4x2_LOOP + + +DSTRM_LT_L4x2_SAVE: + + SOLVE_LT_2x4 + + addi CO, CO, 2*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 1+BASE_SHIFT + slwi T4, T4, 2+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 2 + +DSTRM_LT_L4x2_END: + + +DSTRM_LT_L4x1_BEGIN: + + andi. T1, M, 1 + ble DSTRM_LT_L4x1_END + + mr BO, B + + +DSTRM_LT_L4x1_LOOP_START: + + + INIT_1x4 + + + addic. L, KK, 0 + ble DSTRM_LT_L4x1_SAVE + +DSTRM_LT_L4x1_LOOP: + + + KERNEL_1x4 + + addic. L, L, -1 + bgt DSTRM_LT_L4x1_LOOP + + +DSTRM_LT_L4x1_SAVE: + + SOLVE_LT_1x4 + + addi CO, CO, 1*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 0+BASE_SHIFT + slwi T4, T4, 2+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 1 + +DSTRM_LT_L4x1_END: + + slwi T1, K, 2+BASE_SHIFT + add B, B, T1 + + addic. J, J, -1 + bgt DSTRM_LT_L4_BEGIN + + andi. T2, N, 3 + ble L999 + +DSTRM_LT_L4_END: + + b DSTRM_LT_L2_BEGIN + +L999_H1: + + b L999 + + +DSTRM_LT_L2_BEGIN: + + andi. T1, N, 2 + ble DSTRM_LT_L2_END + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + + mr KK, OFFSET + srawi. I, M, 4 + ble DSTRM_LT_L2x16_END + + +DSTRM_LT_L2x16_BEGIN: + + mr BO, B + + +DSTRM_LT_L2x16_LOOP_START: + + + INIT_16x2 + + + addic. L, KK, 0 + ble DSTRM_LT_L2x16_SAVE + +DSTRM_LT_L2x16_LOOP: + + + KERNEL_16x2 + + addic. L, L, -1 + bgt DSTRM_LT_L2x16_LOOP + + +DSTRM_LT_L2x16_SAVE: + + SOLVE_LT_16x2 + + addi CO, CO, 16*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 4+BASE_SHIFT + slwi T4, T4, 1+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 16 + + addic. I, I, -1 + bgt DSTRM_LT_L2x16_BEGIN + +DSTRM_LT_L2x16_END: + + +DSTRM_LT_L2x8_BEGIN: + + andi. T2, M, 15 + ble DSTRM_LT_L2x1_END + + andi. T1, M, 8 + ble DSTRM_LT_L2x8_END + + mr BO, B + + +DSTRM_LT_L2x8_LOOP_START: + + + INIT_8x2 + + + addic. L, KK, 0 + ble DSTRM_LT_L2x8_SAVE + +DSTRM_LT_L2x8_LOOP: + + + KERNEL_8x2 + + addic. L, L, -1 + bgt DSTRM_LT_L2x8_LOOP + + +DSTRM_LT_L2x8_SAVE: + + SOLVE_LT_8x2 + + addi CO, CO, 8*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 3+BASE_SHIFT + slwi T4, T4, 1+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 8 + +DSTRM_LT_L2x8_END: + + +DSTRM_LT_L2x4_BEGIN: + + andi. T1, M, 4 + ble DSTRM_LT_L2x4_END + + mr BO, B + + +DSTRM_LT_L2x4_LOOP_START: + + + INIT_4x2 + + + addic. L, KK, 0 + ble DSTRM_LT_L2x4_SAVE + +DSTRM_LT_L2x4_LOOP: + + + KERNEL_4x2 + + addic. L, L, -1 + bgt DSTRM_LT_L2x4_LOOP + + +DSTRM_LT_L2x4_SAVE: + + SOLVE_LT_4x2 + + addi CO, CO, 4*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 2+BASE_SHIFT + slwi T4, T4, 1+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 4 + +DSTRM_LT_L2x4_END: + + +DSTRM_LT_L2x2_BEGIN: + + andi. T1, M, 2 + ble DSTRM_LT_L2x2_END + + mr BO, B + + +DSTRM_LT_L2x2_LOOP_START: + + + INIT_2x2 + + + addic. L, KK, 0 + ble DSTRM_LT_L2x2_SAVE + +DSTRM_LT_L2x2_LOOP: + + + KERNEL_2x2 + + addic. L, L, -1 + bgt DSTRM_LT_L2x2_LOOP + + +DSTRM_LT_L2x2_SAVE: + + SOLVE_LT_2x2 + + addi CO, CO, 2*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 1+BASE_SHIFT + slwi T4, T4, 1+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 2 + +DSTRM_LT_L2x2_END: + + +DSTRM_LT_L2x1_BEGIN: + + andi. T1, M, 1 + ble DSTRM_LT_L2x1_END + + mr BO, B + + +DSTRM_LT_L2x1_LOOP_START: + + + INIT_1x2 + + + addic. L, KK, 0 + ble DSTRM_LT_L2x1_SAVE + +DSTRM_LT_L2x1_LOOP: + + + KERNEL_1x2 + + addic. L, L, -1 + bgt DSTRM_LT_L2x1_LOOP + + +DSTRM_LT_L2x1_SAVE: + + SOLVE_LT_1x2 + + addi CO, CO, 1*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 0+BASE_SHIFT + slwi T4, T4, 1+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 1 + +DSTRM_LT_L2x1_END: + + slwi T1, K, 1+BASE_SHIFT + add B, B, T1 + +DSTRM_LT_L2_END: + +DSTRM_LT_L1_BEGIN: + + andi. T1, N, 1 + ble DSTRM_LT_L1_END + + mr CO, C + mr AO, A + + mr KK, OFFSET + srawi. I, M, 4 + ble DSTRM_LT_L1x16_END + + +DSTRM_LT_L1x16_BEGIN: + + mr BO, B + + +DSTRM_LT_L1x16_LOOP_START: + + + INIT_16x1 + + + addic. L, KK, 0 + ble DSTRM_LT_L1x16_SAVE + +DSTRM_LT_L1x16_LOOP: + + + KERNEL_16x1 + + addic. L, L, -1 + bgt DSTRM_LT_L1x16_LOOP + + +DSTRM_LT_L1x16_SAVE: + + SOLVE_LT_16x1 + + addi CO, CO, 16*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 4+BASE_SHIFT + slwi T4, T4, 0+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 16 + + addic. I, I, -1 + bgt DSTRM_LT_L1x16_BEGIN + +DSTRM_LT_L1x16_END: + + +DSTRM_LT_L1x8_BEGIN: + + andi. T1, M, 8 + ble DSTRM_LT_L1x8_END + + mr BO, B + + +DSTRM_LT_L1x8_LOOP_START: + + + INIT_8x1 + + + addic. L, KK, 0 + ble DSTRM_LT_L1x8_SAVE + +DSTRM_LT_L1x8_LOOP: + + + KERNEL_8x1 + + addic. L, L, -1 + bgt DSTRM_LT_L1x8_LOOP + + +DSTRM_LT_L1x8_SAVE: + + SOLVE_LT_8x1 + + addi CO, CO, 8*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 3+BASE_SHIFT + slwi T4, T4, 0+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 8 + +DSTRM_LT_L1x8_END: + + +DSTRM_LT_L1x4_BEGIN: + + andi. T1, M, 4 + ble DSTRM_LT_L1x4_END + + mr BO, B + + +DSTRM_LT_L1x4_LOOP_START: + + + INIT_4x1 + + + addic. L, KK, 0 + ble DSTRM_LT_L1x4_SAVE + +DSTRM_LT_L1x4_LOOP: + + + KERNEL_4x1 + + addic. L, L, -1 + bgt DSTRM_LT_L1x4_LOOP + + +DSTRM_LT_L1x4_SAVE: + + SOLVE_LT_4x1 + + addi CO, CO, 4*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 2+BASE_SHIFT + slwi T4, T4, 0+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 4 + +DSTRM_LT_L1x4_END: + + +DSTRM_LT_L1x2_BEGIN: + + andi. T1, M, 2 + ble DSTRM_LT_L1x2_END + + mr BO, B + + +DSTRM_LT_L1x2_LOOP_START: + + + INIT_2x1 + + + addic. L, KK, 0 + ble DSTRM_LT_L1x2_SAVE + +DSTRM_LT_L1x2_LOOP: + + + KERNEL_2x1 + + addic. L, L, -1 + bgt DSTRM_LT_L1x2_LOOP + + +DSTRM_LT_L1x2_SAVE: + + SOLVE_LT_2x1 + + addi CO, CO, 2*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 1+BASE_SHIFT + slwi T4, T4, 0+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 2 + +DSTRM_LT_L1x2_END: + + +DSTRM_LT_L1x1_BEGIN: + + andi. T1, M, 1 + ble DSTRM_LT_L1x1_END + + mr BO, B + + +DSTRM_LT_L1x1_LOOP_START: + + + INIT_1x1 + + + addic. L, KK, 0 + ble DSTRM_LT_L1x1_SAVE + +DSTRM_LT_L1x1_LOOP: + + + KERNEL_1x1 + + addic. L, L, -1 + bgt DSTRM_LT_L1x1_LOOP + + +DSTRM_LT_L1x1_SAVE: + + SOLVE_LT_1x1 + + addi CO, CO, 1*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 0+BASE_SHIFT + slwi T4, T4, 0+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 1 + +DSTRM_LT_L1x1_END: + +DSTRM_LT_L1_END: diff --git a/kernel/power/dtrsm_macros_LT_16x4_power8.S b/kernel/power/dtrsm_macros_LT_16x4_power8.S new file mode 100644 index 0000000000..dc47daa3ac --- /dev/null +++ b/kernel/power/dtrsm_macros_LT_16x4_power8.S @@ -0,0 +1,4659 @@ + +.macro INIT_16x4 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + xvmovdp vs40, vs0 + xvmovdp vs41, vs0 + xvmovdp vs42, vs0 + xvmovdp vs43, vs0 + xvmovdp vs44, vs0 + xvmovdp vs45, vs0 + xvmovdp vs46, vs0 + xvmovdp vs47, vs0 + xvmovdp vs48, vs0 + xvmovdp vs49, vs0 + xvmovdp vs50, vs0 + xvmovdp vs51, vs0 + xvmovdp vs52, vs0 + xvmovdp vs53, vs0 + xvmovdp vs54, vs0 + xvmovdp vs55, vs0 + xvmovdp vs56, vs0 + xvmovdp vs57, vs0 + xvmovdp vs58, vs0 + xvmovdp vs59, vs0 + xvmovdp vs60, vs0 + xvmovdp vs61, vs0 + xvmovdp vs62, vs0 + xvmovdp vs63, vs0 + +.endm + + +.macro KERNEL_16x4 + + + lxvd2x vs0, o0, AO + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + addi BO, BO, 32 + addi AO, AO, 64 + + lxvd2x vs4, o0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 + xvmaddadp vs36, vs1, vs16 + xvmaddadp vs37, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 + xvmaddadp vs40, vs2, vs16 + xvmaddadp vs41, vs2, vs17 + xvmaddadp vs42, vs2, vs18 + xvmaddadp vs43, vs2, vs19 + xvmaddadp vs44, vs3, vs16 + xvmaddadp vs45, vs3, vs17 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 + xvmaddadp vs48, vs4, vs16 + xvmaddadp vs49, vs4, vs17 + xvmaddadp vs50, vs4, vs18 + xvmaddadp vs51, vs4, vs19 + xvmaddadp vs52, vs5, vs16 + xvmaddadp vs53, vs5, vs17 + xvmaddadp vs54, vs5, vs18 + xvmaddadp vs55, vs5, vs19 + xvmaddadp vs56, vs6, vs16 + xvmaddadp vs57, vs6, vs17 + xvmaddadp vs58, vs6, vs18 + xvmaddadp vs59, vs6, vs19 + xvmaddadp vs60, vs7, vs16 + xvmaddadp vs61, vs7, vs17 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs63, vs7, vs19 + + +.endm + + +.macro INIT_8x4 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + xvmovdp vs40, vs0 + xvmovdp vs41, vs0 + xvmovdp vs42, vs0 + xvmovdp vs43, vs0 + xvmovdp vs44, vs0 + xvmovdp vs45, vs0 + xvmovdp vs46, vs0 + xvmovdp vs47, vs0 + +.endm + + +.macro KERNEL_8x4 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 + xvmaddadp vs36, vs1, vs16 + xvmaddadp vs37, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 + xvmaddadp vs40, vs2, vs16 + xvmaddadp vs41, vs2, vs17 + xvmaddadp vs42, vs2, vs18 + xvmaddadp vs43, vs2, vs19 + xvmaddadp vs44, vs3, vs16 + xvmaddadp vs45, vs3, vs17 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 + + +.endm + + +.macro INIT_4x4 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + +.endm + + +.macro KERNEL_4x4 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 + xvmaddadp vs36, vs1, vs16 + xvmaddadp vs37, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 + + +.endm + + +.macro INIT_2x4 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + +.endm + + +.macro KERNEL_2x4 + + + lxvd2x vs0, o0, AO + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 + + +.endm + + +.macro INIT_1x4 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + +.endm + + +.macro KERNEL_1x4 + + + lxvdsx vs0, o0, AO + + addi AO, AO, 8 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 + + +.endm + + +/*########################################################################################## + SOLVE_LT 16x4 +##########################################################################################*/ + +.macro SOLVE_LT_16x4 + +//############### LOAD B ####################### + + mr T1, BO + mr T4, BO + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs34, vs35, 0 + xxpermdi vs2, vs32, vs33, 3 + xxpermdi vs3, vs34, vs35, 3 + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs38, vs39, 0 + xxpermdi vs6, vs36, vs37, 3 + xxpermdi vs7, vs38, vs39, 3 + + lxvd2x vs36, o0, T1 + lxvd2x vs37, o16, T1 + lxvd2x vs38, o32, T1 + lxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + xxpermdi vs8, vs40, vs41, 0 + xxpermdi vs9, vs42, vs43, 0 + xxpermdi vs10, vs40, vs41, 3 + xxpermdi vs11, vs42, vs43, 3 + + lxvd2x vs40, o0, T1 + lxvd2x vs41, o16, T1 + lxvd2x vs42, o32, T1 + lxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + xxpermdi vs12, vs44, vs45, 0 + xxpermdi vs13, vs46, vs47, 0 + xxpermdi vs14, vs44, vs45, 3 + xxpermdi vs15, vs46, vs47, 3 + + lxvd2x vs44, o0, T1 + lxvd2x vs45, o16, T1 + lxvd2x vs46, o32, T1 + lxvd2x vs47, o48, T1 + + addi T1, T1, 64 + + xxpermdi vs16, vs48, vs49, 0 + xxpermdi vs17, vs50, vs51, 0 + xxpermdi vs18, vs48, vs49, 3 + xxpermdi vs19, vs50, vs51, 3 + + lxvd2x vs48, o0, T1 + lxvd2x vs49, o16, T1 + lxvd2x vs50, o32, T1 + lxvd2x vs51, o48, T1 + + addi T1, T1, 64 + + xxpermdi vs20, vs52, vs53, 0 + xxpermdi vs21, vs54, vs55, 0 + xxpermdi vs22, vs52, vs53, 3 + xxpermdi vs23, vs54, vs55, 3 + + lxvd2x vs52, o0, T1 + lxvd2x vs53, o16, T1 + lxvd2x vs54, o32, T1 + lxvd2x vs55, o48, T1 + + addi T1, T1, 64 + + xxpermdi vs24, vs56, vs57, 0 + xxpermdi vs25, vs58, vs59, 0 + xxpermdi vs26, vs56, vs57, 3 + xxpermdi vs27, vs58, vs59, 3 + + lxvd2x vs56, o0, T1 + lxvd2x vs57, o16, T1 + lxvd2x vs58, o32, T1 + lxvd2x vs59, o48, T1 + + addi T1, T1, 64 + + xxpermdi vs28, vs60, vs61, 0 + xxpermdi vs29, vs62, vs63, 0 + xxpermdi vs30, vs60, vs61, 3 + xxpermdi vs31, vs62, vs63, 3 + + + + lxvd2x vs60, o0, T1 + lxvd2x vs61, o16, T1 + lxvd2x vs62, o32, T1 + lxvd2x vs63, o48, T1 + +//############### OFFSET 0 ####################### + + dcbt AO, PRE + mr T1, AO + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvsubdp vs36, vs36, vs4 + xvsubdp vs37, vs37, vs5 + xvsubdp vs38, vs38, vs6 + xvsubdp vs39, vs39, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvsubdp vs40, vs40, vs8 + xvsubdp vs41, vs41, vs9 + xvsubdp vs42, vs42, vs10 + xvsubdp vs43, vs43, vs11 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + xvsubdp vs44, vs44, vs12 + xvsubdp vs45, vs45, vs13 + xvsubdp vs46, vs46, vs14 + xvsubdp vs47, vs47, vs15 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + lxvdsx vs14, o16, T1 + lxvdsx vs15, o24, T1 + + addi T1, T1, 32 + + xvsubdp vs48, vs48, vs16 + xvsubdp vs49, vs49, vs17 + xvsubdp vs50, vs50, vs18 + xvsubdp vs51, vs51, vs19 + + xvsubdp vs52, vs52, vs20 + xvsubdp vs53, vs53, vs21 + xvsubdp vs54, vs54, vs22 + xvsubdp vs55, vs55, vs23 + + xvsubdp vs56, vs56, vs24 + xvsubdp vs57, vs57, vs25 + xvsubdp vs58, vs58, vs26 + xvsubdp vs59, vs59, vs27 + + xvsubdp vs60, vs60, vs28 + xvsubdp vs61, vs61, vs29 + xvsubdp vs62, vs62, vs30 + xvsubdp vs63, vs63, vs31 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + xvmuldp vs32, vs32, vs0 + xvmuldp vs33, vs33, vs0 + + xvnmsubadp vs34, vs32, vs1 + xvnmsubadp vs35, vs33, vs1 + xvnmsubadp vs36, vs32, vs2 + dcbt T1, PRE + xvnmsubadp vs37, vs33, vs2 + xvnmsubadp vs38, vs32, vs3 + xvnmsubadp vs39, vs33, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs40, vs32, vs4 + xvnmsubadp vs41, vs33, vs4 + xvnmsubadp vs42, vs32, vs5 + xvnmsubadp vs43, vs33, vs5 + xvnmsubadp vs44, vs32, vs6 + xvnmsubadp vs45, vs33, vs6 + xvnmsubadp vs46, vs32, vs7 + xvnmsubadp vs47, vs33, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs48, vs32, vs8 + xvnmsubadp vs49, vs33, vs8 + xvnmsubadp vs50, vs32, vs9 + xvnmsubadp vs51, vs33, vs9 + xvnmsubadp vs52, vs32, vs10 + xvnmsubadp vs53, vs33, vs10 + xvnmsubadp vs54, vs32, vs11 + xvnmsubadp vs55, vs33, vs11 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs56, vs32, vs12 + xvnmsubadp vs57, vs33, vs12 + xvnmsubadp vs58, vs32, vs13 + xvnmsubadp vs59, vs33, vs13 + xvnmsubadp vs60, vs32, vs14 + xvnmsubadp vs61, vs33, vs14 + xvnmsubadp vs62, vs32, vs15 + xvnmsubadp vs63, vs33, vs15 + + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + lxvdsx vs14, o16, T1 + + addi T1, T1, 24 + +//############### OFFSET 2 ####################### + + xvmuldp vs34, vs34, vs0 + xvmuldp vs35, vs35, vs0 + + addi T1, T1, 2*SIZE + + xvnmsubadp vs36, vs34, vs1 + xvnmsubadp vs37, vs35, vs1 + xvnmsubadp vs38, vs34, vs2 + dcbt T1, PRE + xvnmsubadp vs39, vs35, vs2 + xvnmsubadp vs40, vs34, vs3 + xvnmsubadp vs41, vs35, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs42, vs34, vs4 + xvnmsubadp vs43, vs35, vs4 + xvnmsubadp vs44, vs34, vs5 + xvnmsubadp vs45, vs35, vs5 + xvnmsubadp vs46, vs34, vs6 + xvnmsubadp vs47, vs35, vs6 + xvnmsubadp vs48, vs34, vs7 + xvnmsubadp vs49, vs35, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs50, vs34, vs8 + xvnmsubadp vs51, vs35, vs8 + xvnmsubadp vs52, vs34, vs9 + xvnmsubadp vs53, vs35, vs9 + xvnmsubadp vs54, vs34, vs10 + xvnmsubadp vs55, vs35, vs10 + xvnmsubadp vs56, vs34, vs11 + xvnmsubadp vs57, vs35, vs11 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + + xvnmsubadp vs58, vs34, vs12 + xvnmsubadp vs59, vs35, vs12 + xvnmsubadp vs60, vs34, vs13 + xvnmsubadp vs61, vs35, vs13 + xvnmsubadp vs62, vs34, vs14 + xvnmsubadp vs63, vs35, vs14 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + + addi T1, T1, 16 + +//############### OFFSET 3 ####################### + xvmuldp vs36, vs36, vs0 + xvmuldp vs37, vs37, vs0 + + addi T1, T1, 3*SIZE + + xvnmsubadp vs38, vs36, vs1 + xvnmsubadp vs39, vs37, vs1 + xvnmsubadp vs40, vs36, vs2 + dcbt T1, PRE + xvnmsubadp vs41, vs37, vs2 + xvnmsubadp vs42, vs36, vs3 + xvnmsubadp vs43, vs37, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs44, vs36, vs4 + xvnmsubadp vs45, vs37, vs4 + xvnmsubadp vs46, vs36, vs5 + xvnmsubadp vs47, vs37, vs5 + xvnmsubadp vs48, vs36, vs6 + xvnmsubadp vs49, vs37, vs6 + xvnmsubadp vs50, vs36, vs7 + xvnmsubadp vs51, vs37, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs52, vs36, vs8 + xvnmsubadp vs53, vs37, vs8 + xvnmsubadp vs54, vs36, vs9 + xvnmsubadp vs55, vs37, vs9 + xvnmsubadp vs56, vs36, vs10 + xvnmsubadp vs57, vs37, vs10 + xvnmsubadp vs58, vs36, vs11 + xvnmsubadp vs59, vs37, vs11 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs60, vs36, vs12 + xvnmsubadp vs61, vs37, vs12 + xvnmsubadp vs62, vs36, vs13 + xvnmsubadp vs63, vs37, vs13 + + lxvdsx vs12, o0, T1 + + stxvd2x vs32, o0, T4 + stxvd2x vs33, o16, T4 + stxvd2x vs34, o32, T4 + stxvd2x vs35, o48, T4 + + addi T4, T4, 64 + + addi T1, T1, 8 + +//############### OFFSET 4 ####################### + xvmuldp vs38, vs38, vs0 + xvmuldp vs39, vs39, vs0 + + addi T1, T1, 4*SIZE + + xvnmsubadp vs40, vs38, vs1 + xvnmsubadp vs41, vs39, vs1 + xvnmsubadp vs42, vs38, vs2 + dcbt T1, PRE + xvnmsubadp vs43, vs39, vs2 + xvnmsubadp vs44, vs38, vs3 + xvnmsubadp vs45, vs39, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs46, vs38, vs4 + xvnmsubadp vs47, vs39, vs4 + xvnmsubadp vs48, vs38, vs5 + xvnmsubadp vs49, vs39, vs5 + xvnmsubadp vs50, vs38, vs6 + xvnmsubadp vs51, vs39, vs6 + xvnmsubadp vs52, vs38, vs7 + xvnmsubadp vs53, vs39, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + + xvnmsubadp vs54, vs38, vs8 + xvnmsubadp vs55, vs39, vs8 + xvnmsubadp vs56, vs38, vs9 + xvnmsubadp vs57, vs39, vs9 + xvnmsubadp vs58, vs38, vs10 + xvnmsubadp vs59, vs39, vs10 + xvnmsubadp vs60, vs38, vs11 + xvnmsubadp vs61, vs39, vs11 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs62, vs38, vs12 + xvnmsubadp vs63, vs39, vs12 + + +//############### OFFSET 5 ####################### + xvmuldp vs40, vs40, vs0 + xvmuldp vs41, vs41, vs0 + + addi T1, T1, 5*SIZE + + xvnmsubadp vs42, vs40, vs1 + xvnmsubadp vs43, vs41, vs1 + xvnmsubadp vs44, vs40, vs2 + dcbt T1, PRE + xvnmsubadp vs45, vs41, vs2 + xvnmsubadp vs46, vs40, vs3 + xvnmsubadp vs47, vs41, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs48, vs40, vs4 + xvnmsubadp vs49, vs41, vs4 + xvnmsubadp vs50, vs40, vs5 + xvnmsubadp vs51, vs41, vs5 + xvnmsubadp vs52, vs40, vs6 + xvnmsubadp vs53, vs41, vs6 + xvnmsubadp vs54, vs40, vs7 + xvnmsubadp vs55, vs41, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs56, vs40, vs8 + xvnmsubadp vs57, vs41, vs8 + xvnmsubadp vs58, vs40, vs9 + xvnmsubadp vs59, vs41, vs9 + xvnmsubadp vs60, vs40, vs10 + xvnmsubadp vs61, vs41, vs10 + xvnmsubadp vs62, vs40, vs11 + xvnmsubadp vs63, vs41, vs11 + + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + + addi T1, T1, 24 + +//############### OFFSET 6 ####################### + xvmuldp vs42, vs42, vs0 + xvmuldp vs43, vs43, vs0 + + addi T1, T1, 6*SIZE + + xvnmsubadp vs44, vs42, vs1 + xvnmsubadp vs45, vs43, vs1 + xvnmsubadp vs46, vs42, vs2 + dcbt T1, PRE + xvnmsubadp vs47, vs43, vs2 + xvnmsubadp vs48, vs42, vs3 + xvnmsubadp vs49, vs43, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs50, vs42, vs4 + xvnmsubadp vs51, vs43, vs4 + xvnmsubadp vs52, vs42, vs5 + xvnmsubadp vs53, vs43, vs5 + xvnmsubadp vs54, vs42, vs6 + xvnmsubadp vs55, vs43, vs6 + xvnmsubadp vs56, vs42, vs7 + xvnmsubadp vs57, vs43, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs58, vs42, vs8 + xvnmsubadp vs59, vs43, vs8 + xvnmsubadp vs60, vs42, vs9 + xvnmsubadp vs61, vs43, vs9 + xvnmsubadp vs62, vs42, vs10 + xvnmsubadp vs63, vs43, vs10 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + + addi T1, T1, 16 + + stxvd2x vs36, o0, T4 + stxvd2x vs37, o16, T4 + stxvd2x vs38, o32, T4 + stxvd2x vs39, o48, T4 + + addi T4, T4, 64 + +//############### OFFSET 7 ####################### + xvmuldp vs44, vs44, vs0 + xvmuldp vs45, vs45, vs0 + + addi T1, T1, 7*SIZE + + xvnmsubadp vs46, vs44, vs1 + xvnmsubadp vs47, vs45, vs1 + xvnmsubadp vs48, vs44, vs2 + dcbt T1, PRE + xvnmsubadp vs49, vs45, vs2 + xvnmsubadp vs50, vs44, vs3 + xvnmsubadp vs51, vs45, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs52, vs44, vs4 + xvnmsubadp vs53, vs45, vs4 + xvnmsubadp vs54, vs44, vs5 + xvnmsubadp vs55, vs45, vs5 + xvnmsubadp vs56, vs44, vs6 + xvnmsubadp vs57, vs45, vs6 + xvnmsubadp vs58, vs44, vs7 + xvnmsubadp vs59, vs45, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs60, vs44, vs8 + xvnmsubadp vs61, vs45, vs8 + xvnmsubadp vs62, vs44, vs9 + xvnmsubadp vs63, vs45, vs9 + + lxvdsx vs8, o0, T1 + + addi T1, T1, 8 + +//############### OFFSET 8 ####################### + xvmuldp vs46, vs46, vs0 + xvmuldp vs47, vs47, vs0 + + addi T1, T1, 8*SIZE + + xvnmsubadp vs48, vs46, vs1 + xvnmsubadp vs49, vs47, vs1 + xvnmsubadp vs50, vs46, vs2 + dcbt T1, PRE + xvnmsubadp vs51, vs47, vs2 + xvnmsubadp vs52, vs46, vs3 + xvnmsubadp vs53, vs47, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs54, vs46, vs4 + xvnmsubadp vs55, vs47, vs4 + xvnmsubadp vs56, vs46, vs5 + xvnmsubadp vs57, vs47, vs5 + xvnmsubadp vs58, vs46, vs6 + xvnmsubadp vs59, vs47, vs6 + xvnmsubadp vs60, vs46, vs7 + xvnmsubadp vs61, vs47, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + stxvd2x vs40, o0, T4 + stxvd2x vs41, o16, T4 + stxvd2x vs42, o32, T4 + stxvd2x vs43, o48, T4 + + addi T4, T4, 64 + + xvnmsubadp vs62, vs46, vs8 + xvnmsubadp vs63, vs47, vs8 + + +//############### OFFSET 9 ####################### + xvmuldp vs48, vs48, vs0 + xvmuldp vs49, vs49, vs0 + + addi T1, T1, 9*SIZE + + xvnmsubadp vs50, vs48, vs1 + xvnmsubadp vs51, vs49, vs1 + xvnmsubadp vs52, vs48, vs2 + dcbt T1, PRE + xvnmsubadp vs53, vs49, vs2 + xvnmsubadp vs54, vs48, vs3 + xvnmsubadp vs55, vs49, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs56, vs48, vs4 + xvnmsubadp vs57, vs49, vs4 + xvnmsubadp vs58, vs48, vs5 + xvnmsubadp vs59, vs49, vs5 + xvnmsubadp vs60, vs48, vs6 + xvnmsubadp vs61, vs49, vs6 + xvnmsubadp vs62, vs48, vs7 + xvnmsubadp vs63, vs49, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + + addi T1, T1, 24 + +//############### OFFSET 10 ####################### + xvmuldp vs50, vs50, vs0 + xvmuldp vs51, vs51, vs0 + + addi T1, T1, 10*SIZE + + xvnmsubadp vs52, vs50, vs1 + xvnmsubadp vs53, vs51, vs1 + xvnmsubadp vs54, vs50, vs2 + dcbt T1, PRE + xvnmsubadp vs55, vs51, vs2 + xvnmsubadp vs56, vs50, vs3 + xvnmsubadp vs57, vs51, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs58, vs50, vs4 + xvnmsubadp vs59, vs51, vs4 + xvnmsubadp vs60, vs50, vs5 + xvnmsubadp vs61, vs51, vs5 + xvnmsubadp vs62, vs50, vs6 + xvnmsubadp vs63, vs51, vs6 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + + addi T1, T1, 16 + + stxvd2x vs44, o0, T4 + stxvd2x vs45, o16, T4 + stxvd2x vs46, o32, T4 + stxvd2x vs47, o48, T4 + + addi T4, T4, 64 + +//############### OFFSET 11 ####################### + xvmuldp vs52, vs52, vs0 + xvmuldp vs53, vs53, vs0 + + addi T1, T1, 11*SIZE + + xvnmsubadp vs54, vs52, vs1 + xvnmsubadp vs55, vs53, vs1 + xvnmsubadp vs56, vs52, vs2 + dcbt T1, PRE + xvnmsubadp vs57, vs53, vs2 + xvnmsubadp vs58, vs52, vs3 + xvnmsubadp vs59, vs53, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs60, vs52, vs4 + xvnmsubadp vs61, vs53, vs4 + xvnmsubadp vs62, vs52, vs5 + xvnmsubadp vs63, vs53, vs5 + + lxvdsx vs4, o0, T1 + + addi T1, T1, 8 + +//############### OFFSET 12 ####################### + xvmuldp vs54, vs54, vs0 + xvmuldp vs55, vs55, vs0 + + addi T1, T1, 12*SIZE + + xvnmsubadp vs56, vs54, vs1 + xvnmsubadp vs57, vs55, vs1 + xvnmsubadp vs58, vs54, vs2 + dcbt T1, PRE + xvnmsubadp vs59, vs55, vs2 + xvnmsubadp vs60, vs54, vs3 + xvnmsubadp vs61, vs55, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + stxvd2x vs48, o0, T4 + stxvd2x vs49, o16, T4 + stxvd2x vs50, o32, T4 + stxvd2x vs51, o48, T4 + + addi T4, T4, 64 + + xvnmsubadp vs62, vs54, vs4 + xvnmsubadp vs63, vs55, vs4 + + +//############### OFFSET 13 ####################### + xvmuldp vs56, vs56, vs0 + xvmuldp vs57, vs57, vs0 + + addi T1, T1, 13*SIZE + + xvnmsubadp vs58, vs56, vs1 + xvnmsubadp vs59, vs57, vs1 + xvnmsubadp vs60, vs56, vs2 + xvnmsubadp vs61, vs57, vs2 + xvnmsubadp vs62, vs56, vs3 + xvnmsubadp vs63, vs57, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + +//############### OFFSET 14 ####################### + xvmuldp vs58, vs58, vs0 + xvmuldp vs59, vs59, vs0 + + addi T1, T1, 14*SIZE + + xvnmsubadp vs60, vs58, vs1 + xvnmsubadp vs61, vs59, vs1 + xvnmsubadp vs62, vs58, vs2 + xvnmsubadp vs63, vs59, vs2 + + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + stxvd2x vs52, o0, T4 + stxvd2x vs53, o16, T4 + stxvd2x vs54, o32, T4 + stxvd2x vs55, o48, T4 + + addi T4, T4, 64 +//############### OFFSET 15 ####################### + xvmuldp vs60, vs60, vs0 + xvmuldp vs61, vs61, vs0 + + addi T1, T1, 15*SIZE + + xvnmsubadp vs62, vs60, vs1 + xvnmsubadp vs63, vs61, vs1 + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs62, vs62, vs0 + xvmuldp vs63, vs63, vs0 + + +//############### SAVE B ####################### + + + + stxvd2x vs56, o0, T4 + stxvd2x vs57, o16, T4 + stxvd2x vs58, o32, T4 + stxvd2x vs59, o48, T4 + + addi T4, T4, 64 + + stxvd2x vs60, o0, T4 + stxvd2x vs61, o16, T4 + stxvd2x vs62, o32, T4 + stxvd2x vs63, o48, T4 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs34, o8, T1 + xxswapd vs34, vs34 + stxsdx vs36, o16, T1 + xxswapd vs36, vs36 + stxsdx vs38, o24, T1 + xxswapd vs38, vs38 + + addi T1, T1, 32 + + stxsdx vs40, o0, T1 + xxswapd vs40, vs40 + stxsdx vs42, o8, T1 + xxswapd vs42, vs42 + stxsdx vs44, o16, T1 + xxswapd vs44, vs44 + stxsdx vs46, o24, T1 + xxswapd vs46, vs46 + + addi T1, T1, 32 + + stxsdx vs48, o0, T1 + xxswapd vs48, vs48 + stxsdx vs50, o8, T1 + xxswapd vs50, vs50 + stxsdx vs52, o16, T1 + xxswapd vs52, vs52 + stxsdx vs54, o24, T1 + xxswapd vs54, vs54 + + addi T1, T1, 32 + + stxsdx vs56, o0, T1 + xxswapd vs56, vs56 + stxsdx vs58, o8, T1 + xxswapd vs58, vs58 + stxsdx vs60, o16, T1 + xxswapd vs60, vs60 + stxsdx vs62, o24, T1 + xxswapd vs62, vs62 + + stxsdx vs32, o0, T2 + stxsdx vs34, o8, T2 + stxsdx vs36, o16, T2 + stxsdx vs38, o24, T2 + + addi T2, T2, 32 + + stxsdx vs40, o0, T2 + stxsdx vs42, o8, T2 + stxsdx vs44, o16, T2 + stxsdx vs46, o24, T2 + + addi T2, T2, 32 + + stxsdx vs48, o0, T2 + stxsdx vs50, o8, T2 + stxsdx vs52, o16, T2 + stxsdx vs54, o24, T2 + + addi T2, T2, 32 + + stxsdx vs56, o0, T2 + stxsdx vs58, o8, T2 + stxsdx vs60, o16, T2 + stxsdx vs62, o24, T2 + + mr T1, CO + add T2, CO, LDC + + + add T1, T2, LDC + add T2, T1, LDC + + + stxsdx vs33, o0, T1 + xxswapd vs33, vs33 + stxsdx vs35, o8, T1 + xxswapd vs35, vs35 + stxsdx vs37, o16, T1 + xxswapd vs37, vs37 + stxsdx vs39, o24, T1 + xxswapd vs39, vs39 + + addi T1, T1, 32 + + stxsdx vs41, o0, T1 + xxswapd vs41, vs41 + stxsdx vs43, o8, T1 + xxswapd vs43, vs43 + stxsdx vs45, o16, T1 + xxswapd vs45, vs45 + stxsdx vs47, o24, T1 + xxswapd vs47, vs47 + + addi T1, T1, 32 + + stxsdx vs49, o0, T1 + xxswapd vs49, vs49 + stxsdx vs51, o8, T1 + xxswapd vs51, vs51 + stxsdx vs53, o16, T1 + xxswapd vs53, vs53 + stxsdx vs55, o24, T1 + xxswapd vs55, vs55 + + addi T1, T1, 32 + + stxsdx vs57, o0, T1 + xxswapd vs57, vs57 + stxsdx vs59, o8, T1 + xxswapd vs59, vs59 + stxsdx vs61, o16, T1 + xxswapd vs61, vs61 + stxsdx vs63, o24, T1 + xxswapd vs63, vs63 + + stxsdx vs33, o0, T2 + stxsdx vs35, o8, T2 + stxsdx vs37, o16, T2 + stxsdx vs39, o24, T2 + + addi T2, T2, 32 + + stxsdx vs41, o0, T2 + stxsdx vs43, o8, T2 + stxsdx vs45, o16, T2 + stxsdx vs47, o24, T2 + + addi T2, T2, 32 + + stxsdx vs49, o0, T2 + stxsdx vs51, o8, T2 + stxsdx vs53, o16, T2 + stxsdx vs55, o24, T2 + + addi T2, T2, 32 + + stxsdx vs57, o0, T2 + stxsdx vs59, o8, T2 + stxsdx vs61, o16, T2 + stxsdx vs63, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 8x4 +##########################################################################################*/ + +.macro SOLVE_LT_8x4 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs34, vs35, 0 + xxpermdi vs2, vs32, vs33, 3 + xxpermdi vs3, vs34, vs35, 3 + + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs38, vs39, 0 + xxpermdi vs6, vs36, vs37, 3 + xxpermdi vs7, vs38, vs39, 3 + + xxpermdi vs8, vs40, vs41, 0 + xxpermdi vs9, vs42, vs43, 0 + xxpermdi vs10, vs40, vs41, 3 + xxpermdi vs11, vs42, vs43, 3 + + xxpermdi vs12, vs44, vs45, 0 + xxpermdi vs13, vs46, vs47, 0 + xxpermdi vs14, vs44, vs45, 3 + xxpermdi vs15, vs46, vs47, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs36, o0, T1 + lxvd2x vs37, o16, T1 + lxvd2x vs38, o32, T1 + lxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs40, o0, T1 + lxvd2x vs41, o16, T1 + lxvd2x vs42, o32, T1 + lxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs44, o0, T1 + lxvd2x vs45, o16, T1 + lxvd2x vs46, o32, T1 + lxvd2x vs47, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + xvsubdp vs36, vs36, vs4 + xvsubdp vs37, vs37, vs5 + xvsubdp vs38, vs38, vs6 + xvsubdp vs39, vs39, vs7 + xvsubdp vs40, vs40, vs8 + xvsubdp vs41, vs41, vs9 + xvsubdp vs42, vs42, vs10 + xvsubdp vs43, vs43, vs11 + xvsubdp vs44, vs44, vs12 + xvsubdp vs45, vs45, vs13 + xvsubdp vs46, vs46, vs14 + xvsubdp vs47, vs47, vs15 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvmuldp vs33, vs33, vs0 + + xvnmsubadp vs34, vs32, vs1 + xvnmsubadp vs35, vs33, vs1 + xvnmsubadp vs36, vs32, vs2 + xvnmsubadp vs37, vs33, vs2 + xvnmsubadp vs38, vs32, vs3 + xvnmsubadp vs39, vs33, vs3 + xvnmsubadp vs40, vs32, vs4 + xvnmsubadp vs41, vs33, vs4 + xvnmsubadp vs42, vs32, vs5 + xvnmsubadp vs43, vs33, vs5 + xvnmsubadp vs44, vs32, vs6 + xvnmsubadp vs45, vs33, vs6 + xvnmsubadp vs46, vs32, vs7 + xvnmsubadp vs47, vs33, vs7 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs34, vs34, vs0 + xvmuldp vs35, vs35, vs0 + + xvnmsubadp vs36, vs34, vs1 + xvnmsubadp vs37, vs35, vs1 + xvnmsubadp vs38, vs34, vs2 + xvnmsubadp vs39, vs35, vs2 + xvnmsubadp vs40, vs34, vs3 + xvnmsubadp vs41, vs35, vs3 + xvnmsubadp vs42, vs34, vs4 + xvnmsubadp vs43, vs35, vs4 + xvnmsubadp vs44, vs34, vs5 + xvnmsubadp vs45, vs35, vs5 + xvnmsubadp vs46, vs34, vs6 + xvnmsubadp vs47, vs35, vs6 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs36, vs36, vs0 + xvmuldp vs37, vs37, vs0 + + xvnmsubadp vs38, vs36, vs1 + xvnmsubadp vs39, vs37, vs1 + xvnmsubadp vs40, vs36, vs2 + xvnmsubadp vs41, vs37, vs2 + xvnmsubadp vs42, vs36, vs3 + xvnmsubadp vs43, vs37, vs3 + xvnmsubadp vs44, vs36, vs4 + xvnmsubadp vs45, vs37, vs4 + xvnmsubadp vs46, vs36, vs5 + xvnmsubadp vs47, vs37, vs5 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs38, vs38, vs0 + xvmuldp vs39, vs39, vs0 + + xvnmsubadp vs40, vs38, vs1 + xvnmsubadp vs41, vs39, vs1 + xvnmsubadp vs42, vs38, vs2 + xvnmsubadp vs43, vs39, vs2 + xvnmsubadp vs44, vs38, vs3 + xvnmsubadp vs45, vs39, vs3 + xvnmsubadp vs46, vs38, vs4 + xvnmsubadp vs47, vs39, vs4 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs40, vs40, vs0 + xvmuldp vs41, vs41, vs0 + + xvnmsubadp vs42, vs40, vs1 + xvnmsubadp vs43, vs41, vs1 + xvnmsubadp vs44, vs40, vs2 + xvnmsubadp vs45, vs41, vs2 + xvnmsubadp vs46, vs40, vs3 + xvnmsubadp vs47, vs41, vs3 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs42, vs42, vs0 + xvmuldp vs43, vs43, vs0 + + xvnmsubadp vs44, vs42, vs1 + xvnmsubadp vs45, vs43, vs1 + xvnmsubadp vs46, vs42, vs2 + xvnmsubadp vs47, vs43, vs2 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs44, vs44, vs0 + xvmuldp vs45, vs45, vs0 + + xvnmsubadp vs46, vs44, vs1 + xvnmsubadp vs47, vs45, vs1 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs46, vs46, vs0 + xvmuldp vs47, vs47, vs0 + + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs34, o8, T1 + xxswapd vs34, vs34 + stxsdx vs36, o16, T1 + xxswapd vs36, vs36 + stxsdx vs38, o24, T1 + xxswapd vs38, vs38 + + addi T1, T1, 32 + + stxsdx vs40, o0, T1 + xxswapd vs40, vs40 + stxsdx vs42, o8, T1 + xxswapd vs42, vs42 + stxsdx vs44, o16, T1 + xxswapd vs44, vs44 + stxsdx vs46, o24, T1 + xxswapd vs46, vs46 + + stxsdx vs32, o0, T2 + stxsdx vs34, o8, T2 + stxsdx vs36, o16, T2 + stxsdx vs38, o24, T2 + + addi T2, T2, 32 + + stxsdx vs40, o0, T2 + stxsdx vs42, o8, T2 + stxsdx vs44, o16, T2 + stxsdx vs46, o24, T2 + + mr T1, CO + add T2, CO, LDC + + + add T1, T2, LDC + add T2, T1, LDC + + + stxsdx vs33, o0, T1 + xxswapd vs33, vs33 + stxsdx vs35, o8, T1 + xxswapd vs35, vs35 + stxsdx vs37, o16, T1 + xxswapd vs37, vs37 + stxsdx vs39, o24, T1 + xxswapd vs39, vs39 + + addi T1, T1, 32 + + stxsdx vs41, o0, T1 + xxswapd vs41, vs41 + stxsdx vs43, o8, T1 + xxswapd vs43, vs43 + stxsdx vs45, o16, T1 + xxswapd vs45, vs45 + stxsdx vs47, o24, T1 + xxswapd vs47, vs47 + + stxsdx vs33, o0, T2 + stxsdx vs35, o8, T2 + stxsdx vs37, o16, T2 + stxsdx vs39, o24, T2 + + addi T2, T2, 32 + + stxsdx vs41, o0, T2 + stxsdx vs43, o8, T2 + stxsdx vs45, o16, T2 + stxsdx vs47, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 4x4 +##########################################################################################*/ + +.macro SOLVE_LT_4x4 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs34, vs35, 0 + xxpermdi vs2, vs32, vs33, 3 + xxpermdi vs3, vs34, vs35, 3 + + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs38, vs39, 0 + xxpermdi vs6, vs36, vs37, 3 + xxpermdi vs7, vs38, vs39, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs36, o0, T1 + lxvd2x vs37, o16, T1 + lxvd2x vs38, o32, T1 + lxvd2x vs39, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + xvsubdp vs36, vs36, vs4 + xvsubdp vs37, vs37, vs5 + xvsubdp vs38, vs38, vs6 + xvsubdp vs39, vs39, vs7 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvmuldp vs33, vs33, vs0 + + xvnmsubadp vs34, vs32, vs1 + xvnmsubadp vs35, vs33, vs1 + xvnmsubadp vs36, vs32, vs2 + xvnmsubadp vs37, vs33, vs2 + xvnmsubadp vs38, vs32, vs3 + xvnmsubadp vs39, vs33, vs3 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs34, vs34, vs0 + xvmuldp vs35, vs35, vs0 + + xvnmsubadp vs36, vs34, vs1 + xvnmsubadp vs37, vs35, vs1 + xvnmsubadp vs38, vs34, vs2 + xvnmsubadp vs39, vs35, vs2 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs36, vs36, vs0 + xvmuldp vs37, vs37, vs0 + + xvnmsubadp vs38, vs36, vs1 + xvnmsubadp vs39, vs37, vs1 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs38, vs38, vs0 + xvmuldp vs39, vs39, vs0 + + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs34, o8, T1 + xxswapd vs34, vs34 + stxsdx vs36, o16, T1 + xxswapd vs36, vs36 + stxsdx vs38, o24, T1 + xxswapd vs38, vs38 + + stxsdx vs32, o0, T2 + stxsdx vs34, o8, T2 + stxsdx vs36, o16, T2 + stxsdx vs38, o24, T2 + + mr T1, CO + add T2, CO, LDC + + + add T1, T2, LDC + add T2, T1, LDC + + + stxsdx vs33, o0, T1 + xxswapd vs33, vs33 + stxsdx vs35, o8, T1 + xxswapd vs35, vs35 + stxsdx vs37, o16, T1 + xxswapd vs37, vs37 + stxsdx vs39, o24, T1 + xxswapd vs39, vs39 + + stxsdx vs33, o0, T2 + stxsdx vs35, o8, T2 + stxsdx vs37, o16, T2 + stxsdx vs39, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 2x4 +##########################################################################################*/ + +.macro SOLVE_LT_2x4 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs34, vs35, 0 + xxpermdi vs2, vs32, vs33, 3 + xxpermdi vs3, vs34, vs35, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs32, vs32, vs0 + xvmuldp vs33, vs33, vs0 + + xvnmsubadp vs34, vs32, vs1 + xvnmsubadp vs35, vs33, vs1 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs34, vs34, vs0 + xvmuldp vs35, vs35, vs0 + + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs34, o8, T1 + xxswapd vs34, vs34 + + stxsdx vs32, o0, T2 + stxsdx vs34, o8, T2 + + mr T1, CO + add T2, CO, LDC + + + add T1, T2, LDC + add T2, T1, LDC + + + stxsdx vs33, o0, T1 + xxswapd vs33, vs33 + stxsdx vs35, o8, T1 + xxswapd vs35, vs35 + + stxsdx vs33, o0, T2 + stxsdx vs35, o8, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 1x4 +##########################################################################################*/ + +.macro SOLVE_LT_1x4 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs34, vs35, 0 + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs32, vs32, vs0 + xvmuldp vs33, vs33, vs0 + + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + + stxsdx vs32, o0, T2 + + mr T1, CO + add T2, CO, LDC + + + add T1, T2, LDC + add T2, T1, LDC + + + stxsdx vs33, o0, T1 + xxswapd vs33, vs33 + + stxsdx vs33, o0, T2 + +.endm + + +.macro INIT_16x2 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + xvmovdp vs40, vs0 + xvmovdp vs41, vs0 + xvmovdp vs42, vs0 + xvmovdp vs43, vs0 + xvmovdp vs44, vs0 + xvmovdp vs45, vs0 + xvmovdp vs46, vs0 + xvmovdp vs47, vs0 + +.endm + + +.macro KERNEL_16x2 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 + + +.endm + + +.macro INIT_8x2 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + +.endm + + +.macro KERNEL_8x2 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + + +.endm + + +.macro INIT_4x2 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + +.endm + + +.macro KERNEL_4x2 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + + +.endm + + +.macro INIT_2x2 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + +.endm + + +.macro KERNEL_2x2 + + + lxvd2x vs0, o0, AO + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + +.endm + + +.macro INIT_1x2 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + +.endm + + +.macro KERNEL_1x2 + + + lxvdsx vs0, o0, AO + + addi AO, AO, 8 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + +.endm + + +/*########################################################################################## + SOLVE_LT 16x2 +##########################################################################################*/ + +.macro SOLVE_LT_16x2 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs32, vs33, 3 + + xxpermdi vs2, vs34, vs35, 0 + xxpermdi vs3, vs34, vs35, 3 + + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs36, vs37, 3 + + xxpermdi vs6, vs38, vs39, 0 + xxpermdi vs7, vs38, vs39, 3 + + xxpermdi vs8, vs40, vs41, 0 + xxpermdi vs9, vs40, vs41, 3 + + xxpermdi vs10, vs42, vs43, 0 + xxpermdi vs11, vs42, vs43, 3 + + xxpermdi vs12, vs44, vs45, 0 + xxpermdi vs13, vs44, vs45, 3 + + xxpermdi vs14, vs46, vs47, 0 + xxpermdi vs15, vs46, vs47, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs36, o0, T1 + lxvd2x vs37, o16, T1 + lxvd2x vs38, o32, T1 + lxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs40, o0, T1 + lxvd2x vs41, o16, T1 + lxvd2x vs42, o32, T1 + lxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs44, o0, T1 + lxvd2x vs45, o16, T1 + lxvd2x vs46, o32, T1 + lxvd2x vs47, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + xvsubdp vs36, vs36, vs4 + xvsubdp vs37, vs37, vs5 + xvsubdp vs38, vs38, vs6 + xvsubdp vs39, vs39, vs7 + xvsubdp vs40, vs40, vs8 + xvsubdp vs41, vs41, vs9 + xvsubdp vs42, vs42, vs10 + xvsubdp vs43, vs43, vs11 + xvsubdp vs44, vs44, vs12 + xvsubdp vs45, vs45, vs13 + xvsubdp vs46, vs46, vs14 + xvsubdp vs47, vs47, vs15 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + lxvdsx vs14, o16, T1 + lxvdsx vs15, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvnmsubadp vs33, vs32, vs1 + xvnmsubadp vs34, vs32, vs2 + xvnmsubadp vs35, vs32, vs3 + xvnmsubadp vs36, vs32, vs4 + xvnmsubadp vs37, vs32, vs5 + xvnmsubadp vs38, vs32, vs6 + xvnmsubadp vs39, vs32, vs7 + xvnmsubadp vs40, vs32, vs8 + xvnmsubadp vs41, vs32, vs9 + xvnmsubadp vs42, vs32, vs10 + xvnmsubadp vs43, vs32, vs11 + xvnmsubadp vs44, vs32, vs12 + xvnmsubadp vs45, vs32, vs13 + xvnmsubadp vs46, vs32, vs14 + xvnmsubadp vs47, vs32, vs15 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + lxvdsx vs14, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs33, vs33, vs0 + xvnmsubadp vs34, vs33, vs1 + xvnmsubadp vs35, vs33, vs2 + xvnmsubadp vs36, vs33, vs3 + xvnmsubadp vs37, vs33, vs4 + xvnmsubadp vs38, vs33, vs5 + xvnmsubadp vs39, vs33, vs6 + xvnmsubadp vs40, vs33, vs7 + xvnmsubadp vs41, vs33, vs8 + xvnmsubadp vs42, vs33, vs9 + xvnmsubadp vs43, vs33, vs10 + xvnmsubadp vs44, vs33, vs11 + xvnmsubadp vs45, vs33, vs12 + xvnmsubadp vs46, vs33, vs13 + xvnmsubadp vs47, vs33, vs14 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs34, vs34, vs0 + xvnmsubadp vs35, vs34, vs1 + xvnmsubadp vs36, vs34, vs2 + xvnmsubadp vs37, vs34, vs3 + xvnmsubadp vs38, vs34, vs4 + xvnmsubadp vs39, vs34, vs5 + xvnmsubadp vs40, vs34, vs6 + xvnmsubadp vs41, vs34, vs7 + xvnmsubadp vs42, vs34, vs8 + xvnmsubadp vs43, vs34, vs9 + xvnmsubadp vs44, vs34, vs10 + xvnmsubadp vs45, vs34, vs11 + xvnmsubadp vs46, vs34, vs12 + xvnmsubadp vs47, vs34, vs13 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs35, vs35, vs0 + xvnmsubadp vs36, vs35, vs1 + xvnmsubadp vs37, vs35, vs2 + xvnmsubadp vs38, vs35, vs3 + xvnmsubadp vs39, vs35, vs4 + xvnmsubadp vs40, vs35, vs5 + xvnmsubadp vs41, vs35, vs6 + xvnmsubadp vs42, vs35, vs7 + xvnmsubadp vs43, vs35, vs8 + xvnmsubadp vs44, vs35, vs9 + xvnmsubadp vs45, vs35, vs10 + xvnmsubadp vs46, vs35, vs11 + xvnmsubadp vs47, vs35, vs12 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs36, vs36, vs0 + xvnmsubadp vs37, vs36, vs1 + xvnmsubadp vs38, vs36, vs2 + xvnmsubadp vs39, vs36, vs3 + xvnmsubadp vs40, vs36, vs4 + xvnmsubadp vs41, vs36, vs5 + xvnmsubadp vs42, vs36, vs6 + xvnmsubadp vs43, vs36, vs7 + xvnmsubadp vs44, vs36, vs8 + xvnmsubadp vs45, vs36, vs9 + xvnmsubadp vs46, vs36, vs10 + xvnmsubadp vs47, vs36, vs11 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs37, vs37, vs0 + xvnmsubadp vs38, vs37, vs1 + xvnmsubadp vs39, vs37, vs2 + xvnmsubadp vs40, vs37, vs3 + xvnmsubadp vs41, vs37, vs4 + xvnmsubadp vs42, vs37, vs5 + xvnmsubadp vs43, vs37, vs6 + xvnmsubadp vs44, vs37, vs7 + xvnmsubadp vs45, vs37, vs8 + xvnmsubadp vs46, vs37, vs9 + xvnmsubadp vs47, vs37, vs10 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs38, vs38, vs0 + xvnmsubadp vs39, vs38, vs1 + xvnmsubadp vs40, vs38, vs2 + xvnmsubadp vs41, vs38, vs3 + xvnmsubadp vs42, vs38, vs4 + xvnmsubadp vs43, vs38, vs5 + xvnmsubadp vs44, vs38, vs6 + xvnmsubadp vs45, vs38, vs7 + xvnmsubadp vs46, vs38, vs8 + xvnmsubadp vs47, vs38, vs9 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs39, vs39, vs0 + xvnmsubadp vs40, vs39, vs1 + xvnmsubadp vs41, vs39, vs2 + xvnmsubadp vs42, vs39, vs3 + xvnmsubadp vs43, vs39, vs4 + xvnmsubadp vs44, vs39, vs5 + xvnmsubadp vs45, vs39, vs6 + xvnmsubadp vs46, vs39, vs7 + xvnmsubadp vs47, vs39, vs8 + +//############### OFFSET 8 ####################### + + addi T1, T1, 8*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs40, vs40, vs0 + xvnmsubadp vs41, vs40, vs1 + xvnmsubadp vs42, vs40, vs2 + xvnmsubadp vs43, vs40, vs3 + xvnmsubadp vs44, vs40, vs4 + xvnmsubadp vs45, vs40, vs5 + xvnmsubadp vs46, vs40, vs6 + xvnmsubadp vs47, vs40, vs7 + +//############### OFFSET 9 ####################### + + addi T1, T1, 9*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs41, vs41, vs0 + xvnmsubadp vs42, vs41, vs1 + xvnmsubadp vs43, vs41, vs2 + xvnmsubadp vs44, vs41, vs3 + xvnmsubadp vs45, vs41, vs4 + xvnmsubadp vs46, vs41, vs5 + xvnmsubadp vs47, vs41, vs6 + +//############### OFFSET 10 ####################### + + addi T1, T1, 10*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs42, vs42, vs0 + xvnmsubadp vs43, vs42, vs1 + xvnmsubadp vs44, vs42, vs2 + xvnmsubadp vs45, vs42, vs3 + xvnmsubadp vs46, vs42, vs4 + xvnmsubadp vs47, vs42, vs5 + +//############### OFFSET 11 ####################### + + addi T1, T1, 11*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs43, vs43, vs0 + xvnmsubadp vs44, vs43, vs1 + xvnmsubadp vs45, vs43, vs2 + xvnmsubadp vs46, vs43, vs3 + xvnmsubadp vs47, vs43, vs4 + +//############### OFFSET 12 ####################### + + addi T1, T1, 12*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs44, vs44, vs0 + xvnmsubadp vs45, vs44, vs1 + xvnmsubadp vs46, vs44, vs2 + xvnmsubadp vs47, vs44, vs3 + +//############### OFFSET 13 ####################### + + addi T1, T1, 13*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs45, vs45, vs0 + xvnmsubadp vs46, vs45, vs1 + xvnmsubadp vs47, vs45, vs2 + +//############### OFFSET 14 ####################### + + addi T1, T1, 14*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs46, vs46, vs0 + xvnmsubadp vs47, vs46, vs1 + +//############### OFFSET 15 ####################### + + addi T1, T1, 15*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs47, vs47, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs33, o8, T1 + xxswapd vs33, vs33 + stxsdx vs34, o16, T1 + xxswapd vs34, vs34 + stxsdx vs35, o24, T1 + xxswapd vs35, vs35 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + xxswapd vs36, vs36 + stxsdx vs37, o8, T1 + xxswapd vs37, vs37 + stxsdx vs38, o16, T1 + xxswapd vs38, vs38 + stxsdx vs39, o24, T1 + xxswapd vs39, vs39 + + addi T1, T1, 32 + + stxsdx vs40, o0, T1 + xxswapd vs40, vs40 + stxsdx vs41, o8, T1 + xxswapd vs41, vs41 + stxsdx vs42, o16, T1 + xxswapd vs42, vs42 + stxsdx vs43, o24, T1 + xxswapd vs43, vs43 + + addi T1, T1, 32 + + stxsdx vs44, o0, T1 + xxswapd vs44, vs44 + stxsdx vs45, o8, T1 + xxswapd vs45, vs45 + stxsdx vs46, o16, T1 + xxswapd vs46, vs46 + stxsdx vs47, o24, T1 + xxswapd vs47, vs47 + + stxsdx vs32, o0, T2 + stxsdx vs33, o8, T2 + stxsdx vs34, o16, T2 + stxsdx vs35, o24, T2 + + addi T2, T2, 32 + + stxsdx vs36, o0, T2 + stxsdx vs37, o8, T2 + stxsdx vs38, o16, T2 + stxsdx vs39, o24, T2 + + addi T2, T2, 32 + + stxsdx vs40, o0, T2 + stxsdx vs41, o8, T2 + stxsdx vs42, o16, T2 + stxsdx vs43, o24, T2 + + addi T2, T2, 32 + + stxsdx vs44, o0, T2 + stxsdx vs45, o8, T2 + stxsdx vs46, o16, T2 + stxsdx vs47, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 8x2 +##########################################################################################*/ + +.macro SOLVE_LT_8x2 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs32, vs33, 3 + + xxpermdi vs2, vs34, vs35, 0 + xxpermdi vs3, vs34, vs35, 3 + + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs36, vs37, 3 + + xxpermdi vs6, vs38, vs39, 0 + xxpermdi vs7, vs38, vs39, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs36, o0, T1 + lxvd2x vs37, o16, T1 + lxvd2x vs38, o32, T1 + lxvd2x vs39, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + xvsubdp vs36, vs36, vs4 + xvsubdp vs37, vs37, vs5 + xvsubdp vs38, vs38, vs6 + xvsubdp vs39, vs39, vs7 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvnmsubadp vs33, vs32, vs1 + xvnmsubadp vs34, vs32, vs2 + xvnmsubadp vs35, vs32, vs3 + xvnmsubadp vs36, vs32, vs4 + xvnmsubadp vs37, vs32, vs5 + xvnmsubadp vs38, vs32, vs6 + xvnmsubadp vs39, vs32, vs7 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs33, vs33, vs0 + xvnmsubadp vs34, vs33, vs1 + xvnmsubadp vs35, vs33, vs2 + xvnmsubadp vs36, vs33, vs3 + xvnmsubadp vs37, vs33, vs4 + xvnmsubadp vs38, vs33, vs5 + xvnmsubadp vs39, vs33, vs6 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs34, vs34, vs0 + xvnmsubadp vs35, vs34, vs1 + xvnmsubadp vs36, vs34, vs2 + xvnmsubadp vs37, vs34, vs3 + xvnmsubadp vs38, vs34, vs4 + xvnmsubadp vs39, vs34, vs5 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs35, vs35, vs0 + xvnmsubadp vs36, vs35, vs1 + xvnmsubadp vs37, vs35, vs2 + xvnmsubadp vs38, vs35, vs3 + xvnmsubadp vs39, vs35, vs4 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs36, vs36, vs0 + xvnmsubadp vs37, vs36, vs1 + xvnmsubadp vs38, vs36, vs2 + xvnmsubadp vs39, vs36, vs3 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs37, vs37, vs0 + xvnmsubadp vs38, vs37, vs1 + xvnmsubadp vs39, vs37, vs2 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs38, vs38, vs0 + xvnmsubadp vs39, vs38, vs1 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs39, vs39, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs33, o8, T1 + xxswapd vs33, vs33 + stxsdx vs34, o16, T1 + xxswapd vs34, vs34 + stxsdx vs35, o24, T1 + xxswapd vs35, vs35 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + xxswapd vs36, vs36 + stxsdx vs37, o8, T1 + xxswapd vs37, vs37 + stxsdx vs38, o16, T1 + xxswapd vs38, vs38 + stxsdx vs39, o24, T1 + xxswapd vs39, vs39 + + stxsdx vs32, o0, T2 + stxsdx vs33, o8, T2 + stxsdx vs34, o16, T2 + stxsdx vs35, o24, T2 + + addi T2, T2, 32 + + stxsdx vs36, o0, T2 + stxsdx vs37, o8, T2 + stxsdx vs38, o16, T2 + stxsdx vs39, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 4x2 +##########################################################################################*/ + +.macro SOLVE_LT_4x2 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs32, vs33, 3 + + xxpermdi vs2, vs34, vs35, 0 + xxpermdi vs3, vs34, vs35, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvnmsubadp vs33, vs32, vs1 + xvnmsubadp vs34, vs32, vs2 + xvnmsubadp vs35, vs32, vs3 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs33, vs33, vs0 + xvnmsubadp vs34, vs33, vs1 + xvnmsubadp vs35, vs33, vs2 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs34, vs34, vs0 + xvnmsubadp vs35, vs34, vs1 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs35, vs35, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs33, o8, T1 + xxswapd vs33, vs33 + stxsdx vs34, o16, T1 + xxswapd vs34, vs34 + stxsdx vs35, o24, T1 + xxswapd vs35, vs35 + + stxsdx vs32, o0, T2 + stxsdx vs33, o8, T2 + stxsdx vs34, o16, T2 + stxsdx vs35, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 2x2 +##########################################################################################*/ + +.macro SOLVE_LT_2x2 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs32, vs33, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs32, vs32, vs0 + xvnmsubadp vs33, vs32, vs1 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs33, vs33, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs33, o8, T1 + xxswapd vs33, vs33 + + stxsdx vs32, o0, T2 + stxsdx vs33, o8, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 1x2 +##########################################################################################*/ + +.macro SOLVE_LT_1x2 + + xxpermdi vs0, vs32, vs33, 0 + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + + xvsubdp vs32, vs32, vs0 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs32, vs32, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + + stxsdx vs32, o0, T2 + +.endm + + +.macro INIT_16x1 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + xvmovdp vs40, vs0 + xvmovdp vs41, vs0 + xvmovdp vs42, vs0 + xvmovdp vs43, vs0 + xvmovdp vs44, vs0 + xvmovdp vs45, vs0 + xvmovdp vs46, vs0 + xvmovdp vs47, vs0 + +.endm + + +.macro KERNEL_16x1 + + + lxvdsx vs0, o0, AO + lxvdsx vs1, o8, AO + lxvdsx vs2, o16, AO + lxvdsx vs3, o24, AO + + addi AO, AO, 32 + + lxvdsx vs4, o0, AO + lxvdsx vs5, o8, AO + lxvdsx vs6, o16, AO + lxvdsx vs7, o24, AO + + addi AO, AO, 32 + + lxvdsx vs8, o0, AO + lxvdsx vs9, o8, AO + lxvdsx vs10, o16, AO + lxvdsx vs11, o24, AO + + addi AO, AO, 32 + + lxvdsx vs12, o0, AO + lxvdsx vs13, o8, AO + lxvdsx vs14, o16, AO + lxvdsx vs15, o24, AO + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO + + addi BO, BO, 8 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs1, vs16 + xvmaddadp vs34, vs2, vs16 + xvmaddadp vs35, vs3, vs16 + xvmaddadp vs36, vs4, vs16 + xvmaddadp vs37, vs5, vs16 + xvmaddadp vs38, vs6, vs16 + xvmaddadp vs39, vs7, vs16 + xvmaddadp vs40, vs8, vs16 + xvmaddadp vs41, vs9, vs16 + xvmaddadp vs42, vs10, vs16 + xvmaddadp vs43, vs11, vs16 + xvmaddadp vs44, vs12, vs16 + xvmaddadp vs45, vs13, vs16 + xvmaddadp vs46, vs14, vs16 + xvmaddadp vs47, vs15, vs16 + + +.endm + + +.macro INIT_8x1 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + +.endm + + +.macro KERNEL_8x1 + + + lxvdsx vs0, o0, AO + lxvdsx vs1, o8, AO + lxvdsx vs2, o16, AO + lxvdsx vs3, o24, AO + + addi AO, AO, 32 + + lxvdsx vs4, o0, AO + lxvdsx vs5, o8, AO + lxvdsx vs6, o16, AO + lxvdsx vs7, o24, AO + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO + + addi BO, BO, 8 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs1, vs16 + xvmaddadp vs34, vs2, vs16 + xvmaddadp vs35, vs3, vs16 + xvmaddadp vs36, vs4, vs16 + xvmaddadp vs37, vs5, vs16 + xvmaddadp vs38, vs6, vs16 + xvmaddadp vs39, vs7, vs16 + + +.endm + + +.macro INIT_4x1 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + +.endm + + +.macro KERNEL_4x1 + + + lxvdsx vs0, o0, AO + lxvdsx vs1, o8, AO + lxvdsx vs2, o16, AO + lxvdsx vs3, o24, AO + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO + + addi BO, BO, 8 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs1, vs16 + xvmaddadp vs34, vs2, vs16 + xvmaddadp vs35, vs3, vs16 + + +.endm + + +.macro INIT_2x1 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + +.endm + + +.macro KERNEL_2x1 + + + lxvdsx vs0, o0, AO + lxvdsx vs1, o8, AO + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO + + addi BO, BO, 8 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs1, vs16 + + +.endm + + +.macro INIT_1x1 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + +.endm + + +.macro KERNEL_1x1 + + + lxvdsx vs0, o0, AO + + addi AO, AO, 8 + + lxvdsx vs16, o0, BO + + addi BO, BO, 8 + + xvmaddadp vs32, vs0, vs16 + + +.endm + + +/*########################################################################################## + SOLVE_LT 16x1 +##########################################################################################*/ + +.macro SOLVE_LT_16x1 + + xxswapd vs0, vs32 + xxswapd vs1, vs33 + xxswapd vs2, vs34 + xxswapd vs3, vs35 + xxswapd vs4, vs36 + xxswapd vs5, vs37 + xxswapd vs6, vs38 + xxswapd vs7, vs39 + xxswapd vs8, vs40 + xxswapd vs9, vs41 + xxswapd vs10, vs42 + xxswapd vs11, vs43 + xxswapd vs12, vs44 + xxswapd vs13, vs45 + xxswapd vs14, vs46 + xxswapd vs15, vs47 + +//############### LOAD B ####################### + + + mr T1, BO + + lxsdx vs32, o0, T1 + lxsdx vs33, o8, T1 + lxsdx vs34, o16, T1 + lxsdx vs35, o24, T1 + + addi T1, T1, 32 + + lxsdx vs36, o0, T1 + lxsdx vs37, o8, T1 + lxsdx vs38, o16, T1 + lxsdx vs39, o24, T1 + + addi T1, T1, 32 + + lxsdx vs40, o0, T1 + lxsdx vs41, o8, T1 + lxsdx vs42, o16, T1 + lxsdx vs43, o24, T1 + + addi T1, T1, 32 + + lxsdx vs44, o0, T1 + lxsdx vs45, o8, T1 + lxsdx vs46, o16, T1 + lxsdx vs47, o24, T1 + + xssubdp vs32, vs32, vs0 + xssubdp vs33, vs33, vs1 + xssubdp vs34, vs34, vs2 + xssubdp vs35, vs35, vs3 + xssubdp vs36, vs36, vs4 + xssubdp vs37, vs37, vs5 + xssubdp vs38, vs38, vs6 + xssubdp vs39, vs39, vs7 + xssubdp vs40, vs40, vs8 + xssubdp vs41, vs41, vs9 + xssubdp vs42, vs42, vs10 + xssubdp vs43, vs43, vs11 + xssubdp vs44, vs44, vs12 + xssubdp vs45, vs45, vs13 + xssubdp vs46, vs46, vs14 + xssubdp vs47, vs47, vs15 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + lxsdx vs11, o24, T1 + + addi T1, T1, 32 + + lxsdx vs12, o0, T1 + lxsdx vs13, o8, T1 + lxsdx vs14, o16, T1 + lxsdx vs15, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs32, vs32, vs0 + xsnmsubadp vs33, vs32, vs1 + xsnmsubadp vs34, vs32, vs2 + xsnmsubadp vs35, vs32, vs3 + xsnmsubadp vs36, vs32, vs4 + xsnmsubadp vs37, vs32, vs5 + xsnmsubadp vs38, vs32, vs6 + xsnmsubadp vs39, vs32, vs7 + xsnmsubadp vs40, vs32, vs8 + xsnmsubadp vs41, vs32, vs9 + xsnmsubadp vs42, vs32, vs10 + xsnmsubadp vs43, vs32, vs11 + xsnmsubadp vs44, vs32, vs12 + xsnmsubadp vs45, vs32, vs13 + xsnmsubadp vs46, vs32, vs14 + xsnmsubadp vs47, vs32, vs15 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + lxsdx vs11, o24, T1 + + addi T1, T1, 32 + + lxsdx vs12, o0, T1 + lxsdx vs13, o8, T1 + lxsdx vs14, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs33, vs33, vs0 + xsnmsubadp vs34, vs33, vs1 + xsnmsubadp vs35, vs33, vs2 + xsnmsubadp vs36, vs33, vs3 + xsnmsubadp vs37, vs33, vs4 + xsnmsubadp vs38, vs33, vs5 + xsnmsubadp vs39, vs33, vs6 + xsnmsubadp vs40, vs33, vs7 + xsnmsubadp vs41, vs33, vs8 + xsnmsubadp vs42, vs33, vs9 + xsnmsubadp vs43, vs33, vs10 + xsnmsubadp vs44, vs33, vs11 + xsnmsubadp vs45, vs33, vs12 + xsnmsubadp vs46, vs33, vs13 + xsnmsubadp vs47, vs33, vs14 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + lxsdx vs11, o24, T1 + + addi T1, T1, 32 + + lxsdx vs12, o0, T1 + lxsdx vs13, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs34, vs34, vs0 + xsnmsubadp vs35, vs34, vs1 + xsnmsubadp vs36, vs34, vs2 + xsnmsubadp vs37, vs34, vs3 + xsnmsubadp vs38, vs34, vs4 + xsnmsubadp vs39, vs34, vs5 + xsnmsubadp vs40, vs34, vs6 + xsnmsubadp vs41, vs34, vs7 + xsnmsubadp vs42, vs34, vs8 + xsnmsubadp vs43, vs34, vs9 + xsnmsubadp vs44, vs34, vs10 + xsnmsubadp vs45, vs34, vs11 + xsnmsubadp vs46, vs34, vs12 + xsnmsubadp vs47, vs34, vs13 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + lxsdx vs11, o24, T1 + + addi T1, T1, 32 + + lxsdx vs12, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs35, vs35, vs0 + xsnmsubadp vs36, vs35, vs1 + xsnmsubadp vs37, vs35, vs2 + xsnmsubadp vs38, vs35, vs3 + xsnmsubadp vs39, vs35, vs4 + xsnmsubadp vs40, vs35, vs5 + xsnmsubadp vs41, vs35, vs6 + xsnmsubadp vs42, vs35, vs7 + xsnmsubadp vs43, vs35, vs8 + xsnmsubadp vs44, vs35, vs9 + xsnmsubadp vs45, vs35, vs10 + xsnmsubadp vs46, vs35, vs11 + xsnmsubadp vs47, vs35, vs12 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + lxsdx vs11, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs36, vs36, vs0 + xsnmsubadp vs37, vs36, vs1 + xsnmsubadp vs38, vs36, vs2 + xsnmsubadp vs39, vs36, vs3 + xsnmsubadp vs40, vs36, vs4 + xsnmsubadp vs41, vs36, vs5 + xsnmsubadp vs42, vs36, vs6 + xsnmsubadp vs43, vs36, vs7 + xsnmsubadp vs44, vs36, vs8 + xsnmsubadp vs45, vs36, vs9 + xsnmsubadp vs46, vs36, vs10 + xsnmsubadp vs47, vs36, vs11 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs37, vs37, vs0 + xsnmsubadp vs38, vs37, vs1 + xsnmsubadp vs39, vs37, vs2 + xsnmsubadp vs40, vs37, vs3 + xsnmsubadp vs41, vs37, vs4 + xsnmsubadp vs42, vs37, vs5 + xsnmsubadp vs43, vs37, vs6 + xsnmsubadp vs44, vs37, vs7 + xsnmsubadp vs45, vs37, vs8 + xsnmsubadp vs46, vs37, vs9 + xsnmsubadp vs47, vs37, vs10 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs38, vs38, vs0 + xsnmsubadp vs39, vs38, vs1 + xsnmsubadp vs40, vs38, vs2 + xsnmsubadp vs41, vs38, vs3 + xsnmsubadp vs42, vs38, vs4 + xsnmsubadp vs43, vs38, vs5 + xsnmsubadp vs44, vs38, vs6 + xsnmsubadp vs45, vs38, vs7 + xsnmsubadp vs46, vs38, vs8 + xsnmsubadp vs47, vs38, vs9 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs39, vs39, vs0 + xsnmsubadp vs40, vs39, vs1 + xsnmsubadp vs41, vs39, vs2 + xsnmsubadp vs42, vs39, vs3 + xsnmsubadp vs43, vs39, vs4 + xsnmsubadp vs44, vs39, vs5 + xsnmsubadp vs45, vs39, vs6 + xsnmsubadp vs46, vs39, vs7 + xsnmsubadp vs47, vs39, vs8 + +//############### OFFSET 8 ####################### + + addi T1, T1, 8*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs40, vs40, vs0 + xsnmsubadp vs41, vs40, vs1 + xsnmsubadp vs42, vs40, vs2 + xsnmsubadp vs43, vs40, vs3 + xsnmsubadp vs44, vs40, vs4 + xsnmsubadp vs45, vs40, vs5 + xsnmsubadp vs46, vs40, vs6 + xsnmsubadp vs47, vs40, vs7 + +//############### OFFSET 9 ####################### + + addi T1, T1, 9*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs41, vs41, vs0 + xsnmsubadp vs42, vs41, vs1 + xsnmsubadp vs43, vs41, vs2 + xsnmsubadp vs44, vs41, vs3 + xsnmsubadp vs45, vs41, vs4 + xsnmsubadp vs46, vs41, vs5 + xsnmsubadp vs47, vs41, vs6 + +//############### OFFSET 10 ####################### + + addi T1, T1, 10*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs42, vs42, vs0 + xsnmsubadp vs43, vs42, vs1 + xsnmsubadp vs44, vs42, vs2 + xsnmsubadp vs45, vs42, vs3 + xsnmsubadp vs46, vs42, vs4 + xsnmsubadp vs47, vs42, vs5 + +//############### OFFSET 11 ####################### + + addi T1, T1, 11*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs43, vs43, vs0 + xsnmsubadp vs44, vs43, vs1 + xsnmsubadp vs45, vs43, vs2 + xsnmsubadp vs46, vs43, vs3 + xsnmsubadp vs47, vs43, vs4 + +//############### OFFSET 12 ####################### + + addi T1, T1, 12*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs44, vs44, vs0 + xsnmsubadp vs45, vs44, vs1 + xsnmsubadp vs46, vs44, vs2 + xsnmsubadp vs47, vs44, vs3 + +//############### OFFSET 13 ####################### + + addi T1, T1, 13*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs45, vs45, vs0 + xsnmsubadp vs46, vs45, vs1 + xsnmsubadp vs47, vs45, vs2 + +//############### OFFSET 14 ####################### + + addi T1, T1, 14*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs46, vs46, vs0 + xsnmsubadp vs47, vs46, vs1 + +//############### OFFSET 15 ####################### + + addi T1, T1, 15*SIZE + + lxsdx vs0, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs47, vs47, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + stxsdx vs37, o8, T1 + stxsdx vs38, o16, T1 + stxsdx vs39, o24, T1 + + addi T1, T1, 32 + + stxsdx vs40, o0, T1 + stxsdx vs41, o8, T1 + stxsdx vs42, o16, T1 + stxsdx vs43, o24, T1 + + addi T1, T1, 32 + + stxsdx vs44, o0, T1 + stxsdx vs45, o8, T1 + stxsdx vs46, o16, T1 + stxsdx vs47, o24, T1 + +//############### SAVE C ####################### + + + mr T1, CO + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + stxsdx vs37, o8, T1 + stxsdx vs38, o16, T1 + stxsdx vs39, o24, T1 + + addi T1, T1, 32 + + stxsdx vs40, o0, T1 + stxsdx vs41, o8, T1 + stxsdx vs42, o16, T1 + stxsdx vs43, o24, T1 + + addi T1, T1, 32 + + stxsdx vs44, o0, T1 + stxsdx vs45, o8, T1 + stxsdx vs46, o16, T1 + stxsdx vs47, o24, T1 + +.endm + + +/*########################################################################################## + SOLVE_LT 8x1 +##########################################################################################*/ + +.macro SOLVE_LT_8x1 + + xxswapd vs0, vs32 + xxswapd vs1, vs33 + xxswapd vs2, vs34 + xxswapd vs3, vs35 + xxswapd vs4, vs36 + xxswapd vs5, vs37 + xxswapd vs6, vs38 + xxswapd vs7, vs39 + +//############### LOAD B ####################### + + + mr T1, BO + + lxsdx vs32, o0, T1 + lxsdx vs33, o8, T1 + lxsdx vs34, o16, T1 + lxsdx vs35, o24, T1 + + addi T1, T1, 32 + + lxsdx vs36, o0, T1 + lxsdx vs37, o8, T1 + lxsdx vs38, o16, T1 + lxsdx vs39, o24, T1 + + xssubdp vs32, vs32, vs0 + xssubdp vs33, vs33, vs1 + xssubdp vs34, vs34, vs2 + xssubdp vs35, vs35, vs3 + xssubdp vs36, vs36, vs4 + xssubdp vs37, vs37, vs5 + xssubdp vs38, vs38, vs6 + xssubdp vs39, vs39, vs7 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs32, vs32, vs0 + xsnmsubadp vs33, vs32, vs1 + xsnmsubadp vs34, vs32, vs2 + xsnmsubadp vs35, vs32, vs3 + xsnmsubadp vs36, vs32, vs4 + xsnmsubadp vs37, vs32, vs5 + xsnmsubadp vs38, vs32, vs6 + xsnmsubadp vs39, vs32, vs7 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs33, vs33, vs0 + xsnmsubadp vs34, vs33, vs1 + xsnmsubadp vs35, vs33, vs2 + xsnmsubadp vs36, vs33, vs3 + xsnmsubadp vs37, vs33, vs4 + xsnmsubadp vs38, vs33, vs5 + xsnmsubadp vs39, vs33, vs6 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs34, vs34, vs0 + xsnmsubadp vs35, vs34, vs1 + xsnmsubadp vs36, vs34, vs2 + xsnmsubadp vs37, vs34, vs3 + xsnmsubadp vs38, vs34, vs4 + xsnmsubadp vs39, vs34, vs5 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs35, vs35, vs0 + xsnmsubadp vs36, vs35, vs1 + xsnmsubadp vs37, vs35, vs2 + xsnmsubadp vs38, vs35, vs3 + xsnmsubadp vs39, vs35, vs4 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs36, vs36, vs0 + xsnmsubadp vs37, vs36, vs1 + xsnmsubadp vs38, vs36, vs2 + xsnmsubadp vs39, vs36, vs3 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs37, vs37, vs0 + xsnmsubadp vs38, vs37, vs1 + xsnmsubadp vs39, vs37, vs2 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs38, vs38, vs0 + xsnmsubadp vs39, vs38, vs1 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxsdx vs0, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs39, vs39, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + stxsdx vs37, o8, T1 + stxsdx vs38, o16, T1 + stxsdx vs39, o24, T1 + +//############### SAVE C ####################### + + + mr T1, CO + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + stxsdx vs37, o8, T1 + stxsdx vs38, o16, T1 + stxsdx vs39, o24, T1 + +.endm + + +/*########################################################################################## + SOLVE_LT 4x1 +##########################################################################################*/ + +.macro SOLVE_LT_4x1 + + xxswapd vs0, vs32 + xxswapd vs1, vs33 + xxswapd vs2, vs34 + xxswapd vs3, vs35 + +//############### LOAD B ####################### + + + mr T1, BO + + lxsdx vs32, o0, T1 + lxsdx vs33, o8, T1 + lxsdx vs34, o16, T1 + lxsdx vs35, o24, T1 + + xssubdp vs32, vs32, vs0 + xssubdp vs33, vs33, vs1 + xssubdp vs34, vs34, vs2 + xssubdp vs35, vs35, vs3 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs32, vs32, vs0 + xsnmsubadp vs33, vs32, vs1 + xsnmsubadp vs34, vs32, vs2 + xsnmsubadp vs35, vs32, vs3 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs33, vs33, vs0 + xsnmsubadp vs34, vs33, vs1 + xsnmsubadp vs35, vs33, vs2 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs34, vs34, vs0 + xsnmsubadp vs35, vs34, vs1 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxsdx vs0, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs35, vs35, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + +//############### SAVE C ####################### + + + mr T1, CO + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + +.endm + + +/*########################################################################################## + SOLVE_LT 2x1 +##########################################################################################*/ + +.macro SOLVE_LT_2x1 + + xxswapd vs0, vs32 + xxswapd vs1, vs33 + +//############### LOAD B ####################### + + + mr T1, BO + + lxsdx vs32, o0, T1 + lxsdx vs33, o8, T1 + + xssubdp vs32, vs32, vs0 + xssubdp vs33, vs33, vs1 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs32, vs32, vs0 + xsnmsubadp vs33, vs32, vs1 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxsdx vs0, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs33, vs33, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + +//############### SAVE C ####################### + + + mr T1, CO + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + +.endm + + +/*########################################################################################## + SOLVE_LT 1x1 +##########################################################################################*/ + +.macro SOLVE_LT_1x1 + + xxswapd vs0, vs32 + +//############### LOAD B ####################### + + + mr T1, BO + + lxsdx vs32, o0, T1 + + xssubdp vs32, vs32, vs0 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxsdx vs0, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs32, vs32, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxsdx vs32, o0, T1 + +//############### SAVE C ####################### + + + mr T1, CO + + stxsdx vs32, o0, T1 + +.endm + diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index 77f3f7cfbc..e169eb9703 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S index 06bb79ea38..8907fe6adf 100644 --- a/kernel/power/sgemm_logic_16x8_power8.S +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ srawi. J, N, 3 @@ -40,35 +40,48 @@ SGEMM_L8_BEGIN: mr BO, B mr BBO, BBUFFER - slwi T1, K, 3 + srawi. T1, K, 2 + ble SGEMM_L8_COPYB1 + -SGEMM_L8_COPYB: +SGEMM_L8_COPYB4: + + dcbt BO, PRE dcbtst BBO, PRE + COPYB_4x8 + addic. T1, T1, -1 + ble SGEMM_L8_COPYB1 - lxvw4x vs3, o0, BO - lxvw4x vs11, o16, BO - xxspltw vs4, vs3, 0 - xxspltw vs5, vs3, 1 - xxspltw vs6, vs3, 2 - xxspltw vs7, vs3, 3 - xxspltw vs12, vs11, 0 - xxspltw vs13, vs11, 1 - xxspltw vs14, vs11, 2 - xxspltw vs15, vs11, 3 - stxvw4x vs4, o0, BBO - stxvw4x vs5, o16, BBO - stxvw4x vs6, o32, BBO - stxvw4x vs7, o48, BBO - addi BO, BO, 32 - addi BBO, BBO, 64 - stxvw4x vs12, o0, BBO - stxvw4x vs13, o16, BBO - stxvw4x vs14, o32, BBO - stxvw4x vs15, o48, BBO - addic. T1, T1, -8 - addi BBO, BBO, 64 + dcbtst BBO, PRE + COPYB_4x8 + addic. T1, T1, -1 + ble SGEMM_L8_COPYB1 - bge SGEMM_L8_COPYB + dcbtst BBO, PRE + COPYB_4x8 + addic. T1, T1, -1 + ble SGEMM_L8_COPYB1 + + dcbtst BBO, PRE + COPYB_4x8 + addic. T1, T1, -1 + + bgt SGEMM_L8_COPYB4 + +SGEMM_L8_COPYB1: + + andi. T1, K, 3 + ble SGEMM_L8_COPYB_END + +SGEMM_L8_COPYB1_LOOP: + + + COPYB_1x8 + addic. T1, T1, -1 + + bgt SGEMM_L8_COPYB1_LOOP + +SGEMM_L8_COPYB_END: mr CO, C mr AO, A @@ -93,24 +106,24 @@ SGEMM_L8x16_LOOP_START: LOAD8x16_1 dcbt BO, PRE KERNEL8x16_I1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 addic. L, L, -2 @@ -122,24 +135,24 @@ SGEMM_L8x16_LOOP: dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 addic. L, L, -1 @@ -149,18 +162,15 @@ SGEMM_L8x16_LOOP_END: dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 - dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 KERNEL8x16_1 diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S index 71dc52979d..98414857fd 100644 --- a/kernel/power/sgemm_macros_16x8_power8.S +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ @@ -5886,3 +5886,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm + + + + +.macro COPYB_4x8 + + + lxvw4x vs5, o0, BO + xxspltw vs6, vs5, 0 + xxspltw vs7, vs5, 1 + xxspltw vs8, vs5, 2 + xxspltw vs9, vs5, 3 + + lxvw4x vs10, o16, BO + xxspltw vs11, vs10, 0 + xxspltw vs12, vs10, 1 + xxspltw vs13, vs10, 2 + xxspltw vs14, vs10, 3 + + lxvw4x vs15, o32, BO + xxspltw vs16, vs15, 0 + xxspltw vs17, vs15, 1 + xxspltw vs18, vs15, 2 + xxspltw vs19, vs15, 3 + + lxvw4x vs20, o48, BO + xxspltw vs21, vs20, 0 + xxspltw vs22, vs20, 1 + xxspltw vs23, vs20, 2 + xxspltw vs24, vs20, 3 + + addi BO, BO, 64 + lxvw4x vs35, o0, BO + xxspltw vs36, vs35, 0 + xxspltw vs37, vs35, 1 + xxspltw vs38, vs35, 2 + xxspltw vs39, vs35, 3 + + lxvw4x vs40, o16, BO + xxspltw vs41, vs40, 0 + xxspltw vs42, vs40, 1 + xxspltw vs43, vs40, 2 + xxspltw vs44, vs40, 3 + + lxvw4x vs45, o32, BO + xxspltw vs46, vs45, 0 + xxspltw vs47, vs45, 1 + xxspltw vs48, vs45, 2 + xxspltw vs49, vs45, 3 + + lxvw4x vs50, o48, BO + xxspltw vs51, vs50, 0 + xxspltw vs52, vs50, 1 + xxspltw vs53, vs50, 2 + xxspltw vs54, vs50, 3 + + addi BO, BO, 64 + + + stxvw4x vs6, o0, BBO + stxvw4x vs7, o16, BBO + stxvw4x vs8, o32, BBO + stxvw4x vs9, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs11, o0, BBO + stxvw4x vs12, o16, BBO + stxvw4x vs13, o32, BBO + stxvw4x vs14, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs16, o0, BBO + stxvw4x vs17, o16, BBO + stxvw4x vs18, o32, BBO + stxvw4x vs19, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs21, o0, BBO + stxvw4x vs22, o16, BBO + stxvw4x vs23, o32, BBO + stxvw4x vs24, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs36, o0, BBO + stxvw4x vs37, o16, BBO + stxvw4x vs38, o32, BBO + stxvw4x vs39, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs41, o0, BBO + stxvw4x vs42, o16, BBO + stxvw4x vs43, o32, BBO + stxvw4x vs44, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs46, o0, BBO + stxvw4x vs47, o16, BBO + stxvw4x vs48, o32, BBO + stxvw4x vs49, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs51, o0, BBO + stxvw4x vs52, o16, BBO + stxvw4x vs53, o32, BBO + stxvw4x vs54, o48, BBO + + addi BBO, BBO, 64 +.endm + + +.macro COPYB_1x8 + + + lxvw4x vs5, o0, BO + xxspltw vs6, vs5, 0 + xxspltw vs7, vs5, 1 + xxspltw vs8, vs5, 2 + xxspltw vs9, vs5, 3 + + lxvw4x vs10, o16, BO + xxspltw vs11, vs10, 0 + xxspltw vs12, vs10, 1 + xxspltw vs13, vs10, 2 + xxspltw vs14, vs10, 3 + + + addi BO, BO, 32 + + stxvw4x vs6, o0, BBO + stxvw4x vs7, o16, BBO + stxvw4x vs8, o32, BBO + stxvw4x vs9, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs11, o0, BBO + stxvw4x vs12, o16, BBO + stxvw4x vs13, o32, BBO + stxvw4x vs14, o48, BBO + + addi BBO, BBO, 64 +.endm + diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S new file mode 100644 index 0000000000..764d5b1872 --- /dev/null +++ b/kernel/power/sgemm_tcopy_16_power8.S @@ -0,0 +1,212 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define o4 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define B16 r29 +#define M16 r30 +#define T1 r31 + +#define o0 0 + +#include "sgemm_tcopy_macros_16_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + slwi M16, M, 4 + BASE_SHIFT + + li T1, -16 + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B8, N, T1 + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B8, B8, M + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B8, B8, BASE_SHIFT + slwi B4, B4, BASE_SHIFT + slwi B2, B2, BASE_SHIFT + slwi B1, B1, BASE_SHIFT + + add B8, B8, B + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 768 + addi PREB, M16, 128 + + li o4, 4 + li o16, 16 + li o32, 32 + li o48, 48 + +#include "sgemm_tcopy_logic_16_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/sgemm_tcopy_8_power8.S b/kernel/power/sgemm_tcopy_8_power8.S new file mode 100644 index 0000000000..2bbd6e6969 --- /dev/null +++ b/kernel/power/sgemm_tcopy_8_power8.S @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define o4 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTU1 r29 +#define M8 r30 +#define T1 r31 + +#define o0 0 + +#include "sgemm_tcopy_macros_8_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + slwi M8, M, 3 + BASE_SHIFT + + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B4, B4, BASE_SHIFT + slwi B2, B2, BASE_SHIFT + slwi B1, B1, BASE_SHIFT + + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M8, 128 + + li o4, 4 + li o16, 16 + li o32, 32 + li o48, 48 + +#include "sgemm_tcopy_logic_8_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/sgemm_tcopy_logic_16_power8.S b/kernel/power/sgemm_tcopy_logic_16_power8.S new file mode 100644 index 0000000000..7dfb6fa465 --- /dev/null +++ b/kernel/power/sgemm_tcopy_logic_16_power8.S @@ -0,0 +1,324 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble SCOPYT_L2_BEGIN + + +SCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B16, B + addi B, B, 64*SIZE + + sradi. J, N, 4 + ble SCOPYT_L4x8_BEGIN + + mr BO, B16 + +SCOPYT_L4x16_LOOP: + + dcbtst BO, M16 + dcbtst BO, PREB + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + COPY_4x16 + + addi A0, A0, 16*SIZE + addi A1, A1, 16*SIZE + addi A2, A2, 16*SIZE + addi A3, A3, 16*SIZE + add BO, BO, M16 + + addic. J, J, -1 + ble SCOPYT_L4x8_BEGIN + + + dcbtst BO, M16 + dcbtst BO, PREB + COPY_4x16 + + addi A0, A0, 16*SIZE + addi A1, A1, 16*SIZE + addi A2, A2, 16*SIZE + addi A3, A3, 16*SIZE + add BO, BO, M16 + + addic. J, J, -1 + bgt SCOPYT_L4x16_LOOP + +SCOPYT_L4x8_BEGIN: + + andi. T1, N, 8 + ble SCOPYT_L4x4_BEGIN + + mr BO, B8 + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + + addi B8, B8, 32*SIZE + +SCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + addi A0, A0, 4*SIZE + addi A1, A1, 4*SIZE + addi A2, A2, 4*SIZE + addi A3, A3, 4*SIZE + + addi B4, B4, 16*SIZE + +SCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + addi A0, A0, 2*SIZE + addi A1, A1, 2*SIZE + addi A2, A2, 2*SIZE + addi A3, A3, 2*SIZE + + addi B2, B2, 8*SIZE + +SCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + addi A0, A0, 1*SIZE + addi A1, A1, 1*SIZE + addi A2, A2, 1*SIZE + addi A3, A3, 1*SIZE + + addi B1, B1, 4*SIZE + +SCOPYT_L4_END: + + addic. I, I, -1 + bgt SCOPYT_L4_BEGIN + + + +SCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble SCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B16, B + addi B, B, 32*SIZE + + sradi. J, N, 4 + ble SCOPYT_L2x8_BEGIN + + mr BO, B16 + +SCOPYT_L2x16_LOOP: + + COPY_2x16 + + addi A0, A0, 16*SIZE + addi A1, A1, 16*SIZE + add BO, BO, M16 + + addic. J, J, -1 + bgt SCOPYT_L2x16_LOOP + +SCOPYT_L2x8_BEGIN: + + andi. T1, N, 8 + ble SCOPYT_L2x4_BEGIN + + mr BO, B8 + + COPY_2x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + + addi B8, B8, 16*SIZE + +SCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + addi A0, A0, 4*SIZE + addi A1, A1, 4*SIZE + + addi B4, B4, 8*SIZE + +SCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + addi A0, A0, 2*SIZE + addi A1, A1, 2*SIZE + + addi B2, B2, 4*SIZE + +SCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + addi A0, A0, 1*SIZE + addi A1, A1, 1*SIZE + + addi B1, B1, 2*SIZE + +SCOPYT_L2_END: + + +SCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B16, B + addi B, B, 16*SIZE + + sradi. J, N, 4 + ble SCOPYT_L1x8_BEGIN + + mr BO, B16 + +SCOPYT_L1x16_LOOP: + + COPY_1x16 + + addi A0, A0, 16*SIZE + add BO, BO, M16 + + addic. J, J, -1 + bgt SCOPYT_L1x16_LOOP + +SCOPYT_L1x8_BEGIN: + + andi. T1, N, 8 + ble SCOPYT_L1x4_BEGIN + + mr BO, B8 + + COPY_1x8 + + addi A0, A0, 8*SIZE + + addi B8, B8, 8*SIZE + +SCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + addi A0, A0, 4*SIZE + + addi B4, B4, 4*SIZE + +SCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + addi A0, A0, 2*SIZE + + addi B2, B2, 2*SIZE + +SCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + addi A0, A0, 1*SIZE + + addi B1, B1, 1*SIZE + +SCOPYT_L1_END: + diff --git a/kernel/power/sgemm_tcopy_logic_8_power8.S b/kernel/power/sgemm_tcopy_logic_8_power8.S new file mode 100644 index 0000000000..4cf74baa3c --- /dev/null +++ b/kernel/power/sgemm_tcopy_logic_8_power8.S @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble SCOPYOT_L2_BEGIN + + +SCOPYOT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B8, B + addi B, B, 32*SIZE + + sradi. J, N, 3 + ble SCOPYOT_L4x4_BEGIN + + mr BO, B8 + .align 5 + +SCOPYOT_L4x8_LOOP: + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + ble SCOPYOT_L4x4_BEGIN + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + ble SCOPYOT_L4x4_BEGIN + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + ble SCOPYOT_L4x4_BEGIN + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + bgt SCOPYOT_L4x8_LOOP + +SCOPYOT_L4x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYOT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + addi A0, A0, 4*SIZE + addi A1, A1, 4*SIZE + addi A2, A2, 4*SIZE + addi A3, A3, 4*SIZE + + addi B4, B4, 16*SIZE + +SCOPYOT_L4x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYOT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + addi A0, A0, 2*SIZE + addi A1, A1, 2*SIZE + addi A2, A2, 2*SIZE + addi A3, A3, 2*SIZE + + addi B2, B2, 8*SIZE + +SCOPYOT_L4x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYOT_L4_END + + mr BO, B1 + + COPY_4x1 + + addi A0, A0, 1*SIZE + addi A1, A1, 1*SIZE + addi A2, A2, 1*SIZE + addi A3, A3, 1*SIZE + + addi B1, B1, 4*SIZE + +SCOPYOT_L4_END: + + addic. I, I, -1 + bgt SCOPYOT_L4_BEGIN + + + +SCOPYOT_L2_BEGIN: + + andi. T1, M, 2 + ble SCOPYOT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B8, B + addi B, B, 16*SIZE + + sradi. J, N, 3 + ble SCOPYOT_L2x4_BEGIN + + mr BO, B8 + +SCOPYOT_L2x8_LOOP: + + COPY_2x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + bgt SCOPYOT_L2x8_LOOP + +SCOPYOT_L2x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYOT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + addi A0, A0, 4*SIZE + addi A1, A1, 4*SIZE + + addi B4, B4, 8*SIZE + +SCOPYOT_L2x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYOT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + addi A0, A0, 2*SIZE + addi A1, A1, 2*SIZE + + addi B2, B2, 4*SIZE + +SCOPYOT_L2x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYOT_L2_END + + mr BO, B1 + + COPY_2x1 + + addi A0, A0, 1*SIZE + addi A1, A1, 1*SIZE + + addi B1, B1, 2*SIZE + +SCOPYOT_L2_END: + + +SCOPYOT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B8, B + addi B, B, 8*SIZE + + sradi. J, N, 3 + ble SCOPYOT_L1x4_BEGIN + + mr BO, B8 + +SCOPYOT_L1x8_LOOP: + + COPY_1x8 + + addi A0, A0, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + bgt SCOPYOT_L1x8_LOOP + +SCOPYOT_L1x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYOT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + addi A0, A0, 4*SIZE + + addi B4, B4, 4*SIZE + +SCOPYOT_L1x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYOT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + addi A0, A0, 2*SIZE + + addi B2, B2, 2*SIZE + +SCOPYOT_L1x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYOT_L1_END + + mr BO, B1 + + COPY_1x1 + + addi A0, A0, 1*SIZE + + addi B1, B1, 1*SIZE + +SCOPYOT_L1_END: + diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S new file mode 100644 index 0000000000..53f9c8b823 --- /dev/null +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -0,0 +1,416 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro COPY_4x16 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + + lxvw4x vs36, o0, A1 + lxvw4x vs37, o16, A1 + lxvw4x vs38, o32, A1 + lxvw4x vs39, o48, A1 + + lxvw4x vs40, o0, A2 + lxvw4x vs41, o16, A2 + lxvw4x vs42, o32, A2 + lxvw4x vs43, o48, A2 + + lxvw4x vs44, o0, A3 + lxvw4x vs45, o16, A3 + lxvw4x vs46, o32, A3 + lxvw4x vs47, o48, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs40, o0, T1 + stxvw4x vs41, o16, T1 + stxvw4x vs42, o32, T1 + stxvw4x vs43, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs44, o0, T1 + stxvw4x vs45, o16, T1 + stxvw4x vs46, o32, T1 + stxvw4x vs47, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + + lxvw4x vs36, o0, A2 + lxvw4x vs37, o16, A2 + + lxvw4x vs38, o0, A3 + lxvw4x vs39, o16, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvw4x vs32, o0, A0 + + lxvw4x vs33, o0, A1 + + lxvw4x vs34, o0, A2 + + lxvw4x vs35, o0, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + + lxsspx vs36, o0, A2 + lxsspx vs37, o4, A2 + + lxsspx vs38, o0, A3 + lxsspx vs39, o4, A3 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + + addi T1, T1, 8 + + stxsspx vs36, o0, T1 + stxsspx vs37, o4, T1 + + addi T1, T1, 8 + + stxsspx vs38, o0, T1 + stxsspx vs39, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsspx vs32, o0, A0 + + lxsspx vs33, o0, A1 + + lxsspx vs34, o0, A2 + + lxsspx vs35, o0, A3 + + mr T1, BO + + stxsspx vs32, o0, T1 + + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro COPY_2x16 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + + lxvw4x vs36, o0, A1 + lxvw4x vs37, o16, A1 + lxvw4x vs38, o32, A1 + lxvw4x vs39, o48, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvw4x vs32, o0, A0 + + lxvw4x vs33, o0, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsspx vs32, o0, A0 + + lxsspx vs33, o0, A1 + + mr T1, BO + + stxsspx vs32, o0, T1 + + stxsspx vs33, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro COPY_1x16 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvw4x vs32, o0, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsspx vs32, o0, A0 + + mr T1, BO + + stxsspx vs32, o0, T1 + +.endm + diff --git a/kernel/power/sgemm_tcopy_macros_8_power8.S b/kernel/power/sgemm_tcopy_macros_8_power8.S new file mode 100644 index 0000000000..1b71d5bb3b --- /dev/null +++ b/kernel/power/sgemm_tcopy_macros_8_power8.S @@ -0,0 +1,308 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + + lxvw4x vs36, o0, A2 + lxvw4x vs37, o16, A2 + + lxvw4x vs38, o0, A3 + lxvw4x vs39, o16, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvw4x vs32, o0, A0 + + lxvw4x vs33, o0, A1 + + lxvw4x vs34, o0, A2 + + lxvw4x vs35, o0, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + + lxsspx vs36, o0, A2 + lxsspx vs37, o4, A2 + + lxsspx vs38, o0, A3 + lxsspx vs39, o4, A3 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + + addi T1, T1, 8 + + stxsspx vs36, o0, T1 + stxsspx vs37, o4, T1 + + addi T1, T1, 8 + + stxsspx vs38, o0, T1 + stxsspx vs39, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsspx vs32, o0, A0 + + lxsspx vs33, o0, A1 + + lxsspx vs34, o0, A2 + + lxsspx vs35, o0, A3 + + mr T1, BO + + stxsspx vs32, o0, T1 + + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvw4x vs32, o0, A0 + + lxvw4x vs33, o0, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsspx vs32, o0, A0 + + lxsspx vs33, o0, A1 + + mr T1, BO + + stxsspx vs32, o0, T1 + + stxsspx vs33, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvw4x vs32, o0, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsspx vs32, o0, A0 + + mr T1, BO + + stxsspx vs32, o0, T1 + +.endm + diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index 336b13b1fc..02c94a88ab 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -1,3 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -250,7 +320,7 @@ ble L999 slwi LDC, LDC, ZBASE_SHIFT - li PRE, 384 + li PRE, 512 li o8 , 8 li o16 , 16 li o24 , 24 diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S index 96612da82e..0cd784cc01 100644 --- a/kernel/power/zgemm_logic_8x2_power8.S +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -1,3 +1,39 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + srawi. J, N, 1 ble ZGEMM_L2_END @@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN: mr BO, B mr BBO, BBUFFER - slwi T1, K, 1 + srawi. T1, K, 2 + ble ZGEMM_L2_COPYB1 -ZGEMM_L2_COPYB: +ZGEMM_L2_COPYB8: - lxvdsx vs4, o0, BO // b0_r - lxvdsx vs5, o8, BO // b0_i - addi BO, BO, 16 - stxvd2x vs4, o0, BBO - stxvd2x vs5, o16, BBO + addi T2, PRE, 128 + dcbt BO, PRE + dcbtst BBO, PRE + dcbtst BBO, T2 + ZCOPYB_8x1 addic. T1, T1, -1 - addi BBO, BBO, 32 - bge ZGEMM_L2_COPYB + bgt ZGEMM_L2_COPYB8 + +ZGEMM_L2_COPYB1: + + andi. T1, K, 3 + ble ZGEMM_L2_COPYB_END + +ZGEMM_L2_COPYB_LOOP: + + ZCOPYB_1x1 + ZCOPYB_1x1 + addic. T1, T1, -1 + + bgt ZGEMM_L2_COPYB_LOOP +ZGEMM_L2_COPYB_END: mr CO, C mr AO, A @@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN: slwi T1, K, 0 ZGEMM_L1_COPYB: + dcbtst BBO, PRE lxvdsx vs4, o0, BO // b0_r lxvdsx vs5, o8, BO // b0_i diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index a0fbb2e112..c43a115b28 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp @@ -3055,3 +3090,76 @@ .endm + + +.macro ZCOPYB_1x1 + + lxvdsx vs4, o0, BO // b0_r + lxvdsx vs5, o8, BO // b0_i + addi BO, BO, 16 + stxvd2x vs4, o0, BBO + stxvd2x vs5, o16, BBO + addi BBO, BBO, 32 + +.endm + + +.macro ZCOPYB_8x1 + + lxvd2x vs32, o0, BO + lxvd2x vs33, o16, BO + lxvd2x vs34, o32, BO + lxvd2x vs35, o48, BO + addi BO, BO, 64 + + lxvd2x vs36, o0, BO + lxvd2x vs37, o16, BO + lxvd2x vs38, o32, BO + lxvd2x vs39, o48, BO + addi BO, BO, 64 + + xxspltd vs40, vs32, 0 + xxspltd vs41, vs32, 1 + xxspltd vs42, vs33, 0 + xxspltd vs43, vs33, 1 + xxspltd vs44, vs34, 0 + xxspltd vs45, vs34, 1 + xxspltd vs46, vs35, 0 + xxspltd vs47, vs35, 1 + + xxspltd vs48, vs36, 0 + xxspltd vs49, vs36, 1 + xxspltd vs50, vs37, 0 + xxspltd vs51, vs37, 1 + xxspltd vs52, vs38, 0 + xxspltd vs53, vs38, 1 + xxspltd vs54, vs39, 0 + xxspltd vs55, vs39, 1 + + stxvd2x vs40, o0, BBO + stxvd2x vs41, o16, BBO + stxvd2x vs42, o32, BBO + stxvd2x vs43, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs44, o0, BBO + stxvd2x vs45, o16, BBO + stxvd2x vs46, o32, BBO + stxvd2x vs47, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs48, o0, BBO + stxvd2x vs49, o16, BBO + stxvd2x vs50, o32, BBO + stxvd2x vs51, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs52, o0, BBO + stxvd2x vs53, o16, BBO + stxvd2x vs54, o32, BBO + stxvd2x vs55, o48, BBO + addi BBO, BBO, 64 + +.endm + + diff --git a/kernel/power/zgemm_tcopy_8_power8.S b/kernel/power/zgemm_tcopy_8_power8.S new file mode 100644 index 0000000000..1f3f35419c --- /dev/null +++ b/kernel/power/zgemm_tcopy_8_power8.S @@ -0,0 +1,205 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define NOTUS1 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTUS2 r27 +#define M8 r30 +#define T1 r31 + +#define o0 0 + +#include "zgemm_tcopy_macros_8_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, ZBASE_SHIFT + slwi M8, M, 3 + ZBASE_SHIFT + + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B4, B4, ZBASE_SHIFT + slwi B2, B2, ZBASE_SHIFT + slwi B1, B1, ZBASE_SHIFT + + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M8, 128 + + li o16, 16 + li o32, 32 + li o48, 48 + +#include "zgemm_tcopy_logic_8_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/zgemm_tcopy_logic_8_power8.S b/kernel/power/zgemm_tcopy_logic_8_power8.S new file mode 100644 index 0000000000..34fd307bdc --- /dev/null +++ b/kernel/power/zgemm_tcopy_logic_8_power8.S @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble ZCOPYT_L2_BEGIN + + +ZCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B8, B + addi B, B, 64*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L4x4_BEGIN + + mr BO, B8 + + .align 5 + +ZCOPYT_L4x8_LOOP: + + addi T1, PREB, 128 + addi T2, PREB, 256 + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + dcbtst BO, M8 + dcbtst BO, PREB + dcbtst BO, T1 + dcbtst BO, T2 + + COPY_4x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L4x8_LOOP + +ZCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + + addi B4, B4, 32*SIZE + +ZCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + + addi B2, B2, 16*SIZE + +ZCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + + addi B1, B1, 8*SIZE + +ZCOPYT_L4_END: + + addic. I, I, -1 + bgt ZCOPYT_L4_BEGIN + + + +ZCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble ZCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B8, B + addi B, B, 32*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L2x4_BEGIN + + mr BO, B8 + +ZCOPYT_L2x8_LOOP: + + COPY_2x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L2x8_LOOP + +ZCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + + addi B4, B4, 16*SIZE + +ZCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + + addi B2, B2, 8*SIZE + +ZCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + + addi B1, B1, 4*SIZE + +ZCOPYT_L2_END: + + +ZCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B8, B + addi B, B, 16*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L1x4_BEGIN + + mr BO, B8 + +ZCOPYT_L1x8_LOOP: + + COPY_1x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L1x8_LOOP + +ZCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + + addi B4, B4, 8*SIZE + +ZCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + + addi B2, B2, 4*SIZE + +ZCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + + addi B1, B1, 2*SIZE + +ZCOPYT_L1_END: + diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S new file mode 100644 index 0000000000..e8c2f0baa8 --- /dev/null +++ b/kernel/power/zgemm_tcopy_macros_8_power8.S @@ -0,0 +1,535 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs48, o0, A2 + lxvd2x vs49, o16, A2 + lxvd2x vs50, o32, A2 + lxvd2x vs51, o48, A2 + addi A2, A2, 64 + + lxvd2x vs52, o0, A2 + lxvd2x vs53, o16, A2 + lxvd2x vs54, o32, A2 + lxvd2x vs55, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs56, o0, A3 + lxvd2x vs57, o16, A3 + lxvd2x vs58, o32, A3 + lxvd2x vs59, o48, A3 + addi A3, A3, 64 + + lxvd2x vs60, o0, A3 + lxvd2x vs61, o16, A3 + lxvd2x vs62, o32, A3 + lxvd2x vs63, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 + addi T1, T1, 64 + + stxvd2x vs52, o0, T1 + stxvd2x vs53, o16, T1 + stxvd2x vs54, o32, T1 + stxvd2x vs55, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs56, o0, T1 + stxvd2x vs57, o16, T1 + stxvd2x vs58, o32, T1 + stxvd2x vs59, o48, T1 + addi T1, T1, 64 + + stxvd2x vs60, o0, T1 + stxvd2x vs61, o16, T1 + stxvd2x vs62, o32, T1 + stxvd2x vs63, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs40, o0, A2 + lxvd2x vs41, o16, A2 + lxvd2x vs42, o32, A2 + lxvd2x vs43, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs44, o0, A3 + lxvd2x vs45, o16, A3 + lxvd2x vs46, o32, A3 + lxvd2x vs47, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + lxvd2x vs36, o0, A2 + lxvd2x vs37, o16, A2 + addi A2, A2, 32 + + + lxvd2x vs38, o0, A3 + lxvd2x vs39, o16, A3 + addi A3, A3, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + lxvd2x vs34, o0, A2 + addi A2, A2, 16 + + + lxvd2x vs35, o0, A3 + addi A3, A3, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + +.endm + diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 213839a8f3..410fc9840a 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) +#if defined(DOUBLE) #include "zscal_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 @@ -123,6 +125,21 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F if ( inc_x <= 0 ) return(0); + if (da_r == ZERO && da_i == ZERO) { + //clear the vector and return + if (inc_x == 1) { + memset(x, 0, n*COMPSIZE*SIZE); + }else{ + inc_x2 = 2 * inc_x; + for(i=0; i a END PROGRAM ") - try_compile( SIZEOF_${_TYPE_NAME} ${CMAKE_BINARY_DIR} ${__TEST_FILE} ) + try_compile( SIZEOF_${_TYPE_NAME} ${PROJECT_BINARY_DIR} ${__TEST_FILE} ) if( SIZEOF_${_TYPE_NAME} ) message( STATUS "Testing default ${_TYPE_NAME}*${__TEST_SIZE} - found" ) set( SIZEOF_${_TYPE_NAME} ${__TEST_SIZE} CACHE INTERNAL "Size of the default ${_TYPE_NAME} type" FORCE ) diff --git a/lapack-netlib/CMAKE/CheckTimeFunction.cmake b/lapack-netlib/CMAKE/CheckTimeFunction.cmake index 350a591324..1a65f242bc 100644 --- a/lapack-netlib/CMAKE/CheckTimeFunction.cmake +++ b/lapack-netlib/CMAKE/CheckTimeFunction.cmake @@ -16,11 +16,11 @@ macro(CHECK_TIME_FUNCTION FUNCTION VARIABLE) if(RES) set(${VARIABLE} ${FUNCTION} CACHE INTERNAL "Have Fortran function ${FUNCTION}") message(STATUS "Looking for Fortran ${FUNCTION} - found") - file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log + file(APPEND ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log "Fortran ${FUNCTION} exists. ${OUTPUT} \n\n") else(RES) message(STATUS "Looking for Fortran ${FUNCTION} - not found") - file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log + file(APPEND ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log "Fortran ${FUNCTION} does not exist. \n ${OUTPUT} \n") endif(RES) endmacro(CHECK_TIME_FUNCTION) diff --git a/lapack-netlib/CMAKE/FortranMangling.cmake b/lapack-netlib/CMAKE/FortranMangling.cmake index 98b8443ef4..538c80218c 100644 --- a/lapack-netlib/CMAKE/FortranMangling.cmake +++ b/lapack-netlib/CMAKE/FortranMangling.cmake @@ -43,7 +43,7 @@ MESSAGE(STATUS "Testing FORTRAN_MANGLING") MESSAGE(STATUS "Compiling Finface.f...") execute_process ( COMMAND ${CMAKE_Fortran_COMPILER} ${F77_OPTION_COMPILE} ${PROJECT_SOURCE_DIR}/lapacke/mangling/Fintface.f - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp OUTPUT_VARIABLE OUTPUT RESULT_VARIABLE RESULT ERROR_VARIABLE ERROR) @@ -58,7 +58,7 @@ MESSAGE(STATUS "Compiling Finface.f...") MESSAGE(STATUS "Compiling Cintface.c...") execute_process ( COMMAND ${CMAKE_C_COMPILER} ${F77_OPTION_COMPILE} ${PROJECT_SOURCE_DIR}/lapacke/mangling/Cintface.c - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp OUTPUT_VARIABLE OUTPUT RESULT_VARIABLE RESULT ERROR_VARIABLE ERROR) @@ -73,7 +73,7 @@ MESSAGE(STATUS "Compiling Cintface.c...") MESSAGE(STATUS "Linking Finface.f and Cintface.c...") execute_process ( COMMAND ${CMAKE_Fortran_COMPILER} ${F77_OUTPUT_OBJ} xintface.exe Fintface.o Cintface.o - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp OUTPUT_VARIABLE OUTPUT RESULT_VARIABLE RESULT ERROR_VARIABLE ERROR) @@ -88,7 +88,7 @@ MESSAGE(STATUS "Linking Finface.f and Cintface.c...") MESSAGE(STATUS "Running ./xintface...") execute_process ( COMMAND ./xintface.exe - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp RESULT_VARIABLE xintface_RES OUTPUT_VARIABLE xintface_OUT ERROR_VARIABLE xintface_ERR) diff --git a/lapack-netlib/SRC/zgetrf2.f b/lapack-netlib/SRC/zgetrf2.f index 290d4847e7..7d28b58129 100644 --- a/lapack-netlib/SRC/zgetrf2.f +++ b/lapack-netlib/SRC/zgetrf2.f @@ -144,7 +144,7 @@ RECURSIVE SUBROUTINE ZGETRF2( M, N, A, LDA, IPIV, INFO ) EXTERNAL DLAMCH, IZAMAX * .. * .. External Subroutines .. - EXTERNAL ZGEMM, ZSCAL, ZLASWP, ZTRSM, ZERBLA + EXTERNAL ZGEMM, ZSCAL, ZLASWP, ZTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index de42e1ab66..afd583c113 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -1,5 +1,5 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) set(LAPACK_SOURCES diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c index 7e23197186..6b8cbda2f5 100644 --- a/lapack/getrf/getrf_parallel_omp.c +++ b/lapack/getrf/getrf_parallel_omp.c @@ -173,10 +173,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; - if (blocking <= GEMM_UNROLL_N * 2) { +#ifdef POWER8 + if (blocking <= GEMM_UNROLL_N) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } +#else + if (blocking <= GEMM_UNROLL_N*2) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } +#endif sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c index e60a16c116..9f0f36b78b 100644 --- a/lapack/getrf/getrf_single.c +++ b/lapack/getrf/getrf_single.c @@ -77,10 +77,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; +#ifdef POWER8 + if (blocking <= GEMM_UNROLL_N) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } +#else if (blocking <= GEMM_UNROLL_N * 2) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } +#endif sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); diff --git a/lapack/laswp/mips/Makefile b/lapack/laswp/mips/Makefile new file mode 100644 index 0000000000..75411deb54 --- /dev/null +++ b/lapack/laswp/mips/Makefile @@ -0,0 +1,13 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index a6ead4b64e..480518cd4b 100644 --- a/param.h +++ b/param.h @@ -1964,9 +1964,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SNUMOPT 16 #define DNUMOPT 8 -#define GEMM_DEFAULT_OFFSET_A 4096 -#define GEMM_DEFAULT_OFFSET_B 4096 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 65536 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -1977,20 +1977,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 960 -#define DGEMM_DEFAULT_P 480 -#define CGEMM_DEFAULT_P 720 -#define ZGEMM_DEFAULT_P 480 +#define SGEMM_DEFAULT_P 1280 +#define DGEMM_DEFAULT_P 640 +#define CGEMM_DEFAULT_P 640 +#define ZGEMM_DEFAULT_P 320 -#define SGEMM_DEFAULT_Q 720 +#define SGEMM_DEFAULT_Q 640 #define DGEMM_DEFAULT_Q 720 -#define CGEMM_DEFAULT_Q 720 -#define ZGEMM_DEFAULT_Q 720 - -#define SGEMM_DEFAULT_R 21600 -#define DGEMM_DEFAULT_R 14400 -#define CGEMM_DEFAULT_R 16200 -#define ZGEMM_DEFAULT_R 21600 +#define CGEMM_DEFAULT_Q 640 +#define ZGEMM_DEFAULT_Q 640 #define SYMV_P 8 @@ -2179,6 +2174,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#if defined(P5600) || defined(I6400) || defined(P6600) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#ifdef HAVE_MSA +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#endif + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 +#endif #ifdef ARMV7 #define SNUMOPT 2 @@ -2269,13 +2315,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 4 diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cd4497117d..5e9baf9280 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) enable_language(Fortran) diff --git a/test/Makefile b/test/Makefile index 75ea6de604..65fb6f4387 100644 --- a/test/Makefile +++ b/test/Makefile @@ -4,6 +4,7 @@ include ../Makefile.system all :: level1 level2 level3 level1 : sblat1 dblat1 cblat1 zblat1 +ifndef CROSS OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1 @@ -21,8 +22,10 @@ else OPENBLAS_NUM_THREADS=2 ./zblat1 endif endif +endif level2 : sblat2 dblat2 cblat2 zblat2 +ifndef CROSS rm -f ?BLAT2.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 @@ -54,8 +57,10 @@ else @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 endif endif +endif level3 : sblat3 dblat3 cblat3 zblat3 +ifndef CROSS rm -f ?BLAT3.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 @@ -87,9 +92,11 @@ else @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 endif endif +endif level3_3m : zblat3_3m cblat3_3m +ifndef CROSS rm -f ?BLAT3_3M.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 @@ -109,6 +116,7 @@ else @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 endif endif +endif diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index dfa42df671..f0ffee0888 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) set(OpenBLAS_utest_src utest_main.c diff --git a/utest/Makefile b/utest/Makefile index 9f98089205..3ccc0a041d 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -21,7 +21,9 @@ $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) run_test: $(UTESTBIN) +ifndef CROSS ./$(UTESTBIN) +endif clean: -rm -f *.o $(UTESTBIN) diff --git a/utest/ctest.h b/utest/ctest.h index a62103ff52..1deea32f6a 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -637,7 +637,7 @@ static void *find_symbol(struct ctest *test, const char *fname) static void sighandler(int signum) { char msg[128]; - sprintf(msg, "[SIGNAL %d: %s]", signum, sys_siglist[signum]); + snprintf(msg, sizeof(msg), "[SIGNAL %d: %s]", signum, strsignal(signum)); color_print(ANSI_BRED, msg); fflush(stdout);