From 0fd8363adc3f248820f3908d5845c61c2fa36f6f Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 20 Apr 2023 02:04:00 +0300 Subject: [PATCH 01/51] use hipblas based on cublas --- CMakeLists.txt | 26 ++++++++++++++++++++++++++ Makefile | 4 ++++ ggml-cuda.cu | 6 ++++++ ggml.c | 35 +++++++++++++++++++++++++++++++++-- 4 files changed, 69 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f9fdd30f0830..57cce9bb05d76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,7 @@ endif() option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF) option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) +option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -168,6 +169,31 @@ if (LLAMA_CUBLAS) endif() endif() +if (LLAMA_HIPBLAS) + cmake_minimum_required(VERSION 3.21) + + find_package(hip) + find_package(hipblas) + + if (hipblas_FOUND) + message(STATUS "hipBLAS found") + + set(LLAMA_HIPBLAS_PLATFORM "AMD" CACHE STRING "hip device type" FORCE) + set_property(CACHE LLAMA_HIPBLAS_PLATFORM PROPERTY STRINGS "AMD" "NVIDIA") + + add_compile_definitions(GGML_USE_HIPBLAS "__HIP_PLATFORM_${LLAMA_HIPBLAS_PLATFORM}__") + + add_library(ggml-hip OBJECT ggml-cuda.cu) + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-hip hip::device) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host roc::hipblas ggml-hip) + + else() + message(WARNING "hipBLAS not found") + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) set(c_flags diff --git a/Makefile b/Makefile index f267d086415ee..d2f30e1cc9674 100644 --- a/Makefile +++ b/Makefile @@ -107,6 +107,10 @@ ifdef LLAMA_CUBLAS ggml-cuda.o: ggml-cuda.cu ggml-cuda.h nvcc -arch=native -c -o $@ $< endif +ifdef LLAMA_HIPBLAS + CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I/opt/rocm/include + LDFLAGS += -lhipblas -lamdhip64 -L/opt/rocm/lib +endif ifdef LLAMA_GPROF CFLAGS += -pg CXXFLAGS += -pg diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0baa989a36ca9..90830e5fd2976 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,5 +1,11 @@ #include +#if defined(__HIP_PLATFORM_AMD__) +#include "hip/hip_runtime.h" +#define cudaStream_t hipStream_t +#define __half _Float16 +#else #include +#endif #include "ggml-cuda.h" typedef uint16_t ggml_fp16_t; diff --git a/ggml.c b/ggml.c index da0f5d1d549ab..23befa297136d 100644 --- a/ggml.c +++ b/ggml.c @@ -147,9 +147,41 @@ inline static void* ggml_aligned_malloc(size_t size) { #include #elif defined(GGML_USE_OPENBLAS) #include -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) + +#if defined(GGML_USE_HIPBLAS) +#include "hipblas/hipblas.h" +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define cudaError_t hipError_t +#define cudaFree hipFree +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaStream_t hipStream_t +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaSuccess hipSuccess +#define GGML_USE_CUBLAS +#else #include #include +#endif #include "ggml-cuda.h" #define CUDA_CHECK(err) \ @@ -8073,7 +8105,6 @@ static void ggml_compute_forward_mul_mat_q_f32( const float * x = wdata; #endif - #if defined(GGML_USE_CUBLAS) // copy data to device CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream)); From 54a63c10e85bf454eb1ea99cc27d89cce06144b6 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 20 Apr 2023 22:19:22 +0300 Subject: [PATCH 02/51] Update Makefile for the Cuda kernels --- Makefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index d2f30e1cc9674..8e0ada581e698 100644 --- a/Makefile +++ b/Makefile @@ -108,8 +108,15 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h nvcc -arch=native -c -o $@ $< endif ifdef LLAMA_HIPBLAS - CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I/opt/rocm/include - LDFLAGS += -lhipblas -lamdhip64 -L/opt/rocm/lib + ROCMPATH?= /opt/rocm + CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include + CXXFLAGS+= -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include + HIPFLAGS?= -amdgpu-early-inline-all=true -amdgpu-function-calls=false -march=native + LDFLAGS += -lhipblas -lamdhip64 -L$(ROCMPATH)/lib + HIPCC ?= $(ROCMPATH)/bin/hipcc + OBJS += ggml-cuda.o +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h + $(HIPCC) $(CXXFLAGS) -x hip $(HIPFLAGS) -c -o $@ $< endif ifdef LLAMA_GPROF CFLAGS += -pg From 0e005f779357c9594b942adaf8d985edb071642a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 21 Apr 2023 02:13:00 +0300 Subject: [PATCH 03/51] Build file changes Now HIP Clang is not required, the CMake scripts will configure the needed compiler, which can be system clang++. Also other code can still use GCC, but CMake will force the clang to link. --- CMakeLists.txt | 24 ++++++++++-------------- Makefile | 13 ++++++------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57cce9bb05d76..cea51078dcdd9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,27 +170,23 @@ if (LLAMA_CUBLAS) endif() if (LLAMA_HIPBLAS) - cmake_minimum_required(VERSION 3.21) - find_package(hip) find_package(hipblas) - if (hipblas_FOUND) + if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "hipBLAS found") + add_compile_definitions(GGML_USE_HIPBLAS) + enable_language(HIP) + add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h) + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE HIP) + target_link_libraries(ggml-hip PRIVATE hip::device) - set(LLAMA_HIPBLAS_PLATFORM "AMD" CACHE STRING "hip device type" FORCE) - set_property(CACHE LLAMA_HIPBLAS_PLATFORM PROPERTY STRINGS "AMD" "NVIDIA") - - add_compile_definitions(GGML_USE_HIPBLAS "__HIP_PLATFORM_${LLAMA_HIPBLAS_PLATFORM}__") - - add_library(ggml-hip OBJECT ggml-cuda.cu) - set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-hip hip::device) - + if (LLAMA_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host roc::hipblas ggml-hip) - else() - message(WARNING "hipBLAS not found") + message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") endif() endif() diff --git a/Makefile b/Makefile index 8e0ada581e698..5b856a3677b96 100644 --- a/Makefile +++ b/Makefile @@ -108,13 +108,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h nvcc -arch=native -c -o $@ $< endif ifdef LLAMA_HIPBLAS - ROCMPATH?= /opt/rocm - CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include - CXXFLAGS+= -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include - HIPFLAGS?= -amdgpu-early-inline-all=true -amdgpu-function-calls=false -march=native - LDFLAGS += -lhipblas -lamdhip64 -L$(ROCMPATH)/lib - HIPCC ?= $(ROCMPATH)/bin/hipcc - OBJS += ggml-cuda.o + ROCM_PATH ?= /opt/rocm + LDFLAGS += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib + HIPCC ?= $(ROCM_PATH)/bin/hipcc + OBJS += ggml-cuda.o +ggml.o: CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCM_PATH)/include +ggml-cuda.o: CXXFLAGS += -march=native -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(HIPCC) $(CXXFLAGS) -x hip $(HIPFLAGS) -c -o $@ $< endif From d3e1984ce0df5af62ab69c1bdd55a743af4157cc Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 21 Apr 2023 03:32:06 +0300 Subject: [PATCH 04/51] add rpath --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5b856a3677b96..d27f716ec1fe1 100644 --- a/Makefile +++ b/Makefile @@ -109,7 +109,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h endif ifdef LLAMA_HIPBLAS ROCM_PATH ?= /opt/rocm - LDFLAGS += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib + LDFLAGS += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib HIPCC ?= $(ROCM_PATH)/bin/hipcc OBJS += ggml-cuda.o ggml.o: CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCM_PATH)/include From 367723544c2187a2a6cd5954ca37a8faf5335e5a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 22 Apr 2023 23:28:00 +0300 Subject: [PATCH 05/51] More build file changes --- CMakeLists.txt | 15 ++++++++++++--- Makefile | 21 ++++++++++++--------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cea51078dcdd9..2c1958f6acfbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,16 +170,24 @@ if (LLAMA_CUBLAS) endif() if (LLAMA_HIPBLAS) + list(APPEND CMAKE_PREFIX_PATH /opt/rocm) + + if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") + endif() + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + find_package(hip) find_package(hipblas) if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS) - enable_language(HIP) add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h) - set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE HIP) - target_link_libraries(ggml-hip PRIVATE hip::device) + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-hip PUBLIC hip::device) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") @@ -188,6 +196,7 @@ if (LLAMA_HIPBLAS) else() message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") endif() + endif() if (LLAMA_ALL_WARNINGS) diff --git a/Makefile b/Makefile index d27f716ec1fe1..5339d5765082c 100644 --- a/Makefile +++ b/Makefile @@ -13,8 +13,8 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif -CCV := $(shell $(CC) --version | head -n 1) -CXXV := $(shell $(CXX) --version | head -n 1) +CCV = $(shell $(CC) --version | head -n 1) +CXXV = $(shell $(CXX) --version | head -n 1) # Mac OS + Arm can report x86_64 # ref: /~https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 @@ -108,14 +108,17 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h nvcc -arch=native -c -o $@ $< endif ifdef LLAMA_HIPBLAS - ROCM_PATH ?= /opt/rocm - LDFLAGS += -lhipblas -lamdhip64 -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib - HIPCC ?= $(ROCM_PATH)/bin/hipcc - OBJS += ggml-cuda.o -ggml.o: CFLAGS += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I$(ROCM_PATH)/include -ggml-cuda.o: CXXFLAGS += -march=native -D__HIP_PLATFORM_AMD__ -I$(ROCMPATH)/include + ROCM_PATH ?= /opt/rocm + CC := $(ROCM_PATH)/llvm/bin/clang + CXX := $(ROCM_PATH)/llvm/bin/clang++ + GPU_TARGETS!= $(ROCM_PATH)/llvm/bin/offload-arch + CFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) + CXXFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) + LDFLAGS += -L/opt/rocm/lib -lhipblas -lamdhip64 + OBJS += ggml-cuda.o +ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h - $(HIPCC) $(CXXFLAGS) -x hip $(HIPFLAGS) -c -o $@ $< + $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< endif ifdef LLAMA_GPROF CFLAGS += -pg From 3a004b2a0166e412d8d54052c50bfd093611ad95 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 24 Apr 2023 02:24:54 +0300 Subject: [PATCH 06/51] add rpath --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e9b9018acd726..4af8aa78ad878 100644 --- a/Makefile +++ b/Makefile @@ -120,7 +120,7 @@ ifdef LLAMA_HIPBLAS GPU_TARGETS!= $(ROCM_PATH)/llvm/bin/offload-arch CFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) - LDFLAGS += -L/opt/rocm/lib -lhipblas -lamdhip64 + LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h From 608aa33d9f0ee8a7183ed4f9fb62532a65f5b097 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 25 Apr 2023 21:15:04 +0300 Subject: [PATCH 07/51] change default GPU arch to match CMake --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4af8aa78ad878..bd0139ed4c695 100644 --- a/Makefile +++ b/Makefile @@ -117,7 +117,7 @@ ifdef LLAMA_HIPBLAS ROCM_PATH ?= /opt/rocm CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ - GPU_TARGETS!= $(ROCM_PATH)/llvm/bin/offload-arch + GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030 CFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 From ecc056519fd08363922875b23956a13a7b6fbdcf Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 28 Apr 2023 01:58:27 +0300 Subject: [PATCH 08/51] only .cu file needs to be complied as device --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cf087905cbf37..b1fd6e218c661 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,7 +187,7 @@ if (LLAMA_HIPBLAS) add_compile_definitions(GGML_USE_HIPBLAS) add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-hip PUBLIC hip::device) + target_link_libraries(ggml-hip PRIVATE hip::device) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") From a1caa486113eb3d1192c6d554feaff7419194313 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 28 Apr 2023 10:08:21 +0300 Subject: [PATCH 09/51] add more cuda defines This is so 'slaren/cuda-f16f32' would merge. --- ggml-cuda.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ggml-cuda.h b/ggml-cuda.h index c00d83ba64748..6ab5b3944301d 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -6,19 +6,28 @@ #define CUBLAS_OP_N HIPBLAS_OP_N #define CUBLAS_OP_T HIPBLAS_OP_T #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx #define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(h, m) HIPBLAS_STATUS_SUCCESS #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_32F HIPBLAS_R_32F +#define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t +#define cudaEvent_t hipEvent_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord #define cudaFree hipFree +#define cudaFreeHost hipFreeHost #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError #define cudaMalloc hipMalloc +#define cudaMallocHost hipMallocHost #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice @@ -26,6 +35,7 @@ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent #define cudaSuccess hipSuccess #define GGML_USE_CUBLAS #else From 04c0d480d780b7e43f9cd5726b1c1d66570b57d8 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 4 May 2023 12:31:16 +0300 Subject: [PATCH 10/51] Move all HIP stuff to ggml-cuda.cu --- CMakeLists.txt | 10 +++++----- ggml-cuda.cu | 44 +++++++++++++++++++++++++++++++++++++++++--- ggml-cuda.h | 46 ---------------------------------------------- 3 files changed, 46 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e01bb2edd4815..79393a54e4ee9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,16 +232,16 @@ if (LLAMA_HIPBLAS) find_package(hipblas) if (${hipblas_FOUND} AND ${hip_FOUND}) - message(STATUS "hipBLAS found") - add_compile_definitions(GGML_USE_HIPBLAS) - add_library(ggml-hip OBJECT ggml-cuda.cu ggml-cuda.h) + message(STATUS "HIP and hipBLAS found") + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-hip PRIVATE hip::device) + target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host roc::hipblas ggml-hip) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm) else() message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") endif() diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 033c7d5c88ff0..9007f6dcbf626 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5,9 +5,47 @@ #include #if defined(GGML_USE_HIPBLAS) -#include "hip/hip_runtime.h" -#include "hipblas/hipblas.h" -#include "hip/hip_fp16.h" +#include +#include +#include +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocPortable) +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess #else #include #include diff --git a/ggml-cuda.h b/ggml-cuda.h index 0e740e30908bc..f7d6a8bc1842a 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -1,49 +1,3 @@ -#if defined(GGML_USE_HIPBLAS) -#include "hipblas/hipblas.h" -#include "hip/hip_runtime.h" -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F HIPBLAS_R_16F -#define CUDA_R_32F HIPBLAS_R_32F -#define cublasCreate hipblasCreate -#define cublasGemmEx hipblasGemmEx -#define cublasHandle_t hipblasHandle_t -#define cublasSetMathMode(h, m) HIPBLAS_STATUS_SUCCESS -#define cublasSetStream hipblasSetStream -#define cublasSgemm hipblasSgemm -#define cublasStatus_t hipblasStatus_t -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEvent_t hipEvent_t -#define cudaFree hipFree -#define cudaFreeHost hipFreeHost -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaMalloc hipMalloc -#define cudaMallocHost hipMallocHost -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent hipStreamWaitEvent -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define GGML_USE_CUBLAS -#else -#include -#include -#endif #include "ggml.h" #ifdef __cplusplus From baeb482a9429cb7d962da34e9820e62d14ffbe31 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 7 May 2023 12:24:12 +0300 Subject: [PATCH 11/51] Revert to default copy --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9007f6dcbf626..7760f0de133a5 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -35,7 +35,7 @@ #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError #define cudaMalloc hipMalloc -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocPortable) +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) #define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost From 070cbcc1bd7f1b5049feec43507a320d22aac815 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 7 May 2023 18:10:56 +0300 Subject: [PATCH 12/51] occupanct function --- ggml-cuda.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7760f0de133a5..1b862fe82dff2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -26,10 +26,10 @@ #define cublasStatus_t hipblasStatus_t #define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t +#define cudaEvent_t hipEvent_t #define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventDisableTiming hipEventDisableTiming #define cudaEventRecord hipEventRecord -#define cudaEvent_t hipEvent_t #define cudaFree hipFree #define cudaFreeHost hipHostFree #define cudaGetErrorString hipGetErrorString @@ -40,11 +40,12 @@ #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaStream_t hipStream_t #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamWaitEvent hipStreamWaitEvent -#define cudaStream_t hipStream_t #define cudaSuccess hipSuccess #else #include From 0fe6384755b478bd57c38e626db36f144c617b40 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 12 May 2023 17:22:11 +0300 Subject: [PATCH 13/51] fix makefile --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 80413517ff8f2..976eefab4f783 100644 --- a/Makefile +++ b/Makefile @@ -140,8 +140,8 @@ ifdef LLAMA_HIPBLAS CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030 - CFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) - CXXFLAGS += -DGGML_USE_HIPBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) + CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) + CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) From b19fefef943d974db2eda8a8908e67e1d08e317c Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 20 May 2023 23:28:08 +0300 Subject: [PATCH 14/51] Forwardcompat --- ggml-cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7f0975a615c5a..44d0fa0489ccb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -17,6 +17,7 @@ #define CUBLAS_TF32_TENSOR_OP_MATH 0 #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx #define cublasHandle_t hipblasHandle_t From 600ace39c8f1d311b8f3c49003f5a6448a44b18e Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 20 May 2023 23:42:20 +0300 Subject: [PATCH 15/51] update warp size --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 44d0fa0489ccb..64ddc68ccd6d0 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -132,7 +132,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo #define CUDA_MUL_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 -#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec +#define CUDA_DMMV_BLOCK_SIZE 64 // dmmv = dequantize_mul_mat_vec static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; From a593a4f6c24389528a5eed8e6dc86eb06ced38b8 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 26 May 2023 00:55:28 +0300 Subject: [PATCH 16/51] Add missing parameters --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 18b67b6699c1d..7c0fb0573d2fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -233,6 +233,8 @@ if (LLAMA_HIPBLAS) message(STATUS "HIP and hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) From 30d921af3e0b21f511652c98448ccb631434d0d4 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 26 May 2023 01:03:56 +0300 Subject: [PATCH 17/51] and makefile --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 6bb362cb20d6f..ea6ee20414b2e 100644 --- a/Makefile +++ b/Makefile @@ -169,6 +169,8 @@ ifdef LLAMA_HIPBLAS LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=64 +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=1 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< endif From 4c8b3fb1071dff0cd0c4b4f96e506294ba6473f4 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 26 May 2023 01:08:53 +0300 Subject: [PATCH 18/51] add configurable vars --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ea6ee20414b2e..a5dd2a3042b2d 100644 --- a/Makefile +++ b/Makefile @@ -164,13 +164,15 @@ ifdef LLAMA_HIPBLAS CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030 + LLAMA_CUDA_DMMV_X ?= 64 + LLAMA_CUDA_DMMV_Y ?= 1 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) -ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=64 -ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=1 +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< endif From 9fdaa1d2501a2c4a030af6d34e97b2e4766b27c4 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 27 May 2023 19:17:53 +0300 Subject: [PATCH 19/51] Add more defs For forward compatibility #1607 --- ggml-cuda.cu | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 4b4c678ead265..1253f086189ba 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -25,14 +25,18 @@ #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t +#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t -#define cudaEvent_t hipEvent_t #define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventDisableTiming hipEventDisableTiming #define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t #define cudaFree hipFree #define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError #define cudaMalloc hipMalloc @@ -43,11 +47,12 @@ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize -#define cudaStream_t hipStream_t +#define cudaSetDevice hipSetDevice #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t #define cudaSuccess hipSuccess #else #include From 5d6eb72164e5ae000d07dd725e635faa7a2f723d Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 6 Jun 2023 18:32:41 +0300 Subject: [PATCH 20/51] warp size fixes --- ggml-cuda.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 8b2fc690e03e4..3a5e1527fb5f7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -182,7 +182,11 @@ typedef struct { } block_q6_k; static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding"); +#if defined(GGML_USE_HIPBLAS) +#define WARP_SIZE warpSize +#else #define WARP_SIZE 32 +#endif #define CUDA_MUL_BLOCK_SIZE 256 @@ -679,8 +683,8 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, // sum up partial sums and write back result __syncthreads(); #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, WARP_SIZE); } if (tid == 0) { From 1ba4ce4ad792f9672eecc37bf982386d3a007914 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 6 Jun 2023 18:41:08 +0300 Subject: [PATCH 21/51] Revert "warp size fixes" It seems like 32 is faster for me, at least and it won't cause so many conflicts. This reverts commit 5d6eb72164e5ae000d07dd725e635faa7a2f723d. --- ggml-cuda.cu | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 3a5e1527fb5f7..8b2fc690e03e4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -182,11 +182,7 @@ typedef struct { } block_q6_k; static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding"); -#if defined(GGML_USE_HIPBLAS) -#define WARP_SIZE warpSize -#else #define WARP_SIZE 32 -#endif #define CUDA_MUL_BLOCK_SIZE 256 @@ -683,8 +679,8 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, // sum up partial sums and write back result __syncthreads(); #pragma unroll - for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, WARP_SIZE); + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } if (tid == 0) { From fa5b3d7365266a9903450c1105551ffec7f51d92 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 6 Jun 2023 18:47:00 +0300 Subject: [PATCH 22/51] fix makefile. --- Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile b/Makefile index 94946d6f92522..0b2849712af89 100644 --- a/Makefile +++ b/Makefile @@ -196,7 +196,6 @@ ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< -endif endif # LLAMA_HIPBLAS ifdef LLAMA_METAL From 61df8e92179b84af9041e53f61d0194dfd791de0 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 14 Jun 2023 22:46:10 +0300 Subject: [PATCH 23/51] add cudaMemset --- ggml-cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index fe55cc8cf2743..e54ea6d469863 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -50,6 +50,7 @@ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice #define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize #define cudaSetDevice hipSetDevice #define cudaStreamCreateWithFlags hipStreamCreateWithFlags From bb16effc750e2706050f5d4ec89cecc42cc13882 Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed, 28 Jun 2023 15:27:10 -0500 Subject: [PATCH 24/51] headers fix; add kquants_iter for hipblas and add gfx803 (#1) * kquants_iter for hipblas and add gfx803 * Update CMakeLists.txt with hipblas kquants_iter and DMMV_F16 * remove dmmv_f16 for now --- CMakeLists.txt | 1 + Makefile | 11 ++++++++--- ggml.c | 6 ++++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 296f5043001db..23c28c3589ac1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -335,6 +335,7 @@ if (LLAMA_HIPBLAS) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) + add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") diff --git a/Makefile b/Makefile index f49c57edf5ece..49bbfaf4e72fb 100644 --- a/Makefile +++ b/Makefile @@ -21,8 +21,8 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif -CCV = $(shell $(CC) --version | head -n 1) -CXXV = $(shell $(CXX) --version | head -n 1) +CCV := $(shell $(CC) --version | head -n 1) +CXXV := $(shell $(CXX) --version | head -n 1) # Mac OS + Arm can report x86_64 # ref: /~https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 @@ -207,13 +207,18 @@ ifdef LLAMA_HIPBLAS ROCM_PATH ?= /opt/rocm CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ - GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030 + GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_Y ?= 1 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o +ifdef LLAMA_CUDA_KQUANTS_ITER + CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +else + CXXFLAGS += -DK_QUANTS_PER_ITERATION=2 +endif ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) diff --git a/ggml.c b/ggml.c index 89379516e2bcd..5713a9f43569f 100644 --- a/ggml.c +++ b/ggml.c @@ -230,9 +230,11 @@ inline static void* ggml_aligned_malloc(size_t size) { #endif #elif defined(GGML_USE_OPENBLAS) #include -#elif defined(GGML_USE_CUBLAS) | defined(GGML_USE_HIPBLAS) +#endif +#if defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" -#elif defined(GGML_USE_CLBLAST) +#endif +#if defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" #endif From c3e3733c61f7705ea00fd593ee94527da8c12f1b Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 2 Jul 2023 15:51:31 +0300 Subject: [PATCH 25/51] ROCm fixes --- ggml-cuda.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index dca90d9997ea7..8fc37ba1bb9fb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -34,6 +34,7 @@ #define cudaEventDisableTiming hipEventDisableTiming #define cudaEventRecord hipEventRecord #define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy #define cudaFree hipFree #define cudaFreeHost hipHostFree #define cudaGetDevice hipGetDevice @@ -56,7 +57,7 @@ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0) #define cudaStream_t hipStream_t #define cudaSuccess hipSuccess #else From e610466307abc8f8bae641682ab3f91dbc33930e Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 11 Jul 2023 17:53:14 +0300 Subject: [PATCH 26/51] Expand arch list and make it overrideable --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 38d65aebc6c01..d3fc7c4c6c785 100644 --- a/Makefile +++ b/Makefile @@ -213,10 +213,10 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h endif # LLAMA_CLBLAST ifdef LLAMA_HIPBLAS - ROCM_PATH ?= /opt/rocm - CC := $(ROCM_PATH)/llvm/bin/clang - CXX := $(ROCM_PATH)/llvm/bin/clang++ - GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 + ROCM_PATH ?= /opt/rocm + CC := $(ROCM_PATH)/llvm/bin/clang + CXX := $(ROCM_PATH)/llvm/bin/clang++ + GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_Y ?= 1 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) From afcb8fe0c4f5e918422ea41d08824653d58575ed Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 11 Jul 2023 18:09:27 +0300 Subject: [PATCH 27/51] Add new config option --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 34d4a33fe4fd2..54c091413d987 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -367,6 +367,9 @@ if (LLAMA_HIPBLAS) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + if (LLAMA_CUDA_FORCE_DMMV) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) + endif() set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) From 2ec4466db54fd2f42f2ab7713cc1061e0cf59bf3 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 13 Jul 2023 13:44:02 +0300 Subject: [PATCH 28/51] Update build flags. GGML_CUDA_DMMV_Y is now GGML_CUDA_MMV_Y so update your build instructions. GGML_CUDA_FORCE_DMMV is always enabled. --------- Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> --- CMakeLists.txt | 6 ++---- Makefile | 14 ++++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f9c55a671c93..016d850f4466e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -365,11 +365,9 @@ if (LLAMA_HIPBLAS) add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - if (LLAMA_CUDA_FORCE_DMMV) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) - endif() + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) diff --git a/Makefile b/Makefile index 88cc288aadb20..039a75365d18e 100644 --- a/Makefile +++ b/Makefile @@ -226,20 +226,18 @@ ifdef LLAMA_HIPBLAS CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 - LLAMA_CUDA_DMMV_X ?= 32 - LLAMA_CUDA_DMMV_Y ?= 1 + LLAMA_CUDA_DMMV_X ?= 32 + LLAMA_CUDA_MMV_Y ?= 1 + LLAMA_CUDA_KQUANTS_ITER ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 OBJS += ggml-cuda.o -ifdef LLAMA_CUDA_KQUANTS_ITER - CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) -else - CXXFLAGS += -DK_QUANTS_PER_ITERATION=2 -endif ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) -ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV +ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< endif # LLAMA_HIPBLAS From 1f6294dc4473701b5be791d47e4b3733f95dbc0a Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon, 24 Jul 2023 03:52:01 -0500 Subject: [PATCH 29/51] Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5) * initialize rocblas --- ggml-cuda.cu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f6426d4bad168..cac029b480b7a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -10,6 +10,7 @@ #include #include #include +#include "rocblas/rocblas.h" #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT @@ -2531,6 +2532,10 @@ void ggml_init_cublas() { static bool initialized = false; if (!initialized) { +#ifdef GGML_USE_HIPBLAS + rocblas_initialize(); + hipDeviceSynchronize(); +#endif CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; From 8e8054ad83e794b261914ad4f337d43e2c76882d Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 24 Jul 2023 12:20:49 +0300 Subject: [PATCH 30/51] Add rocblas to build files --- CMakeLists.txt | 3 ++- Makefile | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 016d850f4466e..0488443249560 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -359,6 +359,7 @@ if (LLAMA_HIPBLAS) find_package(hip) find_package(hipblas) + find_package(rocblas) if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "HIP and hipBLAS found") @@ -369,7 +370,7 @@ if (LLAMA_HIPBLAS) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas) + target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") diff --git a/Makefile b/Makefile index 4adaaaad87922..5cf0943fac552 100644 --- a/Makefile +++ b/Makefile @@ -228,7 +228,7 @@ ifdef LLAMA_HIPBLAS LLAMA_CUDA_KQUANTS_ITER ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) - LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 + LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) From f8e3fc6c746b37d69656fb5ae6af8e411d85dbca Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 29 Jul 2023 14:16:46 +0300 Subject: [PATCH 31/51] rocblas init stuff --- ggml-cuda.cu | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 91e6c078ecc45..cd122c5be6155 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -10,7 +10,10 @@ #include #include #include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() #include "rocblas/rocblas.h" +#endif #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT @@ -2746,10 +2749,14 @@ void ggml_init_cublas() { static bool initialized = false; if (!initialized) { -#ifdef GGML_USE_HIPBLAS - rocblas_initialize(); - hipDeviceSynchronize(); + +#ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: + // /~https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 + rocblas_initialize(); + CUDA_CHECK(cudaDeviceSynchronize()); #endif + CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; From 4336231a32a0c6168da5d79801752289622e9e58 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 29 Jul 2023 18:35:56 +0300 Subject: [PATCH 32/51] add hipBLAS to README --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> --- README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.md b/README.md index 6a3268d129b55..05c9d3b5df799 100644 --- a/README.md +++ b/README.md @@ -408,6 +408,35 @@ Building the program with BLAS support may lead to some performance improvements | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | +- #### hipBLAS + + This provide BLAS acceleation on HIP supported GPU like AMD GPU. + Make sure to have ROCm installed. + You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html). + Windows support is coming soon... + + - Using `make`: + ```bash + make LLAMA_HIPBLAS=1 + ``` + - Using `CMake`: + ```bash + mkdir build + cd build + CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON + cmake --build . + ``` + + The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used. + If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3. + The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): + + | Option | Legal values | Default | Description | + |-------------------------|------------------------|---------|-------------| + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + - #### CLBlast OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](/~https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU. From c1cb70d64d307d3fd9b7b9f61bb574e36520499a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 31 Jul 2023 19:56:44 +0300 Subject: [PATCH 33/51] new build arg LLAMA_CUDA_MMQ_Y --- CMakeLists.txt | 1 + Makefile | 2 ++ README.md | 7 ++++--- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 969a3de693dea..14eefe0051670 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -375,6 +375,7 @@ if (LLAMA_HIPBLAS) message(STATUS "HIP and hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) diff --git a/Makefile b/Makefile index 21fed1a46d1a5..cdb12a872f784 100644 --- a/Makefile +++ b/Makefile @@ -270,6 +270,7 @@ ifdef LLAMA_HIPBLAS GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 + LLAMA_CUDA_MMQ_Y ?= 64 LLAMA_CUDA_KQUANTS_ITER ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) @@ -278,6 +279,7 @@ ifdef LLAMA_HIPBLAS ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h diff --git a/README.md b/README.md index a56b64a88dbd0..947c2b6940d26 100644 --- a/README.md +++ b/README.md @@ -437,9 +437,10 @@ Building the program with BLAS support may lead to some performance improvements | Option | Legal values | Default | Description | |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom HIP kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. | + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### CLBlast From d91456aaf138566fa0aa3d507964049c8a09499b Mon Sep 17 00:00:00 2001 From: ardfork <134447697+ardfork@users.noreply.github.com> Date: Mon, 31 Jul 2023 20:35:00 +0300 Subject: [PATCH 34/51] fix half2 decomposition --- ggml-cuda.cu | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e62891d60f47a..f19c7c7c71ead 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -472,8 +472,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q4_1 * x = (const block_q4_1 *) vx; - const dfloat d = x[ib].dm.x; - const dfloat m = x[ib].dm.y; + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); const int vui = x[ib].qs[iqs]; @@ -515,8 +515,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q5_1 * x = (const block_q5_1 *) vx; - const dfloat d = x[ib].dm.x; - const dfloat m = x[ib].dm.y; + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); @@ -568,8 +568,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const uint8_t q = x[i].qs[32*n + l]; float * y = yy + i*QK_K + 128*n; - float dall = x[i].dm.x; - float dmin = x[i].dm.y; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); @@ -579,8 +579,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const int il = tid%16; // 0...15 const uint8_t q = x[i].qs[il] >> (2*is); float * y = yy + i*QK_K + 16*is + il; - float dall = x[i].dm.x; - float dmin = x[i].dm.y; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); #endif @@ -666,8 +666,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float float * y = yy + i*QK_K + 64*il + n*ir; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint8_t * q = x[i].qs + 32*il + n*ir; @@ -705,8 +705,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float float * y = yy + i*QK_K + 64*il + 2*ir; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint8_t * ql = x[i].qs + 32*il + 2*ir; const uint8_t * qh = x[i].qh + 2*ir; @@ -818,8 +818,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); aux[0] = a[0] & 0x0f0f0f0f; @@ -1039,8 +1039,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; @@ -1172,8 +1172,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; From 4024f91a665d83b6de8658d45ec9d004c5d90c79 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 9 Aug 2023 01:56:44 +0300 Subject: [PATCH 35/51] Add intrinsics polyfills for AMD --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com> Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com> --- CMakeLists.txt | 1 - Makefile | 1 - ggml-cuda.cu | 53 ++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0da4db55820df..5d64cf77001ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -379,7 +379,6 @@ if (LLAMA_HIPBLAS) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) diff --git a/Makefile b/Makefile index 450e055fd947f..c3ef75f3fdea0 100644 --- a/Makefile +++ b/Makefile @@ -302,7 +302,6 @@ ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y) -ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 033df63349704..96e558e4cbbab 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -75,6 +75,29 @@ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#if defined(GGML_USE_HIPBLAS) +#define __CUDA_ARCH__ 1300 + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast(c); +} + +static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) + c = __builtin_amdgcn_sdot4(a, b, c, false); +#else + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; +#endif + return c; +} +#endif + #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif @@ -1396,8 +1419,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest return; } - y[ib].ds.x = d; - y[ib].ds.y = sum; + reinterpret_cast(y[ib].ds.x) = d; + reinterpret_cast(y[ib].ds.y) = sum; } template @@ -1609,8 +1632,8 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #else const float2 dm8f = __half22float2(dm8); const float2 ds8f = __half22float2(ds8); - const float d8d8 = dm8.x * ds8.x; - const float m8s8 = dm8.y * ds8.y; + const float d8d8 = __low2float(dm8) * __low2float(ds8); + const float m8s8 = __high2float(dm8) * __high2float(ds8); #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it @@ -2380,7 +2403,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); } - return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, bq8_1->ds.x); + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); } static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { @@ -2478,7 +2501,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( #pragma unroll for (int i = 0; i < QR2_K; ++ i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + i].ds); } return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); @@ -2605,7 +2628,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( #pragma unroll for (int i = 0; i < QR3_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + i].ds); } return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); @@ -2782,7 +2805,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( for (int i = 0; i < QR4_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds.x; + d8[i] = __low2half(bq8i->ds); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; @@ -2809,8 +2832,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const float dall = bq4_K->d[0]; const float dmin = bq4_K->d[1]; - const float d8_1 = bq8_1[0].ds.x; - const float d8_2 = bq8_1[1].ds.x; + const float d8_1 = __low2float(bq8_1[0].ds); + const float d8_2 = __low2float(bq8_1[1].ds); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); @@ -2977,7 +3000,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( #pragma unroll for (int i = 0; i < QR5_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds.x; + d8[i] = __low2float(bq8i->ds); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; @@ -2995,8 +3018,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( const float d = bq5_K->d; - const float d8_1 = bq8_1[0].ds.x; - const float d8_2 = bq8_1[1].ds.x; + const float d8_1 = __low2half(bq8_1[0].ds); + const float d8_2 = __low2half(bq8_1[1].ds); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); @@ -3157,7 +3180,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( #pragma unroll for (int i = 0; i < QR6_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + 2*i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds); } return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); @@ -3336,7 +3359,7 @@ static __global__ void mul_mat_q( *dsi_dst = *dsi_src; } else { float * dfi_dst = (float *) dsi_dst; - *dfi_dst = (*dsi_src).x; + *dfi_dst = __low2half(*dsi_src); } } From 8f8ab6c4c049df501e9a5ed8fef3aa0fc0691421 Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed, 9 Aug 2023 18:05:03 -0500 Subject: [PATCH 36/51] hipLDFLAG Path change Unix to multisystem in Makefile changed the hardcoded linux distro hipblas LD path from -L/opt/rocm/lib to use the defined ROCM_PATH variable to be flexible with ROCm on non-Linux OS --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 469d62de73bc3..0843d86c6b8d6 100644 --- a/Makefile +++ b/Makefile @@ -290,7 +290,7 @@ ifdef LLAMA_HIPBLAS LLAMA_CUDA_KQUANTS_ITER ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) - LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas + LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas OBJS += ggml-cuda.o ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) From 29a59b5f0742d8a61b6144807549f1eee7ed4d19 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 10 Aug 2023 12:09:28 +0300 Subject: [PATCH 37/51] Fix merge --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> --- ggml-cuda.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b196163c965b0..6d98a8857fc9d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1643,8 +1643,8 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #else const float2 dm8f = __half22float2(dm8); const float2 ds8f = __half22float2(ds8); - const float d8d8 = dm8f.x * ds8f.x; - const float m8s8 = dm8f.y * ds8f.y; + const float d8d8 = __low2float(dm8) * __low2float(ds8); + const float m8s8 = __high2float(dm8) * __high2float(ds8); #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it @@ -3283,7 +3283,7 @@ static __global__ void mul_mat_q( *dsi_dst = *dsi_src; } else { float * dfi_dst = (float *) dsi_dst; - *dfi_dst = (*dsi_src).x; + *dfi_dst = __low2half(*dsi_src); } } From f41920e3a905e161c94de75efd70545bd71ae92b Mon Sep 17 00:00:00 2001 From: Engininja2 <139037756+Engininja2@users.noreply.github.com> Date: Thu, 10 Aug 2023 12:11:27 +0300 Subject: [PATCH 38/51] AMD assembly optimized __dp4a Doesn't seem to work for gfx900, so commented out. --- ggml-cuda.cu | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 6d98a8857fc9d..1f5995b4dedb6 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -90,6 +90,26 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) { static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) c = __builtin_amdgcn_sdot4(a, b, c, false); +#elif defined(__gfx1010__)// || defined(__gfx900__) + int ashift; + int bshift; + int aext; + int bext; + asm("\n \ + v_pk_ashrrev_i16 %1, 0x80008, %5 \n \ + v_pk_ashrrev_i16 %2, 0x80008, %6 \n \ + v_mov_b32_sdwa %3, sext(%5) dst_sel:WORD_1 src0_sel:BYTE_2 \n \ + v_mov_b32_sdwa %3, sext(%5) dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 \n \ + v_mov_b32_sdwa %4, sext(%6) dst_sel:WORD_1 src0_sel:BYTE_2 \n \ + v_mov_b32_sdwa %4, sext(%6) dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 \n \ + v_mad_i32_i16 %0, %1, %2, %0 op_sel:[0, 0, 0, 0] \n \ + v_mad_i32_i16 %0, %1, %2, %0 op_sel:[1, 1, 0, 0] \n \ + v_mad_i32_i16 %0, %3, %4, %0 op_sel:[0, 0, 0, 0] \n \ + v_mad_i32_i16 %0, %3, %4, %0 op_sel:[1, 1, 0, 0] \n \ + " + : "+v"(c), "=&v"(ashift), "=&v"(bshift), "=&v"(aext), "=&v"(bext) + : "v"(a), "v"(b) + ); #else const int8x4_t va = reinterpret_cast(a); const int8x4_t vb = reinterpret_cast(b); From 42e055d9d62092294bff60ffb7ccc440d7a7054e Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 10 Aug 2023 12:14:40 +0300 Subject: [PATCH 39/51] ws fix --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 1f5995b4dedb6..a89cbab7f5660 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -94,7 +94,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { int ashift; int bshift; int aext; - int bext; + int bext; asm("\n \ v_pk_ashrrev_i16 %1, 0x80008, %5 \n \ v_pk_ashrrev_i16 %2, 0x80008, %6 \n \ From e6b6ae55f41e338871283606912856a771b01741 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 11 Aug 2023 09:30:28 +0300 Subject: [PATCH 40/51] Undo mess --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> --- ggml-cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a89cbab7f5660..c1b179c6b4ff7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1663,8 +1663,8 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #else const float2 dm8f = __half22float2(dm8); const float2 ds8f = __half22float2(ds8); - const float d8d8 = __low2float(dm8) * __low2float(ds8); - const float m8s8 = __high2float(dm8) * __high2float(ds8); + const float d8d8 = dm8f.x * ds8f.x; + const float m8s8 = dm8f.y * ds8f.y; #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it From c299c4ac0ddf8a470686f770b7f78f356fd69704 Mon Sep 17 00:00:00 2001 From: Engininja2 <139037756+Engininja2@users.noreply.github.com> Date: Fri, 11 Aug 2023 09:43:14 +0300 Subject: [PATCH 41/51] New __dp4a assembly Now compatible with gfx900 and faster as well. --- ggml-cuda.cu | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c1b179c6b4ff7..03ecdee7c9783 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -74,7 +74,7 @@ #include "ggml.h" #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products -#define CC_TURING 700 +#define CC_TURING 1000000000 #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 @@ -90,24 +90,18 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) { static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) c = __builtin_amdgcn_sdot4(a, b, c, false); -#elif defined(__gfx1010__)// || defined(__gfx900__) - int ashift; - int bshift; - int aext; - int bext; +#elif defined(__gfx1010__) || defined(__gfx900__) + int tmp1; + int tmp2; asm("\n \ - v_pk_ashrrev_i16 %1, 0x80008, %5 \n \ - v_pk_ashrrev_i16 %2, 0x80008, %6 \n \ - v_mov_b32_sdwa %3, sext(%5) dst_sel:WORD_1 src0_sel:BYTE_2 \n \ - v_mov_b32_sdwa %3, sext(%5) dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 \n \ - v_mov_b32_sdwa %4, sext(%6) dst_sel:WORD_1 src0_sel:BYTE_2 \n \ - v_mov_b32_sdwa %4, sext(%6) dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 \n \ - v_mad_i32_i16 %0, %1, %2, %0 op_sel:[0, 0, 0, 0] \n \ - v_mad_i32_i16 %0, %1, %2, %0 op_sel:[1, 1, 0, 0] \n \ - v_mad_i32_i16 %0, %3, %4, %0 op_sel:[0, 0, 0, 0] \n \ - v_mad_i32_i16 %0, %3, %4, %0 op_sel:[1, 1, 0, 0] \n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ " - : "+v"(c), "=&v"(ashift), "=&v"(bshift), "=&v"(aext), "=&v"(bext) + : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) : "v"(a), "v"(b) ); #else From 4e58a0524944d8e5a374da12cbf002f47bccb44c Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 11 Aug 2023 10:16:02 +0300 Subject: [PATCH 42/51] Allow overriding CC_TURING --- CMakeLists.txt | 1 + Makefile | 1 + ggml-cuda.cu | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 26cd0a4944e4c..13d862c4b9314 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -378,6 +378,7 @@ if (LLAMA_HIPBLAS) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) diff --git a/Makefile b/Makefile index 0843d86c6b8d6..59b4abebac35f 100644 --- a/Makefile +++ b/Makefile @@ -296,6 +296,7 @@ ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +ggml-cuda.o: CXXFLAGS += -DCC_TURING=1000000000 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< endif # LLAMA_HIPBLAS diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 03ecdee7c9783..aa4a2e9192bdd 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -74,7 +74,9 @@ #include "ggml.h" #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products -#define CC_TURING 1000000000 +#ifndef CC_TURING +#define CC_TURING 700 +#endif #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 From 641561058bc1c366530871aa1fd1f7deb4be8fdb Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 12 Aug 2023 10:51:46 +0300 Subject: [PATCH 43/51] gfx1100 support --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> Co-authored-by: jammm <2500920+jammm@users.noreply.github.com> Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com> --- ggml-cuda.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index aa4a2e9192bdd..1b4755d22ea45 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -92,6 +92,8 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) { static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) c = __builtin_amdgcn_sdot4(a, b, c, false); +#elif defined(__gfx1100__) + c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); #elif defined(__gfx1010__) || defined(__gfx900__) int tmp1; int tmp2; From 3de6a9aed22b7b864ce61c610613e3f679c7bc09 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 16 Aug 2023 18:35:16 +0300 Subject: [PATCH 44/51] reenable LLAMA_CUDA_FORCE_DMMV --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 58983a3fcdb0d..6680ae283ccaa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -373,6 +373,9 @@ if (LLAMA_HIPBLAS) message(STATUS "HIP and hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) + if (LLAMA_CUDA_FORCE_DMMV) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) + endif() target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) From bbbc0ce717e00faf1e42f427716508b19e3ff19d Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 16 Aug 2023 21:28:54 +0300 Subject: [PATCH 45/51] makefile rewrite --- Makefile | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 781a0e403ac87..04b91720ef853 100644 --- a/Makefile +++ b/Makefile @@ -281,24 +281,27 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h endif # LLAMA_CLBLAST ifdef LLAMA_HIPBLAS - ROCM_PATH ?= /opt/rocm - CC := $(ROCM_PATH)/llvm/bin/clang - CXX := $(ROCM_PATH)/llvm/bin/clang++ - GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 + ROCM_PATH ?= /opt/rocm + HIPCC ?= $(ROCM_PATH)/bin/hipcc + GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_KQUANTS_ITER ?= 2 - CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) - CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) - LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas - OBJS += ggml-cuda.o -ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) -ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) -ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) -ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) -ggml-cuda.o: CXXFLAGS += -DCC_TURING=1000000000 + CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS + CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS + LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -fuse-ld=lld + LDFLAGS += -lhipblas -lamdhip64 -lrocblas + HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) + HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) + HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) + HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) + HIPFLAGS += -DCC_TURING=1000000000 +ifdef LLAMA_CUDA_FORCE_DMMV + HIPFLAGS += -DGGML_CUDA_FORCE_DMMV +endif # LLAMA_CUDA_FORCE_DMMV + OBJS += ggml-cuda.o ggml-cuda.o: ggml-cuda.cu ggml-cuda.h - $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< + $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< endif # LLAMA_HIPBLAS ifdef LLAMA_METAL From c88c2a992a42d8fe40bc61e56dd44c7fade1c5a3 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 16 Aug 2023 23:17:52 +0300 Subject: [PATCH 46/51] probably lld is not required --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 04b91720ef853..c84764207cc44 100644 --- a/Makefile +++ b/Makefile @@ -289,7 +289,7 @@ ifdef LLAMA_HIPBLAS LLAMA_CUDA_KQUANTS_ITER ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS - LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -fuse-ld=lld + LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib LDFLAGS += -lhipblas -lamdhip64 -lrocblas HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) From 5d3e7b25e0fce955304a5b166af3459444edcad1 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 22 Aug 2023 19:24:35 +0300 Subject: [PATCH 47/51] use "ROCm" instead of "CUDA" --- common/common.cpp | 4 ++++ examples/llama-bench/llama-bench.cpp | 2 +- ggml-cuda.cu | 2 +- ggml-cuda.h | 6 ++++++ llama.cpp | 2 +- 5 files changed, 13 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 1623ba21f461a..2c42b0cd4a309 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -601,7 +601,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); +#if defined(GGML_USE_HIPBLAS) + fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q HIP kernels instead of hipBLAS. TEMP!!!\n" ); +#else fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" ); +#endif fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" ); fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" ); #endif diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 36057bfca5605..99edf868886c1 100755 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -504,7 +504,7 @@ struct test { static std::string get_backend() { if (cuda) { - return "CUDA"; + return GGML_CUDA_NAME; } if (opencl) { return "OpenCL"; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index dd82011ae9cf3..7cd4a517bb61c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5025,7 +5025,7 @@ void ggml_init_cublas() { CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; - fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count); + fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); for (int id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); diff --git a/ggml-cuda.h b/ggml-cuda.h index f66bb16786af9..17e8d471dbf3a 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -2,6 +2,12 @@ #include "ggml.h" +#ifdef GGML_USE_HIPBLAS +#define GGML_CUDA_NAME "ROCm" +#else +#define GGML_CUDA_NAME "CUDA" +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/llama.cpp b/llama.cpp index 8b151dc84c90c..42454d64b6e19 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1478,7 +1478,7 @@ static void llama_model_load_internal( (void) main_gpu; (void) mul_mat_q; #if defined(GGML_USE_CUBLAS) - LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__); + LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); ggml_cuda_set_mul_mat_q(mul_mat_q); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU From 058f905ef99010b4928803156a886b10785813c0 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 24 Aug 2023 13:23:23 +0300 Subject: [PATCH 48/51] ignore all build dirs --- .dockerignore | 9 +-------- .gitignore | 15 +-------------- 2 files changed, 2 insertions(+), 22 deletions(-) diff --git a/.dockerignore b/.dockerignore index 462fac23a6932..c6ef6c86c9fe1 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,14 +5,7 @@ .vscode/ .DS_Store -build/ -build-em/ -build-debug/ -build-release/ -build-static/ -build-no-accel/ -build-sanitize-addr/ -build-sanitize-thread/ +build*/ models/* diff --git a/.gitignore b/.gitignore index f3121794ac0b7..187527e0e71a7 100644 --- a/.gitignore +++ b/.gitignore @@ -16,20 +16,7 @@ .vs/ .vscode/ -build/ -build-em/ -build-debug/ -build-release/ -build-ci-debug/ -build-ci-release/ -build-static/ -build-cublas/ -build-opencl/ -build-metal/ -build-mpi/ -build-no-accel/ -build-sanitize-addr/ -build-sanitize-thread/ +build*/ out/ tmp/ From a60231f786e5bb7b8cdb42021c3e6d113f96a9e2 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 24 Aug 2023 13:45:05 +0300 Subject: [PATCH 49/51] Add Dockerfiles --- .devops/full-rocm.Dockerfile | 44 ++++++++++++++++++++++++++++++++++++ .devops/main-rocm.Dockerfile | 44 ++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 .devops/full-rocm.Dockerfile create mode 100644 .devops/main-rocm.Dockerfile diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile new file mode 100644 index 0000000000000..6c521e9b4101f --- /dev/null +++ b/.devops/full-rocm.Dockerfile @@ -0,0 +1,44 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=5.6 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +FROM ${BASE_ROCM_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +# List from /~https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +ARG ROCM_DOCKER_ARCH=\ + gfx803 \ + gfx900 \ + gfx906 \ + gfx908 \ + gfx90a \ + gfx1010 \ + gfx1030 \ + gfx1100 \ + gfx1101 \ + gfx1102 + +COPY requirements.txt requirements.txt + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +ENV LLAMA_HIPBLAS=1 +ENV CC=/opt/rocm/llvm/bin/clang +ENV CXX=/opt/rocm/llvm/bin/clang++ + +RUN make + +ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/main-rocm.Dockerfile b/.devops/main-rocm.Dockerfile new file mode 100644 index 0000000000000..789deff6dc8c1 --- /dev/null +++ b/.devops/main-rocm.Dockerfile @@ -0,0 +1,44 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=5.6 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +FROM ${BASE_ROCM_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +# List from /~https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +ARG ROCM_DOCKER_ARCH=\ + gfx803 \ + gfx900 \ + gfx906 \ + gfx908 \ + gfx90a \ + gfx1010 \ + gfx1030 \ + gfx1100 \ + gfx1101 \ + gfx1102 + +COPY requirements.txt requirements.txt + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +ENV LLAMA_HIPBLAS=1 +ENV CC=/opt/rocm/llvm/bin/clang +ENV CXX=/opt/rocm/llvm/bin/clang++ + +RUN make + +ENTRYPOINT [ "/app/main" ] From 81ecaa4b6c1c0f2422076dc012c15771d59c29e2 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 24 Aug 2023 13:52:51 +0300 Subject: [PATCH 50/51] fix llama-bench --- examples/llama-bench/llama-bench.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 99edf868886c1..7a28115841fc3 100755 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -18,9 +18,7 @@ #include "llama.h" #include "common.h" #include "build-info.h" -#ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" -#endif // utils static uint64_t get_time_ns() { From 238335f54f4275fc4389de9a70898d458df6be1e Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 24 Aug 2023 14:03:31 +0300 Subject: [PATCH 51/51] fix -nommq help for non CUDA/HIP --- common/common.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index 62c5e9cee85d3..ff19ec4e50f60 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -613,9 +613,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); +#ifdef GGML_USE_CUBLAS fprintf(stdout, " -nommq, --no-mul-mat-q\n"); fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); +#endif // GGML_USE_CUBLAS #endif fprintf(stdout, " --mtest compute maximum memory usage\n"); fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");