diff --git a/llama/build_cuda.ps1 b/llama/build_cuda.ps1 deleted file mode 100644 index 736c722d..00000000 --- a/llama/build_cuda.ps1 +++ /dev/null @@ -1,24 +0,0 @@ -nvcc -t 12 ` - --generate-code=arch=compute_50,code=[compute_50,sm_50] ` - --generate-code=arch=compute_52,code=[compute_52,sm_52] ` - --generate-code=arch=compute_61,code=[compute_61,sm_61] ` - --generate-code=arch=compute_70,code=[compute_70,sm_70] ` - --generate-code=arch=compute_75,code=[compute_75,sm_75] ` - --generate-code=arch=compute_80,code=[compute_80,sm_80] ` - -DGGML_CUDA_DMMV_X=32 ` - -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 ` - -DGGML_CUDA_MMV_Y=1 ` - -DGGML_USE_CUDA=1 ` - -DGGML_SHARED=1 ` - -DGGML_BUILD=1 ` - -DGGML_USE_LLAMAFILE ` - -Wno-deprecated-gpu-targets ` - --forward-unknown-to-host-compiler ` - -use_fast_math ` - -link ` - -shared ` - -I. ` - -lcuda -lcublas -lcudart -lcublasLt ` - -O3 ` - -o ggml-cuda.dll ` - ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp diff --git a/llama/build_cuda.sh b/llama/build_cuda.sh index e65ac4b1..c2f0ec03 100755 --- a/llama/build_cuda.sh +++ b/llama/build_cuda.sh @@ -1,3 +1,13 @@ +#!/bin/bash + +os="$(uname -s)" + +if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then + output="ggml-cuda.dll" +else + output="libggml-cuda.so" +fi + nvcc \ -t 12 \ --generate-code=arch=compute_50,code=[compute_50,sm_50] \ @@ -14,6 +24,7 @@ nvcc \ -DGGML_BUILD=1 \ -DGGML_USE_LLAMAFILE \ -D_GNU_SOURCE \ + -DCMAKE_POSITION_INDEPENDENT_CODE=on \ -Wno-deprecated-gpu-targets \ --forward-unknown-to-host-compiler \ -use_fast_math \ @@ -23,5 +34,5 @@ nvcc \ -I. \ -lcuda -lcublas -lcudart -lcublasLt \ -O3 \ - -o libggml-cuda.so \ + -o $output \ ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp diff --git a/llama/build_hipblas.sh b/llama/build_hipblas.sh index a9a855df..f09106e6 100644 --- a/llama/build_hipblas.sh +++ b/llama/build_hipblas.sh @@ -1,16 +1,43 @@ +#!/bin/bash + +archs=( + gfx900 + gfx940 + gfx941 + gfx942 + gfx1010 + gfx1012 + gfx1030 + gfx1100 + gfx1101 + gfx1102 +) + +linux_archs=( + gfx906:xnack- + gfx908:xnack- + gfx90a:xnack+ + gfx90a:xnack- +) + +os="$(uname -s)" + +if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then + output="ggml-hipblas.dll" +else + output="libggml-hipblas.so" + archs+=("${linux_archs[@]}") +fi + +offload_arch_flags="" +for arch in "${archs[@]}"; do + offload_arch_flags+=" --offload-arch=$arch" +done + hipcc \ -parallel-jobs=12 \ -O3 \ - --offload-arch=gfx900 \ - --offload-arch=gfx940 \ - --offload-arch=gfx941 \ - --offload-arch=gfx942 \ - --offload-arch=gfx1010 \ - --offload-arch=gfx1012 \ - --offload-arch=gfx1030 \ - --offload-arch=gfx1100 \ - --offload-arch=gfx1101 \ - --offload-arch=gfx1102 \ + $offload_arch_flags \ -DGGML_USE_CUDA \ -DGGML_BUILD=1 \ -DGGML_SHARED=1 \ @@ -23,6 +50,7 @@ hipcc \ -DNDEBUG \ -DK_QUANTS_PER_ITERATION=2 \ -D_CRT_SECURE_NO_WARNINGS \ + -DCMAKE_POSITION_INDEPENDENT_CODE=on \ -Xclang --dependent-lib=msvcrt -Wl,/subsystem:console \ -Wno-expansion-to-defined \ -Wno-invalid-noreturn \ @@ -35,10 +63,6 @@ hipcc \ -o ggml-hipblas.dll \ ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp - # --offload-arch='gfx906:xnack-' \ - # --offload-arch='gfx908:xnack-' \ - # --offload-arch='gfx90a:xnack+' \ - # --offload-arch='gfx90a:xnack-' \ # -D_DLL \ # -D_MT \ # -D_XOPEN_SOURCE=600 \