From b22d78720ea33898c114f1df5ab8557d54007d89 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sun, 19 May 2024 23:11:30 -0700 Subject: [PATCH] cuda linux --- llama/README.md | 25 +++++++++++++++++++++---- llama/build_cuda.ps1 | 24 ++++++++++++++++++++++++ llama/build_cuda.sh | 5 +++-- llama/llama.go | 7 +++++-- 4 files changed, 53 insertions(+), 8 deletions(-) create mode 100644 llama/build_cuda.ps1 mode change 100644 => 100755 llama/build_cuda.sh diff --git a/llama/README.md b/llama/README.md index fae9ea9a..c3228b31 100644 --- a/llama/README.md +++ b/llama/README.md @@ -6,11 +6,10 @@ Supported: - [x] CPU - [x] avx, avx2 -- [ ] avx512 - [x] macOS Metal - [x] Windows CUDA - [x] Windows ROCm -- [ ] Linux CUDA +- [x] Linux CUDA - [ ] Linux ROCm - [x] Llava - [ ] Parallel Requests @@ -44,14 +43,32 @@ go env -w "CGO_CXXFLAGS_ALLOW=-mfma|-mf16c" go build -tags=avx,avx2 . ``` +## Linux + ### CUDA -Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build ggml-cuda: +Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build `libggml-cuda.so`: + +```shell +./build_cuda.sh +``` + +Then build the package with the `cuda` tag: + +```shell +go build -tags=cuda . +``` + +## Windows + +### CUDA + +Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build the cuda code: Build `ggml-cuda.dll`: ```shell -./build_cuda.sh +./build_cuda.ps1 ``` Then build the package with the `cuda` tag: diff --git a/llama/build_cuda.ps1 b/llama/build_cuda.ps1 new file mode 100644 index 00000000..736c722d --- /dev/null +++ b/llama/build_cuda.ps1 @@ -0,0 +1,24 @@ +nvcc -t 12 ` + --generate-code=arch=compute_50,code=[compute_50,sm_50] ` + --generate-code=arch=compute_52,code=[compute_52,sm_52] ` + --generate-code=arch=compute_61,code=[compute_61,sm_61] ` + --generate-code=arch=compute_70,code=[compute_70,sm_70] ` + --generate-code=arch=compute_75,code=[compute_75,sm_75] ` + --generate-code=arch=compute_80,code=[compute_80,sm_80] ` + -DGGML_CUDA_DMMV_X=32 ` + -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 ` + -DGGML_CUDA_MMV_Y=1 ` + -DGGML_USE_CUDA=1 ` + -DGGML_SHARED=1 ` + -DGGML_BUILD=1 ` + -DGGML_USE_LLAMAFILE ` + -Wno-deprecated-gpu-targets ` + --forward-unknown-to-host-compiler ` + -use_fast_math ` + -link ` + -shared ` + -I. ` + -lcuda -lcublas -lcudart -lcublasLt ` + -O3 ` + -o ggml-cuda.dll ` + ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp diff --git a/llama/build_cuda.sh b/llama/build_cuda.sh old mode 100644 new mode 100755 index 7a49a4b9..e65ac4b1 --- a/llama/build_cuda.sh +++ b/llama/build_cuda.sh @@ -8,19 +8,20 @@ nvcc \ --generate-code=arch=compute_80,code=[compute_80,sm_80] \ -DGGML_CUDA_DMMV_X=32 \ -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \ - -DGGML_MULTIPLATFORM \ -DGGML_CUDA_MMV_Y=1 \ -DGGML_USE_CUDA=1 \ -DGGML_SHARED=1 \ -DGGML_BUILD=1 \ -DGGML_USE_LLAMAFILE \ + -D_GNU_SOURCE \ -Wno-deprecated-gpu-targets \ --forward-unknown-to-host-compiler \ -use_fast_math \ -link \ -shared \ + -fPIC \ -I. \ -lcuda -lcublas -lcudart -lcublasLt \ -O3 \ - -o ggml-cuda.dll \ + -o libggml-cuda.so \ ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp diff --git a/llama/llama.go b/llama/llama.go index c82148c4..0de3e21f 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -8,16 +8,19 @@ package llama // #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers // #cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers // #cgo darwin,amd64 LDFLAGS: -ld_classic -framework Foundation -framework Accelerate +// #cgo linux CFLAGS: -D_GNU_SOURCE +// #cgo linux CXXFLAGS: -D_GNU_SOURCE // #cgo windows LDFLAGS: -lmsvcrt // #cgo avx CFLAGS: -mavx // #cgo avx CXXFLAGS: -mavx // #cgo avx2 CFLAGS: -mavx2 -mfma // #cgo avx2 CXXFLAGS: -mavx2 -mfma -// #cgo cuda CFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 -// #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 +// #cgo cuda CFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 +// #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 // #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 // #cgo windows,cuda LDFLAGS: -L. -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.3/lib/x64" -lggml-cuda -lcuda -lcudart -lcublas -lcublasLt // #cgo windows,rocm LDFLAGS: -L. -L"C:/Program Files/AMD/ROCm/5.7/lib" -lggml-hipblas -lhipblas -lamdhip64 -lrocblas +// #cgo linux,cuda LDFLAGS: -L${SRCDIR} -L/usr/local/cuda/lib64 -lggml-cuda -lcuda -lcudart -lcublas -lcublasLt -lpthread -ldl -lrt // #include // #include "llama.h" // #include "clip.h"