forked from third-party-mirrors/ollama
cuda linux
This commit is contained in:
parent
905568a47f
commit
b22d78720e
@ -6,11 +6,10 @@ Supported:
|
|||||||
|
|
||||||
- [x] CPU
|
- [x] CPU
|
||||||
- [x] avx, avx2
|
- [x] avx, avx2
|
||||||
- [ ] avx512
|
|
||||||
- [x] macOS Metal
|
- [x] macOS Metal
|
||||||
- [x] Windows CUDA
|
- [x] Windows CUDA
|
||||||
- [x] Windows ROCm
|
- [x] Windows ROCm
|
||||||
- [ ] Linux CUDA
|
- [x] Linux CUDA
|
||||||
- [ ] Linux ROCm
|
- [ ] Linux ROCm
|
||||||
- [x] Llava
|
- [x] Llava
|
||||||
- [ ] Parallel Requests
|
- [ ] Parallel Requests
|
||||||
@ -44,14 +43,32 @@ go env -w "CGO_CXXFLAGS_ALLOW=-mfma|-mf16c"
|
|||||||
go build -tags=avx,avx2 .
|
go build -tags=avx,avx2 .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Linux
|
||||||
|
|
||||||
### CUDA
|
### CUDA
|
||||||
|
|
||||||
Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build ggml-cuda:
|
Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build `libggml-cuda.so`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
./build_cuda.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Then build the package with the `cuda` tag:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
go build -tags=cuda .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Windows
|
||||||
|
|
||||||
|
### CUDA
|
||||||
|
|
||||||
|
Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build the cuda code:
|
||||||
|
|
||||||
Build `ggml-cuda.dll`:
|
Build `ggml-cuda.dll`:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
./build_cuda.sh
|
./build_cuda.ps1
|
||||||
```
|
```
|
||||||
|
|
||||||
Then build the package with the `cuda` tag:
|
Then build the package with the `cuda` tag:
|
||||||
|
24
llama/build_cuda.ps1
Normal file
24
llama/build_cuda.ps1
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
nvcc -t 12 `
|
||||||
|
--generate-code=arch=compute_50,code=[compute_50,sm_50] `
|
||||||
|
--generate-code=arch=compute_52,code=[compute_52,sm_52] `
|
||||||
|
--generate-code=arch=compute_61,code=[compute_61,sm_61] `
|
||||||
|
--generate-code=arch=compute_70,code=[compute_70,sm_70] `
|
||||||
|
--generate-code=arch=compute_75,code=[compute_75,sm_75] `
|
||||||
|
--generate-code=arch=compute_80,code=[compute_80,sm_80] `
|
||||||
|
-DGGML_CUDA_DMMV_X=32 `
|
||||||
|
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 `
|
||||||
|
-DGGML_CUDA_MMV_Y=1 `
|
||||||
|
-DGGML_USE_CUDA=1 `
|
||||||
|
-DGGML_SHARED=1 `
|
||||||
|
-DGGML_BUILD=1 `
|
||||||
|
-DGGML_USE_LLAMAFILE `
|
||||||
|
-Wno-deprecated-gpu-targets `
|
||||||
|
--forward-unknown-to-host-compiler `
|
||||||
|
-use_fast_math `
|
||||||
|
-link `
|
||||||
|
-shared `
|
||||||
|
-I. `
|
||||||
|
-lcuda -lcublas -lcudart -lcublasLt `
|
||||||
|
-O3 `
|
||||||
|
-o ggml-cuda.dll `
|
||||||
|
ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
|
5
llama/build_cuda.sh
Normal file → Executable file
5
llama/build_cuda.sh
Normal file → Executable file
@ -8,19 +8,20 @@ nvcc \
|
|||||||
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
|
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
|
||||||
-DGGML_CUDA_DMMV_X=32 \
|
-DGGML_CUDA_DMMV_X=32 \
|
||||||
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
|
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
|
||||||
-DGGML_MULTIPLATFORM \
|
|
||||||
-DGGML_CUDA_MMV_Y=1 \
|
-DGGML_CUDA_MMV_Y=1 \
|
||||||
-DGGML_USE_CUDA=1 \
|
-DGGML_USE_CUDA=1 \
|
||||||
-DGGML_SHARED=1 \
|
-DGGML_SHARED=1 \
|
||||||
-DGGML_BUILD=1 \
|
-DGGML_BUILD=1 \
|
||||||
-DGGML_USE_LLAMAFILE \
|
-DGGML_USE_LLAMAFILE \
|
||||||
|
-D_GNU_SOURCE \
|
||||||
-Wno-deprecated-gpu-targets \
|
-Wno-deprecated-gpu-targets \
|
||||||
--forward-unknown-to-host-compiler \
|
--forward-unknown-to-host-compiler \
|
||||||
-use_fast_math \
|
-use_fast_math \
|
||||||
-link \
|
-link \
|
||||||
-shared \
|
-shared \
|
||||||
|
-fPIC \
|
||||||
-I. \
|
-I. \
|
||||||
-lcuda -lcublas -lcudart -lcublasLt \
|
-lcuda -lcublas -lcudart -lcublasLt \
|
||||||
-O3 \
|
-O3 \
|
||||||
-o ggml-cuda.dll \
|
-o libggml-cuda.so \
|
||||||
ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
|
ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
|
||||||
|
@ -8,16 +8,19 @@ package llama
|
|||||||
// #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
|
// #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
|
||||||
// #cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
|
// #cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
|
||||||
// #cgo darwin,amd64 LDFLAGS: -ld_classic -framework Foundation -framework Accelerate
|
// #cgo darwin,amd64 LDFLAGS: -ld_classic -framework Foundation -framework Accelerate
|
||||||
|
// #cgo linux CFLAGS: -D_GNU_SOURCE
|
||||||
|
// #cgo linux CXXFLAGS: -D_GNU_SOURCE
|
||||||
// #cgo windows LDFLAGS: -lmsvcrt
|
// #cgo windows LDFLAGS: -lmsvcrt
|
||||||
// #cgo avx CFLAGS: -mavx
|
// #cgo avx CFLAGS: -mavx
|
||||||
// #cgo avx CXXFLAGS: -mavx
|
// #cgo avx CXXFLAGS: -mavx
|
||||||
// #cgo avx2 CFLAGS: -mavx2 -mfma
|
// #cgo avx2 CFLAGS: -mavx2 -mfma
|
||||||
// #cgo avx2 CXXFLAGS: -mavx2 -mfma
|
// #cgo avx2 CXXFLAGS: -mavx2 -mfma
|
||||||
// #cgo cuda CFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
// #cgo cuda CFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
// #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
// #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
// #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
// #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
// #cgo windows,cuda LDFLAGS: -L. -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.3/lib/x64" -lggml-cuda -lcuda -lcudart -lcublas -lcublasLt
|
// #cgo windows,cuda LDFLAGS: -L. -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.3/lib/x64" -lggml-cuda -lcuda -lcudart -lcublas -lcublasLt
|
||||||
// #cgo windows,rocm LDFLAGS: -L. -L"C:/Program Files/AMD/ROCm/5.7/lib" -lggml-hipblas -lhipblas -lamdhip64 -lrocblas
|
// #cgo windows,rocm LDFLAGS: -L. -L"C:/Program Files/AMD/ROCm/5.7/lib" -lggml-hipblas -lhipblas -lamdhip64 -lrocblas
|
||||||
|
// #cgo linux,cuda LDFLAGS: -L${SRCDIR} -L/usr/local/cuda/lib64 -lggml-cuda -lcuda -lcudart -lcublas -lcublasLt -lpthread -ldl -lrt
|
||||||
// #include <stdlib.h>
|
// #include <stdlib.h>
|
||||||
// #include "llama.h"
|
// #include "llama.h"
|
||||||
// #include "clip.h"
|
// #include "clip.h"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user