forked from third-party-mirrors/ollama
Compare commits
61 Commits
dhiltgen/r
...
main
Author | SHA1 | Date | |
---|---|---|---|
|
67691e410d | ||
|
d7eb05b936 | ||
|
636a743c2b | ||
|
df011054fa | ||
|
ac07160c8d | ||
|
6606e4243c | ||
|
65973ceb64 | ||
|
bebef1e50d | ||
|
d48c1c5a44 | ||
|
36a8372b28 | ||
|
4e94227b5d | ||
|
479d551766 | ||
|
76b2b723b2 | ||
|
b8d77cdeab | ||
|
c2e8cbaa14 | ||
|
771fab1dd8 | ||
|
3a5239e6bf | ||
|
3d25e7bf8c | ||
|
1618700c5a | ||
|
b111aa5a91 | ||
|
9e83e550e1 | ||
|
fc2a0715df | ||
|
3020d2dc58 | ||
|
a909417602 | ||
|
6cd566872b | ||
|
9d71bcc3e2 | ||
|
a4c70fe157 | ||
|
34a75102f7 | ||
|
4157d1f7b6 | ||
|
4ebfa2cb91 | ||
|
046054fa3b | ||
|
95483f348b | ||
|
f247a6233e | ||
|
44bd9e5994 | ||
|
18237be9b2 | ||
|
29ab9fa7d7 | ||
|
b8d5036e33 | ||
|
312d9de1d1 | ||
|
a103dae01e | ||
|
d07cf41a97 | ||
|
8c238e70ab | ||
|
8a9bb0d000 | ||
|
26acdcf44e | ||
|
921779bb10 | ||
|
16f4eabe2d | ||
|
c826e57475 | ||
|
712e99d477 | ||
|
b754f5a6a3 | ||
|
a805e5947e | ||
|
91dfbb1bba | ||
|
db1842b9e1 | ||
|
c9ca386131 | ||
|
078f666f73 | ||
|
de1557a0dc | ||
|
084929c293 | ||
|
abd5dfd06a | ||
|
099f7077a1 | ||
|
d7c94e0ca6 | ||
|
35ec7f079f | ||
|
5231ae52d9 | ||
|
3085c47bea |
231
.github/workflows/release.yaml
vendored
231
.github/workflows/release.yaml
vendored
@ -1,5 +1,9 @@
|
|||||||
name: release
|
name: release
|
||||||
|
|
||||||
|
env:
|
||||||
|
ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
|
||||||
|
MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
tags:
|
tags:
|
||||||
@ -8,7 +12,7 @@ on:
|
|||||||
jobs:
|
jobs:
|
||||||
# Full build of the Mac assets
|
# Full build of the Mac assets
|
||||||
build-darwin:
|
build-darwin:
|
||||||
runs-on: macos-12
|
runs-on: macos-13
|
||||||
environment: release
|
environment: release
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
@ -39,8 +43,8 @@ jobs:
|
|||||||
APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
|
APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
|
||||||
APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
|
APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
|
||||||
APPLE_ID: ${{ vars.APPLE_ID }}
|
APPLE_ID: ${{ vars.APPLE_ID }}
|
||||||
SDKROOT: /Applications/Xcode_13.4.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
||||||
DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
|
DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
|
||||||
run: |
|
run: |
|
||||||
./scripts/build_darwin.sh
|
./scripts/build_darwin.sh
|
||||||
|
|
||||||
@ -60,51 +64,34 @@ jobs:
|
|||||||
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
- name: Set make jobs default
|
||||||
|
run: |
|
||||||
|
echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
- name: Set Version
|
- name: Set Version
|
||||||
shell: bash
|
shell: bash
|
||||||
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
||||||
- uses: 'google-github-actions/auth@v2'
|
- name: Add msys paths
|
||||||
with:
|
|
||||||
project_id: 'ollama'
|
|
||||||
credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
|
|
||||||
- run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
|
|
||||||
- name: install Windows SDK 8.1 to get signtool
|
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
write-host "downloading SDK"
|
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
|
- name: Install msys2 tools
|
||||||
Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
|
|
||||||
write-host "Win SDK 8.1 installed"
|
|
||||||
gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
|
|
||||||
- name: install signing plugin
|
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
||||||
write-host "downloading plugin"
|
|
||||||
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
|
|
||||||
Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
|
|
||||||
write-host "Installing plugin"
|
|
||||||
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
|
|
||||||
write-host "plugin installed"
|
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- run: go get ./...
|
|
||||||
- run: |
|
- run: |
|
||||||
$gopath=(get-command go).source | split-path -parent
|
|
||||||
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||||
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||||
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
|
||||||
$env:PATH="$gopath;$env:PATH"
|
make
|
||||||
$cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
|
|
||||||
make -j $cores
|
|
||||||
name: make
|
name: make
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cpu
|
name: generate-windows-cpu
|
||||||
path: |
|
path: |
|
||||||
build/**/*
|
build/**/*
|
||||||
build/**/*.a
|
|
||||||
dist/windows-amd64/**
|
dist/windows-amd64/**
|
||||||
|
|
||||||
# ROCm generation step
|
# ROCm generation step
|
||||||
@ -115,58 +102,49 @@ jobs:
|
|||||||
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
- name: Set make jobs default
|
||||||
|
run: |
|
||||||
|
echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
- name: Set Version
|
- name: Set Version
|
||||||
shell: bash
|
shell: bash
|
||||||
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
||||||
- uses: 'google-github-actions/auth@v2'
|
- name: Add msys paths
|
||||||
with:
|
|
||||||
project_id: 'ollama'
|
|
||||||
credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
|
|
||||||
- run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
|
|
||||||
- name: install Windows SDK 8.1 to get signtool
|
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
write-host "downloading SDK"
|
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
|
- name: Install msys2 tools
|
||||||
Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
|
|
||||||
write-host "Win SDK 8.1 installed"
|
|
||||||
gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
|
|
||||||
- name: install signing plugin
|
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
||||||
write-host "downloading plugin"
|
|
||||||
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
|
|
||||||
Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
|
|
||||||
write-host "Installing plugin"
|
|
||||||
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
|
|
||||||
write-host "plugin installed"
|
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install ROCm'
|
# ROCM installation steps
|
||||||
|
- name: 'Cache ROCm installer'
|
||||||
|
id: cache-rocm
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: rocm-install.exe
|
||||||
|
key: ${{ env.ROCM_WINDOWS_URL }}
|
||||||
|
- name: 'Conditionally Download ROCm'
|
||||||
|
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
write-host "downloading AMD HIP Installer"
|
Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
|
||||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
- name: 'Install ROCm'
|
||||||
write-host "Installing AMD HIP"
|
run: |
|
||||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||||
write-host "Completed AMD HIP"
|
|
||||||
- name: 'Verify ROCm'
|
- name: 'Verify ROCm'
|
||||||
run: |
|
run: |
|
||||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||||
- run: go get ./...
|
echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
- run: |
|
- name: make rocm runner
|
||||||
$gopath=(get-command go).source | split-path -parent
|
run: |
|
||||||
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||||
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||||
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
|
||||||
$env:PATH="$gopath;$env:PATH"
|
make -C llama print-HIP_PATH print-HIP_LIB_DIR
|
||||||
$env:OLLAMA_SKIP_CPU_GENERATE="1"
|
make rocm
|
||||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
|
||||||
$cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
|
|
||||||
make -j $cores
|
|
||||||
name: make
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-rocm
|
name: generate-windows-rocm
|
||||||
@ -181,71 +159,74 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
cuda:
|
cuda:
|
||||||
- version: "11"
|
- version: "11.3"
|
||||||
url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
|
url: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
|
||||||
- version: "12"
|
- version: "12.4"
|
||||||
url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
|
url: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
|
||||||
env:
|
env:
|
||||||
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
- name: Set make jobs default
|
||||||
|
run: |
|
||||||
|
echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
- name: Set Version
|
- name: Set Version
|
||||||
shell: bash
|
shell: bash
|
||||||
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
||||||
- uses: 'google-github-actions/auth@v2'
|
- name: Install msys2
|
||||||
with:
|
|
||||||
project_id: 'ollama'
|
|
||||||
credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
|
|
||||||
- run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
|
|
||||||
- name: install Windows SDK 8.1 to get signtool
|
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
$msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
|
||||||
write-host "downloading SDK"
|
write-host "Downloading msys2"
|
||||||
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
|
Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
|
||||||
Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
|
write-host "Installing msys2"
|
||||||
write-host "Win SDK 8.1 installed"
|
Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
|
||||||
gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
|
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
- name: install signing plugin
|
- name: Install msys2 tools
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
|
||||||
write-host "downloading plugin"
|
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
|
- name: verify tools
|
||||||
Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
|
run: |
|
||||||
write-host "Installing plugin"
|
get-command gcc
|
||||||
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
|
gcc --version
|
||||||
write-host "plugin installed"
|
get-command make
|
||||||
|
make --version
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install CUDA ${{ matrix.cuda.version }}'
|
# CUDA installation steps
|
||||||
|
- name: 'Cache CUDA installer'
|
||||||
|
id: cache-cuda
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: cuda-install.exe
|
||||||
|
key: ${{ matrix.cuda.url }}
|
||||||
|
- name: 'Conditionally Download CUDA'
|
||||||
|
if: steps.cache-cuda.outputs.cache-hit != 'true'
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
write-host "downloading CUDA Installer"
|
Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "cuda-install.exe"
|
||||||
Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
|
- name: 'Install CUDA'
|
||||||
write-host "Installing CUDA"
|
run: |
|
||||||
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
|
$subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ matrix.cuda.version }}"}
|
||||||
write-host "Completed CUDA"
|
Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
|
||||||
|
- name: 'Verify CUDA'
|
||||||
|
run: |
|
||||||
|
& (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
|
||||||
$cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
|
$cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
|
||||||
$cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2'
|
$cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2'
|
||||||
echo "$cudaPath\bin" >> $env:GITHUB_PATH
|
echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
|
echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
|
echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
|
echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
- name: 'Verify CUDA'
|
|
||||||
run: nvcc -V
|
- name: make cuda runner
|
||||||
- run: go get ./...
|
|
||||||
- name: make
|
|
||||||
run: |
|
run: |
|
||||||
$gopath=(get-command go).source | split-path -parent
|
|
||||||
$cudabin=(get-command nvcc).source | split-path
|
|
||||||
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||||
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||||
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
|
||||||
$env:PATH="$gopath;$cudabin;$env:PATH"
|
make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
|
||||||
$env:OLLAMA_SKIP_CPU_GENERATE="1"
|
|
||||||
$cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
|
|
||||||
make -j $cores
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cuda-${{ matrix.cuda.version }}
|
name: generate-windows-cuda-${{ matrix.cuda.version }}
|
||||||
@ -393,7 +374,7 @@ jobs:
|
|||||||
$env:PATH="$gopath;$gccpath;$env:PATH"
|
$env:PATH="$gopath;$gccpath;$env:PATH"
|
||||||
echo $env:PATH
|
echo $env:PATH
|
||||||
$env:ARCH="arm64"
|
$env:ARCH="arm64"
|
||||||
.\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
|
.\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies sign distZip
|
||||||
name: 'Windows Build'
|
name: 'Windows Build'
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
@ -443,6 +424,24 @@ jobs:
|
|||||||
write-host "Installing plugin"
|
write-host "Installing plugin"
|
||||||
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
|
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
|
||||||
write-host "plugin installed"
|
write-host "plugin installed"
|
||||||
|
- name: Install msys2
|
||||||
|
run: |
|
||||||
|
$msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
|
||||||
|
write-host "Downloading msys2"
|
||||||
|
Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
|
||||||
|
write-host "Installing msys2"
|
||||||
|
Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
|
||||||
|
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
- name: Install msys2 tools
|
||||||
|
run: |
|
||||||
|
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
|
||||||
|
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
- name: verify tools
|
||||||
|
run: |
|
||||||
|
get-command gcc
|
||||||
|
gcc --version
|
||||||
|
get-command make
|
||||||
|
make --version
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
@ -453,10 +452,10 @@ jobs:
|
|||||||
name: generate-windows-cpu
|
name: generate-windows-cpu
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cuda-11
|
name: generate-windows-cuda-11.3
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cuda-12
|
name: generate-windows-cuda-12.4
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-rocm
|
name: generate-windows-rocm
|
||||||
@ -466,13 +465,11 @@ jobs:
|
|||||||
path: dist
|
path: dist
|
||||||
- run: dir build
|
- run: dir build
|
||||||
- run: |
|
- run: |
|
||||||
$gopath=(get-command go).source | split-path -parent
|
|
||||||
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||||
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||||
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
|
||||||
$env:PATH="$gopath;$env:PATH"
|
|
||||||
$env:OLLAMA_SKIP_GENERATE="1"
|
$env:OLLAMA_SKIP_GENERATE="1"
|
||||||
$env:ARCH="amd64"
|
$env:ARCH="amd64"
|
||||||
|
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
|
||||||
& .\scripts\build_windows.ps1
|
& .\scripts\build_windows.ps1
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
|
125
.github/workflows/test.yaml
vendored
125
.github/workflows/test.yaml
vendored
@ -1,5 +1,11 @@
|
|||||||
name: test
|
name: test
|
||||||
|
|
||||||
|
env:
|
||||||
|
ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
|
||||||
|
MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
|
||||||
|
CUDA_12_WINDOWS_URL: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
|
||||||
|
CUDA_12_WINDOWS_VER: 12.4
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
# For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
|
# For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
|
||||||
# cancels running CI jobs and starts all new ones.
|
# cancels running CI jobs and starts all new ones.
|
||||||
@ -99,30 +105,45 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install ROCm'
|
- name: Set make jobs default
|
||||||
|
run: |
|
||||||
|
echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
|
|
||||||
|
# ROCM installation steps
|
||||||
|
- name: 'Cache ROCm installer'
|
||||||
|
id: cache-rocm
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: rocm-install.exe
|
||||||
|
key: ${{ env.ROCM_WINDOWS_URL }}
|
||||||
|
- name: 'Conditionally Download ROCm'
|
||||||
|
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
write-host "downloading AMD HIP Installer"
|
Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
|
||||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
- name: 'Install ROCm'
|
||||||
write-host "Installing AMD HIP"
|
run: |
|
||||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||||
write-host "Completed AMD HIP"
|
|
||||||
- name: 'Verify ROCm'
|
- name: 'Verify ROCm'
|
||||||
run: |
|
run: |
|
||||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||||
- run: go get ./...
|
echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
- run: |
|
|
||||||
$gopath=(get-command go).source | split-path -parent
|
- name: Add msys paths
|
||||||
|
run: |
|
||||||
|
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
- name: Install msys2 tools
|
||||||
|
run: |
|
||||||
|
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
||||||
|
|
||||||
|
- name: make rocm runner
|
||||||
|
run: |
|
||||||
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||||
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||||
$env:PATH="$gopath;$env:PATH"
|
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
|
||||||
$env:OLLAMA_SKIP_CPU_GENERATE="1"
|
|
||||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
|
||||||
$cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
|
|
||||||
write-host $env:HIP_PATH
|
|
||||||
make -C llama print-HIP_PATH print-HIP_LIB_DIR
|
make -C llama print-HIP_PATH print-HIP_LIB_DIR
|
||||||
make -j $cores rocm
|
make rocm
|
||||||
name: make
|
|
||||||
|
|
||||||
# CUDA generation step
|
# CUDA generation step
|
||||||
runners-windows-cuda:
|
runners-windows-cuda:
|
||||||
@ -135,36 +156,49 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install CUDA'
|
- name: Set make jobs default
|
||||||
|
run: |
|
||||||
|
echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
|
|
||||||
|
# CUDA installation steps
|
||||||
|
- name: 'Cache CUDA installer'
|
||||||
|
id: cache-cuda
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: cuda-install.exe
|
||||||
|
key: ${{ env.CUDA_12_WINDOWS_URL }}
|
||||||
|
- name: 'Conditionally Download CUDA'
|
||||||
|
if: steps.cache-cuda.outputs.cache-hit != 'true'
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
write-host "downloading CUDA Installer"
|
Invoke-WebRequest -Uri "${env:CUDA_12_WINDOWS_URL}" -OutFile "cuda-install.exe"
|
||||||
Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
|
- name: 'Install CUDA'
|
||||||
write-host "Installing CUDA"
|
run: |
|
||||||
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
|
$subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ env.CUDA_12_WINDOWS_VER }}"}
|
||||||
write-host "Completed CUDA"
|
Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
|
||||||
|
- name: 'Verify CUDA'
|
||||||
|
run: |
|
||||||
|
& (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
|
||||||
$cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
|
$cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
|
||||||
$cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2'
|
$cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2'
|
||||||
echo "$cudaPath\bin" >> $env:GITHUB_PATH
|
echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
|
echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
|
echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
|
echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
||||||
- name: 'Verify CUDA'
|
|
||||||
run: nvcc -V
|
- name: Add msys paths
|
||||||
- run: go get ./...
|
run: |
|
||||||
- name: make
|
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
- name: Install msys2 tools
|
||||||
|
run: |
|
||||||
|
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
||||||
|
- name: make cuda runner
|
||||||
run: |
|
run: |
|
||||||
$gopath=(get-command go).source | split-path -parent
|
|
||||||
$cudabin=(get-command nvcc).source | split-path
|
|
||||||
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||||
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||||
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
|
||||||
$env:PATH="$gopath;$cudabin;$env:PATH"
|
make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
|
||||||
$env:OLLAMA_SKIP_CPU_GENERATE="1"
|
|
||||||
$cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
|
|
||||||
make -j $cores cuda_v11
|
|
||||||
env:
|
|
||||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
|
||||||
|
|
||||||
runners-cpu:
|
runners-cpu:
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
@ -189,7 +223,15 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- run: go get ./...
|
- name: Add msys paths
|
||||||
|
if: ${{ startsWith(matrix.os, 'windows-') }}
|
||||||
|
run: |
|
||||||
|
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
- name: Install msys2 tools
|
||||||
|
if: ${{ startsWith(matrix.os, 'windows-') }}
|
||||||
|
run: |
|
||||||
|
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
||||||
- name: 'Build Windows Go Runners'
|
- name: 'Build Windows Go Runners'
|
||||||
if: ${{ startsWith(matrix.os, 'windows-') }}
|
if: ${{ startsWith(matrix.os, 'windows-') }}
|
||||||
run: |
|
run: |
|
||||||
@ -200,6 +242,7 @@ jobs:
|
|||||||
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
||||||
$env:PATH="$gopath;$gccpath;$env:PATH"
|
$env:PATH="$gopath;$gccpath;$env:PATH"
|
||||||
echo $env:PATH
|
echo $env:PATH
|
||||||
|
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
|
||||||
make -j 4
|
make -j 4
|
||||||
- name: 'Build Unix Go Runners'
|
- name: 'Build Unix Go Runners'
|
||||||
if: ${{ ! startsWith(matrix.os, 'windows-') }}
|
if: ${{ ! startsWith(matrix.os, 'windows-') }}
|
||||||
@ -238,7 +281,7 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
- uses: golangci/golangci-lint-action@v6
|
- uses: golangci/golangci-lint-action@v6
|
||||||
with:
|
with:
|
||||||
args: --timeout 8m0s -v
|
args: --timeout 10m0s -v
|
||||||
test:
|
test:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
75
Dockerfile
75
Dockerfile
@ -1,11 +1,12 @@
|
|||||||
# Note: once we have fully transitioned to the Go server, this will replace the old Dockerfile at the top of the tree
|
ARG GOLANG_VERSION=1.22.8
|
||||||
ARG GOLANG_VERSION=1.22.5
|
|
||||||
ARG CMAKE_VERSION=3.22.1
|
ARG CMAKE_VERSION=3.22.1
|
||||||
ARG CUDA_VERSION_11=11.3.1
|
ARG CUDA_VERSION_11=11.3.1
|
||||||
ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
|
ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
|
||||||
ARG CUDA_VERSION_12=12.4.0
|
ARG CUDA_VERSION_12=12.4.0
|
||||||
ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
|
ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
|
||||||
ARG ROCM_VERSION=6.1.2
|
ARG ROCM_VERSION=6.1.2
|
||||||
|
ARG JETPACK_6=r36.2.0
|
||||||
|
ARG JETPACK_5=r35.4.1
|
||||||
|
|
||||||
### To create a local image for building linux binaries on mac or windows with efficient incremental builds
|
### To create a local image for building linux binaries on mac or windows with efficient incremental builds
|
||||||
#
|
#
|
||||||
@ -14,7 +15,7 @@ ARG ROCM_VERSION=6.1.2
|
|||||||
#
|
#
|
||||||
### Then incremental builds will be much faster in this container
|
### Then incremental builds will be much faster in this container
|
||||||
#
|
#
|
||||||
# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
|
# make -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
|
||||||
#
|
#
|
||||||
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
|
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
@ -77,9 +78,9 @@ ARG CUDA_V12_ARCHITECTURES
|
|||||||
ARG OLLAMA_FAST_BUILD
|
ARG OLLAMA_FAST_BUILD
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
|
if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
|
||||||
make -C llama -j $(expr $(nproc) / 2 ) ; \
|
make -j $(expr $(nproc) / 2 ) ; \
|
||||||
else \
|
else \
|
||||||
make -C llama -j 5 ; \
|
make -j 5 ; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
|
FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
|
||||||
@ -91,7 +92,46 @@ ARG CUDA_V11_ARCHITECTURES
|
|||||||
ARG CUDA_V12_ARCHITECTURES
|
ARG CUDA_V12_ARCHITECTURES
|
||||||
ARG OLLAMA_FAST_BUILD
|
ARG OLLAMA_FAST_BUILD
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
make -C llama -j 8
|
make -j 5
|
||||||
|
|
||||||
|
# Jetsons need to be built in discrete stages
|
||||||
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
|
||||||
|
ARG GOLANG_VERSION
|
||||||
|
RUN apt-get update && apt-get install -y git curl ccache && \
|
||||||
|
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
|
||||||
|
ln -s /usr/local/go/bin/go /usr/local/bin/go && \
|
||||||
|
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
WORKDIR /go/src/github.com/ollama/ollama/
|
||||||
|
COPY . .
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
ENV GOARCH arm64
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
make -j 5 cuda_v11 \
|
||||||
|
CUDA_ARCHITECTURES="72;87" \
|
||||||
|
GPU_RUNNER_VARIANT=_jetpack5 \
|
||||||
|
CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
|
||||||
|
DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
|
||||||
|
DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
|
||||||
|
|
||||||
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
|
||||||
|
ARG GOLANG_VERSION
|
||||||
|
RUN apt-get update && apt-get install -y git curl ccache && \
|
||||||
|
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
|
||||||
|
ln -s /usr/local/go/bin/go /usr/local/bin/go && \
|
||||||
|
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
WORKDIR /go/src/github.com/ollama/ollama/
|
||||||
|
COPY . .
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
ENV GOARCH arm64
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
make -j 5 cuda_v12 \
|
||||||
|
CUDA_ARCHITECTURES="87" \
|
||||||
|
GPU_RUNNER_VARIANT=_jetpack6 \
|
||||||
|
CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
|
||||||
|
DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
|
||||||
|
DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
|
||||||
|
|
||||||
|
|
||||||
# Intermediate stages used for ./scripts/build_linux.sh
|
# Intermediate stages used for ./scripts/build_linux.sh
|
||||||
@ -135,12 +175,20 @@ FROM --platform=linux/arm64 builder-arm64 AS build-arm64
|
|||||||
COPY . .
|
COPY . .
|
||||||
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
|
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
|
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
ARG GOFLAGS
|
ARG GOFLAGS
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
go build -trimpath -o dist/linux-arm64/bin/ollama .
|
go build -trimpath -o dist/linux-arm64/bin/ollama .
|
||||||
RUN cd dist/linux-$GOARCH && \
|
RUN cd dist/linux-$GOARCH && \
|
||||||
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
|
||||||
|
RUN cd dist/linux-$GOARCH-jetpack5 && \
|
||||||
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
|
||||||
|
RUN cd dist/linux-$GOARCH-jetpack6 && \
|
||||||
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
|
||||||
|
|
||||||
FROM --platform=linux/amd64 scratch AS dist-amd64
|
FROM --platform=linux/amd64 scratch AS dist-amd64
|
||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
|
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
|
||||||
@ -181,16 +229,23 @@ RUN rm -rf \
|
|||||||
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
|
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y ca-certificates && \
|
apt-get install -y ca-certificates && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
||||||
COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
|
||||||
FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
|
FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
|
||||||
|
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
|
||||||
|
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y ca-certificates && \
|
apt-get install -y ca-certificates && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
|
COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
|
||||||
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
|
||||||
|
|
||||||
# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
|
# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
|
||||||
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
||||||
@ -199,7 +254,7 @@ FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
|||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
|
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y ca-certificates && \
|
apt-get install -y ca-certificates && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
||||||
COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
|
||||||
|
50
README.md
50
README.md
@ -12,7 +12,7 @@ Get up and running with large language models.
|
|||||||
|
|
||||||
[Download](https://ollama.com/download/Ollama-darwin.zip)
|
[Download](https://ollama.com/download/Ollama-darwin.zip)
|
||||||
|
|
||||||
### Windows preview
|
### Windows
|
||||||
|
|
||||||
[Download](https://ollama.com/download/OllamaSetup.exe)
|
[Download](https://ollama.com/download/OllamaSetup.exe)
|
||||||
|
|
||||||
@ -47,26 +47,28 @@ Ollama supports a list of models available on [ollama.com/library](https://ollam
|
|||||||
|
|
||||||
Here are some example models that can be downloaded:
|
Here are some example models that can be downloaded:
|
||||||
|
|
||||||
| Model | Parameters | Size | Download |
|
| Model | Parameters | Size | Download |
|
||||||
| ------------------ | ---------- | ----- | ------------------------------ |
|
| ------------------ | ---------- | ----- | -------------------------------- |
|
||||||
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
||||||
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
||||||
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
|
| Llama 3.2 Vision | 11B | 7.9GB | `ollama run llama3.2-vision` |
|
||||||
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
|
| Llama 3.2 Vision | 90B | 55GB | `ollama run llama3.2-vision:90b` |
|
||||||
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
|
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
|
||||||
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
|
||||||
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
|
||||||
| Gemma 2 | 2B | 1.6GB | `ollama run gemma2:2b` |
|
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
||||||
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
|
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
||||||
| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
|
| Gemma 2 | 2B | 1.6GB | `ollama run gemma2:2b` |
|
||||||
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
|
||||||
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
|
| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
|
||||||
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
||||||
| Starling | 7B | 4.1GB | `ollama run starling-lm` |
|
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
|
||||||
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
|
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
||||||
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
|
| Starling | 7B | 4.1GB | `ollama run starling-lm` |
|
||||||
| LLaVA | 7B | 4.5GB | `ollama run llava` |
|
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
|
||||||
| Solar | 10.7B | 6.1GB | `ollama run solar` |
|
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
|
||||||
|
| LLaVA | 7B | 4.5GB | `ollama run llava` |
|
||||||
|
| Solar | 10.7B | 6.1GB | `ollama run solar` |
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
|
> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
|
||||||
@ -331,6 +333,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
|
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
|
||||||
- [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
|
- [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
|
||||||
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
||||||
|
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
|
||||||
|
- [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
|
||||||
|
|
||||||
### Terminal
|
### Terminal
|
||||||
|
|
||||||
@ -357,6 +361,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
||||||
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
|
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
|
||||||
- [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
|
- [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
|
||||||
|
- [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
|
||||||
|
|
||||||
### Apple Vision Pro
|
### Apple Vision Pro
|
||||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||||
@ -413,6 +418,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
|
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
|
||||||
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
|
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
|
||||||
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
||||||
|
- [GoLamify](https://github.com/prasad89/golamify)
|
||||||
|
|
||||||
### Mobile
|
### Mobile
|
||||||
|
|
||||||
@ -450,10 +456,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
||||||
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
||||||
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
||||||
|
- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
|
||||||
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
||||||
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
||||||
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
||||||
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
|
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
|
||||||
|
- [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
|
||||||
|
|
||||||
### Supported backends
|
### Supported backends
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ func checkError(resp *http.Response, body []byte) error {
|
|||||||
|
|
||||||
// ClientFromEnvironment creates a new [Client] using configuration from the
|
// ClientFromEnvironment creates a new [Client] using configuration from the
|
||||||
// environment variable OLLAMA_HOST, which points to the network host and
|
// environment variable OLLAMA_HOST, which points to the network host and
|
||||||
// port on which the ollama service is listenting. The format of this variable
|
// port on which the ollama service is listening. The format of this variable
|
||||||
// is:
|
// is:
|
||||||
//
|
//
|
||||||
// <scheme>://<host>:<port>
|
// <scheme>://<host>:<port>
|
||||||
|
15
api/types.go
15
api/types.go
@ -12,7 +12,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// StatusError is an error with and HTTP status code.
|
// StatusError is an error with an HTTP status code and message.
|
||||||
type StatusError struct {
|
type StatusError struct {
|
||||||
StatusCode int
|
StatusCode int
|
||||||
Status string
|
Status string
|
||||||
@ -57,7 +57,7 @@ type GenerateRequest struct {
|
|||||||
Template string `json:"template"`
|
Template string `json:"template"`
|
||||||
|
|
||||||
// Context is the context parameter returned from a previous call to
|
// Context is the context parameter returned from a previous call to
|
||||||
// Generate call. It can be used to keep a short conversational memory.
|
// [Client.Generate]. It can be used to keep a short conversational memory.
|
||||||
Context []int `json:"context,omitempty"`
|
Context []int `json:"context,omitempty"`
|
||||||
|
|
||||||
// Stream specifies whether the response is streaming; it is true by default.
|
// Stream specifies whether the response is streaming; it is true by default.
|
||||||
@ -90,14 +90,14 @@ type ChatRequest struct {
|
|||||||
// Messages is the messages of the chat - can be used to keep a chat memory.
|
// Messages is the messages of the chat - can be used to keep a chat memory.
|
||||||
Messages []Message `json:"messages"`
|
Messages []Message `json:"messages"`
|
||||||
|
|
||||||
// Stream enable streaming of returned response; true by default.
|
// Stream enables streaming of returned responses; true by default.
|
||||||
Stream *bool `json:"stream,omitempty"`
|
Stream *bool `json:"stream,omitempty"`
|
||||||
|
|
||||||
// Format is the format to return the response in (e.g. "json").
|
// Format is the format to return the response in (e.g. "json").
|
||||||
Format string `json:"format"`
|
Format string `json:"format"`
|
||||||
|
|
||||||
// KeepAlive controls how long the model will stay loaded into memory
|
// KeepAlive controls how long the model will stay loaded into memory
|
||||||
// followin the request.
|
// following the request.
|
||||||
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
||||||
|
|
||||||
// Tools is an optional list of tools the model has access to.
|
// Tools is an optional list of tools the model has access to.
|
||||||
@ -203,8 +203,8 @@ type Metrics struct {
|
|||||||
EvalDuration time.Duration `json:"eval_duration,omitempty"`
|
EvalDuration time.Duration `json:"eval_duration,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Options specified in [GenerateRequest], if you add a new option here add it
|
// Options specified in [GenerateRequest]. If you add a new option here, also
|
||||||
// to the API docs also.
|
// add it to the API docs.
|
||||||
type Options struct {
|
type Options struct {
|
||||||
Runner
|
Runner
|
||||||
|
|
||||||
@ -236,7 +236,7 @@ type Runner struct {
|
|||||||
NumGPU int `json:"num_gpu,omitempty"`
|
NumGPU int `json:"num_gpu,omitempty"`
|
||||||
MainGPU int `json:"main_gpu,omitempty"`
|
MainGPU int `json:"main_gpu,omitempty"`
|
||||||
LowVRAM bool `json:"low_vram,omitempty"`
|
LowVRAM bool `json:"low_vram,omitempty"`
|
||||||
F16KV bool `json:"f16_kv,omitempty"`
|
F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
|
||||||
LogitsAll bool `json:"logits_all,omitempty"`
|
LogitsAll bool `json:"logits_all,omitempty"`
|
||||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||||
UseMMap *bool `json:"use_mmap,omitempty"`
|
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||||
@ -613,7 +613,6 @@ func DefaultOptions() Options {
|
|||||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||||
NumThread: 0, // let the runtime decide
|
NumThread: 0, // let the runtime decide
|
||||||
LowVRAM: false,
|
LowVRAM: false,
|
||||||
F16KV: true,
|
|
||||||
UseMLock: false,
|
UseMLock: false,
|
||||||
UseMMap: nil,
|
UseMMap: nil,
|
||||||
},
|
},
|
||||||
|
@ -11,10 +11,12 @@ import (
|
|||||||
|
|
||||||
"github.com/ollama/ollama/app/store"
|
"github.com/ollama/ollama/app/store"
|
||||||
"github.com/ollama/ollama/app/tray"
|
"github.com/ollama/ollama/app/tray"
|
||||||
|
"github.com/ollama/ollama/envconfig"
|
||||||
)
|
)
|
||||||
|
|
||||||
func Run() {
|
func Run() {
|
||||||
InitLogging()
|
InitLogging()
|
||||||
|
slog.Info("app config", "env", envconfig.Values())
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
var done chan int
|
var done chan int
|
||||||
|
@ -36,8 +36,13 @@ func init() {
|
|||||||
ServerLogFile = filepath.Join(AppDataDir, "server.log")
|
ServerLogFile = filepath.Join(AppDataDir, "server.log")
|
||||||
UpgradeLogFile = filepath.Join(AppDataDir, "upgrade.log")
|
UpgradeLogFile = filepath.Join(AppDataDir, "upgrade.log")
|
||||||
|
|
||||||
// Executables are stored in APPDATA
|
exe, err := os.Executable()
|
||||||
AppDir = filepath.Join(localAppData, "Programs", "Ollama")
|
if err != nil {
|
||||||
|
slog.Warn("error discovering executable directory", "error", err)
|
||||||
|
AppDir = filepath.Join(localAppData, "Programs", "Ollama")
|
||||||
|
} else {
|
||||||
|
AppDir = filepath.Dir(exe)
|
||||||
|
}
|
||||||
|
|
||||||
// Make sure we have PATH set correctly for any spawned children
|
// Make sure we have PATH set correctly for any spawned children
|
||||||
paths := strings.Split(os.Getenv("PATH"), ";")
|
paths := strings.Split(os.Getenv("PATH"), ";")
|
||||||
@ -64,7 +69,7 @@ func init() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Make sure our logging dir exists
|
// Make sure our logging dir exists
|
||||||
_, err := os.Stat(AppDataDir)
|
_, err = os.Stat(AppDataDir)
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
if err := os.MkdirAll(AppDataDir, 0o755); err != nil {
|
if err := os.MkdirAll(AppDataDir, 0o755); err != nil {
|
||||||
slog.Error(fmt.Sprintf("create ollama dir %s: %v", AppDataDir, err))
|
slog.Error(fmt.Sprintf("create ollama dir %s: %v", AppDataDir, err))
|
||||||
|
@ -18,11 +18,17 @@ func getCLIFullPath(command string) string {
|
|||||||
var cmdPath string
|
var cmdPath string
|
||||||
appExe, err := os.Executable()
|
appExe, err := os.Executable()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
// Check both the same location as the tray app, as well as ./bin
|
||||||
cmdPath = filepath.Join(filepath.Dir(appExe), command)
|
cmdPath = filepath.Join(filepath.Dir(appExe), command)
|
||||||
_, err := os.Stat(cmdPath)
|
_, err := os.Stat(cmdPath)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return cmdPath
|
return cmdPath
|
||||||
}
|
}
|
||||||
|
cmdPath = filepath.Join(filepath.Dir(appExe), "bin", command)
|
||||||
|
_, err = os.Stat(cmdPath)
|
||||||
|
if err == nil {
|
||||||
|
return cmdPath
|
||||||
|
}
|
||||||
}
|
}
|
||||||
cmdPath, err = exec.LookPath(command)
|
cmdPath, err = exec.LookPath(command)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
@ -26,19 +26,15 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
|
|||||||
slog.Info("starting upgrade with " + installerExe)
|
slog.Info("starting upgrade with " + installerExe)
|
||||||
slog.Info("upgrade log file " + UpgradeLogFile)
|
slog.Info("upgrade log file " + UpgradeLogFile)
|
||||||
|
|
||||||
// When running in debug mode, we'll be "verbose" and let the installer pop up and prompt
|
// make the upgrade show progress, but non interactive
|
||||||
installArgs := []string{
|
installArgs := []string{
|
||||||
"/CLOSEAPPLICATIONS", // Quit the tray app if it's still running
|
"/CLOSEAPPLICATIONS", // Quit the tray app if it's still running
|
||||||
"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
|
"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
|
||||||
"/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed
|
"/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed
|
||||||
}
|
"/SP", // Skip the "This will install... Do you wish to continue" prompt
|
||||||
// make the upgrade as quiet as possible (no GUI, no prompts)
|
"/NOCANCEL", // Disable the ability to cancel upgrade mid-flight to avoid partially installed upgrades
|
||||||
installArgs = append(installArgs,
|
|
||||||
"/SP", // Skip the "This will install... Do you wish to continue" prompt
|
|
||||||
"/SUPPRESSMSGBOXES",
|
|
||||||
"/SILENT",
|
"/SILENT",
|
||||||
"/VERYSILENT",
|
}
|
||||||
)
|
|
||||||
|
|
||||||
// Safeguard in case we have requests in flight that need to drain...
|
// Safeguard in case we have requests in flight that need to drain...
|
||||||
slog.Info("Waiting for server to shutdown")
|
slog.Info("Waiting for server to shutdown")
|
||||||
|
@ -53,8 +53,8 @@ RestartIfNeededByRun=no
|
|||||||
; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
|
; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
|
||||||
WizardSmallImageFile=.\assets\setup.bmp
|
WizardSmallImageFile=.\assets\setup.bmp
|
||||||
|
|
||||||
; TODO verifty actual min windows version...
|
; Ollama requires Windows 10 22H2 or newer for proper unicode rendering
|
||||||
; OG Win 10
|
; TODO: consider setting this to 10.0.19045
|
||||||
MinVersion=10.0.10240
|
MinVersion=10.0.10240
|
||||||
|
|
||||||
; First release that supports WinRT UI Composition for win32 apps
|
; First release that supports WinRT UI Composition for win32 apps
|
||||||
@ -136,7 +136,7 @@ Type: filesandordirs; Name: "{%TEMP}\ollama*"
|
|||||||
Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
|
Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
|
||||||
|
|
||||||
[Messages]
|
[Messages]
|
||||||
WizardReady=Ollama Windows Preview
|
WizardReady=Ollama
|
||||||
ReadyLabel1=%nLet's get you up and running with your own large language models.
|
ReadyLabel1=%nLet's get you up and running with your own large language models.
|
||||||
SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit.
|
SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit.
|
||||||
|
|
||||||
|
@ -11,12 +11,13 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
updateAvailableMenuID = 1
|
_ = iota
|
||||||
updateMenuID = updateAvailableMenuID + 1
|
updateAvailableMenuID
|
||||||
separatorMenuID = updateMenuID + 1
|
updateMenuID
|
||||||
diagLogsMenuID = separatorMenuID + 1
|
separatorMenuID
|
||||||
diagSeparatorMenuID = diagLogsMenuID + 1
|
diagLogsMenuID
|
||||||
quitMenuID = diagSeparatorMenuID + 1
|
diagSeparatorMenuID
|
||||||
|
quitMenuID
|
||||||
)
|
)
|
||||||
|
|
||||||
func (t *winTray) initMenus() error {
|
func (t *winTray) initMenus() error {
|
||||||
|
@ -800,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
case "parameters":
|
case "parameters":
|
||||||
fmt.Println(resp.Parameters)
|
fmt.Println(resp.Parameters)
|
||||||
case "system":
|
case "system":
|
||||||
fmt.Println(resp.System)
|
fmt.Print(resp.System)
|
||||||
case "template":
|
case "template":
|
||||||
fmt.Println(resp.Template)
|
fmt.Print(resp.Template)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
@ -1318,7 +1318,7 @@ func NewCLI() *cobra.Command {
|
|||||||
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
||||||
cobra.EnableCommandSorting = false
|
cobra.EnableCommandSorting = false
|
||||||
|
|
||||||
if runtime.GOOS == "windows" {
|
if runtime.GOOS == "windows" && term.IsTerminal(int(os.Stdout.Fd())) {
|
||||||
console.ConsoleFromFile(os.Stdin) //nolint:errcheck
|
console.ConsoleFromFile(os.Stdin) //nolint:errcheck
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ModelParameters struct {
|
type ModelParameters struct {
|
||||||
@ -27,8 +27,8 @@ type AdapterParameters struct {
|
|||||||
} `json:"lora_parameters"`
|
} `json:"lora_parameters"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ModelParameters) KV(t *Tokenizer) fileutils.KV {
|
func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
||||||
kv := fileutils.KV{
|
kv := llm.KV{
|
||||||
"general.file_type": uint32(1),
|
"general.file_type": uint32(1),
|
||||||
"general.quantization_version": uint32(2),
|
"general.quantization_version": uint32(2),
|
||||||
"tokenizer.ggml.pre": t.Pre,
|
"tokenizer.ggml.pre": t.Pre,
|
||||||
@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) fileutils.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p AdapterParameters) KV() fileutils.KV {
|
func (p AdapterParameters) KV() llm.KV {
|
||||||
var alpha float32
|
var alpha float32
|
||||||
if p.LoraParameters.Alpha == 0 {
|
if p.LoraParameters.Alpha == 0 {
|
||||||
alpha = float32(p.Alpha)
|
alpha = float32(p.Alpha)
|
||||||
@ -62,7 +62,7 @@ func (p AdapterParameters) KV() fileutils.KV {
|
|||||||
alpha = p.LoraParameters.Alpha
|
alpha = p.LoraParameters.Alpha
|
||||||
}
|
}
|
||||||
|
|
||||||
kv := fileutils.KV{
|
kv := llm.KV{
|
||||||
"adapter.lora.alpha": alpha,
|
"adapter.lora.alpha": alpha,
|
||||||
"adapter.type": "lora",
|
"adapter.type": "lora",
|
||||||
"general.file_type": uint32(1),
|
"general.file_type": uint32(1),
|
||||||
@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ModelParameters) writeFile(ws io.WriteSeeker, kv fileutils.KV, ts []fileutils.Tensor) error {
|
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||||
return fileutils.WriteGGUF(ws, kv, ts)
|
return llm.WriteGGUF(ws, kv, ts)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv fileutils.KV, ts []fileutils.Tensor) error {
|
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||||
return fileutils.WriteGGUF(ws, kv, ts)
|
return llm.WriteGGUF(ws, kv, ts)
|
||||||
}
|
}
|
||||||
|
|
||||||
type ModelConverter interface {
|
type ModelConverter interface {
|
||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(*Tokenizer) fileutils.KV
|
KV(*Tokenizer) llm.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
||||||
Tensors([]Tensor) []fileutils.Tensor
|
Tensors([]Tensor) []llm.Tensor
|
||||||
// Replacements returns a list of string pairs to replace in tensor names.
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
Replacements() []string
|
Replacements() []string
|
||||||
@ -99,7 +99,7 @@ type ModelConverter interface {
|
|||||||
// specialTokenTypes returns any special token types the model uses
|
// specialTokenTypes returns any special token types the model uses
|
||||||
specialTokenTypes() []string
|
specialTokenTypes() []string
|
||||||
// writeFile writes the model to the provided io.WriteSeeker
|
// writeFile writes the model to the provided io.WriteSeeker
|
||||||
writeFile(io.WriteSeeker, fileutils.KV, []fileutils.Tensor) error
|
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
||||||
}
|
}
|
||||||
|
|
||||||
type moreParser interface {
|
type moreParser interface {
|
||||||
@ -108,17 +108,17 @@ type moreParser interface {
|
|||||||
|
|
||||||
type AdapterConverter interface {
|
type AdapterConverter interface {
|
||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(fileutils.KV) fileutils.KV
|
KV(llm.KV) llm.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
||||||
Tensors([]Tensor) []fileutils.Tensor
|
Tensors([]Tensor) []llm.Tensor
|
||||||
// Replacements returns a list of string pairs to replace in tensor names.
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
Replacements() []string
|
Replacements() []string
|
||||||
|
|
||||||
writeFile(io.WriteSeeker, fileutils.KV, []fileutils.Tensor) error
|
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
||||||
}
|
}
|
||||||
|
|
||||||
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV fileutils.KV) error {
|
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
|
||||||
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -8,7 +8,7 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type bertModel struct {
|
type bertModel struct {
|
||||||
@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *bertModel) KV(t *Tokenizer) fileutils.KV {
|
func (p *bertModel) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "bert"
|
kv["general.architecture"] = "bert"
|
||||||
kv["bert.attention.causal"] = false
|
kv["bert.attention.causal"] = false
|
||||||
@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) fileutils.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *bertModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var out []fileutils.Tensor
|
var out []llm.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if slices.Contains([]string{
|
if slices.Contains([]string{
|
||||||
"embeddings.position_ids",
|
"embeddings.position_ids",
|
||||||
@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, fileutils.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -6,7 +6,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gemmaModel struct {
|
type gemmaModel struct {
|
||||||
@ -23,7 +23,7 @@ type gemmaModel struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*gemmaModel)(nil)
|
var _ ModelConverter = (*gemmaModel)(nil)
|
||||||
|
|
||||||
func (p *gemmaModel) KV(t *Tokenizer) fileutils.KV {
|
func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "gemma"
|
kv["general.architecture"] = "gemma"
|
||||||
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
||||||
@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) fileutils.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemmaModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var out []fileutils.Tensor
|
var out []llm.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasSuffix(t.Name(), "_norm.weight") {
|
if strings.HasSuffix(t.Name(), "_norm.weight") {
|
||||||
t.SetRepacker(p.addOne)
|
t.SetRepacker(p.addOne)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, fileutils.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package convert
|
package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gemma2Model struct {
|
type gemma2Model struct {
|
||||||
@ -11,7 +11,7 @@ type gemma2Model struct {
|
|||||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma2Model) KV(t *Tokenizer) fileutils.KV {
|
func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "gemma2"
|
kv["general.architecture"] = "gemma2"
|
||||||
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
||||||
|
@ -6,7 +6,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gemma2Adapter struct {
|
type gemma2Adapter struct {
|
||||||
@ -15,14 +15,14 @@ type gemma2Adapter struct {
|
|||||||
|
|
||||||
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
||||||
|
|
||||||
func (p *gemma2Adapter) KV(baseKV fileutils.KV) fileutils.KV {
|
func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
|
||||||
kv := p.AdapterParameters.KV()
|
kv := p.AdapterParameters.KV()
|
||||||
kv["general.architecture"] = "gemma2"
|
kv["general.architecture"] = "gemma2"
|
||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma2Adapter) Tensors(ts []Tensor) []fileutils.Tensor {
|
func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var out []fileutils.Tensor
|
var out []llm.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
shape := t.Shape()
|
shape := t.Shape()
|
||||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []fileutils.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, fileutils.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -9,7 +9,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type llamaModel struct {
|
type llamaModel struct {
|
||||||
@ -46,7 +46,7 @@ type llamaModel struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*llamaModel)(nil)
|
var _ ModelConverter = (*llamaModel)(nil)
|
||||||
|
|
||||||
func (p *llamaModel) KV(t *Tokenizer) fileutils.KV {
|
func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "llama"
|
kv["general.architecture"] = "llama"
|
||||||
kv["llama.vocab_size"] = p.VocabSize
|
kv["llama.vocab_size"] = p.VocabSize
|
||||||
@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) fileutils.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llamaModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var out []fileutils.Tensor
|
var out []llm.Tensor
|
||||||
|
|
||||||
if p.RopeScaling.factors != nil {
|
if p.RopeScaling.factors != nil {
|
||||||
out = append(out, fileutils.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: "rope_freqs.weight",
|
Name: "rope_freqs.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
||||||
@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, fileutils.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -7,7 +7,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type llamaAdapter struct {
|
type llamaAdapter struct {
|
||||||
@ -18,7 +18,7 @@ type llamaAdapter struct {
|
|||||||
|
|
||||||
var _ AdapterConverter = (*llamaAdapter)(nil)
|
var _ AdapterConverter = (*llamaAdapter)(nil)
|
||||||
|
|
||||||
func (p *llamaAdapter) KV(baseKV fileutils.KV) fileutils.KV {
|
func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
|
||||||
kv := p.AdapterParameters.KV()
|
kv := p.AdapterParameters.KV()
|
||||||
kv["general.architecture"] = "llama"
|
kv["general.architecture"] = "llama"
|
||||||
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
||||||
@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV fileutils.KV) fileutils.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llamaAdapter) Tensors(ts []Tensor) []fileutils.Tensor {
|
func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var out []fileutils.Tensor
|
var out []llm.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
shape := t.Shape()
|
shape := t.Shape()
|
||||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []fileutils.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, fileutils.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: shape,
|
Shape: shape,
|
||||||
|
@ -6,7 +6,7 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type mixtralModel struct {
|
type mixtralModel struct {
|
||||||
@ -15,7 +15,7 @@ type mixtralModel struct {
|
|||||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtralModel) KV(t *Tokenizer) fileutils.KV {
|
func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.llamaModel.KV(t)
|
kv := p.llamaModel.KV(t)
|
||||||
|
|
||||||
if p.NumLocalExperts > 0 {
|
if p.NumLocalExperts > 0 {
|
||||||
@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) fileutils.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtralModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
oldnew := []string{
|
oldnew := []string{
|
||||||
"model.layers", "blk",
|
"model.layers", "blk",
|
||||||
"w1", "ffn_gate_exps",
|
"w1", "ffn_gate_exps",
|
||||||
@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []fileutils.Tensor {
|
|||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
|
||||||
var out []fileutils.Tensor
|
var out []llm.Tensor
|
||||||
for n, e := range experts {
|
for n, e := range experts {
|
||||||
// TODO(mxyng): sanity check experts
|
// TODO(mxyng): sanity check experts
|
||||||
out = append(out, fileutils.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: n,
|
Name: n,
|
||||||
Kind: e[0].Kind(),
|
Kind: e[0].Kind(),
|
||||||
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
||||||
|
@ -8,7 +8,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type phi3Model struct {
|
type phi3Model struct {
|
||||||
@ -37,7 +37,7 @@ type phi3Model struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*phi3Model)(nil)
|
var _ ModelConverter = (*phi3Model)(nil)
|
||||||
|
|
||||||
func (p *phi3Model) KV(t *Tokenizer) fileutils.KV {
|
func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "phi3"
|
kv["general.architecture"] = "phi3"
|
||||||
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
||||||
@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) fileutils.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *phi3Model) Tensors(ts []Tensor) []fileutils.Tensor {
|
func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var addRopeFactors sync.Once
|
var addRopeFactors sync.Once
|
||||||
|
|
||||||
out := make([]fileutils.Tensor, 0, len(ts)+2)
|
out := make([]llm.Tensor, 0, len(ts)+2)
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasPrefix(t.Name(), "blk.0.") {
|
if strings.HasPrefix(t.Name(), "blk.0.") {
|
||||||
addRopeFactors.Do(func() {
|
addRopeFactors.Do(func() {
|
||||||
out = append(out, fileutils.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: "rope_factors_long.weight",
|
Name: "rope_factors_long.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
||||||
WriterTo: p.RopeScaling.LongFactor,
|
WriterTo: p.RopeScaling.LongFactor,
|
||||||
}, fileutils.Tensor{
|
}, llm.Tensor{
|
||||||
Name: "rope_factors_short.weight",
|
Name: "rope_factors_short.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
||||||
@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []fileutils.Tensor {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, fileutils.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -20,7 +20,7 @@ import (
|
|||||||
|
|
||||||
"golang.org/x/exp/maps"
|
"golang.org/x/exp/maps"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type tensorData struct {
|
type tensorData struct {
|
||||||
@ -29,7 +29,7 @@ type tensorData struct {
|
|||||||
Shape []int `json:"shape"`
|
Shape []int `json:"shape"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func convertFull(t *testing.T, fsys fs.FS) (*os.File, fileutils.KV, *fileutils.Tensors) {
|
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
f, err := os.CreateTemp(t.TempDir(), "f16")
|
f, err := os.CreateTemp(t.TempDir(), "f16")
|
||||||
@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, fileutils.KV, *fileutils.T
|
|||||||
}
|
}
|
||||||
t.Cleanup(func() { r.Close() })
|
t.Cleanup(func() { r.Close() })
|
||||||
|
|
||||||
m, _, err := fileutils.DecodeGGML(r, math.MaxInt)
|
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, fileutils.KV, *fileutils.T
|
|||||||
return r, m.KV(), m.Tensors()
|
return r, m.KV(), m.Tensors()
|
||||||
}
|
}
|
||||||
|
|
||||||
func generateResultsJSON(t *testing.T, f *os.File, kv fileutils.KV, tensors *fileutils.Tensors) map[string]string {
|
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
|
||||||
actual := make(map[string]string)
|
actual := make(map[string]string)
|
||||||
for k, v := range kv {
|
for k, v := range kv {
|
||||||
if s, ok := v.(json.Marshaler); !ok {
|
if s, ok := v.(json.Marshaler); !ok {
|
||||||
@ -330,7 +330,7 @@ func TestConvertAdapter(t *testing.T) {
|
|||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
m, _, err := fileutils.DecodeGGML(r, math.MaxInt)
|
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
@ -1,3 +0,0 @@
|
|||||||
# `discover`
|
|
||||||
|
|
||||||
This package is responsible for discovering information about the system and the capabilities to run LLM. This includes GPU and CPU discovery so the optimal runner can be chosen for a given model. The ollama scheduler relies on up-to-date available memory information, so this package provides the ability to refresh free memory as efficiently as possible.
|
|
@ -37,19 +37,6 @@ func GetSupportedGFX(libDir string) ([]string, error) {
|
|||||||
return ret, nil
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
|
||||||
ids := []string{}
|
|
||||||
for _, info := range gpuInfo {
|
|
||||||
if info.Library != "rocm" {
|
|
||||||
// TODO shouldn't happen if things are wired correctly...
|
|
||||||
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ids = append(ids, info.ID)
|
|
||||||
}
|
|
||||||
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
|
|
||||||
}
|
|
||||||
|
|
||||||
func commonAMDValidateLibDir() (string, error) {
|
func commonAMDValidateLibDir() (string, error) {
|
||||||
// Favor our bundled version
|
// Favor our bundled version
|
||||||
|
|
||||||
|
@ -64,7 +64,7 @@ func NewHipLib() (*HipLib, error) {
|
|||||||
return hl, nil
|
return hl, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup
|
// The hip library only evaluates the ROCR_VISIBLE_DEVICES variable at startup
|
||||||
// so we have to unload/reset the library after we do our initial discovery
|
// so we have to unload/reset the library after we do our initial discovery
|
||||||
// to make sure our updates to that variable are processed by llama.cpp
|
// to make sure our updates to that variable are processed by llama.cpp
|
||||||
func (hl *HipLib) Release() {
|
func (hl *HipLib) Release() {
|
||||||
|
@ -64,16 +64,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
||||||
var visibleDevices []string
|
var visibleDevices []string
|
||||||
hipVD := envconfig.HipVisibleDevices() // zero based index only
|
hipVD := envconfig.HipVisibleDevices() // zero based index only
|
||||||
rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID
|
rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID
|
||||||
gpuDO := envconfig.GpuDeviceOrdinal() // zero based index
|
gpuDO := envconfig.GpuDeviceOrdinal() // zero based index
|
||||||
switch {
|
switch {
|
||||||
// TODO is this priorty order right?
|
|
||||||
case hipVD != "":
|
|
||||||
visibleDevices = strings.Split(hipVD, ",")
|
|
||||||
case rocrVD != "":
|
case rocrVD != "":
|
||||||
visibleDevices = strings.Split(rocrVD, ",")
|
visibleDevices = strings.Split(rocrVD, ",")
|
||||||
// TODO - since we don't yet support UUIDs, consider detecting and reporting here
|
case hipVD != "":
|
||||||
// all our test systems show GPU-XX indicating UUID is not supported
|
visibleDevices = strings.Split(hipVD, ",")
|
||||||
case gpuDO != "":
|
case gpuDO != "":
|
||||||
visibleDevices = strings.Split(gpuDO, ",")
|
visibleDevices = strings.Split(gpuDO, ",")
|
||||||
}
|
}
|
||||||
@ -99,7 +96,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
return a < b
|
return a < b
|
||||||
})
|
})
|
||||||
cpuCount := 0
|
gpuCount := 0
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
slog.Debug("evaluating amdgpu node " + match)
|
slog.Debug("evaluating amdgpu node " + match)
|
||||||
fp, err := os.Open(match)
|
fp, err := os.Open(match)
|
||||||
@ -108,11 +105,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
defer fp.Close()
|
defer fp.Close()
|
||||||
nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to parse node ID", "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(fp)
|
scanner := bufio.NewScanner(fp)
|
||||||
isCPU := false
|
isCPU := false
|
||||||
@ -186,20 +178,19 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
// do reliably report VRAM usage.
|
// do reliably report VRAM usage.
|
||||||
|
|
||||||
if isCPU {
|
if isCPU {
|
||||||
cpuCount++
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// CPUs are always first in the list
|
// Skip over any GPUs that are masked
|
||||||
gpuID := nodeID - cpuCount
|
if major == 0 && minor == 0 && patch == 0 {
|
||||||
|
slog.Debug("skipping gpu with gfx000")
|
||||||
// Shouldn't happen, but just in case...
|
continue
|
||||||
if gpuID < 0 {
|
|
||||||
err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
|
|
||||||
slog.Error(err.Error())
|
|
||||||
return nil, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Keep track of numeric IDs based on valid GPUs
|
||||||
|
gpuID := gpuCount
|
||||||
|
gpuCount += 1
|
||||||
|
|
||||||
// Look up the memory for the current node
|
// Look up the memory for the current node
|
||||||
totalMemory := uint64(0)
|
totalMemory := uint64(0)
|
||||||
usedMemory := uint64(0)
|
usedMemory := uint64(0)
|
||||||
@ -273,6 +264,14 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
name = fmt.Sprintf("%04x:%04x", vendor, device)
|
name = fmt.Sprintf("%04x:%04x", vendor, device)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Favor UUIDs if available to reduce possibility of getting the numeric IDs wrong
|
||||||
|
var ID string
|
||||||
|
if uniqueID != 0 {
|
||||||
|
ID = fmt.Sprintf("GPU-%016x", uniqueID)
|
||||||
|
} else {
|
||||||
|
ID = strconv.Itoa(gpuID)
|
||||||
|
}
|
||||||
|
|
||||||
gpuInfo := RocmGPUInfo{
|
gpuInfo := RocmGPUInfo{
|
||||||
GpuInfo: GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
Library: "rocm",
|
Library: "rocm",
|
||||||
@ -280,7 +279,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
TotalMemory: totalMemory,
|
TotalMemory: totalMemory,
|
||||||
FreeMemory: (totalMemory - usedMemory),
|
FreeMemory: (totalMemory - usedMemory),
|
||||||
},
|
},
|
||||||
ID: strconv.Itoa(gpuID),
|
ID: ID,
|
||||||
Name: name,
|
Name: name,
|
||||||
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
||||||
MinimumMemory: rocmMinimumMemory,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
@ -288,6 +287,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
DriverMinor: driverMinor,
|
DriverMinor: driverMinor,
|
||||||
},
|
},
|
||||||
usedFilepath: usedFile,
|
usedFilepath: usedFile,
|
||||||
|
index: gpuID,
|
||||||
}
|
}
|
||||||
|
|
||||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
||||||
@ -319,7 +319,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
if len(visibleDevices) > 0 {
|
if len(visibleDevices) > 0 {
|
||||||
include := false
|
include := false
|
||||||
for _, visible := range visibleDevices {
|
for _, visible := range visibleDevices {
|
||||||
if visible == gpuInfo.ID {
|
if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
|
||||||
include = true
|
include = true
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -350,7 +350,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gpuInfo.DependencyPath = libDir
|
gpuInfo.DependencyPath = []string{libDir}
|
||||||
|
|
||||||
if gfxOverride == "" {
|
if gfxOverride == "" {
|
||||||
// Only load supported list once
|
// Only load supported list once
|
||||||
@ -516,3 +516,20 @@ func verifyKFDDriverAccess() error {
|
|||||||
fd.Close()
|
fd.Close()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||||
|
ids := []string{}
|
||||||
|
for _, info := range gpuInfo {
|
||||||
|
if info.Library != "rocm" {
|
||||||
|
// TODO shouldn't happen if things are wired correctly...
|
||||||
|
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ids = append(ids, info.ID)
|
||||||
|
}
|
||||||
|
// There are 3 potential env vars to use to select GPUs.
|
||||||
|
// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
|
||||||
|
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
||||||
|
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
||||||
|
return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||||
|
}
|
||||||
|
@ -43,7 +43,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
slog.Debug("error looking up amd driver version", "error", err)
|
slog.Debug("error looking up amd driver version", "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
|
// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
|
||||||
count := hl.HipGetDeviceCount()
|
count := hl.HipGetDeviceCount()
|
||||||
if count == 0 {
|
if count == 0 {
|
||||||
err := fmt.Errorf("no compatible amdgpu devices detected")
|
err := fmt.Errorf("no compatible amdgpu devices detected")
|
||||||
@ -111,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
UnreliableFreeMemory: true,
|
UnreliableFreeMemory: true,
|
||||||
|
|
||||||
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
||||||
DependencyPath: libDir,
|
DependencyPath: []string{libDir},
|
||||||
MinimumMemory: rocmMinimumMemory,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
Name: name,
|
Name: name,
|
||||||
Compute: gfx,
|
Compute: gfx,
|
||||||
@ -201,3 +201,20 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||||
|
ids := []string{}
|
||||||
|
for _, info := range gpuInfo {
|
||||||
|
if info.Library != "rocm" {
|
||||||
|
// TODO shouldn't happen if things are wired correctly...
|
||||||
|
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ids = append(ids, info.ID)
|
||||||
|
}
|
||||||
|
// There are 3 potential env vars to use to select GPUs.
|
||||||
|
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
|
||||||
|
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
||||||
|
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
||||||
|
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||||
|
}
|
||||||
|
@ -240,7 +240,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: cpuCapability.String(),
|
Variant: cpuCapability.String(),
|
||||||
ID: "0",
|
ID: "0",
|
||||||
DependencyPath: depPath,
|
DependencyPath: []string{depPath},
|
||||||
},
|
},
|
||||||
CPUs: details,
|
CPUs: details,
|
||||||
},
|
},
|
||||||
@ -293,11 +293,11 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
gpuInfo.DriverMinor = driverMinor
|
gpuInfo.DriverMinor = driverMinor
|
||||||
variant := cudaVariant(gpuInfo)
|
variant := cudaVariant(gpuInfo)
|
||||||
if depPath != "" {
|
if depPath != "" {
|
||||||
gpuInfo.DependencyPath = depPath
|
gpuInfo.DependencyPath = []string{depPath}
|
||||||
// Check for variant specific directory
|
// Check for variant specific directory
|
||||||
if variant != "" {
|
if variant != "" {
|
||||||
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
|
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
|
||||||
gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
|
gpuInfo.DependencyPath = []string{filepath.Join(depPath, "cuda_"+variant), depPath}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -316,7 +316,9 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
// query the management library as well so we can record any skew between the two
|
// query the management library as well so we can record any skew between the two
|
||||||
// which represents overhead on the GPU we must set aside on subsequent updates
|
// which represents overhead on the GPU we must set aside on subsequent updates
|
||||||
if cHandles.nvml != nil {
|
if cHandles.nvml != nil {
|
||||||
C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
|
uuid := C.CString(gpuInfo.ID)
|
||||||
|
defer C.free(unsafe.Pointer(uuid))
|
||||||
|
C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
@ -368,7 +370,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||||
gpuInfo.DependencyPath = depPath
|
gpuInfo.DependencyPath = []string{depPath}
|
||||||
oneapiGPUs = append(oneapiGPUs, gpuInfo)
|
oneapiGPUs = append(oneapiGPUs, gpuInfo)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -417,7 +419,9 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
}
|
}
|
||||||
for i, gpu := range cudaGPUs {
|
for i, gpu := range cudaGPUs {
|
||||||
if cHandles.nvml != nil {
|
if cHandles.nvml != nil {
|
||||||
C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
|
uuid := C.CString(gpu.ID)
|
||||||
|
defer C.free(unsafe.Pointer(uuid))
|
||||||
|
C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
|
||||||
} else if cHandles.cudart != nil {
|
} else if cHandles.cudart != nil {
|
||||||
C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
|
C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
|
||||||
} else if cHandles.nvcuda != nil {
|
} else if cHandles.nvcuda != nil {
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include "gpu_info_nvcuda.h"
|
#include "gpu_info_nvcuda.h"
|
||||||
|
|
||||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||||
|
LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
|
||||||
CUresult ret;
|
CUresult ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
resp->num_devices = 0;
|
resp->num_devices = 0;
|
||||||
@ -57,8 +58,10 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
resp->cudaErr = -1;
|
resp->cudaErr = -1;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG(resp->ch.verbose, "calling cuInit\n");
|
||||||
ret = (*resp->ch.cuInit)(0);
|
ret = (*resp->ch.cuInit)(0);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
||||||
@ -75,15 +78,18 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
resp->ch.driver_minor = 0;
|
resp->ch.driver_minor = 0;
|
||||||
|
|
||||||
// Report driver version if we're in verbose mode, ignore errors
|
// Report driver version if we're in verbose mode, ignore errors
|
||||||
|
LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
|
||||||
ret = (*resp->ch.cuDriverGetVersion)(&version);
|
ret = (*resp->ch.cuDriverGetVersion)(&version);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
|
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
|
||||||
} else {
|
} else {
|
||||||
|
LOG(resp->ch.verbose, "raw version 0x%x\n", version);
|
||||||
resp->ch.driver_major = version / 1000;
|
resp->ch.driver_major = version / 1000;
|
||||||
resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
|
resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
|
||||||
LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
|
LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
|
||||||
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
|
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
|
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
|
||||||
@ -94,6 +100,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
resp->cudaErr = ret;
|
resp->cudaErr = ret;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
|
@ -17,7 +17,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
|||||||
} l[] = {
|
} l[] = {
|
||||||
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
|
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
|
||||||
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
|
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
|
||||||
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
|
{"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
|
||||||
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
|
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
|
||||||
{NULL, NULL},
|
{NULL, NULL},
|
||||||
};
|
};
|
||||||
@ -67,20 +67,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
|
void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
|
||||||
nvmlDevice_t device;
|
nvmlDevice_t device;
|
||||||
nvmlMemory_t memInfo = {0};
|
nvmlMemory_t memInfo = {0};
|
||||||
nvmlReturn_t ret;
|
nvmlReturn_t ret;
|
||||||
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
|
ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
LOG(1, "unable to get device handle %d: %d", device_id, ret);
|
LOG(1, "unable to get device handle %s: %d", uuid, ret);
|
||||||
*free = 0;
|
*free = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
|
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
|
LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
|
||||||
*free = 0;
|
*free = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -25,7 +25,7 @@ typedef struct nvml_handle {
|
|||||||
uint16_t verbose;
|
uint16_t verbose;
|
||||||
nvmlReturn_t (*nvmlInit_v2)(void);
|
nvmlReturn_t (*nvmlInit_v2)(void);
|
||||||
nvmlReturn_t (*nvmlShutdown)(void);
|
nvmlReturn_t (*nvmlShutdown)(void);
|
||||||
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
|
nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
|
||||||
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
|
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||||
} nvml_handle_t;
|
} nvml_handle_t;
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ typedef struct nvml_compute_capability {
|
|||||||
} nvml_compute_capability_t;
|
} nvml_compute_capability_t;
|
||||||
|
|
||||||
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
|
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
|
||||||
void nvml_get_free(nvml_handle_t ch, int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
|
void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
|
||||||
void nvml_release(nvml_handle_t ch);
|
void nvml_release(nvml_handle_t ch);
|
||||||
|
|
||||||
#endif // __GPU_INFO_NVML_H__
|
#endif // __GPU_INFO_NVML_H__
|
||||||
|
@ -3,9 +3,11 @@ package discover
|
|||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"reflect"
|
"reflect"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
@ -109,6 +111,10 @@ func GetCPUDetails() ([]CPU, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
return linuxCPUDetails(file)
|
||||||
|
}
|
||||||
|
|
||||||
|
func linuxCPUDetails(file io.Reader) ([]CPU, error) {
|
||||||
reColumns := regexp.MustCompile("\t+: ")
|
reColumns := regexp.MustCompile("\t+: ")
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
cpuInfos := []linuxCpuInfo{}
|
cpuInfos := []linuxCpuInfo{}
|
||||||
@ -131,6 +137,9 @@ func GetCPUDetails() ([]CPU, error) {
|
|||||||
cpu = &linuxCpuInfo{}
|
cpu = &linuxCpuInfo{}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if cpu.ID != "" {
|
||||||
|
cpuInfos = append(cpuInfos, *cpu)
|
||||||
|
}
|
||||||
|
|
||||||
// Process the sockets/cores/threads
|
// Process the sockets/cores/threads
|
||||||
socketByID := map[string]*CPU{}
|
socketByID := map[string]*CPU{}
|
||||||
@ -177,10 +186,14 @@ func GetCPUDetails() ([]CPU, error) {
|
|||||||
s.EfficiencyCoreCount = efficiencyCoreCount
|
s.EfficiencyCoreCount = efficiencyCoreCount
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
keys := make([]string, 0, len(socketByID))
|
||||||
result := []CPU{}
|
result := make([]CPU, 0, len(socketByID))
|
||||||
for _, c := range socketByID {
|
for k := range socketByID {
|
||||||
result = append(result, *c)
|
keys = append(keys, k)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, k := range keys {
|
||||||
|
result = append(result, *socketByID[k])
|
||||||
}
|
}
|
||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
2097
discover/gpu_linux_test.go
Normal file
2097
discover/gpu_linux_test.go
Normal file
File diff suppressed because it is too large
Load Diff
@ -25,7 +25,7 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
|
|||||||
MinimumMemory uint64 `json:"-"`
|
MinimumMemory uint64 `json:"-"`
|
||||||
|
|
||||||
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
||||||
DependencyPath string `json:"lib_path,omitempty"`
|
DependencyPath []string `json:"lib_path,omitempty"`
|
||||||
|
|
||||||
// Extra environment variables specific to the GPU as list of [key,value]
|
// Extra environment variables specific to the GPU as list of [key,value]
|
||||||
EnvWorkarounds [][2]string `json:"envs,omitempty"`
|
EnvWorkarounds [][2]string `json:"envs,omitempty"`
|
||||||
@ -175,6 +175,11 @@ func (si SystemInfo) GetOptimalThreadCount() int {
|
|||||||
if len(si.System.CPUs) == 0 {
|
if len(si.System.CPUs) == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
// Allocate thread count matching the performance cores on a single socket
|
|
||||||
return si.System.CPUs[0].CoreCount - si.System.CPUs[0].EfficiencyCoreCount
|
coreCount := 0
|
||||||
|
for _, c := range si.System.CPUs {
|
||||||
|
coreCount += c.CoreCount - c.EfficiencyCoreCount
|
||||||
|
}
|
||||||
|
|
||||||
|
return coreCount
|
||||||
}
|
}
|
||||||
|
@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
"num_gpu": 1,
|
"num_gpu": 1,
|
||||||
"main_gpu": 0,
|
"main_gpu": 0,
|
||||||
"low_vram": false,
|
"low_vram": false,
|
||||||
"f16_kv": true,
|
|
||||||
"vocab_only": false,
|
"vocab_only": false,
|
||||||
"use_mmap": true,
|
"use_mmap": true,
|
||||||
"use_mlock": false,
|
"use_mlock": false,
|
||||||
|
@ -108,7 +108,7 @@ Custom CPU settings are not currently supported in the new Go server build but w
|
|||||||
|
|
||||||
#### Containerized Linux Build
|
#### Containerized Linux Build
|
||||||
|
|
||||||
If you have Docker available, you can build linux binaries with `OLLAMA_NEW_RUNNERS=1 ./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
|
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
@ -118,10 +118,13 @@ The following tools are required as a minimal development environment to build C
|
|||||||
- https://go.dev/dl/
|
- https://go.dev/dl/
|
||||||
- Git
|
- Git
|
||||||
- https://git-scm.com/download/win
|
- https://git-scm.com/download/win
|
||||||
- GCC and Make. There are multiple options on how to go about installing these tools on Windows. We have verified the following, but others may work as well:
|
- clang with gcc compat and Make. There are multiple options on how to go about installing these tools on Windows. We have verified the following, but others may work as well:
|
||||||
- [MSYS2](https://www.msys2.org/)
|
- [MSYS2](https://www.msys2.org/)
|
||||||
- After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-ucrt-x86_64-gcc make` to install the required tools
|
- After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-clang-x86_64-gcc-compat mingw-w64-clang-x86_64-clang make` to install the required tools
|
||||||
- Assuming you used the default install prefix for msys2 above, add `c:\msys64\ucrt64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
|
- Assuming you used the default install prefix for msys2 above, add `C:\msys64\clang64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.
|
||||||
|
|
||||||
Then, build the `ollama` binary:
|
Then, build the `ollama` binary:
|
||||||
|
|
||||||
|
@ -74,6 +74,10 @@ would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
|
|||||||
server. If you have an unsupported AMD GPU you can experiment using the list of
|
server. If you have an unsupported AMD GPU you can experiment using the list of
|
||||||
supported types below.
|
supported types below.
|
||||||
|
|
||||||
|
If you have multiple GPUs with different GFX versions, append the numeric device
|
||||||
|
number to the environment variable to set them individually. For example,
|
||||||
|
`HSA_OVERRIDE_GFX_VERSION_0=10.3.0` and `HSA_OVERRIDE_GFX_VERSION_1=11.0.0`
|
||||||
|
|
||||||
At this time, the known supported GPU types on linux are the following LLVM Targets.
|
At this time, the known supported GPU types on linux are the following LLVM Targets.
|
||||||
This table shows some example GPUs that map to these LLVM targets:
|
This table shows some example GPUs that map to these LLVM targets:
|
||||||
| **LLVM Target** | **An Example GPU** |
|
| **LLVM Target** | **An Example GPU** |
|
||||||
@ -99,9 +103,10 @@ Reach out on [Discord](https://discord.gg/ollama) or file an
|
|||||||
### GPU Selection
|
### GPU Selection
|
||||||
|
|
||||||
If you have multiple AMD GPUs in your system and want to limit Ollama to use a
|
If you have multiple AMD GPUs in your system and want to limit Ollama to use a
|
||||||
subset, you can set `HIP_VISIBLE_DEVICES` to a comma separated list of GPUs.
|
subset, you can set `ROCR_VISIBLE_DEVICES` to a comma separated list of GPUs.
|
||||||
You can see the list of devices with `rocminfo`. If you want to ignore the GPUs
|
You can see the list of devices with `rocminfo`. If you want to ignore the GPUs
|
||||||
and force CPU usage, use an invalid GPU ID (e.g., "-1")
|
and force CPU usage, use an invalid GPU ID (e.g., "-1"). When available, use the
|
||||||
|
`Uuid` to uniquely identify the device instead of numeric value.
|
||||||
|
|
||||||
### Container Permission
|
### Container Permission
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ ollama run my-model
|
|||||||
|
|
||||||
Ollama supports importing adapters based on several different model architectures including:
|
Ollama supports importing adapters based on several different model architectures including:
|
||||||
|
|
||||||
* Llama (including Llama 2, Llama 3, and Llama 3.1);
|
* Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
|
||||||
* Mistral (including Mistral 1, Mistral 2, and Mixtral); and
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral); and
|
||||||
* Gemma (including Gemma 1 and Gemma 2)
|
* Gemma (including Gemma 1 and Gemma 2)
|
||||||
|
|
||||||
@ -67,14 +67,12 @@ ollama run my-model
|
|||||||
|
|
||||||
Ollama supports importing models for several different architectures including:
|
Ollama supports importing models for several different architectures including:
|
||||||
|
|
||||||
* Llama (including Llama 2, Llama 3, and Llama 3.1);
|
* Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
|
||||||
* Mistral (including Mistral 1, Mistral 2, and Mixtral);
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral);
|
||||||
* Gemma (including Gemma 1 and Gemma 2); and
|
* Gemma (including Gemma 1 and Gemma 2); and
|
||||||
* Phi3
|
* Phi3
|
||||||
|
|
||||||
This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
|
This includes importing foundation models as well as any fine tuned models which have been _fused_ with a foundation model.
|
||||||
|
|
||||||
|
|
||||||
## Importing a GGUF based model or adapter
|
## Importing a GGUF based model or adapter
|
||||||
|
|
||||||
If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
|
If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
|
||||||
|
@ -120,7 +120,7 @@ FROM <model directory>
|
|||||||
The model directory should contain the Safetensors weights for a supported architecture.
|
The model directory should contain the Safetensors weights for a supported architecture.
|
||||||
|
|
||||||
Currently supported model architectures:
|
Currently supported model architectures:
|
||||||
* Llama (including Llama 2, Llama 3, and Llama 3.1)
|
* Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
|
||||||
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
|
||||||
* Gemma (including Gemma 1 and Gemma 2)
|
* Gemma (including Gemma 1 and Gemma 2)
|
||||||
* Phi3
|
* Phi3
|
||||||
|
@ -95,7 +95,9 @@ If none of those resolve the problem, gather additional information and file an
|
|||||||
|
|
||||||
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
|
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
|
||||||
|
|
||||||
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
|
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices. For example, in the following output `crw-rw---- 1 0 44 226, 0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44`
|
||||||
|
|
||||||
|
If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker. Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
|
||||||
|
|
||||||
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
|
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
|
||||||
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
|
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
|
||||||
|
@ -10,7 +10,7 @@ This sounds like a typical censored response, but even llama2-uncensored gives a
|
|||||||
|
|
||||||
So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.
|
So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.
|
||||||
|
|
||||||
Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:
|
Let's start by asking a simple question that we can get an answer to from the **Llama3** model using **Ollama**. First, we need to install the **LangChain** package:
|
||||||
|
|
||||||
`pip install langchain_community`
|
`pip install langchain_community`
|
||||||
|
|
||||||
|
@ -1,22 +1,15 @@
|
|||||||
# Ollama Windows Preview
|
# Ollama Windows
|
||||||
|
|
||||||
Welcome to the Ollama Windows preview.
|
Welcome to Ollama for Windows.
|
||||||
|
|
||||||
No more WSL required!
|
No more WSL required!
|
||||||
|
|
||||||
Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
|
Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
|
||||||
After installing Ollama Windows Preview, Ollama will run in the background and
|
After installing Ollama for Windows, Ollama will run in the background and
|
||||||
the `ollama` command line is available in `cmd`, `powershell` or your favorite
|
the `ollama` command line is available in `cmd`, `powershell` or your favorite
|
||||||
terminal application. As usual the Ollama [api](./api.md) will be served on
|
terminal application. As usual the Ollama [api](./api.md) will be served on
|
||||||
`http://localhost:11434`.
|
`http://localhost:11434`.
|
||||||
|
|
||||||
As this is a preview release, you should expect a few bugs here and there. If
|
|
||||||
you run into a problem you can reach out on
|
|
||||||
[Discord](https://discord.gg/ollama), or file an
|
|
||||||
[issue](https://github.com/ollama/ollama/issues).
|
|
||||||
Logs will often be helpful in diagnosing the problem (see
|
|
||||||
[Troubleshooting](#troubleshooting) below)
|
|
||||||
|
|
||||||
## System Requirements
|
## System Requirements
|
||||||
|
|
||||||
* Windows 10 22H2 or newer, Home or Pro
|
* Windows 10 22H2 or newer, Home or Pro
|
||||||
@ -25,6 +18,32 @@ Logs will often be helpful in diagnosing the problem (see
|
|||||||
|
|
||||||
Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
|
Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
|
||||||
|
|
||||||
|
## Filesystem Requirements
|
||||||
|
|
||||||
|
The Ollama install does not require Administrator, and installs in your home directory by default. You'll need at least 4GB of space for the binary install. Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size. If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
|
||||||
|
|
||||||
|
### Changing Install Location
|
||||||
|
|
||||||
|
To install the Ollama application in a location different than your home directory, start the installer with the following flag
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
OllamaSetup.exe /DIR="d:\some\location"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Changing Model Location
|
||||||
|
|
||||||
|
To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
|
||||||
|
|
||||||
|
1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
|
||||||
|
|
||||||
|
2. Click on _Edit environment variables for your account_.
|
||||||
|
|
||||||
|
3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
|
||||||
|
|
||||||
|
4. Click OK/Apply to save.
|
||||||
|
|
||||||
|
If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
|
||||||
|
|
||||||
## API Access
|
## API Access
|
||||||
|
|
||||||
Here's a quick example showing API access from `powershell`
|
Here's a quick example showing API access from `powershell`
|
||||||
@ -34,10 +53,6 @@ Here's a quick example showing API access from `powershell`
|
|||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
|
|
||||||
While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
|
|
||||||
a "view logs" menu item to the app, and increases logging for the GUI app and
|
|
||||||
server.
|
|
||||||
|
|
||||||
Ollama on Windows stores files in a few different locations. You can view them in
|
Ollama on Windows stores files in a few different locations. You can view them in
|
||||||
the explorer window by hitting `<cmd>+R` and type in:
|
the explorer window by hitting `<cmd>+R` and type in:
|
||||||
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
|
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
|
||||||
@ -52,6 +67,10 @@ the explorer window by hitting `<cmd>+R` and type in:
|
|||||||
|
|
||||||
The Ollama Windows installer registers an Uninstaller application. Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.
|
The Ollama Windows installer registers an Uninstaller application. Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> If you have [changed the OLLAMA_MODELS location](#changing-model-location), the installer will not remove your downloaded models
|
||||||
|
|
||||||
|
|
||||||
## Standalone CLI
|
## Standalone CLI
|
||||||
|
|
||||||
The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
|
The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
|
||||||
|
@ -265,9 +265,9 @@ func AsMap() map[string]EnvVar {
|
|||||||
|
|
||||||
if runtime.GOOS != "darwin" {
|
if runtime.GOOS != "darwin" {
|
||||||
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
||||||
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
|
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible by numeric ID"}
|
||||||
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"}
|
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible by UUID or numeric ID"}
|
||||||
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"}
|
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"}
|
||||||
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
||||||
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
||||||
}
|
}
|
||||||
|
@ -1,3 +0,0 @@
|
|||||||
# `modelfile`
|
|
||||||
|
|
||||||
This package provides utilities for loading and inspecting model files
|
|
@ -1 +0,0 @@
|
|||||||
package fileutils
|
|
2
go.mod
2
go.mod
@ -1,6 +1,6 @@
|
|||||||
module github.com/ollama/ollama
|
module github.com/ollama/ollama
|
||||||
|
|
||||||
go 1.22.5
|
go 1.22.8
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/containerd/console v1.0.3
|
github.com/containerd/console v1.0.3
|
||||||
|
@ -30,7 +30,30 @@ func TestOrcaMiniBlueSky(t *testing.T) {
|
|||||||
GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
|
GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestUnicodeOutput(t *testing.T) {
|
func TestUnicode(t *testing.T) {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
// Set up the test data
|
||||||
|
req := api.GenerateRequest{
|
||||||
|
// DeepSeek has a Unicode tokenizer regex, making it a unicode torture test
|
||||||
|
Model: "deepseek-coder-v2:16b-lite-instruct-q2_K",
|
||||||
|
Prompt: "天空为什么是蓝色的?",
|
||||||
|
Stream: &stream,
|
||||||
|
Options: map[string]interface{}{
|
||||||
|
"temperature": 0,
|
||||||
|
"seed": 123,
|
||||||
|
// Workaround deepseek context shifting bug
|
||||||
|
"num_ctx": 8192,
|
||||||
|
"num_predict": 2048,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
require.NoError(t, PullIfMissing(ctx, client, req.Model))
|
||||||
|
DoGenerate(ctx, t, client, req, []string{"散射", "频率"}, 120*time.Second, 120*time.Second)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtendedUnicodeOutput(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
// Set up the test data
|
// Set up the test data
|
||||||
@ -43,7 +66,10 @@ func TestUnicodeOutput(t *testing.T) {
|
|||||||
"seed": 123,
|
"seed": 123,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
GenerateTestHelper(ctx, t, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"})
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
require.NoError(t, PullIfMissing(ctx, client, req.Model))
|
||||||
|
DoGenerate(ctx, t, client, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"}, 120*time.Second, 120*time.Second)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestUnicodeModelDir(t *testing.T) {
|
func TestUnicodeModelDir(t *testing.T) {
|
||||||
|
@ -60,7 +60,8 @@ func TestMultiModelConcurrency(t *testing.T) {
|
|||||||
for i := 0; i < len(req); i++ {
|
for i := 0; i < len(req); i++ {
|
||||||
go func(i int) {
|
go func(i int) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
|
// Note: CPU based inference can crawl so don't give up too quickly
|
||||||
|
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
|
||||||
}(i)
|
}(i)
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
@ -12,7 +12,7 @@ import (
|
|||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestIntegrationMultimodal(t *testing.T) {
|
func TestIntegrationLlava(t *testing.T) {
|
||||||
image, err := base64.StdEncoding.DecodeString(imageEncoding)
|
image, err := base64.StdEncoding.DecodeString(imageEncoding)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
req := api.GenerateRequest{
|
req := api.GenerateRequest{
|
||||||
@ -39,6 +39,33 @@ func TestIntegrationMultimodal(t *testing.T) {
|
|||||||
DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
|
DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestIntegrationMllama(t *testing.T) {
|
||||||
|
image, err := base64.StdEncoding.DecodeString(imageEncoding)
|
||||||
|
require.NoError(t, err)
|
||||||
|
req := api.GenerateRequest{
|
||||||
|
// TODO fix up once we publish the final image
|
||||||
|
Model: "x/llama3.2-vision",
|
||||||
|
Prompt: "what does the text in this image say?",
|
||||||
|
Stream: &stream,
|
||||||
|
Options: map[string]interface{}{
|
||||||
|
"seed": 42,
|
||||||
|
"temperature": 0.0,
|
||||||
|
},
|
||||||
|
Images: []api.ImageData{
|
||||||
|
image,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp := "the ollamas"
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
require.NoError(t, PullIfMissing(ctx, client, req.Model))
|
||||||
|
// mllama models on CPU can be quite slow to start,
|
||||||
|
DoGenerate(ctx, t, client, req, []string{resp}, 240*time.Second, 30*time.Second)
|
||||||
|
}
|
||||||
|
|
||||||
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
|
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
|
||||||
AAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAABIAAAAAQAAAEgAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAANKgAwAEAAAAAQAA
|
AAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAABIAAAAAQAAAEgAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAANKgAwAEAAAAAQAA
|
||||||
AHgAAAAAXdsepgAAAAlwSFlzAAALEwAACxMBAJqcGAAAAVlpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6
|
AHgAAAAAXdsepgAAAAlwSFlzAAALEwAACxMBAJqcGAAAAVlpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6
|
||||||
|
2
llama/llama-vocab.cpp
vendored
2
llama/llama-vocab.cpp
vendored
@ -415,7 +415,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"[\r\n]",
|
"[\r\n]",
|
||||||
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
||||||
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
||||||
"\\s+$",
|
"\\s+$",
|
||||||
"[一-龥ࠀ-一가-]+",
|
"[一-龥ࠀ-一가-]+",
|
||||||
|
105
llama/llama.cpp
vendored
105
llama/llama.cpp
vendored
@ -2699,7 +2699,7 @@ struct llama_hparams {
|
|||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool cross_attention_layer(uint32_t il) const {
|
bool cross_attention_layers(uint32_t il) const {
|
||||||
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -2731,6 +2731,9 @@ struct llama_cparams {
|
|||||||
bool offload_kqv;
|
bool offload_kqv;
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
bool no_perf;
|
bool no_perf;
|
||||||
|
// TODO (jmorganca): this should most likely be passed in as part of a batch
|
||||||
|
// and not set on the context for all batches.
|
||||||
|
bool cross_attn = false;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
@ -3542,10 +3545,6 @@ struct llama_context {
|
|||||||
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
|
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
|
||||||
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
|
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
|
||||||
|
|
||||||
// TODO (jmorganca): this should most likely be passed in as part of a batch
|
|
||||||
// and not set on the context for all batches.
|
|
||||||
float * cross_attn_state = nullptr;
|
|
||||||
bool cross_attn_state_first_pass = true;
|
|
||||||
struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
|
struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -3782,7 +3781,7 @@ static bool llama_kv_cache_init(
|
|||||||
|
|
||||||
for (int i = 0; i < (int) n_layer; i++) {
|
for (int i = 0; i < (int) n_layer; i++) {
|
||||||
// for cross attention layers
|
// for cross attention layers
|
||||||
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layer(i)) {
|
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
|
||||||
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||||
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
|
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
|
||||||
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
|
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
|
||||||
@ -7389,7 +7388,7 @@ static bool llm_load_tensors(
|
|||||||
|
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
if (hparams.cross_attention_layer(i)) {
|
if (hparams.cross_attention_layers(i)) {
|
||||||
layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128});
|
layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128});
|
||||||
layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024});
|
layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024});
|
||||||
layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd});
|
layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd});
|
||||||
@ -9346,7 +9345,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|||||||
|
|
||||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
||||||
} else {
|
} else {
|
||||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
||||||
inpL = lctx.inp_embd;
|
inpL = lctx.inp_embd;
|
||||||
ggml_set_input(lctx.inp_embd);
|
ggml_set_input(lctx.inp_embd);
|
||||||
}
|
}
|
||||||
@ -9368,11 +9367,10 @@ static struct ggml_tensor * llm_build_inp_cross_attn_state(
|
|||||||
const llm_build_cb & cb) {
|
const llm_build_cb & cb) {
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
struct ggml_tensor * inpCAS;
|
struct ggml_tensor * inpCAS = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
|
||||||
lctx.inp_cross_attn_state = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
|
cb(inpCAS, "inp_cross_attn_state", -1);
|
||||||
cb(lctx.inp_cross_attn_state, "inp_cross_attn_state", -1);
|
ggml_set_input(inpCAS);
|
||||||
ggml_set_input(lctx.inp_cross_attn_state);
|
lctx.inp_cross_attn_state = inpCAS;
|
||||||
inpCAS = lctx.inp_cross_attn_state;
|
|
||||||
|
|
||||||
return inpCAS;
|
return inpCAS;
|
||||||
}
|
}
|
||||||
@ -10979,8 +10977,8 @@ struct llm_build_context {
|
|||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM_RMS, cb, il);
|
||||||
cb(cur, "attn_norm", il);
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
if (hparams.cross_attention_layer(il)) {
|
if (hparams.cross_attention_layers(il)) {
|
||||||
if (!lctx.cross_attn_state) {
|
if (!batch.embd && !cparams.cross_attn) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -10991,42 +10989,28 @@ struct llm_build_context {
|
|||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Qcur = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
|
|
||||||
// TODO: is this required?
|
|
||||||
Qcur = ggml_cont(ctx0, Qcur);
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
|
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Kcur;
|
struct ggml_tensor * Kcur, * Vcur;
|
||||||
if (lctx.cross_attn_state_first_pass) {
|
if (batch.embd) {
|
||||||
Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
|
Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
Kcur = ggml_permute(ctx0, Kcur, 0, 2, 1, 3);
|
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
// TODO: is this required?
|
|
||||||
Kcur = ggml_cont(ctx0, Kcur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
|
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
|
||||||
} else {
|
|
||||||
Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
|
|
||||||
cb(Kcur, "Kcur (view)", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * Vcur;
|
|
||||||
if (lctx.cross_attn_state_first_pass) {
|
|
||||||
Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
|
Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
@ -11038,6 +11022,9 @@ struct llm_build_context {
|
|||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
|
||||||
} else {
|
} else {
|
||||||
|
Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
|
||||||
|
cb(Kcur, "Kcur (view)", il);
|
||||||
|
|
||||||
Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
|
Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
|
||||||
cb(Vcur, "Vcur (view)", il);
|
cb(Vcur, "Vcur (view)", il);
|
||||||
}
|
}
|
||||||
@ -11045,11 +11032,8 @@ struct llm_build_context {
|
|||||||
struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
|
struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
|
||||||
cb(kq, "kq", il);
|
cb(kq, "kq", il);
|
||||||
|
|
||||||
kq = ggml_scale_inplace(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
|
|
||||||
cb(kq, "kq_scaled", il);
|
|
||||||
|
|
||||||
// TODO: apply causal masks
|
// TODO: apply causal masks
|
||||||
struct ggml_tensor * kq_soft_max = ggml_soft_max_inplace(ctx0, kq);
|
struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
||||||
cb(kq_soft_max, "kq_soft_max", il);
|
cb(kq_soft_max, "kq_soft_max", il);
|
||||||
|
|
||||||
Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
|
Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
|
||||||
@ -11139,8 +11123,8 @@ struct llm_build_context {
|
|||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
|
||||||
|
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
@ -17197,10 +17181,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (batch.embd) {
|
if (batch.embd) {
|
||||||
const int64_t n_embd = hparams.n_embd;
|
if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
|
||||||
|
// zero out inp_embd since it's not used
|
||||||
|
float * inp_embd_data = (float *)lctx.inp_embd->data;
|
||||||
|
for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
|
||||||
|
inp_embd_data[i] = 0.0f;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (batch.pos && lctx.inp_pos) {
|
if (batch.pos && lctx.inp_pos) {
|
||||||
@ -17209,14 +17202,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
|||||||
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO (jmorganca): this might copy a lot of data on every request of a
|
|
||||||
// single generation even though it doesn't change, so we should
|
|
||||||
// find a way to not set this more than one time per image
|
|
||||||
if (lctx.inp_cross_attn_state &&
|
|
||||||
lctx.inp_cross_attn_state->buffer) {
|
|
||||||
ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
@ -17789,7 +17774,7 @@ static int llama_decode_internal(
|
|||||||
n_outputs = 1;
|
n_outputs = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.sbatch.from_batch(batch_all, n_embd,
|
lctx.sbatch.from_batch(batch_all, batch_all.n_embd,
|
||||||
/* simple_split */ !kv_self.recurrent,
|
/* simple_split */ !kv_self.recurrent,
|
||||||
/* logits_all */ n_outputs == n_tokens_all);
|
/* logits_all */ n_outputs == n_tokens_all);
|
||||||
|
|
||||||
@ -17899,10 +17884,6 @@ static int llama_decode_internal(
|
|||||||
|
|
||||||
llama_set_inputs(lctx, ubatch);
|
llama_set_inputs(lctx, ubatch);
|
||||||
|
|
||||||
// TODO: replace with something better to find out if its
|
|
||||||
// our first actual pass
|
|
||||||
lctx.cross_attn_state_first_pass = false;
|
|
||||||
|
|
||||||
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
||||||
|
|
||||||
// update the kv ring buffer
|
// update the kv ring buffer
|
||||||
@ -18086,7 +18067,7 @@ static int llama_encode_internal(
|
|||||||
|
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
lctx.sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||||
|
|
||||||
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
|
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
|
||||||
|
|
||||||
@ -20194,11 +20175,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state) {
|
|
||||||
ctx->cross_attn_state_first_pass = true;
|
|
||||||
ctx->cross_attn_state = cross_attn_state;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_free(struct llama_context * ctx) {
|
void llama_free(struct llama_context * ctx) {
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
@ -21686,6 +21662,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
|||||||
ctx->cparams.causal_attn = causal_attn;
|
ctx->cparams.causal_attn = causal_attn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
|
||||||
|
ctx->cparams.cross_attn = cross_attention;
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_batch llama_batch_get_one(
|
struct llama_batch llama_batch_get_one(
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int32_t n_tokens,
|
int32_t n_tokens,
|
||||||
@ -21695,6 +21675,7 @@ struct llama_batch llama_batch_get_one(
|
|||||||
/*n_tokens =*/ n_tokens,
|
/*n_tokens =*/ n_tokens,
|
||||||
/*tokens =*/ tokens,
|
/*tokens =*/ tokens,
|
||||||
/*embd =*/ nullptr,
|
/*embd =*/ nullptr,
|
||||||
|
/*n_embd =*/ 0,
|
||||||
/*pos =*/ nullptr,
|
/*pos =*/ nullptr,
|
||||||
/*n_seq_id =*/ nullptr,
|
/*n_seq_id =*/ nullptr,
|
||||||
/*seq_id =*/ nullptr,
|
/*seq_id =*/ nullptr,
|
||||||
@ -21710,6 +21691,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
|||||||
/*n_tokens =*/ 0,
|
/*n_tokens =*/ 0,
|
||||||
/*tokens =*/ nullptr,
|
/*tokens =*/ nullptr,
|
||||||
/*embd =*/ nullptr,
|
/*embd =*/ nullptr,
|
||||||
|
/*n_embd =*/ 0,
|
||||||
/*pos =*/ nullptr,
|
/*pos =*/ nullptr,
|
||||||
/*n_seq_id =*/ nullptr,
|
/*n_seq_id =*/ nullptr,
|
||||||
/*seq_id =*/ nullptr,
|
/*seq_id =*/ nullptr,
|
||||||
@ -21721,6 +21703,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
|||||||
|
|
||||||
if (embd) {
|
if (embd) {
|
||||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||||
|
batch.n_embd = embd;
|
||||||
} else {
|
} else {
|
||||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||||
}
|
}
|
||||||
|
291
llama/llama.go
291
llama/llama.go
@ -21,6 +21,8 @@ package llama
|
|||||||
#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
|
#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
|
||||||
|
#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
|
||||||
#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
|
#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
|
||||||
#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
|
#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
|
||||||
#cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
|
#cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
|
||||||
@ -36,8 +38,8 @@ package llama
|
|||||||
#cgo linux CXXFLAGS: -D_GNU_SOURCE
|
#cgo linux CXXFLAGS: -D_GNU_SOURCE
|
||||||
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
||||||
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
||||||
#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
|
#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
|
||||||
#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
|
#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
|
||||||
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
|
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
|
||||||
#cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
|
#cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
|
||||||
#cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
|
#cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
|
||||||
@ -68,6 +70,17 @@ package llama
|
|||||||
#include "sampling_ext.h"
|
#include "sampling_ext.h"
|
||||||
|
|
||||||
bool llamaProgressCallback(float progress, void *user_data);
|
bool llamaProgressCallback(float progress, void *user_data);
|
||||||
|
|
||||||
|
typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
|
||||||
|
COMPILER inline get_compiler() {
|
||||||
|
#if defined(__clang__)
|
||||||
|
return COMP_CLANG;
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
return COMP_GCC;
|
||||||
|
#else
|
||||||
|
return UNKNOWN_COMPILER;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
*/
|
*/
|
||||||
import "C"
|
import "C"
|
||||||
|
|
||||||
@ -77,6 +90,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"runtime"
|
"runtime"
|
||||||
"runtime/cgo"
|
"runtime/cgo"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
@ -88,7 +102,38 @@ func BackendInit() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func PrintSystemInfo() string {
|
func PrintSystemInfo() string {
|
||||||
return C.GoString(C.llama_print_system_info())
|
var compiler string
|
||||||
|
switch C.get_compiler() {
|
||||||
|
case C.COMP_UNKNOWN:
|
||||||
|
compiler = "cgo(unknown_compiler)"
|
||||||
|
case C.COMP_GCC:
|
||||||
|
compiler = "cgo(gcc)"
|
||||||
|
case C.COMP_CLANG:
|
||||||
|
compiler = "cgo(clang)"
|
||||||
|
}
|
||||||
|
return C.GoString(C.llama_print_system_info()) + compiler
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetModelArch(modelPath string) (string, error) {
|
||||||
|
mp := C.CString(modelPath)
|
||||||
|
defer C.free(unsafe.Pointer(mp))
|
||||||
|
|
||||||
|
gguf_ctx := C.gguf_init_from_file(mp, C.struct_gguf_init_params{no_alloc: true, ctx: (**C.struct_ggml_context)(C.NULL)})
|
||||||
|
if gguf_ctx == nil {
|
||||||
|
return "", errors.New("unable to load model file")
|
||||||
|
}
|
||||||
|
defer C.gguf_free(gguf_ctx)
|
||||||
|
|
||||||
|
key := C.CString("general.architecture")
|
||||||
|
defer C.free(unsafe.Pointer(key))
|
||||||
|
arch_index := C.gguf_find_key(gguf_ctx, key)
|
||||||
|
if int(arch_index) < 0 {
|
||||||
|
return "", errors.New("unknown model architecture")
|
||||||
|
}
|
||||||
|
|
||||||
|
arch := C.gguf_get_val_str(gguf_ctx, arch_index)
|
||||||
|
|
||||||
|
return C.GoString(arch), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type ContextParams struct {
|
type ContextParams struct {
|
||||||
@ -138,10 +183,6 @@ func (c *Context) Model() *Model {
|
|||||||
return &Model{c: C.llama_get_model(c.c)}
|
return &Model{c: C.llama_get_model(c.c)}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) GetLogitsIth(i int) []float32 {
|
|
||||||
return unsafe.Slice((*float32)(unsafe.Pointer(C.llama_get_logits_ith(c.c, C.int(i)))), c.Model().NumVocab())
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Context) KvCacheSeqAdd(seqId int, p0 int, p1 int, delta int) {
|
func (c *Context) KvCacheSeqAdd(seqId int, p0 int, p1 int, delta int) {
|
||||||
C.llama_kv_cache_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
|
C.llama_kv_cache_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
|
||||||
}
|
}
|
||||||
@ -165,7 +206,12 @@ func (c *Context) GetEmbeddingsSeq(seqId int) []float32 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) GetEmbeddingsIth(i int) []float32 {
|
func (c *Context) GetEmbeddingsIth(i int) []float32 {
|
||||||
return unsafe.Slice((*float32)(unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))), c.Model().NEmbd())
|
embeddings := unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))
|
||||||
|
if embeddings == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return unsafe.Slice((*float32)(embeddings), c.Model().NEmbd())
|
||||||
}
|
}
|
||||||
|
|
||||||
type ModelParams struct {
|
type ModelParams struct {
|
||||||
@ -186,7 +232,7 @@ func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadModelFromFile(modelPath string, params ModelParams) *Model {
|
func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
||||||
cparams := C.llama_model_default_params()
|
cparams := C.llama_model_default_params()
|
||||||
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
||||||
cparams.main_gpu = C.int32_t(params.MainGpu)
|
cparams.main_gpu = C.int32_t(params.MainGpu)
|
||||||
@ -216,18 +262,28 @@ func LoadModelFromFile(modelPath string, params ModelParams) *Model {
|
|||||||
cparams.progress_callback_user_data = unsafe.Pointer(&handle)
|
cparams.progress_callback_user_data = unsafe.Pointer(&handle)
|
||||||
}
|
}
|
||||||
|
|
||||||
return &Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)}
|
m := Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)}
|
||||||
|
if m.c == nil {
|
||||||
|
return nil, fmt.Errorf("unable to load model: %s", modelPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func FreeModel(model *Model) {
|
func FreeModel(model *Model) {
|
||||||
C.llama_free_model(model.c)
|
C.llama_free_model(model.c)
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewContextWithModel(model *Model, params ContextParams) *Context {
|
func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
|
||||||
return &Context{
|
c := Context{
|
||||||
c: C.llama_new_context_with_model(model.c, params.c),
|
c: C.llama_new_context_with_model(model.c, params.c),
|
||||||
numThreads: int(params.c.n_threads),
|
numThreads: int(params.c.n_threads),
|
||||||
}
|
}
|
||||||
|
if c.c == nil {
|
||||||
|
return nil, errors.New("unable to create llama context")
|
||||||
|
}
|
||||||
|
|
||||||
|
return &c, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) NumVocab() int {
|
func (m *Model) NumVocab() int {
|
||||||
@ -247,6 +303,9 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
|
|||||||
defer C.free(unsafe.Pointer(cLoraPath))
|
defer C.free(unsafe.Pointer(cLoraPath))
|
||||||
|
|
||||||
loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath)
|
loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath)
|
||||||
|
if loraAdapter == nil {
|
||||||
|
return errors.New("unable to load lora")
|
||||||
|
}
|
||||||
|
|
||||||
err := -1
|
err := -1
|
||||||
if loraAdapter != nil {
|
if loraAdapter != nil {
|
||||||
@ -262,18 +321,40 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
|
|||||||
type Batch struct {
|
type Batch struct {
|
||||||
c C.struct_llama_batch
|
c C.struct_llama_batch
|
||||||
batchSize int
|
batchSize int
|
||||||
|
maxSeq int
|
||||||
embedSize int
|
embedSize int
|
||||||
}
|
}
|
||||||
|
|
||||||
// Creates a new batch for either word tokens if embed is 0 or
|
// Creates a new batch for either word tokens or image embeddings (if embedSize is non-zero).
|
||||||
// image embeddings if embed is specified. Batches cannot contain
|
// Batches cannot contain both types at the same time. batchSize is the maximum number of entries
|
||||||
// both types at the same time
|
// that can be added per sequence
|
||||||
func NewBatch(nTokens int, embed int, maxSeq int) *Batch {
|
func NewBatch(batchSize int, maxSeq int, embedSize int) (*Batch, error) {
|
||||||
return &Batch{
|
b := Batch{
|
||||||
c: C.llama_batch_init(C.int(nTokens), C.int(embed), C.int(maxSeq)),
|
c: C.llama_batch_init(C.int(batchSize*maxSeq), C.int(embedSize), C.int(maxSeq)),
|
||||||
batchSize: nTokens,
|
batchSize: batchSize,
|
||||||
embedSize: embed,
|
maxSeq: maxSeq,
|
||||||
|
embedSize: embedSize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check to see if any of the allocations in llama_batch_init() failed
|
||||||
|
nilPointer := (embedSize == 0 && b.c.token == nil) || (embedSize != 0 && b.c.embd == nil) ||
|
||||||
|
b.c.pos == nil || b.c.n_seq_id == nil || b.c.seq_id == nil || b.c.logits == nil ||
|
||||||
|
slices.Contains(unsafe.Slice(b.c.seq_id, b.allocSize()), nil)
|
||||||
|
|
||||||
|
if nilPointer {
|
||||||
|
C.llama_batch_free(b.c)
|
||||||
|
return nil, fmt.Errorf("unable to allocate batch (batchSize=%v maxSeq=%v embedSize=%v)", batchSize, maxSeq, embedSize)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &b, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *Batch) Size() int {
|
||||||
|
return b.batchSize
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *Batch) allocSize() int {
|
||||||
|
return b.batchSize * b.maxSeq
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *Batch) NumTokens() int {
|
func (b *Batch) NumTokens() int {
|
||||||
@ -288,21 +369,21 @@ func (b *Batch) IsEmbedding() bool {
|
|||||||
// when the batch was initialized. The other argument will be ignored. Adds to the
|
// when the batch was initialized. The other argument will be ignored. Adds to the
|
||||||
// batch with the given position for the given sequence ids, and optionally instructs
|
// batch with the given position for the given sequence ids, and optionally instructs
|
||||||
// to include logits.
|
// to include logits.
|
||||||
func (b *Batch) Add(token int, embed []float32, pos int, seqIds []int, logits bool) {
|
func (b *Batch) Add(token int, embed []float32, pos int, logits bool, seqIds ...int) {
|
||||||
if !b.IsEmbedding() {
|
if !b.IsEmbedding() {
|
||||||
unsafe.Slice(b.c.token, b.batchSize)[b.c.n_tokens] = C.llama_token(token)
|
unsafe.Slice(b.c.token, b.allocSize())[b.c.n_tokens] = C.llama_token(token)
|
||||||
} else {
|
} else {
|
||||||
copy(unsafe.Slice((*float32)(b.c.embd), b.batchSize*b.embedSize)[int(b.c.n_tokens)*b.embedSize:], embed)
|
copy(unsafe.Slice((*float32)(b.c.embd), b.allocSize()*b.embedSize)[int(b.c.n_tokens)*b.embedSize:], embed)
|
||||||
}
|
}
|
||||||
unsafe.Slice(b.c.pos, b.batchSize)[b.c.n_tokens] = C.llama_pos(pos)
|
unsafe.Slice(b.c.pos, b.allocSize())[b.c.n_tokens] = C.llama_pos(pos)
|
||||||
unsafe.Slice(b.c.n_seq_id, b.batchSize)[b.c.n_tokens] = C.int(len(seqIds))
|
unsafe.Slice(b.c.n_seq_id, b.allocSize())[b.c.n_tokens] = C.int(len(seqIds))
|
||||||
|
|
||||||
for i, s := range seqIds {
|
for i, s := range seqIds {
|
||||||
unsafe.Slice((unsafe.Slice(b.c.seq_id, b.batchSize)[b.c.n_tokens]), C.int(len(seqIds)))[i] = C.int32_t(s)
|
unsafe.Slice((unsafe.Slice(b.c.seq_id, b.allocSize())[b.c.n_tokens]), C.int(len(seqIds)))[i] = C.int32_t(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
if logits {
|
if logits {
|
||||||
unsafe.Slice(b.c.logits, b.batchSize)[b.c.n_tokens] = 1
|
unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 1
|
||||||
}
|
}
|
||||||
|
|
||||||
b.c.n_tokens += 1
|
b.c.n_tokens += 1
|
||||||
@ -412,71 +493,42 @@ func Quantize(infile, outfile string, ftype uint32) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// llava
|
// vision processing
|
||||||
type ClipContext struct {
|
type ClipContext struct {
|
||||||
c *C.struct_clip_ctx
|
c *C.struct_clip_ctx
|
||||||
m *C.struct_mllama_ctx
|
|
||||||
IsMllama bool
|
|
||||||
embedPin runtime.Pinner
|
|
||||||
pinned bool
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func getVisionArch(mp *C.char) (string, error) {
|
func NewClipContext(llamaContext *Context, modelPath string) (*ClipContext, error) {
|
||||||
gguf_ctx := C.gguf_init_from_file(mp, C.struct_gguf_init_params{no_alloc: true, ctx: (**C.struct_ggml_context)(C.NULL)})
|
|
||||||
if gguf_ctx == nil {
|
|
||||||
return "", errors.New("unable to load vision projector")
|
|
||||||
}
|
|
||||||
defer C.gguf_free(gguf_ctx)
|
|
||||||
|
|
||||||
arch_index := C.gguf_find_key(gguf_ctx, C.CString("general.architecture"))
|
|
||||||
if int(arch_index) < 0 {
|
|
||||||
return "", errors.New("unknown vision model architecture")
|
|
||||||
}
|
|
||||||
|
|
||||||
arch := C.gguf_get_val_str(gguf_ctx, arch_index)
|
|
||||||
|
|
||||||
return C.GoString(arch), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewClipContext(modelPath string) (*ClipContext, error) {
|
|
||||||
mp := C.CString(modelPath)
|
mp := C.CString(modelPath)
|
||||||
defer C.free(unsafe.Pointer(mp))
|
defer C.free(unsafe.Pointer(mp))
|
||||||
|
c := C.clip_model_load(mp, 1)
|
||||||
arch, err := getVisionArch(mp)
|
if c == nil {
|
||||||
if err != nil {
|
return nil, fmt.Errorf("unable to load clip model: %v", modelPath)
|
||||||
return nil, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var cc ClipContext
|
projEmbedSize := int(C.clip_n_mmproj_embd(c))
|
||||||
if arch == "clip" {
|
modelEmbedSize := llamaContext.Model().NEmbd()
|
||||||
cc.c = C.clip_model_load(mp, 1)
|
if projEmbedSize != modelEmbedSize {
|
||||||
} else if arch == "mllama" {
|
return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
|
||||||
cc.m = C.mllama_model_load(mp, 1)
|
|
||||||
cc.IsMllama = true
|
|
||||||
} else {
|
|
||||||
return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// XXX: check embedding size?
|
return &ClipContext{c: c}, nil
|
||||||
return &cc, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ClipContext) Free() {
|
func (c *ClipContext) Free() {
|
||||||
if c.c != nil {
|
C.clip_free(c.c)
|
||||||
C.clip_free(c.c)
|
|
||||||
}
|
|
||||||
if c.m != nil {
|
|
||||||
C.mllama_free(c.m)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewLlavaImageEmbed(llamaContext *Context, clipContext *ClipContext, data []byte) [][]float32 {
|
func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, error) {
|
||||||
c := C.llava_image_embed_make_with_bytes(clipContext.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data)))
|
l := C.llava_image_embed_make_with_bytes(c.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data)))
|
||||||
|
if l == nil {
|
||||||
|
return nil, errors.New("unable to make llava embedding from image")
|
||||||
|
}
|
||||||
|
|
||||||
numTokens := int(c.n_image_pos)
|
numTokens := int(l.n_image_pos)
|
||||||
numEmbed := llamaContext.Model().NEmbd()
|
numEmbed := llamaContext.Model().NEmbd()
|
||||||
|
|
||||||
s := unsafe.Slice((*float32)(c.embed), numEmbed*numTokens)
|
s := unsafe.Slice((*float32)(l.embed), numEmbed*numTokens)
|
||||||
|
|
||||||
embed := make([][]float32, numTokens)
|
embed := make([][]float32, numTokens)
|
||||||
rows := make([]float32, len(s))
|
rows := make([]float32, len(s))
|
||||||
@ -486,51 +538,66 @@ func NewLlavaImageEmbed(llamaContext *Context, clipContext *ClipContext, data []
|
|||||||
embed[i] = rows[i*numEmbed : (i+1)*numEmbed]
|
embed[i] = rows[i*numEmbed : (i+1)*numEmbed]
|
||||||
}
|
}
|
||||||
|
|
||||||
C.llava_image_embed_free(c)
|
C.llava_image_embed_free(l)
|
||||||
|
|
||||||
return embed
|
return embed, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewMllamaImageEmbed(llamaContext *Context, clipContext *ClipContext, data []byte, aspectRatioId int) [][]float32 {
|
type MllamaContext struct {
|
||||||
|
c *C.struct_mllama_ctx
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
|
||||||
|
mp := C.CString(modelPath)
|
||||||
|
defer C.free(unsafe.Pointer(mp))
|
||||||
|
c := C.mllama_model_load(mp, 1)
|
||||||
|
if c == nil {
|
||||||
|
return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
projEmbedSize := int(C.mllama_n_embd(c))
|
||||||
|
modelEmbedSize := llamaContext.Model().NEmbd()
|
||||||
|
if projEmbedSize != modelEmbedSize {
|
||||||
|
return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &MllamaContext{c: c}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MllamaContext) Free() {
|
||||||
|
C.mllama_free(m.c)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
||||||
img := C.mllama_image_init()
|
img := C.mllama_image_init()
|
||||||
defer C.mllama_image_free(img)
|
defer C.mllama_image_free(img)
|
||||||
|
|
||||||
C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img)
|
ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
|
||||||
|
if !ok {
|
||||||
numTokens := int(C.mllama_n_positions(clipContext.m) * C.mllama_n_tiles(clipContext.m))
|
return nil, errors.New("unable to load mllama image data")
|
||||||
numEmbed := llamaContext.Model().NEmbd()
|
|
||||||
|
|
||||||
rows := make([]float32, numEmbed*numTokens)
|
|
||||||
C.mllama_image_encode(clipContext.m, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0])))
|
|
||||||
|
|
||||||
embed := make([][]float32, numTokens)
|
|
||||||
for i := range embed {
|
|
||||||
embed[i] = rows[i*numEmbed : (i+1)*numEmbed]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return embed
|
rows := make([]float32, m.EmbedSize(llamaContext))
|
||||||
|
ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
|
||||||
|
if !ok {
|
||||||
|
return nil, errors.New("unable to make mllama embedding from image")
|
||||||
|
}
|
||||||
|
|
||||||
|
embed := make([][]float32, 1)
|
||||||
|
embed[0] = rows
|
||||||
|
|
||||||
|
return embed, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// This really needs to be set on a batch instead
|
func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
|
||||||
func MllamaSetCrossAttn(llamaContext *Context, clipContext *ClipContext, embed [][]float32) {
|
numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
|
||||||
if embed != nil {
|
numEmbed := llamaContext.Model().NEmbd()
|
||||||
if clipContext.pinned {
|
|
||||||
panic("Cross attention state already pinned")
|
|
||||||
}
|
|
||||||
|
|
||||||
embedData := &embed[0][0]
|
return numTokens * numEmbed
|
||||||
clipContext.embedPin.Pin(embedData)
|
}
|
||||||
clipContext.pinned = true
|
|
||||||
|
|
||||||
C.llama_set_cross_attn_state(llamaContext.c, (*C.float)(unsafe.Pointer(embedData)))
|
func (c *Context) SetCrossAttention(state bool) {
|
||||||
} else {
|
C.llama_set_cross_attention(c.c, C.bool(state))
|
||||||
C.llama_set_cross_attn_state(llamaContext.c, (*C.float)(C.NULL))
|
|
||||||
|
|
||||||
if clipContext.pinned {
|
|
||||||
clipContext.embedPin.Unpin()
|
|
||||||
clipContext.pinned = false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampling
|
// sampling
|
||||||
@ -558,7 +625,7 @@ type SamplingParams struct {
|
|||||||
Grammar string
|
Grammar string
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {
|
func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, error) {
|
||||||
var cparams C.struct_gpt_sampler_cparams
|
var cparams C.struct_gpt_sampler_cparams
|
||||||
cparams.top_k = C.int32_t(params.TopK)
|
cparams.top_k = C.int32_t(params.TopK)
|
||||||
cparams.top_p = C.float(params.TopP)
|
cparams.top_p = C.float(params.TopP)
|
||||||
@ -581,9 +648,13 @@ func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {
|
|||||||
|
|
||||||
cparams.grammar = grammar
|
cparams.grammar = grammar
|
||||||
context := &SamplingContext{c: C.gpt_sampler_cinit(model.c, &cparams)}
|
context := &SamplingContext{c: C.gpt_sampler_cinit(model.c, &cparams)}
|
||||||
|
if context.c == nil {
|
||||||
|
return nil, errors.New("unable to create sampling context")
|
||||||
|
}
|
||||||
|
|
||||||
runtime.SetFinalizer(context, func(s *SamplingContext) { C.gpt_sampler_cfree(s.c) })
|
runtime.SetFinalizer(context, func(s *SamplingContext) { C.gpt_sampler_cfree(s.c) })
|
||||||
|
|
||||||
return context
|
return context, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SamplingContext) Reset() {
|
func (s *SamplingContext) Reset() {
|
||||||
|
3
llama/llama.h
vendored
3
llama/llama.h
vendored
@ -266,6 +266,7 @@ extern "C" {
|
|||||||
|
|
||||||
llama_token * token;
|
llama_token * token;
|
||||||
float * embd;
|
float * embd;
|
||||||
|
int32_t n_embd;
|
||||||
llama_pos * pos;
|
llama_pos * pos;
|
||||||
int32_t * n_seq_id;
|
int32_t * n_seq_id;
|
||||||
llama_seq_id ** seq_id;
|
llama_seq_id ** seq_id;
|
||||||
@ -451,7 +452,7 @@ extern "C" {
|
|||||||
|
|
||||||
// TODO (jmorganca): this should most likely be passed in as part of a batch
|
// TODO (jmorganca): this should most likely be passed in as part of a batch
|
||||||
// and not set on the context for all batches.
|
// and not set on the context for all batches.
|
||||||
LLAMA_API void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state);
|
LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
|
||||||
|
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
2
llama/llava.cpp
vendored
2
llama/llava.cpp
vendored
@ -435,7 +435,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
|||||||
if (n_eval > n_batch) {
|
if (n_eval > n_batch) {
|
||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||||
if (llama_decode(ctx_llama, batch)) {
|
if (llama_decode(ctx_llama, batch)) {
|
||||||
LOG_ERR("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
@ -58,6 +58,8 @@ endif
|
|||||||
GPU_COMPILER_CUFLAGS = \
|
GPU_COMPILER_CUFLAGS = \
|
||||||
$(GPU_COMPILER_FPIC) \
|
$(GPU_COMPILER_FPIC) \
|
||||||
$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
|
$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
|
||||||
|
-mf16c \
|
||||||
|
-mfma \
|
||||||
-parallel-jobs=2 \
|
-parallel-jobs=2 \
|
||||||
-c \
|
-c \
|
||||||
-O3 \
|
-O3 \
|
||||||
@ -77,6 +79,9 @@ GPU_COMPILER_CUFLAGS = \
|
|||||||
-D_CRT_SECURE_NO_WARNINGS \
|
-D_CRT_SECURE_NO_WARNINGS \
|
||||||
-D_GNU_SOURCE \
|
-D_GNU_SOURCE \
|
||||||
-D_XOPEN_SOURCE=600 \
|
-D_XOPEN_SOURCE=600 \
|
||||||
|
-DUSE_PROF_API=1 \
|
||||||
|
-std=gnu++14 \
|
||||||
|
-x hip \
|
||||||
-mllvm=-amdgpu-early-inline-all=true \
|
-mllvm=-amdgpu-early-inline-all=true \
|
||||||
-mllvm=-amdgpu-function-calls=false \
|
-mllvm=-amdgpu-function-calls=false \
|
||||||
-Wno-expansion-to-defined \
|
-Wno-expansion-to-defined \
|
||||||
@ -87,6 +92,12 @@ GPU_COMPILER_CUFLAGS = \
|
|||||||
-Wno-unused-result \
|
-Wno-unused-result \
|
||||||
-I.
|
-I.
|
||||||
|
|
||||||
|
# Workaround buggy P2P copy on some windows multi-GPU setups
|
||||||
|
# This workaround breaks linux systems with small system RAM, so only enable on windows
|
||||||
|
ifeq ($(OS),windows)
|
||||||
|
GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
|
||||||
|
endif
|
||||||
|
|
||||||
include make/gpu.make
|
include make/gpu.make
|
||||||
|
|
||||||
# Adjust the rules from gpu.make to handle the ROCm dependencies properly
|
# Adjust the rules from gpu.make to handle the ROCm dependencies properly
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
|
REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
|
||||||
DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
|
DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
|
||||||
|
|
||||||
include $(REPO_ROOT)llama/vendoring.env
|
include $(REPO_ROOT)llama/vendoring
|
||||||
|
|
||||||
LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/
|
LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/
|
||||||
|
|
||||||
|
@ -76,3 +76,9 @@ else
|
|||||||
CP := cp -af
|
CP := cp -af
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
COMMON_SRCS := \
|
||||||
|
$(wildcard *.c) \
|
||||||
|
$(wildcard *.cpp)
|
||||||
|
COMMON_HDRS := \
|
||||||
|
$(wildcard *.h) \
|
||||||
|
$(wildcard *.hpp)
|
||||||
|
@ -20,7 +20,7 @@ GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
|||||||
GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
|
GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
|
||||||
GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
||||||
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
|
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
|
||||||
GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_LIB_DIR)/,$(notdir $(GPU_LIBS))))
|
GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
|
||||||
|
|
||||||
ifeq ($(OS),linux)
|
ifeq ($(OS),linux)
|
||||||
CUDA_PATH?=/usr/local/cuda
|
CUDA_PATH?=/usr/local/cuda
|
||||||
|
@ -34,13 +34,6 @@ endif
|
|||||||
GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
|
GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
|
||||||
DIST_GPU_RUNNER_LIB_DEPS = $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_RUNNER_LIBS)))
|
DIST_GPU_RUNNER_LIB_DEPS = $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_RUNNER_LIBS)))
|
||||||
|
|
||||||
COMMON_SRCS := \
|
|
||||||
$(wildcard *.c) \
|
|
||||||
$(wildcard *.cpp)
|
|
||||||
COMMON_HDRS := \
|
|
||||||
$(wildcard *.h) \
|
|
||||||
$(wildcard *.hpp)
|
|
||||||
|
|
||||||
GPU_RUNNER_SRCS := \
|
GPU_RUNNER_SRCS := \
|
||||||
ggml-cuda.cu \
|
ggml-cuda.cu \
|
||||||
$(filter-out $(wildcard ggml-cuda/fattn*.cu),$(wildcard ggml-cuda/*.cu)) \
|
$(filter-out $(wildcard ggml-cuda/fattn*.cu),$(wildcard ggml-cuda/*.cu)) \
|
||||||
@ -92,7 +85,7 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS
|
|||||||
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
|
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
|
||||||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
|
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
|
||||||
@-mkdir -p $(dir $@)
|
@-mkdir -p $(dir $@)
|
||||||
$(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
|
$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
|
||||||
|
|
||||||
# Distribution targets
|
# Distribution targets
|
||||||
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
|
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
|
||||||
|
@ -12,27 +12,49 @@ kv cache once per run
|
|||||||
|
|
||||||
remaining is to implement the cross attention mask
|
remaining is to implement the cross attention mask
|
||||||
---
|
---
|
||||||
include/llama.h | 4 +
|
examples/llava/llava.cpp | 2 +-
|
||||||
src/llama.cpp | 456 ++++++++++++++++++++++++++++++++++++++++++++++--
|
include/llama.h | 5 +
|
||||||
2 files changed, 447 insertions(+), 13 deletions(-)
|
src/llama.cpp | 447 +++++++++++++++++++++++++++++++++++++--
|
||||||
|
3 files changed, 436 insertions(+), 18 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
|
||||||
|
index 8558c6bd..37b2f2e2 100644
|
||||||
|
--- a/examples/llava/llava.cpp
|
||||||
|
+++ b/examples/llava/llava.cpp
|
||||||
|
@@ -409,7 +409,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||||
|
if (n_eval > n_batch) {
|
||||||
|
n_eval = n_batch;
|
||||||
|
}
|
||||||
|
- llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||||
|
+ llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||||
|
if (llama_decode(ctx_llama, batch)) {
|
||||||
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
diff --git a/include/llama.h b/include/llama.h
|
diff --git a/include/llama.h b/include/llama.h
|
||||||
index 7cae1bbe..122e3cf1 100644
|
index 7cae1bbe..aca09310 100644
|
||||||
--- a/include/llama.h
|
--- a/include/llama.h
|
||||||
+++ b/include/llama.h
|
+++ b/include/llama.h
|
||||||
@@ -423,6 +423,10 @@ extern "C" {
|
@@ -240,6 +240,7 @@ extern "C" {
|
||||||
|
|
||||||
|
llama_token * token;
|
||||||
|
float * embd;
|
||||||
|
+ int32_t n_embd;
|
||||||
|
llama_pos * pos;
|
||||||
|
int32_t * n_seq_id;
|
||||||
|
llama_seq_id ** seq_id;
|
||||||
|
@@ -423,6 +424,10 @@ extern "C" {
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
struct llama_context_params params);
|
struct llama_context_params params);
|
||||||
|
|
||||||
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
|
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
|
||||||
+ // and not set on the context for all batches.
|
+ // and not set on the context for all batches.
|
||||||
+ LLAMA_API void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state);
|
+ LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
|
||||||
+
|
+
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
index 83b80b59..b189a19a 100644
|
index 83b80b59..35748488 100644
|
||||||
--- a/src/llama.cpp
|
--- a/src/llama.cpp
|
||||||
+++ b/src/llama.cpp
|
+++ b/src/llama.cpp
|
||||||
@@ -169,6 +169,7 @@ static std::string format(const char * fmt, ...) {
|
@@ -169,6 +169,7 @@ static std::string format(const char * fmt, ...) {
|
||||||
@ -160,13 +182,23 @@ index 83b80b59..b189a19a 100644
|
|||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
+
|
+
|
||||||
+ bool cross_attention_layer(uint32_t il) const {
|
+ bool cross_attention_layers(uint32_t il) const {
|
||||||
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||||
+ }
|
+ }
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||||
@@ -2806,6 +2859,16 @@ struct llama_layer {
|
@@ -2652,6 +2705,9 @@ struct llama_cparams {
|
||||||
|
bool offload_kqv;
|
||||||
|
bool flash_attn;
|
||||||
|
bool no_perf;
|
||||||
|
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
|
||||||
|
+ // and not set on the context for all batches.
|
||||||
|
+ bool cross_attn = false;
|
||||||
|
|
||||||
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
|
@@ -2806,6 +2862,16 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_down_scale;
|
struct ggml_tensor * ffn_down_scale;
|
||||||
|
|
||||||
struct ggml_tensor * bskcn_tv;
|
struct ggml_tensor * bskcn_tv;
|
||||||
@ -183,25 +215,21 @@ index 83b80b59..b189a19a 100644
|
|||||||
};
|
};
|
||||||
|
|
||||||
// very similar to llama_batch,
|
// very similar to llama_batch,
|
||||||
@@ -3452,6 +3515,12 @@ struct llama_context {
|
@@ -3452,6 +3518,8 @@ struct llama_context {
|
||||||
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
|
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
|
||||||
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
|
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
|
||||||
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
|
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
|
||||||
+
|
+
|
||||||
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
|
|
||||||
+ // and not set on the context for all batches.
|
|
||||||
+ float * cross_attn_state = nullptr;
|
|
||||||
+ bool cross_attn_state_first_pass = true;
|
|
||||||
+ struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
|
+ struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_lora_weight {
|
struct llama_lora_weight {
|
||||||
@@ -3686,6 +3755,18 @@ static bool llama_kv_cache_init(
|
@@ -3686,6 +3754,18 @@ static bool llama_kv_cache_init(
|
||||||
cache.v_l.reserve(n_layer);
|
cache.v_l.reserve(n_layer);
|
||||||
|
|
||||||
for (int i = 0; i < (int) n_layer; i++) {
|
for (int i = 0; i < (int) n_layer; i++) {
|
||||||
+ // for cross attention layers
|
+ // for cross attention layers
|
||||||
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layer(i)) {
|
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
|
||||||
+ struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
+ struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||||
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
|
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
|
||||||
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
|
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
|
||||||
@ -215,7 +243,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||||
|
|
||||||
@@ -5460,12 +5541,14 @@ static void llm_load_hparams(
|
@@ -5460,12 +5540,14 @@ static void llm_load_hparams(
|
||||||
}
|
}
|
||||||
|
|
||||||
// zero-out the per-layer hparams
|
// zero-out the per-layer hparams
|
||||||
@ -235,7 +263,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
|
|
||||||
// n_head_kv is optional, default to n_head
|
// n_head_kv is optional, default to n_head
|
||||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||||
@@ -5514,7 +5597,7 @@ static void llm_load_hparams(
|
@@ -5514,7 +5596,7 @@ static void llm_load_hparams(
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||||
|
|
||||||
@ -244,7 +272,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||||
}
|
}
|
||||||
@@ -5554,6 +5637,16 @@ static void llm_load_hparams(
|
@@ -5554,6 +5636,16 @@ static void llm_load_hparams(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@ -261,7 +289,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
@@ -7249,6 +7342,55 @@ static bool llm_load_tensors(
|
@@ -7249,6 +7341,55 @@ static bool llm_load_tensors(
|
||||||
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@ -286,7 +314,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
+
|
+
|
||||||
+ auto & layer = model.layers[i];
|
+ auto & layer = model.layers[i];
|
||||||
+
|
+
|
||||||
+ if (hparams.cross_attention_layer(i)) {
|
+ if (hparams.cross_attention_layers(i)) {
|
||||||
+ layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128});
|
+ layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128});
|
||||||
+ layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024});
|
+ layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024});
|
||||||
+ layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd});
|
+ layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd});
|
||||||
@ -317,7 +345,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
case LLM_ARCH_GROK:
|
case LLM_ARCH_GROK:
|
||||||
{
|
{
|
||||||
if (n_expert == 0) {
|
if (n_expert == 0) {
|
||||||
@@ -9093,7 +9235,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
@@ -9093,7 +9234,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||||
|
|
||||||
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
||||||
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
||||||
@ -326,16 +354,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.vocab_only) {
|
if (params.vocab_only) {
|
||||||
@@ -9178,7 +9320,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
@@ -9193,6 +9334,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||||
|
|
||||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
|
||||||
} else {
|
|
||||||
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
|
||||||
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
|
||||||
inpL = lctx.inp_embd;
|
|
||||||
ggml_set_input(lctx.inp_embd);
|
|
||||||
}
|
|
||||||
@@ -9193,6 +9335,22 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
||||||
return inpL;
|
return inpL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -346,11 +365,10 @@ index 83b80b59..b189a19a 100644
|
|||||||
+ const llm_build_cb & cb) {
|
+ const llm_build_cb & cb) {
|
||||||
+ const int64_t n_embd = hparams.n_embd;
|
+ const int64_t n_embd = hparams.n_embd;
|
||||||
+
|
+
|
||||||
+ struct ggml_tensor * inpCAS;
|
+ struct ggml_tensor * inpCAS = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
|
||||||
+ lctx.inp_cross_attn_state = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
|
+ cb(inpCAS, "inp_cross_attn_state", -1);
|
||||||
+ cb(lctx.inp_cross_attn_state, "inp_cross_attn_state", -1);
|
+ ggml_set_input(inpCAS);
|
||||||
+ ggml_set_input(lctx.inp_cross_attn_state);
|
+ lctx.inp_cross_attn_state = inpCAS;
|
||||||
+ inpCAS = lctx.inp_cross_attn_state;
|
|
||||||
+
|
+
|
||||||
+ return inpCAS;
|
+ return inpCAS;
|
||||||
+}
|
+}
|
||||||
@ -358,7 +376,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
static void llm_build_kv_store(
|
static void llm_build_kv_store(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
const llama_hparams & hparams,
|
const llama_hparams & hparams,
|
||||||
@@ -10167,6 +10325,7 @@ struct llm_build_context {
|
@@ -10167,6 +10323,7 @@ struct llm_build_context {
|
||||||
lctx.inp_pos_bucket = nullptr;
|
lctx.inp_pos_bucket = nullptr;
|
||||||
lctx.inp_embd_enc = nullptr;
|
lctx.inp_embd_enc = nullptr;
|
||||||
lctx.inp_KQ_mask_cross = nullptr;
|
lctx.inp_KQ_mask_cross = nullptr;
|
||||||
@ -366,7 +384,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
}
|
}
|
||||||
|
|
||||||
void free() {
|
void free() {
|
||||||
@@ -10754,6 +10913,253 @@ struct llm_build_context {
|
@@ -10754,6 +10911,239 @@ struct llm_build_context {
|
||||||
LLM_NORM_RMS, cb, -1);
|
LLM_NORM_RMS, cb, -1);
|
||||||
cb(cur, "result_norm", -1);
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
@ -410,8 +428,8 @@ index 83b80b59..b189a19a 100644
|
|||||||
+ LLM_NORM_RMS, cb, il);
|
+ LLM_NORM_RMS, cb, il);
|
||||||
+ cb(cur, "attn_norm", il);
|
+ cb(cur, "attn_norm", il);
|
||||||
+
|
+
|
||||||
+ if (hparams.cross_attention_layer(il)) {
|
+ if (hparams.cross_attention_layers(il)) {
|
||||||
+ if (!lctx.cross_attn_state) {
|
+ if (!batch.embd && !cparams.cross_attn) {
|
||||||
+ continue;
|
+ continue;
|
||||||
+ }
|
+ }
|
||||||
+
|
+
|
||||||
@ -422,42 +440,28 @@ index 83b80b59..b189a19a 100644
|
|||||||
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
+ cb(Qcur, "Qcur", il);
|
+ cb(Qcur, "Qcur", il);
|
||||||
+
|
+
|
||||||
+ Qcur = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
+ Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
|
||||||
+ cb(Qcur, "Qcur", il);
|
|
||||||
+
|
|
||||||
+ // TODO: is this required?
|
|
||||||
+ Qcur = ggml_cont(ctx0, Qcur);
|
|
||||||
+ cb(Qcur, "Qcur", il);
|
+ cb(Qcur, "Qcur", il);
|
||||||
+
|
+
|
||||||
+ Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
|
+ Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||||
+ cb(Qcur, "Qcur", il);
|
+ cb(Qcur, "Qcur", il);
|
||||||
+
|
+
|
||||||
+ struct ggml_tensor * Kcur;
|
+ struct ggml_tensor * Kcur, * Vcur;
|
||||||
+ if (lctx.cross_attn_state_first_pass) {
|
+ if (batch.embd) {
|
||||||
+ Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
|
+ Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
|
||||||
+ cb(Kcur, "Kcur", il);
|
+ cb(Kcur, "Kcur", il);
|
||||||
+
|
+
|
||||||
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
|
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
|
||||||
+ cb(Kcur, "Kcur", il);
|
+ cb(Kcur, "Kcur", il);
|
||||||
+
|
+
|
||||||
+ Kcur = ggml_permute(ctx0, Kcur, 0, 2, 1, 3);
|
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
||||||
+ cb(Kcur, "Kcur", il);
|
|
||||||
+
|
|
||||||
+ // TODO: is this required?
|
|
||||||
+ Kcur = ggml_cont(ctx0, Kcur);
|
|
||||||
+ cb(Kcur, "Kcur", il);
|
+ cb(Kcur, "Kcur", il);
|
||||||
+
|
+
|
||||||
+ Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
|
+ Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||||
+ cb(Kcur, "Kcur", il);
|
+ cb(Kcur, "Kcur", il);
|
||||||
+
|
+
|
||||||
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
|
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
|
||||||
+ } else {
|
|
||||||
+ Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
|
|
||||||
+ cb(Kcur, "Kcur (view)", il);
|
|
||||||
+ }
|
|
||||||
+
|
+
|
||||||
+ struct ggml_tensor * Vcur;
|
|
||||||
+ if (lctx.cross_attn_state_first_pass) {
|
|
||||||
+ Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
|
+ Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
|
||||||
+ cb(Vcur, "Vcur", il);
|
+ cb(Vcur, "Vcur", il);
|
||||||
+
|
+
|
||||||
@ -469,6 +473,9 @@ index 83b80b59..b189a19a 100644
|
|||||||
+
|
+
|
||||||
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
|
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
|
||||||
+ } else {
|
+ } else {
|
||||||
|
+ Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
|
||||||
|
+ cb(Kcur, "Kcur (view)", il);
|
||||||
|
+
|
||||||
+ Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
|
+ Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
|
||||||
+ cb(Vcur, "Vcur (view)", il);
|
+ cb(Vcur, "Vcur (view)", il);
|
||||||
+ }
|
+ }
|
||||||
@ -476,11 +483,8 @@ index 83b80b59..b189a19a 100644
|
|||||||
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
|
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
|
||||||
+ cb(kq, "kq", il);
|
+ cb(kq, "kq", il);
|
||||||
+
|
+
|
||||||
+ kq = ggml_scale_inplace(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
|
|
||||||
+ cb(kq, "kq_scaled", il);
|
|
||||||
+
|
|
||||||
+ // TODO: apply causal masks
|
+ // TODO: apply causal masks
|
||||||
+ struct ggml_tensor * kq_soft_max = ggml_soft_max_inplace(ctx0, kq);
|
+ struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
||||||
+ cb(kq_soft_max, "kq_soft_max", il);
|
+ cb(kq_soft_max, "kq_soft_max", il);
|
||||||
+
|
+
|
||||||
+ Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
|
+ Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
|
||||||
@ -570,8 +574,8 @@ index 83b80b59..b189a19a 100644
|
|||||||
+ cb(Kcur, "Kcur", il);
|
+ cb(Kcur, "Kcur", il);
|
||||||
+
|
+
|
||||||
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||||
+ model.layers[il].wo, model.layers[il].bo,
|
+ model.layers[il].wo, model.layers[il].bo,
|
||||||
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
+
|
+
|
||||||
+
|
+
|
||||||
+ if (il == n_layer - 1) {
|
+ if (il == n_layer - 1) {
|
||||||
@ -620,7 +624,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
// lm_head
|
// lm_head
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
cb(cur, "result_output", -1);
|
cb(cur, "result_output", -1);
|
||||||
@@ -16501,6 +16907,10 @@ static struct ggml_cgraph * llama_build_graph(
|
@@ -16501,6 +16891,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_llama();
|
result = llm.build_llama();
|
||||||
} break;
|
} break;
|
||||||
@ -631,33 +635,48 @@ index 83b80b59..b189a19a 100644
|
|||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
{
|
{
|
||||||
result = llm.build_baichuan();
|
result = llm.build_baichuan();
|
||||||
@@ -16773,6 +17183,14 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
@@ -16761,10 +17155,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
||||||
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
+ // TODO (jmorganca): this might copy a lot of data on every request of a
|
if (batch.embd) {
|
||||||
+ // single generation even though it doesn't change, so we should
|
- const int64_t n_embd = hparams.n_embd;
|
||||||
+ // find a way to not set this more than one time per image
|
- const int64_t n_tokens = batch.n_tokens;
|
||||||
+ if (lctx.inp_cross_attn_state &&
|
+ if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
|
||||||
+ lctx.inp_cross_attn_state->buffer) {
|
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
|
||||||
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
|
+ // zero out inp_embd since it's not used
|
||||||
+ }
|
+ float * inp_embd_data = (float *)lctx.inp_embd->data;
|
||||||
+
|
+ for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
|
||||||
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
+ inp_embd_data[i] = 0.0f;
|
||||||
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
+ }
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
+ } else {
|
||||||
@@ -17455,6 +17873,10 @@ static int llama_decode_internal(
|
+ const int64_t n_embd = hparams.n_embd;
|
||||||
|
+ const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
llama_set_inputs(lctx, ubatch);
|
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||||
|
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
+ // TODO: replace with something better to find out if its
|
if (batch.pos && lctx.inp_pos) {
|
||||||
+ // our first actual pass
|
@@ -17345,7 +17748,7 @@ static int llama_decode_internal(
|
||||||
+ lctx.cross_attn_state_first_pass = false;
|
n_outputs = 1;
|
||||||
+
|
}
|
||||||
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
|
||||||
|
|
||||||
// update the kv ring buffer
|
- lctx.sbatch.from_batch(batch_all, n_embd,
|
||||||
@@ -18648,7 +19070,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
+ lctx.sbatch.from_batch(batch_all, batch_all.n_embd,
|
||||||
|
/* simple_split */ !kv_self.recurrent,
|
||||||
|
/* logits_all */ n_outputs == n_tokens_all);
|
||||||
|
|
||||||
|
@@ -17638,7 +18041,7 @@ static int llama_encode_internal(
|
||||||
|
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
|
- lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||||
|
+ lctx.sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||||
|
|
||||||
|
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
|
||||||
|
|
||||||
|
@@ -18648,7 +19051,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
if (llama_model_has_encoder(&model)) {
|
if (llama_model_has_encoder(&model)) {
|
||||||
n_attn_layer *= 3;
|
n_attn_layer *= 3;
|
||||||
}
|
}
|
||||||
@ -668,19 +687,7 @@ index 83b80b59..b189a19a 100644
|
|||||||
}
|
}
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
@@ -19744,6 +20168,11 @@ struct llama_context * llama_new_context_with_model(
|
@@ -19814,6 +20219,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
return ctx;
|
|
||||||
}
|
|
||||||
|
|
||||||
+void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state) {
|
|
||||||
+ ctx->cross_attn_state_first_pass = true;
|
|
||||||
+ ctx->cross_attn_state = cross_attn_state;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
void llama_free(struct llama_context * ctx) {
|
|
||||||
delete ctx;
|
|
||||||
}
|
|
||||||
@@ -19814,6 +20243,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
@ -688,3 +695,38 @@ index 83b80b59..b189a19a 100644
|
|||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
case LLM_ARCH_STARCODER:
|
case LLM_ARCH_STARCODER:
|
||||||
case LLM_ARCH_PLAMO:
|
case LLM_ARCH_PLAMO:
|
||||||
|
@@ -21230,6 +21636,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||||
|
ctx->cparams.causal_attn = causal_attn;
|
||||||
|
}
|
||||||
|
|
||||||
|
+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
|
||||||
|
+ ctx->cparams.cross_attn = cross_attention;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
struct llama_batch llama_batch_get_one(
|
||||||
|
llama_token * tokens,
|
||||||
|
int32_t n_tokens,
|
||||||
|
@@ -21239,6 +21649,7 @@ struct llama_batch llama_batch_get_one(
|
||||||
|
/*n_tokens =*/ n_tokens,
|
||||||
|
/*tokens =*/ tokens,
|
||||||
|
/*embd =*/ nullptr,
|
||||||
|
+ /*n_embd =*/ 0,
|
||||||
|
/*pos =*/ nullptr,
|
||||||
|
/*n_seq_id =*/ nullptr,
|
||||||
|
/*seq_id =*/ nullptr,
|
||||||
|
@@ -21254,6 +21665,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||||
|
/*n_tokens =*/ 0,
|
||||||
|
/*tokens =*/ nullptr,
|
||||||
|
/*embd =*/ nullptr,
|
||||||
|
+ /*n_embd =*/ 0,
|
||||||
|
/*pos =*/ nullptr,
|
||||||
|
/*n_seq_id =*/ nullptr,
|
||||||
|
/*seq_id =*/ nullptr,
|
||||||
|
@@ -21265,6 +21677,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||||
|
|
||||||
|
if (embd) {
|
||||||
|
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||||
|
+ batch.n_embd = embd;
|
||||||
|
} else {
|
||||||
|
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||||
|
}
|
||||||
|
66
llama/patches/0012-fix-deepseek-deseret-regex.patch
Normal file
66
llama/patches/0012-fix-deepseek-deseret-regex.patch
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Daniel Hiltgen <daniel@ollama.com>
|
||||||
|
Date: Fri, 25 Oct 2024 16:25:18 -0700
|
||||||
|
Subject: [PATCH] fix deepseek deseret regex
|
||||||
|
|
||||||
|
On windows compiled with gcc the c++ regex library failed to handle
|
||||||
|
the characters
|
||||||
|
---
|
||||||
|
src/llama-vocab.cpp | 2 +-
|
||||||
|
src/unicode.cpp | 21 +++++++++++++++++++++
|
||||||
|
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||||
|
index d2f34ddd..3ef6af19 100644
|
||||||
|
--- a/src/llama-vocab.cpp
|
||||||
|
+++ b/src/llama-vocab.cpp
|
||||||
|
@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||||
|
regex_exprs = {
|
||||||
|
"[\r\n]",
|
||||||
|
- "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
||||||
|
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
||||||
|
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
||||||
|
"\\s+$",
|
||||||
|
"[一-龥ࠀ-一가-]+",
|
||||||
|
diff --git a/src/unicode.cpp b/src/unicode.cpp
|
||||||
|
index f4e941cd..9d78ff16 100644
|
||||||
|
--- a/src/unicode.cpp
|
||||||
|
+++ b/src/unicode.cpp
|
||||||
|
@@ -2,6 +2,11 @@
|
||||||
|
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||||
|
#endif
|
||||||
|
|
||||||
|
+#if defined(_WIN32)
|
||||||
|
+#define WIN32_LEAN_AND_MEAN
|
||||||
|
+#include <windows.h>
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
#include "unicode.h"
|
||||||
|
#include "unicode-data.h"
|
||||||
|
|
||||||
|
@@ -201,8 +206,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
||||||
|
+#ifdef _WIN32
|
||||||
|
+ int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
|
||||||
|
+ if (!wlen) {
|
||||||
|
+ throw std::invalid_argument("failed to convert regex");
|
||||||
|
+ }
|
||||||
|
+ wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
|
||||||
|
+ wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
|
||||||
|
+ if (!wlen) {
|
||||||
|
+ free(wbuf);
|
||||||
|
+ throw std::invalid_argument("failed to convert regex");
|
||||||
|
+ }
|
||||||
|
+ std::wstring ret = std::wstring(wbuf);
|
||||||
|
+ free(wbuf);
|
||||||
|
+ return ret;
|
||||||
|
+#else
|
||||||
|
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
||||||
|
return conv.from_bytes(s);
|
||||||
|
+#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
|
@ -2,7 +2,7 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"hash/maphash"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"reflect"
|
"reflect"
|
||||||
"time"
|
"time"
|
||||||
@ -20,14 +20,14 @@ type InputCache struct {
|
|||||||
// optimize cache eviction for multiple users
|
// optimize cache eviction for multiple users
|
||||||
multiUserCache bool
|
multiUserCache bool
|
||||||
|
|
||||||
// cache of images to embeddings
|
|
||||||
images []imageCache
|
|
||||||
imageHash maphash.Hash
|
|
||||||
|
|
||||||
lc *llama.Context
|
lc *llama.Context
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) *InputCache {
|
func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) (*InputCache, error) {
|
||||||
|
if kvSize/numSlots < 1 {
|
||||||
|
return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
|
||||||
|
}
|
||||||
|
|
||||||
slots := make([]InputCacheSlot, numSlots)
|
slots := make([]InputCacheSlot, numSlots)
|
||||||
|
|
||||||
for i := range slots {
|
for i := range slots {
|
||||||
@ -41,9 +41,8 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
|
|||||||
numCtx: kvSize / numSlots,
|
numCtx: kvSize / numSlots,
|
||||||
slots: slots,
|
slots: slots,
|
||||||
multiUserCache: multiUserCache,
|
multiUserCache: multiUserCache,
|
||||||
images: make([]imageCache, numSlots),
|
|
||||||
lc: lc,
|
lc: lc,
|
||||||
}
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Locking: Operations on InputCacheSlot (including finding one
|
// Locking: Operations on InputCacheSlot (including finding one
|
||||||
@ -64,7 +63,7 @@ type InputCacheSlot struct {
|
|||||||
lastUsed time.Time
|
lastUsed time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, int, error) {
|
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
|
||||||
var slot *InputCacheSlot
|
var slot *InputCacheSlot
|
||||||
var numPast int
|
var numPast int
|
||||||
var err error
|
var err error
|
||||||
@ -81,7 +80,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
|
|||||||
slot, numPast, err = c.findBestCacheSlot(prompt)
|
slot, numPast, err = c.findBestCacheSlot(prompt)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, 0, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if !cachePrompt {
|
if !cachePrompt {
|
||||||
@ -108,7 +107,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
|
|||||||
prompt = prompt[numPast:]
|
prompt = prompt[numPast:]
|
||||||
slot.Inputs = slot.Inputs[:numPast]
|
slot.Inputs = slot.Inputs[:numPast]
|
||||||
|
|
||||||
return slot, prompt, numPast, nil
|
return slot, prompt, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
|
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
|
||||||
@ -200,66 +199,30 @@ func countCommonPrefix(a []input, b []input) int {
|
|||||||
return count
|
return count
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscard int, numPast int) {
|
// Frees up space in the KV cache by deleting the oldest half of history and shifting
|
||||||
|
// the newest half into that space (saving numKeep inputs at the beginning).
|
||||||
|
//
|
||||||
|
// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
|
||||||
|
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) {
|
||||||
|
targetFree := (c.numCtx - numKeep) / 2
|
||||||
|
targetFree = max(targetFree, 1)
|
||||||
|
|
||||||
|
currentFree := c.numCtx - len(slot.Inputs)
|
||||||
|
discard := targetFree - currentFree
|
||||||
|
|
||||||
|
if discard <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Debug("context limit hit - shifting", "limit", c.numCtx, "input", len(slot.Inputs),
|
||||||
|
"keep", numKeep, "discard", discard)
|
||||||
|
|
||||||
// TODO (jessegross): KV cache removal can fail for certain types of models
|
// TODO (jessegross): KV cache removal can fail for certain types of models
|
||||||
// server.cpp doesn't handle this, though we can be more graceful
|
c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard)
|
||||||
c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+numDiscard)
|
c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
|
||||||
c.lc.KvCacheSeqAdd(slot.Id, numKeep+numDiscard, numPast, -numDiscard)
|
|
||||||
|
|
||||||
for i := numKeep + numDiscard; i < len(slot.Inputs); i++ {
|
for i := numKeep + discard; i < len(slot.Inputs); i++ {
|
||||||
slot.Inputs[i-numDiscard] = slot.Inputs[i]
|
slot.Inputs[i-discard] = slot.Inputs[i]
|
||||||
}
|
}
|
||||||
slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
|
slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
|
||||||
}
|
|
||||||
|
|
||||||
// Locking: Lookup and store operations on imageCache require a lock
|
|
||||||
// to be held that serializes these with each other. Hash does not
|
|
||||||
// require a lock nor they need to be serialized with InputCacheSlot.
|
|
||||||
|
|
||||||
type imageCache struct {
|
|
||||||
key uint64
|
|
||||||
val [][]float32
|
|
||||||
lastUsed time.Time
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *InputCache) HashImage(image []byte) uint64 {
|
|
||||||
c.imageHash.Reset()
|
|
||||||
_, _ = c.imageHash.Write(image)
|
|
||||||
return c.imageHash.Sum64()
|
|
||||||
}
|
|
||||||
|
|
||||||
var ErrImageNotFound = errors.New("image not found in cache")
|
|
||||||
|
|
||||||
func (c *InputCache) FindImage(hash uint64) ([][]float32, error) {
|
|
||||||
for i := range c.images {
|
|
||||||
if c.images[i].key == hash {
|
|
||||||
slog.Debug("loading image embeddings from cache", "entry", i)
|
|
||||||
c.images[i].lastUsed = time.Now()
|
|
||||||
return c.images[i].val, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil, ErrImageNotFound
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *InputCache) AddImage(hash uint64, embed [][]float32) {
|
|
||||||
best := time.Now()
|
|
||||||
var bestImage int
|
|
||||||
|
|
||||||
for i := range c.images {
|
|
||||||
if c.images[i].key == hash {
|
|
||||||
bestImage = i
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
if c.images[i].lastUsed.Compare(best) < 0 {
|
|
||||||
best = c.images[i].lastUsed
|
|
||||||
bestImage = i
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)
|
|
||||||
c.images[bestImage].key = hash
|
|
||||||
c.images[bestImage].val = embed
|
|
||||||
c.images[bestImage].lastUsed = time.Now()
|
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"reflect"
|
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@ -228,77 +227,3 @@ func TestFindCacheSlot(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestImageCache(t *testing.T) {
|
|
||||||
cache := NewInputCache(nil, 2048, 4, false)
|
|
||||||
|
|
||||||
valA := [][]float32{{0.1, 0.2}, {0.3}}
|
|
||||||
valB := [][]float32{{0.4}, {0.5}, {0.6}}
|
|
||||||
valC := [][]float32{{0.7}}
|
|
||||||
valD := [][]float32{{0.8}}
|
|
||||||
valE := [][]float32{{0.9}}
|
|
||||||
|
|
||||||
// Empty cache
|
|
||||||
result, err := cache.FindImage(0x5adb61d31933a946)
|
|
||||||
if err != ErrImageNotFound {
|
|
||||||
t.Errorf("found result in empty cache: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Insert A
|
|
||||||
cache.AddImage(0x5adb61d31933a946, valA)
|
|
||||||
|
|
||||||
result, err = cache.FindImage(0x5adb61d31933a946)
|
|
||||||
if !reflect.DeepEqual(result, valA) {
|
|
||||||
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Insert B
|
|
||||||
cache.AddImage(0x011551369a34a901, valB)
|
|
||||||
|
|
||||||
result, err = cache.FindImage(0x5adb61d31933a946)
|
|
||||||
if !reflect.DeepEqual(result, valA) {
|
|
||||||
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
result, err = cache.FindImage(0x011551369a34a901)
|
|
||||||
if !reflect.DeepEqual(result, valB) {
|
|
||||||
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Replace B with C
|
|
||||||
cache.AddImage(0x011551369a34a901, valC)
|
|
||||||
|
|
||||||
result, err = cache.FindImage(0x5adb61d31933a946)
|
|
||||||
if !reflect.DeepEqual(result, valA) {
|
|
||||||
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
result, err = cache.FindImage(0x011551369a34a901)
|
|
||||||
if !reflect.DeepEqual(result, valC) {
|
|
||||||
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Evict A
|
|
||||||
cache.AddImage(0x756b218a517e7353, valB)
|
|
||||||
cache.AddImage(0x75e5e8d35d7e3967, valD)
|
|
||||||
cache.AddImage(0xd96f7f268ca0646e, valE)
|
|
||||||
|
|
||||||
result, err = cache.FindImage(0x5adb61d31933a946)
|
|
||||||
if reflect.DeepEqual(result, valA) {
|
|
||||||
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
result, err = cache.FindImage(0x756b218a517e7353)
|
|
||||||
if !reflect.DeepEqual(result, valB) {
|
|
||||||
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
result, err = cache.FindImage(0x011551369a34a901)
|
|
||||||
if !reflect.DeepEqual(result, valC) {
|
|
||||||
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
result, err = cache.FindImage(0x75e5e8d35d7e3967)
|
|
||||||
if !reflect.DeepEqual(result, valD) {
|
|
||||||
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
result, err = cache.FindImage(0xd96f7f268ca0646e)
|
|
||||||
if !reflect.DeepEqual(result, valE) {
|
|
||||||
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
183
llama/runner/image.go
Normal file
183
llama/runner/image.go
Normal file
@ -0,0 +1,183 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"hash/maphash"
|
||||||
|
"log/slog"
|
||||||
|
"slices"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/llama"
|
||||||
|
)
|
||||||
|
|
||||||
|
const imageCacheSize = 4
|
||||||
|
|
||||||
|
type ImageContext struct {
|
||||||
|
// mu is required to be held when generating embeddings or accessing the cache
|
||||||
|
mu sync.Mutex
|
||||||
|
|
||||||
|
clip *llama.ClipContext
|
||||||
|
mllama *llama.MllamaContext
|
||||||
|
|
||||||
|
// cache of images to embeddings
|
||||||
|
images []imageCache
|
||||||
|
imageHash maphash.Hash
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageContext, error) {
|
||||||
|
arch, err := llama.GetModelArch(modelPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to determine vision architecture: %w (%s)", err, modelPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
var c ImageContext
|
||||||
|
if arch == "clip" {
|
||||||
|
c.clip, err = llama.NewClipContext(llamaContext, modelPath)
|
||||||
|
} else if arch == "mllama" {
|
||||||
|
c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
c.images = make([]imageCache, imageCacheSize)
|
||||||
|
|
||||||
|
return &c, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *ImageContext) Free(modelPath string) {
|
||||||
|
if c == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.clip != nil {
|
||||||
|
c.clip.Free()
|
||||||
|
}
|
||||||
|
if c.mllama != nil {
|
||||||
|
c.mllama.Free()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
||||||
|
if c == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(data) <= 0 {
|
||||||
|
return nil, errors.New("received zero length image")
|
||||||
|
}
|
||||||
|
|
||||||
|
hash := c.hashImage(data)
|
||||||
|
|
||||||
|
c.mu.Lock()
|
||||||
|
defer c.mu.Unlock()
|
||||||
|
|
||||||
|
embed, err := c.findImage(hash)
|
||||||
|
if err != nil {
|
||||||
|
if c.mllama != nil {
|
||||||
|
embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else if c.clip != nil {
|
||||||
|
embed, err = c.clip.NewEmbed(llamaContext, data)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return nil, errors.New("received image but vision model not loaded")
|
||||||
|
}
|
||||||
|
|
||||||
|
c.addImage(hash, embed)
|
||||||
|
}
|
||||||
|
|
||||||
|
return embed, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *ImageContext) BatchSize(configuredBatchSize int) int {
|
||||||
|
// If images are not supported, we don't need to allocate embedding batches
|
||||||
|
if c == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mllama maps an image to 1 embedding token (llava creates many tokens)
|
||||||
|
// and doesn't support more than a single image per request.
|
||||||
|
// The embeddings are large (100 MB), so allocating a big batch can fail
|
||||||
|
// on some systems
|
||||||
|
if c.mllama != nil {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
return configuredBatchSize
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
|
||||||
|
if c != nil && c.mllama != nil {
|
||||||
|
return c.mllama.EmbedSize(llamaContext)
|
||||||
|
} else {
|
||||||
|
return llamaContext.Model().NEmbd()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
|
||||||
|
if c == nil || c.mllama == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return slices.ContainsFunc(inputs, func(input input) bool {
|
||||||
|
return input.embed != nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
type imageCache struct {
|
||||||
|
key uint64
|
||||||
|
val [][]float32
|
||||||
|
lastUsed time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *ImageContext) hashImage(image []byte) uint64 {
|
||||||
|
c.imageHash.Reset()
|
||||||
|
_, _ = c.imageHash.Write(image)
|
||||||
|
return c.imageHash.Sum64()
|
||||||
|
}
|
||||||
|
|
||||||
|
var errImageNotFound = errors.New("image not found in cache")
|
||||||
|
|
||||||
|
func (c *ImageContext) findImage(hash uint64) ([][]float32, error) {
|
||||||
|
for i := range c.images {
|
||||||
|
if c.images[i].key == hash {
|
||||||
|
slog.Debug("loading image embeddings from cache", "entry", i)
|
||||||
|
c.images[i].lastUsed = time.Now()
|
||||||
|
return c.images[i].val, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, errImageNotFound
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *ImageContext) addImage(hash uint64, embed [][]float32) {
|
||||||
|
best := time.Now()
|
||||||
|
var bestImage int
|
||||||
|
|
||||||
|
for i := range c.images {
|
||||||
|
if c.images[i].key == hash {
|
||||||
|
bestImage = i
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.images[i].lastUsed.Compare(best) < 0 {
|
||||||
|
best = c.images[i].lastUsed
|
||||||
|
bestImage = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)
|
||||||
|
c.images[bestImage].key = hash
|
||||||
|
c.images[bestImage].val = embed
|
||||||
|
c.images[bestImage].lastUsed = time.Now()
|
||||||
|
}
|
80
llama/runner/image_test.go
Normal file
80
llama/runner/image_test.go
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestImageCache(t *testing.T) {
|
||||||
|
cache := ImageContext{images: make([]imageCache, 4)}
|
||||||
|
|
||||||
|
valA := [][]float32{{0.1, 0.2}, {0.3}}
|
||||||
|
valB := [][]float32{{0.4}, {0.5}, {0.6}}
|
||||||
|
valC := [][]float32{{0.7}}
|
||||||
|
valD := [][]float32{{0.8}}
|
||||||
|
valE := [][]float32{{0.9}}
|
||||||
|
|
||||||
|
// Empty cache
|
||||||
|
result, err := cache.findImage(0x5adb61d31933a946)
|
||||||
|
if err != errImageNotFound {
|
||||||
|
t.Errorf("found result in empty cache: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert A
|
||||||
|
cache.addImage(0x5adb61d31933a946, valA)
|
||||||
|
|
||||||
|
result, err = cache.findImage(0x5adb61d31933a946)
|
||||||
|
if !reflect.DeepEqual(result, valA) {
|
||||||
|
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert B
|
||||||
|
cache.addImage(0x011551369a34a901, valB)
|
||||||
|
|
||||||
|
result, err = cache.findImage(0x5adb61d31933a946)
|
||||||
|
if !reflect.DeepEqual(result, valA) {
|
||||||
|
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
result, err = cache.findImage(0x011551369a34a901)
|
||||||
|
if !reflect.DeepEqual(result, valB) {
|
||||||
|
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace B with C
|
||||||
|
cache.addImage(0x011551369a34a901, valC)
|
||||||
|
|
||||||
|
result, err = cache.findImage(0x5adb61d31933a946)
|
||||||
|
if !reflect.DeepEqual(result, valA) {
|
||||||
|
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
result, err = cache.findImage(0x011551369a34a901)
|
||||||
|
if !reflect.DeepEqual(result, valC) {
|
||||||
|
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Evict A
|
||||||
|
cache.addImage(0x756b218a517e7353, valB)
|
||||||
|
cache.addImage(0x75e5e8d35d7e3967, valD)
|
||||||
|
cache.addImage(0xd96f7f268ca0646e, valE)
|
||||||
|
|
||||||
|
result, err = cache.findImage(0x5adb61d31933a946)
|
||||||
|
if reflect.DeepEqual(result, valA) {
|
||||||
|
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
result, err = cache.findImage(0x756b218a517e7353)
|
||||||
|
if !reflect.DeepEqual(result, valB) {
|
||||||
|
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
result, err = cache.findImage(0x011551369a34a901)
|
||||||
|
if !reflect.DeepEqual(result, valC) {
|
||||||
|
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
result, err = cache.findImage(0x75e5e8d35d7e3967)
|
||||||
|
if !reflect.DeepEqual(result, valD) {
|
||||||
|
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
result, err = cache.findImage(0xd96f7f268ca0646e)
|
||||||
|
if !reflect.DeepEqual(result, valE) {
|
||||||
|
t.Errorf("failed to find expected value: result %v, err %v", result, err)
|
||||||
|
}
|
||||||
|
}
|
@ -34,9 +34,6 @@ type input struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Sequence struct {
|
type Sequence struct {
|
||||||
// number of inputs evaluated
|
|
||||||
numPast int
|
|
||||||
|
|
||||||
// batch index
|
// batch index
|
||||||
iBatch int
|
iBatch int
|
||||||
|
|
||||||
@ -52,6 +49,10 @@ type Sequence struct {
|
|||||||
// input cache being used by this sequence
|
// input cache being used by this sequence
|
||||||
cache *InputCacheSlot
|
cache *InputCacheSlot
|
||||||
|
|
||||||
|
// does this sequence require cross-attention layers to be processed? - if we have seen
|
||||||
|
// an image for certain multi-modal models
|
||||||
|
crossAttention bool
|
||||||
|
|
||||||
// channel to send responses over
|
// channel to send responses over
|
||||||
responses chan string
|
responses chan string
|
||||||
|
|
||||||
@ -108,26 +109,23 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
|
|||||||
params.numKeep = len(inputs)
|
params.numKeep = len(inputs)
|
||||||
}
|
}
|
||||||
|
|
||||||
if !params.embedding {
|
if s.model.AddBOSToken() {
|
||||||
// Subtracting 4 ensures that at least 1 input can be discarded during shift
|
params.numKeep += 1
|
||||||
params.numKeep = min(params.numKeep, s.cache.numCtx-4)
|
|
||||||
params.numKeep += s.bosToken
|
|
||||||
} else {
|
|
||||||
// Embeddings are 1 shot - just truncate to the context window, without ever shifting
|
|
||||||
params.numKeep = min(params.numKeep, s.cache.numCtx)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// truncate to fit in context window
|
// Ensure that at least 1 input can be discarded during shift
|
||||||
|
params.numKeep = min(params.numKeep, s.cache.numCtx-1)
|
||||||
|
|
||||||
if len(inputs) > s.cache.numCtx {
|
if len(inputs) > s.cache.numCtx {
|
||||||
slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
|
slog.Warn("input exceeds context length", "prompt", len(inputs), "limit", s.cache.numCtx)
|
||||||
newInputs := inputs[:params.numKeep]
|
|
||||||
newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
|
|
||||||
inputs = newInputs
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var sc *llama.SamplingContext
|
var sc *llama.SamplingContext
|
||||||
if params.samplingParams != nil {
|
if params.samplingParams != nil {
|
||||||
sc = llama.NewSamplingContext(s.model, *params.samplingParams)
|
sc, err = llama.NewSamplingContext(s.model, *params.samplingParams)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
for _, input := range inputs {
|
for _, input := range inputs {
|
||||||
if input.embed == nil {
|
if input.embed == nil {
|
||||||
sc.Accept(input.token, false)
|
sc.Accept(input.token, false)
|
||||||
@ -190,16 +188,10 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
|
|||||||
return nil, fmt.Errorf("invalid image index: %d", n)
|
return nil, fmt.Errorf("invalid image index: %d", n)
|
||||||
}
|
}
|
||||||
|
|
||||||
hash := s.cache.HashImage(images[imageIndex].Data)
|
embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
|
||||||
|
|
||||||
// Vision models cannot be accessed concurrently
|
|
||||||
s.clip.mu.Lock()
|
|
||||||
embed, err := s.cache.FindImage(hash)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
embed = llama.NewLlavaImageEmbed(s.lc, s.clip.cc, images[imageIndex].Data)
|
return nil, err
|
||||||
s.cache.AddImage(hash, embed)
|
|
||||||
}
|
}
|
||||||
s.clip.mu.Unlock()
|
|
||||||
|
|
||||||
for _, e := range embed {
|
for _, e := range embed {
|
||||||
inputs = append(inputs, input{embed: e})
|
inputs = append(inputs, input{embed: e})
|
||||||
@ -207,41 +199,17 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.clip.cc != nil {
|
|
||||||
var embed [][]float32
|
|
||||||
|
|
||||||
if s.clip.cc.IsMllama && len(images) >= 1 {
|
|
||||||
hash := s.cache.HashImage(images[0].Data)
|
|
||||||
|
|
||||||
s.clip.mu.Lock()
|
|
||||||
var err error
|
|
||||||
embed, err = s.cache.FindImage(hash)
|
|
||||||
if err != nil {
|
|
||||||
embed = llama.NewMllamaImageEmbed(s.lc, s.clip.cc, images[0].Data, images[0].AspectRatioID)
|
|
||||||
s.cache.AddImage(hash, embed)
|
|
||||||
}
|
|
||||||
s.clip.mu.Unlock()
|
|
||||||
}
|
|
||||||
s.mu.Lock()
|
|
||||||
llama.MllamaSetCrossAttn(s.lc, s.clip.cc, embed)
|
|
||||||
s.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
return inputs, nil
|
return inputs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type clip struct {
|
|
||||||
cc *llama.ClipContext
|
|
||||||
mu sync.Mutex
|
|
||||||
}
|
|
||||||
|
|
||||||
type Server struct {
|
type Server struct {
|
||||||
model *llama.Model
|
model *llama.Model
|
||||||
lc *llama.Context
|
lc *llama.Context
|
||||||
|
|
||||||
// required for image embeddings
|
// required for image embeddings
|
||||||
clip clip
|
image *ImageContext
|
||||||
|
|
||||||
|
// TODO (jmorganca): make this n_batch
|
||||||
batchSize int
|
batchSize int
|
||||||
|
|
||||||
// parallel is the number of parallel requests to handle
|
// parallel is the number of parallel requests to handle
|
||||||
@ -254,9 +222,6 @@ type Server struct {
|
|||||||
// KV cache
|
// KV cache
|
||||||
cache *InputCache
|
cache *InputCache
|
||||||
|
|
||||||
// does this model require a beginning of sequence token?
|
|
||||||
bosToken int
|
|
||||||
|
|
||||||
// next sequence for prompt processing to avoid starvation
|
// next sequence for prompt processing to avoid starvation
|
||||||
nextSeq int
|
nextSeq int
|
||||||
|
|
||||||
@ -281,18 +246,6 @@ func (s *Server) allNil() bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) shiftContext(seq *Sequence) {
|
|
||||||
numLeft := seq.numPast - seq.numKeep
|
|
||||||
numDiscard := numLeft / 2
|
|
||||||
|
|
||||||
slog.Debug("context limit hit - shifting", "limit", s.cache.numCtx, "numPast", seq.numPast,
|
|
||||||
"numKeep", seq.numKeep, "numLeft", numLeft, "numDiscard", numDiscard)
|
|
||||||
|
|
||||||
s.cache.ShiftCacheSlot(seq.cache, seq.numKeep, numDiscard, seq.numPast)
|
|
||||||
|
|
||||||
seq.numPast -= numDiscard
|
|
||||||
}
|
|
||||||
|
|
||||||
func flushPending(seq *Sequence) bool {
|
func flushPending(seq *Sequence) bool {
|
||||||
joined := strings.Join(seq.pendingResponses, "")
|
joined := strings.Join(seq.pendingResponses, "")
|
||||||
seq.pendingResponses = []string{}
|
seq.pendingResponses = []string{}
|
||||||
@ -327,22 +280,31 @@ func (s *Server) removeSequence(seqIndex int, reason string) {
|
|||||||
close(seq.responses)
|
close(seq.responses)
|
||||||
close(seq.embedding)
|
close(seq.embedding)
|
||||||
seq.cache.InUse = false
|
seq.cache.InUse = false
|
||||||
if s.clip.cc != nil {
|
|
||||||
llama.MllamaSetCrossAttn(s.lc, s.clip.cc, nil)
|
|
||||||
}
|
|
||||||
s.seqs[seqIndex] = nil
|
s.seqs[seqIndex] = nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) run(ctx context.Context) {
|
func (s *Server) run(ctx context.Context) {
|
||||||
s.ready.Wait()
|
s.ready.Wait()
|
||||||
|
|
||||||
// logically these batches are used only within the context of processBatch
|
// Logically these batches are used only within the context of processBatch
|
||||||
// but it is better for performance to allocate them once here
|
// but it is better for performance to allocate them once here
|
||||||
tokenBatch := llama.NewBatch(s.batchSize*len(s.seqs), 0, len(s.seqs))
|
tokenBatch, err := llama.NewBatch(s.batchSize, len(s.seqs), 0)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
defer tokenBatch.Free()
|
defer tokenBatch.Free()
|
||||||
|
|
||||||
embedBatch := llama.NewBatch(s.batchSize*len(s.seqs), s.lc.Model().NEmbd(), len(s.seqs))
|
var embedBatch *llama.Batch
|
||||||
defer embedBatch.Free()
|
embedBatchSize := s.image.BatchSize(s.batchSize)
|
||||||
|
if embedBatchSize != 0 {
|
||||||
|
embedBatch, err = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
defer embedBatch.Free()
|
||||||
|
} else {
|
||||||
|
embedBatch = &llama.Batch{}
|
||||||
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
@ -371,6 +333,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
defer s.mu.Unlock()
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
var batch *llama.Batch
|
var batch *llama.Batch
|
||||||
|
crossAttention := false
|
||||||
|
|
||||||
seqIdx := s.nextSeq - 1
|
seqIdx := s.nextSeq - 1
|
||||||
for range s.seqs {
|
for range s.seqs {
|
||||||
@ -382,17 +345,24 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// if past the num predict limit
|
// if past the num predict limit
|
||||||
if seq.numPredict > 0 && seq.numPredicted > seq.numPredict {
|
if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
|
||||||
s.removeSequence(seqIdx, "limit")
|
s.removeSequence(seqIdx, "limit")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if seq.numPast+len(seq.inputs) > s.cache.numCtx {
|
|
||||||
s.shiftContext(seq)
|
|
||||||
}
|
|
||||||
|
|
||||||
var numInputsProcessed int
|
var numInputsProcessed int
|
||||||
|
shifted := false
|
||||||
|
|
||||||
for i, input := range seq.inputs {
|
for i, input := range seq.inputs {
|
||||||
|
if len(seq.cache.Inputs)+1 > s.cache.numCtx {
|
||||||
|
if !shifted {
|
||||||
|
s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
|
||||||
|
shifted = true
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
embedding := input.embed != nil
|
embedding := input.embed != nil
|
||||||
|
|
||||||
// If we don't currently have a batch, use one of the correct type and
|
// If we don't currently have a batch, use one of the correct type and
|
||||||
@ -404,24 +374,24 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
batch = tokenBatch
|
batch = tokenBatch
|
||||||
} else {
|
} else {
|
||||||
batch = embedBatch
|
batch = embedBatch
|
||||||
|
seq.crossAttention = s.image.NeedCrossAttention(input)
|
||||||
}
|
}
|
||||||
} else if embedding != batch.IsEmbedding() {
|
} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
|
||||||
s.nextSeq = seqIdx
|
s.nextSeq = seqIdx
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// todo: make this n_batch
|
if i >= batch.Size() {
|
||||||
if i >= s.batchSize {
|
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
batch.Add(input.token, input.embed, seq.numPast, []int{seq.cache.Id}, numInputsProcessed+1 == len(seq.inputs))
|
crossAttention = seq.crossAttention
|
||||||
seq.numPast++
|
batch.Add(input.token, input.embed, len(seq.cache.Inputs), i+1 == len(seq.inputs), seq.cache.Id)
|
||||||
|
seq.cache.Inputs = append(seq.cache.Inputs, input)
|
||||||
numInputsProcessed++
|
numInputsProcessed++
|
||||||
}
|
}
|
||||||
|
|
||||||
if numInputsProcessed > 0 {
|
if numInputsProcessed > 0 {
|
||||||
seq.cache.Inputs = append(seq.cache.Inputs, seq.inputs[:numInputsProcessed]...)
|
|
||||||
seq.inputs = seq.inputs[numInputsProcessed:]
|
seq.inputs = seq.inputs[numInputsProcessed:]
|
||||||
seq.iBatch = batch.NumTokens() - 1
|
seq.iBatch = batch.NumTokens() - 1
|
||||||
}
|
}
|
||||||
@ -431,6 +401,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.lc.SetCrossAttention(crossAttention)
|
||||||
|
|
||||||
err := s.lc.Decode(batch)
|
err := s.lc.Decode(batch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("failed to decode batch", "error", err)
|
slog.Error("failed to decode batch", "error", err)
|
||||||
@ -642,12 +614,15 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
|||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
for i, sq := range s.seqs {
|
for i, sq := range s.seqs {
|
||||||
if sq == nil {
|
if sq == nil {
|
||||||
seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
|
||||||
|
|
||||||
s.seqs[i] = seq
|
s.seqs[i] = seq
|
||||||
s.cond.Signal()
|
s.cond.Signal()
|
||||||
break
|
break
|
||||||
@ -722,7 +697,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
|
|||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
for i, sq := range s.seqs {
|
for i, sq := range s.seqs {
|
||||||
if sq == nil {
|
if sq == nil {
|
||||||
seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
||||||
@ -790,10 +765,17 @@ func (s *Server) loadModel(
|
|||||||
) {
|
) {
|
||||||
llama.BackendInit()
|
llama.BackendInit()
|
||||||
|
|
||||||
s.model = llama.LoadModelFromFile(mpath, params)
|
var err error
|
||||||
|
s.model, err = llama.LoadModelFromFile(mpath, params)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention)
|
ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention)
|
||||||
s.lc = llama.NewContextWithModel(s.model, ctxParams)
|
s.lc, err = llama.NewContextWithModel(s.model, ctxParams)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
if lpath != "" {
|
if lpath != "" {
|
||||||
err := s.model.ApplyLoraFromFile(s.lc, lpath, 1.0, threads)
|
err := s.model.ApplyLoraFromFile(s.lc, lpath, 1.0, threads)
|
||||||
@ -802,19 +784,18 @@ func (s *Server) loadModel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.model.AddBOSToken() {
|
|
||||||
s.bosToken = 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if ppath != "" {
|
if ppath != "" {
|
||||||
var err error
|
var err error
|
||||||
s.clip.cc, err = llama.NewClipContext(ppath)
|
s.image, err = NewImageContext(s.lc, ppath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
s.cache = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
|
s.cache, err = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
s.status = ServerStatusReady
|
s.status = ServerStatusReady
|
||||||
s.ready.Done()
|
s.ready.Done()
|
||||||
@ -837,14 +818,8 @@ func main() {
|
|||||||
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
||||||
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||||
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||||
// Expose requirements as a JSON output to stdout
|
|
||||||
requirements := flag.Bool("requirements", false, "print json requirement information")
|
requirements := flag.Bool("requirements", false, "print json requirement information")
|
||||||
|
|
||||||
// These are either ignored by llama.cpp or have no significance to us
|
|
||||||
_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
|
|
||||||
_ = flag.Bool("log-disable", false, "disables logging to a file")
|
|
||||||
_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
|
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
if *requirements {
|
if *requirements {
|
||||||
printRequirements(os.Stdout)
|
printRequirements(os.Stdout)
|
||||||
@ -867,7 +842,7 @@ func main() {
|
|||||||
})
|
})
|
||||||
slog.SetDefault(slog.New(handler))
|
slog.SetDefault(slog.New(handler))
|
||||||
slog.Info("starting go runner")
|
slog.Info("starting go runner")
|
||||||
slog.Debug("system info", "cpu", llama.PrintSystemInfo(), "threads", *threads)
|
slog.Info("system", "info", llama.PrintSystemInfo(), "threads", *threads)
|
||||||
|
|
||||||
server := &Server{
|
server := &Server{
|
||||||
batchSize: *batchSize,
|
batchSize: *batchSize,
|
||||||
|
40
llama/sampling_ext.cpp
vendored
40
llama/sampling_ext.cpp
vendored
@ -5,24 +5,28 @@
|
|||||||
struct gpt_sampler *gpt_sampler_cinit(
|
struct gpt_sampler *gpt_sampler_cinit(
|
||||||
const struct llama_model *model, struct gpt_sampler_cparams *params)
|
const struct llama_model *model, struct gpt_sampler_cparams *params)
|
||||||
{
|
{
|
||||||
gpt_sampler_params sparams;
|
try {
|
||||||
sparams.top_k = params->top_k;
|
gpt_sampler_params sparams;
|
||||||
sparams.top_p = params->top_p;
|
sparams.top_k = params->top_k;
|
||||||
sparams.min_p = params->min_p;
|
sparams.top_p = params->top_p;
|
||||||
sparams.tfs_z = params->tfs_z;
|
sparams.min_p = params->min_p;
|
||||||
sparams.typ_p = params->typical_p;
|
sparams.tfs_z = params->tfs_z;
|
||||||
sparams.temp = params->temp;
|
sparams.typ_p = params->typical_p;
|
||||||
sparams.penalty_last_n = params->penalty_last_n;
|
sparams.temp = params->temp;
|
||||||
sparams.penalty_repeat = params->penalty_repeat;
|
sparams.penalty_last_n = params->penalty_last_n;
|
||||||
sparams.penalty_freq = params->penalty_freq;
|
sparams.penalty_repeat = params->penalty_repeat;
|
||||||
sparams.penalty_present = params->penalty_present;
|
sparams.penalty_freq = params->penalty_freq;
|
||||||
sparams.mirostat = params->mirostat;
|
sparams.penalty_present = params->penalty_present;
|
||||||
sparams.mirostat_tau = params->mirostat_tau;
|
sparams.mirostat = params->mirostat;
|
||||||
sparams.mirostat_eta = params->mirostat_eta;
|
sparams.mirostat_tau = params->mirostat_tau;
|
||||||
sparams.penalize_nl = params->penalize_nl;
|
sparams.mirostat_eta = params->mirostat_eta;
|
||||||
sparams.seed = params->seed;
|
sparams.penalize_nl = params->penalize_nl;
|
||||||
sparams.grammar = params->grammar;
|
sparams.seed = params->seed;
|
||||||
return gpt_sampler_init(model, sparams);
|
sparams.grammar = params->grammar;
|
||||||
|
return gpt_sampler_init(model, sparams);
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_sampler_cfree(struct gpt_sampler *sampler)
|
void gpt_sampler_cfree(struct gpt_sampler *sampler)
|
||||||
|
21
llama/unicode.cpp
vendored
21
llama/unicode.cpp
vendored
@ -28,6 +28,11 @@
|
|||||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#include <windows.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
#include "unicode-data.h"
|
#include "unicode-data.h"
|
||||||
|
|
||||||
@ -227,8 +232,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
||||||
|
#ifdef _WIN32
|
||||||
|
int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
|
||||||
|
if (!wlen) {
|
||||||
|
throw std::invalid_argument("failed to convert regex");
|
||||||
|
}
|
||||||
|
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
|
||||||
|
wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
|
||||||
|
if (!wlen) {
|
||||||
|
free(wbuf);
|
||||||
|
throw std::invalid_argument("failed to convert regex");
|
||||||
|
}
|
||||||
|
std::wstring ret = std::wstring(wbuf);
|
||||||
|
free(wbuf);
|
||||||
|
return ret;
|
||||||
|
#else
|
||||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
||||||
return conv.from_bytes(s);
|
return conv.from_bytes(s);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
|
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
|
||||||
|
@ -1,11 +1,9 @@
|
|||||||
package fileutils
|
package llm
|
||||||
|
|
||||||
import "fmt"
|
import "fmt"
|
||||||
|
|
||||||
type fileType uint32
|
type fileType uint32
|
||||||
|
|
||||||
// TODO this should map over to the GGML CGO enum type
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
fileTypeF32 fileType = iota
|
fileTypeF32 fileType = iota
|
||||||
fileTypeF16
|
fileTypeF16
|
@ -1,4 +1,4 @@
|
|||||||
package fileutils
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
@ -1,11 +1,10 @@
|
|||||||
package fileutils
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@ -361,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
|||||||
}, offset, nil
|
}, offset, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
|
func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffload uint64) {
|
||||||
embedding := llm.KV().EmbeddingLength()
|
embedding := llm.KV().EmbeddingLength()
|
||||||
heads := llm.KV().HeadCount()
|
heads := llm.KV().HeadCount()
|
||||||
headsKV := llm.KV().HeadCountKV()
|
headsKV := llm.KV().HeadCountKV()
|
||||||
@ -369,9 +368,12 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
|
|
||||||
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
||||||
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
||||||
|
embeddingHeadsV := llm.KV().EmbeddingHeadCountV()
|
||||||
|
|
||||||
layers := llm.Tensors().Layers()
|
layers := llm.Tensors().Layers()
|
||||||
|
|
||||||
|
kv = 2 * context * llm.KV().BlockCount() * (embeddingHeadsK + embeddingHeadsV) * headsKV
|
||||||
|
|
||||||
switch llm.KV().Architecture() {
|
switch llm.KV().Architecture() {
|
||||||
case "llama":
|
case "llama":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
@ -401,6 +403,42 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
case "mllama":
|
||||||
|
var visionTokens, tiles uint64 = 1601, 4
|
||||||
|
|
||||||
|
if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
|
||||||
|
kv = headsKV *
|
||||||
|
(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
|
||||||
|
(2* // sizeof(float16)
|
||||||
|
(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
|
||||||
|
context +
|
||||||
|
4* // sizeof(float32)
|
||||||
|
uint64(crossAttentionLayers.size)* // num cross attention layers
|
||||||
|
visionTokens*
|
||||||
|
tiles)
|
||||||
|
}
|
||||||
|
|
||||||
|
fullOffload = max(
|
||||||
|
4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
|
||||||
|
// vocab graph
|
||||||
|
4*batch*(embedding+vocab),
|
||||||
|
)
|
||||||
|
|
||||||
|
var ropeFreqsCount uint64
|
||||||
|
if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
|
||||||
|
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
||||||
|
ropeFreqsCount = ropeFreqsWeights.parameters()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
partialOffload = max(
|
||||||
|
4*(batch*
|
||||||
|
(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
|
||||||
|
ropeFreqsCount+
|
||||||
|
embeddingHeadsK*context*headsKV),
|
||||||
|
// vocab graph
|
||||||
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
|
)
|
||||||
case "gemma", "gemma2":
|
case "gemma", "gemma2":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
@ -489,23 +527,3 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadModel will load a model from disk. The model must be in the GGML format.
|
|
||||||
//
|
|
||||||
// It collects array values for arrays with a size less than or equal to
|
|
||||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
|
||||||
// the maxArraySize is negative, all arrays are collected.
|
|
||||||
func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
|
||||||
if _, err := os.Stat(model); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
f, err := os.Open(model)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
ggml, _, err := DecodeGGML(f, maxArraySize)
|
|
||||||
return ggml, err
|
|
||||||
}
|
|
1
llm/ggml_test.go
Normal file
1
llm/ggml_test.go
Normal file
@ -0,0 +1 @@
|
|||||||
|
package llm
|
@ -1,4 +1,4 @@
|
|||||||
package fileutils
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
@ -1,4 +1,4 @@
|
|||||||
package runners
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"syscall"
|
"syscall"
|
@ -1,4 +1,4 @@
|
|||||||
package runners
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"syscall"
|
"syscall"
|
@ -1,4 +1,4 @@
|
|||||||
package runners
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"syscall"
|
"syscall"
|
@ -1,4 +1,4 @@
|
|||||||
package fileutils
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
@ -123,13 +123,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
|||||||
slog.Warn("model missing blk.0 layer size")
|
slog.Warn("model missing blk.0 layer size")
|
||||||
}
|
}
|
||||||
|
|
||||||
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
|
kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||||
var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
|
|
||||||
|
|
||||||
// KV is proportional to the number of layers
|
|
||||||
layerSize += kv / ggml.KV().BlockCount()
|
|
||||||
|
|
||||||
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
|
||||||
if graphPartialOffload == 0 {
|
if graphPartialOffload == 0 {
|
||||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||||
}
|
}
|
||||||
@ -137,6 +131,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
|||||||
graphFullOffload = graphPartialOffload
|
graphFullOffload = graphPartialOffload
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// KV is proportional to the number of layers
|
||||||
|
layerSize += kv / ggml.KV().BlockCount()
|
||||||
|
|
||||||
// on metal there's no partial offload overhead
|
// on metal there's no partial offload overhead
|
||||||
if gpus[0].Library == "metal" {
|
if gpus[0].Library == "metal" {
|
||||||
graphPartialOffload = graphFullOffload
|
graphPartialOffload = graphFullOffload
|
||||||
@ -329,7 +326,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
|||||||
return estimate
|
return estimate
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m MemoryEstimate) Log() {
|
func (m MemoryEstimate) log() {
|
||||||
overhead := envconfig.GpuOverhead()
|
overhead := envconfig.GpuOverhead()
|
||||||
|
|
||||||
log := slog.With()
|
log := slog.With()
|
@ -1,4 +1,4 @@
|
|||||||
package fileutils
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
@ -1,4 +1,4 @@
|
|||||||
package runners
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
@ -28,11 +28,24 @@ import (
|
|||||||
"github.com/ollama/ollama/build"
|
"github.com/ollama/ollama/build"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/fileutils"
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/llama"
|
"github.com/ollama/ollama/llama"
|
||||||
|
"github.com/ollama/ollama/runners"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type LlamaServer interface {
|
||||||
|
Ping(ctx context.Context) error
|
||||||
|
WaitUntilRunning(ctx context.Context) error
|
||||||
|
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||||
|
Embedding(ctx context.Context, input string) ([]float32, error)
|
||||||
|
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||||
|
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||||
|
Close() error
|
||||||
|
EstimatedVRAM() uint64 // Total VRAM across all GPUs
|
||||||
|
EstimatedTotal() uint64
|
||||||
|
EstimatedVRAMByGPU(gpuID string) uint64
|
||||||
|
}
|
||||||
|
|
||||||
// llmServer is an instance of the llama.cpp server
|
// llmServer is an instance of the llama.cpp server
|
||||||
type llmServer struct {
|
type llmServer struct {
|
||||||
port int
|
port int
|
||||||
@ -45,7 +58,7 @@ type llmServer struct {
|
|||||||
modelLock sync.Mutex // Temporary until we switch fully to Go server
|
modelLock sync.Mutex // Temporary until we switch fully to Go server
|
||||||
model *llama.Model // If non-nil, the runner is a new Go server
|
model *llama.Model // If non-nil, the runner is a new Go server
|
||||||
|
|
||||||
estimate fileutils.MemoryEstimate
|
estimate MemoryEstimate
|
||||||
totalLayers uint64
|
totalLayers uint64
|
||||||
// gpuCount int
|
// gpuCount int
|
||||||
gpus discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
gpus discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
||||||
@ -55,12 +68,32 @@ type llmServer struct {
|
|||||||
sem *semaphore.Weighted
|
sem *semaphore.Weighted
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadModel will load a model from disk. The model must be in the GGML format.
|
||||||
|
//
|
||||||
|
// It collects array values for arrays with a size less than or equal to
|
||||||
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
|
func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||||
|
if _, err := os.Stat(model); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
f, err := os.Open(model)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
ggml, _, err := DecodeGGML(f, maxArraySize)
|
||||||
|
return ggml, err
|
||||||
|
}
|
||||||
|
|
||||||
// NewLlamaServer will run a server for the given GPUs
|
// NewLlamaServer will run a server for the given GPUs
|
||||||
// The gpu list must be a single family.
|
// The gpu list must be a single family.
|
||||||
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LLMServer, error) {
|
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||||
var err error
|
var err error
|
||||||
var cpuRunner string
|
var cpuRunner string
|
||||||
var estimate fileutils.MemoryEstimate
|
var estimate MemoryEstimate
|
||||||
var systemTotalMemory uint64
|
var systemTotalMemory uint64
|
||||||
var systemFreeMemory uint64
|
var systemFreeMemory uint64
|
||||||
var systemSwapFreeMemory uint64
|
var systemSwapFreeMemory uint64
|
||||||
@ -76,10 +109,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGM
|
|||||||
gpus = discover.GetCPUInfo()
|
gpus = discover.GetCPUInfo()
|
||||||
}
|
}
|
||||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
cpuRunner = ServerForCpu()
|
cpuRunner = runners.ServerForCpu()
|
||||||
estimate = fileutils.EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
} else {
|
} else {
|
||||||
estimate = fileutils.EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
||||||
@ -88,7 +121,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGM
|
|||||||
opts.NumGPU = 0
|
opts.NumGPU = 0
|
||||||
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||||
// Don't bother loading into the GPU if no layers can fit
|
// Don't bother loading into the GPU if no layers can fit
|
||||||
cpuRunner = ServerForCpu()
|
cpuRunner = runners.ServerForCpu()
|
||||||
gpus = discover.GetCPUInfo()
|
gpus = discover.GetCPUInfo()
|
||||||
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||||
opts.NumGPU = estimate.Layers
|
opts.NumGPU = estimate.Layers
|
||||||
@ -106,7 +139,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGM
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
estimate.Log()
|
estimate.log()
|
||||||
|
|
||||||
// Loop through potential servers
|
// Loop through potential servers
|
||||||
finalErr := errors.New("no suitable llama servers found")
|
finalErr := errors.New("no suitable llama servers found")
|
||||||
@ -115,12 +148,12 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGM
|
|||||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||||
}
|
}
|
||||||
|
|
||||||
rDir, err := Refresh(build.EmbedFS)
|
rDir, err := runners.Refresh(build.EmbedFS)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
availableServers := GetAvailableServers(rDir)
|
availableServers := runners.GetAvailableServers(rDir)
|
||||||
if len(availableServers) == 0 {
|
if len(availableServers) == 0 {
|
||||||
return nil, finalErr
|
return nil, finalErr
|
||||||
}
|
}
|
||||||
@ -128,7 +161,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGM
|
|||||||
if cpuRunner != "" {
|
if cpuRunner != "" {
|
||||||
servers = []string{cpuRunner}
|
servers = []string{cpuRunner}
|
||||||
} else {
|
} else {
|
||||||
servers = ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||||
}
|
}
|
||||||
demandLib := envconfig.LLMLibrary()
|
demandLib := envconfig.LLMLibrary()
|
||||||
if demandLib != "" {
|
if demandLib != "" {
|
||||||
@ -153,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGM
|
|||||||
"--model", model,
|
"--model", model,
|
||||||
"--ctx-size", strconv.Itoa(opts.NumCtx),
|
"--ctx-size", strconv.Itoa(opts.NumCtx),
|
||||||
"--batch-size", strconv.Itoa(opts.NumBatch),
|
"--batch-size", strconv.Itoa(opts.NumBatch),
|
||||||
"--embedding",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.NumGPU >= 0 {
|
if opts.NumGPU >= 0 {
|
||||||
@ -185,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGM
|
|||||||
params = append(params, "--threads", strconv.Itoa(defaultThreads))
|
params = append(params, "--threads", strconv.Itoa(defaultThreads))
|
||||||
}
|
}
|
||||||
|
|
||||||
if !opts.F16KV {
|
|
||||||
params = append(params, "--memory-f32")
|
|
||||||
}
|
|
||||||
|
|
||||||
flashAttnEnabled := envconfig.FlashAttention()
|
flashAttnEnabled := envconfig.FlashAttention()
|
||||||
|
|
||||||
for _, g := range gpus {
|
for _, g := range gpus {
|
||||||
@ -278,9 +306,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGM
|
|||||||
|
|
||||||
// Note: we always put the dependency path first
|
// Note: we always put the dependency path first
|
||||||
// since this was the exact version we compiled/linked against
|
// since this was the exact version we compiled/linked against
|
||||||
if gpus[0].DependencyPath != "" {
|
if gpus[0].DependencyPath != nil {
|
||||||
// assume gpus from the same library have the same dependency path
|
// assume gpus from the same library have the same dependency path
|
||||||
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
|
libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
|
||||||
}
|
}
|
||||||
|
|
||||||
server := filepath.Join(dir, "ollama_llama_server")
|
server := filepath.Join(dir, "ollama_llama_server")
|
||||||
@ -292,7 +320,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGM
|
|||||||
_, err := os.Stat(server)
|
_, err := os.Stat(server)
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
||||||
_, err = Refresh(build.EmbedFS)
|
_, err = runners.Refresh(build.EmbedFS)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
slog.Warn("failed to reinitialize payloads", "error", err)
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -640,6 +668,23 @@ type completion struct {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type CompletionRequest struct {
|
||||||
|
Prompt string
|
||||||
|
Format string
|
||||||
|
Images []ImageData
|
||||||
|
Options *api.Options
|
||||||
|
}
|
||||||
|
|
||||||
|
type CompletionResponse struct {
|
||||||
|
Content string
|
||||||
|
DoneReason string
|
||||||
|
Done bool
|
||||||
|
PromptEvalCount int
|
||||||
|
PromptEvalDuration time.Duration
|
||||||
|
EvalCount int
|
||||||
|
EvalDuration time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||||
slog.Error("Failed to acquire semaphore", "error", err)
|
slog.Error("Failed to acquire semaphore", "error", err)
|
||||||
@ -908,7 +953,10 @@ func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error)
|
|||||||
if resp.StatusCode == http.StatusNotFound {
|
if resp.StatusCode == http.StatusNotFound {
|
||||||
if s.model == nil {
|
if s.model == nil {
|
||||||
slog.Debug("new runner detected, loading model for cgo tokenization")
|
slog.Debug("new runner detected, loading model for cgo tokenization")
|
||||||
m := llama.LoadModelFromFile(s.modelPath, llama.ModelParams{VocabOnly: true})
|
m, err := llama.LoadModelFromFile(s.modelPath, llama.ModelParams{VocabOnly: true})
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
s.model = m
|
s.model = m
|
||||||
}
|
}
|
||||||
return s.model.Tokenize(content, false, true)
|
return s.model.Tokenize(content, false, true)
|
||||||
@ -977,7 +1025,10 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
|
|||||||
if resp.StatusCode == http.StatusNotFound {
|
if resp.StatusCode == http.StatusNotFound {
|
||||||
if s.model == nil {
|
if s.model == nil {
|
||||||
slog.Debug("new runner detected, loading model for cgo tokenization")
|
slog.Debug("new runner detected, loading model for cgo tokenization")
|
||||||
m := llama.LoadModelFromFile(s.modelPath, llama.ModelParams{VocabOnly: true})
|
m, err := llama.LoadModelFromFile(s.modelPath, llama.ModelParams{VocabOnly: true})
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
s.model = m
|
s.model = m
|
||||||
}
|
}
|
||||||
var resp string
|
var resp string
|
@ -1,4 +1,4 @@
|
|||||||
package runners
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
@ -440,7 +440,6 @@ func TestParseFileParameters(t *testing.T) {
|
|||||||
"num_gpu 1": {"num_gpu", "1"},
|
"num_gpu 1": {"num_gpu", "1"},
|
||||||
"main_gpu 1": {"main_gpu", "1"},
|
"main_gpu 1": {"main_gpu", "1"},
|
||||||
"low_vram true": {"low_vram", "true"},
|
"low_vram true": {"low_vram", "true"},
|
||||||
"f16_kv true": {"f16_kv", "true"},
|
|
||||||
"logits_all true": {"logits_all", "true"},
|
"logits_all true": {"logits_all", "true"},
|
||||||
"vocab_only true": {"vocab_only", "true"},
|
"vocab_only true": {"vocab_only", "true"},
|
||||||
"use_mmap true": {"use_mmap", "true"},
|
"use_mmap true": {"use_mmap", "true"},
|
||||||
|
@ -1,3 +0,0 @@
|
|||||||
# `runners`
|
|
||||||
|
|
||||||
Ollama uses a subprocess model to run one or more child processes to load the LLM. On some platforms (Linux non-containerized, MacOS) these executables are carried as payloads inside the main executable via the ../build package. Extraction and discovery of these runners at runtime is implemented in this package. This package also provides the abstraction to communicate with these subprocesses.
|
|
@ -2,7 +2,6 @@ package runners
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
"context"
|
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@ -16,11 +15,9 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
|
||||||
|
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
)
|
)
|
||||||
@ -34,36 +31,6 @@ var (
|
|||||||
runnersDir = ""
|
runnersDir = ""
|
||||||
)
|
)
|
||||||
|
|
||||||
type CompletionRequest struct {
|
|
||||||
Prompt string
|
|
||||||
Format string
|
|
||||||
Images []ImageData
|
|
||||||
Options *api.Options
|
|
||||||
}
|
|
||||||
|
|
||||||
type CompletionResponse struct {
|
|
||||||
Content string
|
|
||||||
DoneReason string
|
|
||||||
Done bool
|
|
||||||
PromptEvalCount int
|
|
||||||
PromptEvalDuration time.Duration
|
|
||||||
EvalCount int
|
|
||||||
EvalDuration time.Duration
|
|
||||||
}
|
|
||||||
|
|
||||||
type LLMServer interface {
|
|
||||||
Ping(ctx context.Context) error
|
|
||||||
WaitUntilRunning(ctx context.Context) error
|
|
||||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
|
||||||
Embedding(ctx context.Context, input string) ([]float32, error)
|
|
||||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
|
||||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
|
||||||
Close() error
|
|
||||||
EstimatedVRAM() uint64 // Total VRAM across all GPUs
|
|
||||||
EstimatedTotal() uint64
|
|
||||||
EstimatedVRAMByGPU(gpuID string) uint64
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the location where runners are stored
|
// Return the location where runners are stored
|
||||||
// If runners are payloads, this will either extract them
|
// If runners are payloads, this will either extract them
|
||||||
// or refresh them if any have disappeared due to tmp cleaners
|
// or refresh them if any have disappeared due to tmp cleaners
|
||||||
|
@ -6,17 +6,18 @@ set -e
|
|||||||
|
|
||||||
mkdir -p dist
|
mkdir -p dist
|
||||||
|
|
||||||
|
# These require Xcode v13 or older to target MacOS v11
|
||||||
|
# If installed to an alternate location use the following to enable
|
||||||
|
# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
||||||
|
# export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
|
||||||
|
export CGO_CFLAGS=-mmacosx-version-min=11.3
|
||||||
|
export CGO_CXXFLAGS=-mmacosx-version-min=11.3
|
||||||
|
export CGO_LDFLAGS=-mmacosx-version-min=11.3
|
||||||
|
|
||||||
for TARGETARCH in arm64 amd64; do
|
for TARGETARCH in arm64 amd64; do
|
||||||
echo "Building Go runner darwin $TARGETARCH"
|
echo "Building Go runner darwin $TARGETARCH"
|
||||||
rm -rf llama/build
|
rm -rf llama/build
|
||||||
GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
|
GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
|
||||||
# These require Xcode v13 or older to target MacOS v11
|
|
||||||
# If installed to an alternate location use the following to enable
|
|
||||||
# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
|
||||||
# export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
|
|
||||||
export CGO_CFLAGS=-mmacosx-version-min=11.3
|
|
||||||
export CGO_CXXFLAGS=-mmacosx-version-min=11.3
|
|
||||||
export CGO_LDFLAGS=-mmacosx-version-min=11.3
|
|
||||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
|
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
|
||||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
|
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
|
||||||
done
|
done
|
||||||
|
@ -75,7 +75,6 @@ function checkEnv() {
|
|||||||
} else {
|
} else {
|
||||||
write-host "Code signing disabled - please set KEY_CONTAINERS to sign and copy ollama_inc.crt to the top of the source tree"
|
write-host "Code signing disabled - please set KEY_CONTAINERS to sign and copy ollama_inc.crt to the top of the source tree"
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -91,11 +90,6 @@ function buildOllama() {
|
|||||||
write-host "Building ollama CLI"
|
write-host "Building ollama CLI"
|
||||||
& go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
|
& go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
if ("${env:KEY_CONTAINER}") {
|
|
||||||
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
|
||||||
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
|
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
|
||||||
}
|
|
||||||
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
|
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
|
||||||
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
|
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
|
||||||
}
|
}
|
||||||
@ -106,11 +100,6 @@ function buildApp() {
|
|||||||
& windres -l 0 -o ollama.syso ollama.rc
|
& windres -l 0 -o ollama.syso ollama.rc
|
||||||
& go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" -o "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe" .
|
& go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" -o "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe" .
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
if ("${env:KEY_CONTAINER}") {
|
|
||||||
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
|
||||||
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe"
|
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function gatherDependencies() {
|
function gatherDependencies() {
|
||||||
@ -143,16 +132,19 @@ function gatherDependencies() {
|
|||||||
copy-item -path "${env:VCToolsRedistDir}\vc_redist.arm64.exe" -destination "${script:DIST_DIR}" -verbose
|
copy-item -path "${env:VCToolsRedistDir}\vc_redist.arm64.exe" -destination "${script:DIST_DIR}" -verbose
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
|
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
|
||||||
|
}
|
||||||
|
|
||||||
|
function sign() {
|
||||||
if ("${env:KEY_CONTAINER}") {
|
if ("${env:KEY_CONTAINER}") {
|
||||||
write-host "about to sign"
|
write-host "Signing Ollama executables, scripts and libraries"
|
||||||
foreach ($file in (get-childitem "${script:DIST_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
|
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
||||||
write-host "signing $file"
|
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} `
|
||||||
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
$(get-childitem -path "${script:SRC_DIR}\dist" -r -include @('ollama_welcome.ps1')) `
|
||||||
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
|
$(get-childitem -path "${script:SRC_DIR}\dist\windows-*" -r -include @('*.exe', '*.dll'))
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
}
|
} else {
|
||||||
|
write-host "Signing not enabled"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -183,6 +175,7 @@ try {
|
|||||||
buildOllama
|
buildOllama
|
||||||
buildApp
|
buildApp
|
||||||
gatherDependencies
|
gatherDependencies
|
||||||
|
sign
|
||||||
buildInstaller
|
buildInstaller
|
||||||
distZip
|
distZip
|
||||||
} else {
|
} else {
|
||||||
|
@ -120,6 +120,78 @@ func TestGetOptimalTiledCanvas(t *testing.T) {
|
|||||||
TileSize: 560,
|
TileSize: 560,
|
||||||
Expected: image.Point{1120, 1120},
|
Expected: image.Point{1120, 1120},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{800, 600},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{1120, 1120},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{640, 480},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{1120, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{320, 200},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{1320, 200},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{1680, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{2000, 200},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{2240, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{10000, 200},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{2240, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{480, 640},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 1120},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{200, 320},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{200, 1320},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 1680},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{200, 2000},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 2240},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{200, 10000},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 2240},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{10000, 10000},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{1120, 1120},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
|
@ -25,9 +25,9 @@ import (
|
|||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/auth"
|
"github.com/ollama/ollama/auth"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/fileutils"
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/llama"
|
"github.com/ollama/ollama/llama"
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
"github.com/ollama/ollama/types/errtypes"
|
"github.com/ollama/ollama/types/errtypes"
|
||||||
@ -91,7 +91,7 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
|
|||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
|
// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
|
||||||
ggml, _, err := fileutils.DecodeGGML(f, 0)
|
ggml, _, err := llm.DecodeGGML(f, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("couldn't decode ggml", "error", err)
|
slog.Error("couldn't decode ggml", "error", err)
|
||||||
continue
|
continue
|
||||||
@ -431,7 +431,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
|||||||
baseLayer.MediaType == "application/vnd.ollama.image.model" &&
|
baseLayer.MediaType == "application/vnd.ollama.image.model" &&
|
||||||
baseLayer.GGML != nil &&
|
baseLayer.GGML != nil &&
|
||||||
baseLayer.GGML.Name() == "gguf" {
|
baseLayer.GGML.Name() == "gguf" {
|
||||||
want, err := fileutils.ParseFileType(quantization)
|
want, err := llm.ParseFileType(quantization)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -467,7 +467,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml, _, err := fileutils.DecodeGGML(temp, 0)
|
ggml, _, err := llm.DecodeGGML(temp, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -690,7 +690,8 @@ func CopyModel(src, dst model.Name) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func deleteUnusedLayers(deleteMap map[string]struct{}) error {
|
func deleteUnusedLayers(deleteMap map[string]struct{}) error {
|
||||||
manifests, err := Manifests()
|
// Ignore corrupt manifests to avoid blocking deletion of layers that are freshly orphaned
|
||||||
|
manifests, err := Manifests(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -853,8 +854,8 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
|||||||
manifest, _, err := GetManifest(mp)
|
manifest, _, err := GetManifest(mp)
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
// noop
|
// noop
|
||||||
} else if err != nil && !errors.Is(err, os.ErrNotExist) {
|
} else if err != nil {
|
||||||
return err
|
slog.Warn("pulling model with bad existing manifest", "name", name, "error", err)
|
||||||
} else {
|
} else {
|
||||||
for _, l := range manifest.Layers {
|
for _, l := range manifest.Layers {
|
||||||
deleteMap[l.Digest] = struct{}{}
|
deleteMap[l.Digest] = struct{}{}
|
||||||
|
@ -106,7 +106,8 @@ func (l *Layer) Remove() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
ms, err := Manifests()
|
// Ignore corrupt manifests to avoid blocking deletion of layers that are freshly orphaned
|
||||||
|
ms, err := Manifests(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -123,7 +123,7 @@ func WriteManifest(name model.Name, config Layer, layers []Layer) error {
|
|||||||
return json.NewEncoder(f).Encode(m)
|
return json.NewEncoder(f).Encode(m)
|
||||||
}
|
}
|
||||||
|
|
||||||
func Manifests() (map[model.Name]*Manifest, error) {
|
func Manifests(continueOnError bool) (map[model.Name]*Manifest, error) {
|
||||||
manifests, err := GetManifestPath()
|
manifests, err := GetManifestPath()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -145,22 +145,29 @@ func Manifests() (map[model.Name]*Manifest, error) {
|
|||||||
if !fi.IsDir() {
|
if !fi.IsDir() {
|
||||||
rel, err := filepath.Rel(manifests, match)
|
rel, err := filepath.Rel(manifests, match)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if !continueOnError {
|
||||||
|
return nil, fmt.Errorf("%s %w", match, err)
|
||||||
|
}
|
||||||
slog.Warn("bad filepath", "path", match, "error", err)
|
slog.Warn("bad filepath", "path", match, "error", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
n := model.ParseNameFromFilepath(rel)
|
n := model.ParseNameFromFilepath(rel)
|
||||||
if !n.IsValid() {
|
if !n.IsValid() {
|
||||||
|
if !continueOnError {
|
||||||
|
return nil, fmt.Errorf("%s %w", rel, err)
|
||||||
|
}
|
||||||
slog.Warn("bad manifest name", "path", rel)
|
slog.Warn("bad manifest name", "path", rel)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
m, err := ParseNamedManifest(n)
|
m, err := ParseNamedManifest(n)
|
||||||
if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
|
if err != nil {
|
||||||
|
if !continueOnError {
|
||||||
|
return nil, fmt.Errorf("%s %w", n, err)
|
||||||
|
}
|
||||||
slog.Warn("bad manifest", "name", n, "error", err)
|
slog.Warn("bad manifest", "name", n, "error", err)
|
||||||
continue
|
continue
|
||||||
} else if err != nil {
|
|
||||||
return nil, fmt.Errorf("%s: %w", n, err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ms[n] = m
|
ms[n] = m
|
||||||
|
@ -112,7 +112,7 @@ func TestManifests(t *testing.T) {
|
|||||||
createManifest(t, d, p)
|
createManifest(t, d, p)
|
||||||
}
|
}
|
||||||
|
|
||||||
ms, err := Manifests()
|
ms, err := Manifests(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ import (
|
|||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/convert"
|
"github.com/ollama/ollama/convert"
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
)
|
)
|
||||||
@ -27,7 +27,7 @@ var intermediateBlobs map[string]string = make(map[string]string)
|
|||||||
|
|
||||||
type layerGGML struct {
|
type layerGGML struct {
|
||||||
Layer
|
Layer
|
||||||
*fileutils.GGML
|
*llm.GGML
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||||
@ -67,7 +67,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
|||||||
}
|
}
|
||||||
defer blob.Close()
|
defer blob.Close()
|
||||||
|
|
||||||
ggml, _, err := fileutils.DecodeGGML(blob, 0)
|
ggml, _, err := llm.DecodeGGML(blob, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -112,7 +112,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
|
|||||||
|
|
||||||
switch command {
|
switch command {
|
||||||
case "adapter":
|
case "adapter":
|
||||||
var baseModel *fileutils.GGML
|
var baseModel *llm.GGML
|
||||||
for _, l := range baseLayers {
|
for _, l := range baseLayers {
|
||||||
if l.GGML != nil {
|
if l.GGML != nil {
|
||||||
baseModel = l.GGML
|
baseModel = l.GGML
|
||||||
@ -150,7 +150,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
|
|||||||
}
|
}
|
||||||
defer bin.Close()
|
defer bin.Close()
|
||||||
|
|
||||||
ggml, _, err := fileutils.DecodeGGML(bin, 0)
|
ggml, _, err := llm.DecodeGGML(bin, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -184,7 +184,7 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML,
|
|||||||
|
|
||||||
var offset int64
|
var offset int64
|
||||||
for offset < stat.Size() {
|
for offset < stat.Size() {
|
||||||
ggml, n, err := fileutils.DecodeGGML(file, 0)
|
ggml, n, err := llm.DecodeGGML(file, 0)
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
break
|
break
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
@ -263,7 +263,7 @@ func detectContentType(r io.Reader) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
if contentType := fileutils.DetectGGMLType(b.Bytes()); contentType != "" {
|
if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" {
|
||||||
return contentType, nil
|
return contentType, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ import (
|
|||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -147,7 +147,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
|
|||||||
t.Fatalf("failed to open file: %v", err)
|
t.Fatalf("failed to open file: %v", err)
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
if err := fileutils.WriteGGUF(file, fileutils.KV{"general.architecture": "gemma"}, []fileutils.Tensor{}); err != nil {
|
if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
|
||||||
t.Fatalf("failed to write gguf: %v", err)
|
t.Fatalf("failed to write gguf: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -200,7 +200,7 @@ func TestParseLayerFromCopy(t *testing.T) {
|
|||||||
defer file2.Close()
|
defer file2.Close()
|
||||||
|
|
||||||
for range 5 {
|
for range 5 {
|
||||||
if err := fileutils.WriteGGUF(file2, fileutils.KV{"general.architecture": "gemma"}, []fileutils.Tensor{}); err != nil {
|
if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
|
||||||
t.Fatalf("failed to write gguf: %v", err)
|
t.Fatalf("failed to write gguf: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/runners"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/server/imageproc"
|
"github.com/ollama/ollama/server/imageproc"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
)
|
)
|
||||||
@ -22,11 +22,21 @@ var errTooManyImages = errors.New("vision model only supports a single image per
|
|||||||
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
|
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
|
||||||
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
|
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
|
||||||
// latest message and 2) system messages
|
// latest message and 2) system messages
|
||||||
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []runners.ImageData, _ error) {
|
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
|
||||||
var system []api.Message
|
var system []api.Message
|
||||||
|
|
||||||
isMllama := checkMllamaModelFamily(m)
|
isMllama := checkMllamaModelFamily(m)
|
||||||
|
|
||||||
|
var imageNumTokens int
|
||||||
|
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
|
||||||
|
if isMllama {
|
||||||
|
// Our mllama implementation packs all of the embeddings into a single token
|
||||||
|
imageNumTokens = 1
|
||||||
|
} else {
|
||||||
|
// Clip images are represented as 768 tokens, each an embedding
|
||||||
|
imageNumTokens = 768
|
||||||
|
}
|
||||||
|
|
||||||
n := len(msgs) - 1
|
n := len(msgs) - 1
|
||||||
// in reverse, find all messages that fit into context window
|
// in reverse, find all messages that fit into context window
|
||||||
for i := n; i >= 0; i-- {
|
for i := n; i >= 0; i-- {
|
||||||
@ -59,9 +69,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||||||
ctxLen := len(s)
|
ctxLen := len(s)
|
||||||
if m.ProjectorPaths != nil {
|
if m.ProjectorPaths != nil {
|
||||||
for _, m := range msgs[i:] {
|
for _, m := range msgs[i:] {
|
||||||
// images are represented as 768 sized embeddings
|
ctxLen += imageNumTokens * len(m.Images)
|
||||||
// TODO: get embedding length from project metadata
|
|
||||||
ctxLen += 768 * len(m.Images)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -75,11 +83,16 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||||||
|
|
||||||
currMsgIdx := n
|
currMsgIdx := n
|
||||||
|
|
||||||
if isMllama {
|
for cnt, msg := range msgs[currMsgIdx:] {
|
||||||
lastMsgIdx := len(msgs) - 1
|
prefix := ""
|
||||||
for i := lastMsgIdx; i >= currMsgIdx; i-- {
|
imgPrompt := ""
|
||||||
if len(msgs[i].Images) > 0 {
|
prompt := msg.Content
|
||||||
data, aspectRatioID, err := imageproc.Preprocess(msgs[i].Images[0])
|
|
||||||
|
for _, i := range msg.Images {
|
||||||
|
var imgData llm.ImageData
|
||||||
|
|
||||||
|
if isMllama {
|
||||||
|
data, aspectRatioID, err := imageproc.Preprocess(i)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, err
|
return "", nil, err
|
||||||
}
|
}
|
||||||
@ -90,37 +103,30 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||||||
return "", nil, err
|
return "", nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
imgData := runners.ImageData{
|
imgData = llm.ImageData{
|
||||||
|
ID: len(images),
|
||||||
Data: buf.Bytes(),
|
Data: buf.Bytes(),
|
||||||
AspectRatioID: aspectRatioID,
|
AspectRatioID: aspectRatioID,
|
||||||
}
|
}
|
||||||
|
imgPrompt = "<|image|>"
|
||||||
msgs[i].Content = strings.TrimSpace("<|image|>" + msgs[i].Content)
|
} else {
|
||||||
images = append(images, imgData)
|
imgData = llm.ImageData{
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for cnt, msg := range msgs[currMsgIdx:] {
|
|
||||||
prefix := ""
|
|
||||||
prompt := msg.Content
|
|
||||||
for _, i := range msg.Images {
|
|
||||||
imgData := runners.ImageData{
|
|
||||||
ID: len(images),
|
ID: len(images),
|
||||||
Data: i,
|
Data: i,
|
||||||
}
|
}
|
||||||
|
imgPrompt = " "
|
||||||
imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
|
|
||||||
if !strings.Contains(prompt, "[img]") {
|
|
||||||
prefix += imgTag
|
|
||||||
} else {
|
|
||||||
prompt = strings.Replace(prompt, "[img]", imgTag, 1)
|
|
||||||
}
|
|
||||||
|
|
||||||
images = append(images, imgData)
|
|
||||||
}
|
}
|
||||||
msgs[currMsgIdx+cnt].Content = strings.TrimSpace(prefix + " " + prompt)
|
|
||||||
|
imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
|
||||||
|
if !strings.Contains(prompt, "[img]") {
|
||||||
|
prefix += imgTag
|
||||||
|
} else {
|
||||||
|
prompt = strings.Replace(prompt, "[img]", imgTag, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
images = append(images, imgData)
|
||||||
}
|
}
|
||||||
|
msgs[currMsgIdx+cnt].Content = strings.TrimSpace(prefix + imgPrompt + prompt)
|
||||||
}
|
}
|
||||||
|
|
||||||
// truncate any messages that do not fit into the context window
|
// truncate any messages that do not fit into the context window
|
||||||
|
@ -249,7 +249,7 @@ func TestChatPrompt(t *testing.T) {
|
|||||||
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
|
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
|
||||||
},
|
},
|
||||||
expect: expect{
|
expect: expect{
|
||||||
prompt: "<|image|>How many hotdogs are in this image? ",
|
prompt: "[img-0]<|image|>How many hotdogs are in this image? ",
|
||||||
images: [][]byte{imgBuf},
|
images: [][]byte{imgBuf},
|
||||||
aspectRatioID: 1,
|
aspectRatioID: 1,
|
||||||
},
|
},
|
||||||
@ -264,7 +264,7 @@ func TestChatPrompt(t *testing.T) {
|
|||||||
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
|
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
|
||||||
},
|
},
|
||||||
expect: expect{
|
expect: expect{
|
||||||
prompt: "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
|
prompt: "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
|
||||||
images: [][]byte{imgBuf},
|
images: [][]byte{imgBuf},
|
||||||
aspectRatioID: 1,
|
aspectRatioID: 1,
|
||||||
},
|
},
|
||||||
@ -279,8 +279,8 @@ func TestChatPrompt(t *testing.T) {
|
|||||||
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
|
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
|
||||||
},
|
},
|
||||||
expect: expect{
|
expect: expect{
|
||||||
prompt: "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
|
prompt: "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
|
||||||
images: [][]byte{imgBuf2},
|
images: [][]byte{imgBuf, imgBuf2},
|
||||||
aspectRatioID: 1,
|
aspectRatioID: 1,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -294,7 +294,7 @@ func TestChatPrompt(t *testing.T) {
|
|||||||
{Role: "user", Content: "Which ones have mustard?"},
|
{Role: "user", Content: "Which ones have mustard?"},
|
||||||
},
|
},
|
||||||
expect: expect{
|
expect: expect{
|
||||||
prompt: "<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
|
prompt: "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
|
||||||
images: [][]byte{imgBuf},
|
images: [][]byte{imgBuf},
|
||||||
aspectRatioID: 1,
|
aspectRatioID: 1,
|
||||||
},
|
},
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"cmp"
|
"cmp"
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/binary"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
@ -29,10 +30,11 @@ import (
|
|||||||
"github.com/ollama/ollama/build"
|
"github.com/ollama/ollama/build"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/openai"
|
"github.com/ollama/ollama/openai"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/runners"
|
"github.com/ollama/ollama/runners"
|
||||||
|
"github.com/ollama/ollama/server/imageproc"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
"github.com/ollama/ollama/types/errtypes"
|
"github.com/ollama/ollama/types/errtypes"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
@ -78,7 +80,7 @@ func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options
|
|||||||
|
|
||||||
// scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
|
// scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
|
||||||
// It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
|
// It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
|
||||||
func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (runners.LLMServer, *Model, *api.Options, error) {
|
func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (llm.LlamaServer, *Model, *api.Options, error) {
|
||||||
if name == "" {
|
if name == "" {
|
||||||
return nil, nil, nil, fmt.Errorf("model %w", errRequired)
|
return nil, nil, nil, fmt.Errorf("model %w", errRequired)
|
||||||
}
|
}
|
||||||
@ -187,9 +189,26 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
images := make([]runners.ImageData, len(req.Images))
|
images := make([]llm.ImageData, len(req.Images))
|
||||||
for i := range req.Images {
|
for i := range req.Images {
|
||||||
images[i] = runners.ImageData{ID: i, Data: req.Images[i]}
|
if isMllama {
|
||||||
|
data, aspectRatioID, err := imageproc.Preprocess(req.Images[i])
|
||||||
|
if err != nil {
|
||||||
|
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, data)
|
||||||
|
if err != nil {
|
||||||
|
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID}
|
||||||
|
} else {
|
||||||
|
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt := req.Prompt
|
prompt := req.Prompt
|
||||||
@ -220,11 +239,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, i := range images {
|
for _, i := range images {
|
||||||
|
imgPrompt := ""
|
||||||
if isMllama {
|
if isMllama {
|
||||||
msgs = append(msgs, api.Message{Role: "user", Content: "<|image|>"})
|
imgPrompt = "<|image|>"
|
||||||
} else {
|
|
||||||
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
|
|
||||||
}
|
}
|
||||||
|
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
|
||||||
}
|
}
|
||||||
|
|
||||||
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
|
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
|
||||||
@ -248,19 +267,19 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
prompt = b.String()
|
prompt = b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug("generate request", "prompt", prompt, "images", images)
|
slog.Debug("generate request", "images", len(images), "prompt", prompt)
|
||||||
|
|
||||||
ch := make(chan any)
|
ch := make(chan any)
|
||||||
go func() {
|
go func() {
|
||||||
// TODO (jmorganca): avoid building the response twice both here and below
|
// TODO (jmorganca): avoid building the response twice both here and below
|
||||||
var sb strings.Builder
|
var sb strings.Builder
|
||||||
defer close(ch)
|
defer close(ch)
|
||||||
if err := r.Completion(c.Request.Context(), runners.CompletionRequest{
|
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
|
||||||
Prompt: prompt,
|
Prompt: prompt,
|
||||||
Images: images,
|
Images: images,
|
||||||
Format: req.Format,
|
Format: req.Format,
|
||||||
Options: opts,
|
Options: opts,
|
||||||
}, func(cr runners.CompletionResponse) {
|
}, func(cr llm.CompletionResponse) {
|
||||||
res := api.GenerateResponse{
|
res := api.GenerateResponse{
|
||||||
Model: req.Model,
|
Model: req.Model,
|
||||||
CreatedAt: time.Now().UTC(),
|
CreatedAt: time.Now().UTC(),
|
||||||
@ -603,7 +622,7 @@ func (s *Server) PushHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func checkNameExists(name model.Name) error {
|
func checkNameExists(name model.Name) error {
|
||||||
names, err := Manifests()
|
names, err := Manifests(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -639,7 +658,7 @@ func (s *Server) CreateHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if r.Path == "" && r.Modelfile == "" {
|
if r.Path == "" && r.Modelfile == "" {
|
||||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "path or fileutils are required"})
|
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "path or modelfile are required"})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -647,7 +666,7 @@ func (s *Server) CreateHandler(c *gin.Context) {
|
|||||||
if r.Path != "" && r.Modelfile == "" {
|
if r.Path != "" && r.Modelfile == "" {
|
||||||
f, err := os.Open(r.Path)
|
f, err := os.Open(r.Path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading fileutils: %s", err)})
|
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading modelfile: %s", err)})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
@ -851,12 +870,12 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
|||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getKVData(digest string, verbose bool) (fileutils.KV, error) {
|
func getKVData(digest string, verbose bool) (llm.KV, error) {
|
||||||
maxArraySize := 0
|
maxArraySize := 0
|
||||||
if verbose {
|
if verbose {
|
||||||
maxArraySize = -1
|
maxArraySize = -1
|
||||||
}
|
}
|
||||||
kvData, err := fileutils.LoadModel(digest, maxArraySize)
|
kvData, err := llm.LoadModel(digest, maxArraySize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -875,7 +894,7 @@ func getKVData(digest string, verbose bool) (fileutils.KV, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) ListHandler(c *gin.Context) {
|
func (s *Server) ListHandler(c *gin.Context) {
|
||||||
ms, err := Manifests()
|
ms, err := Manifests(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
return
|
return
|
||||||
@ -1192,18 +1211,22 @@ func Serve(ln net.Listener) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !envconfig.NoPrune() {
|
if !envconfig.NoPrune() {
|
||||||
// clean up unused layers and manifests
|
if _, err := Manifests(false); err != nil {
|
||||||
if err := PruneLayers(); err != nil {
|
slog.Warn("corrupt manifests detected, skipping prune operation. Re-pull or delete to clear", "error", err)
|
||||||
return err
|
} else {
|
||||||
}
|
// clean up unused layers and manifests
|
||||||
|
if err := PruneLayers(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
manifestsPath, err := GetManifestPath()
|
manifestsPath, err := GetManifestPath()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := PruneDirectory(manifestsPath); err != nil {
|
if err := PruneDirectory(manifestsPath); err != nil {
|
||||||
return err
|
return err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1436,12 +1459,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
ch := make(chan any)
|
ch := make(chan any)
|
||||||
go func() {
|
go func() {
|
||||||
defer close(ch)
|
defer close(ch)
|
||||||
if err := r.Completion(c.Request.Context(), runners.CompletionRequest{
|
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
|
||||||
Prompt: prompt,
|
Prompt: prompt,
|
||||||
Images: images,
|
Images: images,
|
||||||
Format: req.Format,
|
Format: req.Format,
|
||||||
Options: opts,
|
Options: opts,
|
||||||
}, func(r runners.CompletionResponse) {
|
}, func(r llm.CompletionResponse) {
|
||||||
res := api.ChatResponse{
|
res := api.ChatResponse{
|
||||||
Model: req.Model,
|
Model: req.Model,
|
||||||
CreatedAt: time.Now().UTC(),
|
CreatedAt: time.Now().UTC(),
|
||||||
|
@ -16,12 +16,12 @@ import (
|
|||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
var stream bool = false
|
var stream bool = false
|
||||||
|
|
||||||
func createBinFile(t *testing.T, kv map[string]any, ti []fileutils.Tensor) string {
|
func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
f, err := os.CreateTemp(t.TempDir(), "")
|
f, err := os.CreateTemp(t.TempDir(), "")
|
||||||
@ -30,7 +30,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []fileutils.Tensor) strin
|
|||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
if err := fileutils.WriteGGUF(f, kv, ti); err != nil {
|
if err := llm.WriteGGUF(f, kv, ti); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -581,7 +581,7 @@ func TestCreateDetectTemplate(t *testing.T) {
|
|||||||
t.Run("matched", func(t *testing.T) {
|
t.Run("matched", func(t *testing.T) {
|
||||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||||
Name: "test",
|
Name: "test",
|
||||||
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, fileutils.KV{
|
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
|
||||||
"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
||||||
}, nil)),
|
}, nil)),
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
|
@ -16,19 +16,18 @@ import (
|
|||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/runners"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type mockRunner struct {
|
type mockRunner struct {
|
||||||
runners.LLMServer
|
llm.LlamaServer
|
||||||
|
|
||||||
// CompletionRequest is only valid until the next call to Completion
|
// CompletionRequest is only valid until the next call to Completion
|
||||||
runners.CompletionRequest
|
llm.CompletionRequest
|
||||||
runners.CompletionResponse
|
llm.CompletionResponse
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *mockRunner) Completion(_ context.Context, r runners.CompletionRequest, fn func(r runners.CompletionResponse)) error {
|
func (m *mockRunner) Completion(_ context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
|
||||||
m.CompletionRequest = r
|
m.CompletionRequest = r
|
||||||
fn(m.CompletionResponse)
|
fn(m.CompletionResponse)
|
||||||
return nil
|
return nil
|
||||||
@ -42,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *fileutils.GGML, []string, []string, api.Options, int) (runners.LLMServer, error) {
|
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||||
return func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, projectors, system []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
|
return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
return mock, nil
|
return mock, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -52,7 +51,7 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
gin.SetMode(gin.TestMode)
|
gin.SetMode(gin.TestMode)
|
||||||
|
|
||||||
mock := mockRunner{
|
mock := mockRunner{
|
||||||
CompletionResponse: runners.CompletionResponse{
|
CompletionResponse: llm.CompletionResponse{
|
||||||
Done: true,
|
Done: true,
|
||||||
DoneReason: "stop",
|
DoneReason: "stop",
|
||||||
PromptEvalCount: 1,
|
PromptEvalCount: 1,
|
||||||
@ -73,7 +72,7 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
getGpuFn: discover.GetGPUInfo,
|
getGpuFn: discover.GetGPUInfo,
|
||||||
getCpuFn: discover.GetCPUInfo,
|
getCpuFn: discover.GetCPUInfo,
|
||||||
reschedDelay: 250 * time.Millisecond,
|
reschedDelay: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int) {
|
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
@ -92,7 +91,7 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
{{- if .System }}System: {{ .System }} {{ end }}
|
{{- if .System }}System: {{ .System }} {{ end }}
|
||||||
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
|
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
|
||||||
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
|
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
|
||||||
`, createBinFile(t, fileutils.KV{
|
`, createBinFile(t, llm.KV{
|
||||||
"general.architecture": "llama",
|
"general.architecture": "llama",
|
||||||
"llama.block_count": uint32(1),
|
"llama.block_count": uint32(1),
|
||||||
"llama.context_length": uint32(8192),
|
"llama.context_length": uint32(8192),
|
||||||
@ -102,7 +101,7 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
"tokenizer.ggml.tokens": []string{""},
|
"tokenizer.ggml.tokens": []string{""},
|
||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []fileutils.Tensor{
|
}, []llm.Tensor{
|
||||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
@ -147,10 +146,10 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
t.Run("missing capabilities chat", func(t *testing.T) {
|
t.Run("missing capabilities chat", func(t *testing.T) {
|
||||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||||
Model: "bert",
|
Model: "bert",
|
||||||
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, fileutils.KV{
|
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
|
||||||
"general.architecture": "bert",
|
"general.architecture": "bert",
|
||||||
"bert.pooling_type": uint32(0),
|
"bert.pooling_type": uint32(0),
|
||||||
}, []fileutils.Tensor{})),
|
}, []llm.Tensor{})),
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -350,7 +349,7 @@ func TestGenerate(t *testing.T) {
|
|||||||
gin.SetMode(gin.TestMode)
|
gin.SetMode(gin.TestMode)
|
||||||
|
|
||||||
mock := mockRunner{
|
mock := mockRunner{
|
||||||
CompletionResponse: runners.CompletionResponse{
|
CompletionResponse: llm.CompletionResponse{
|
||||||
Done: true,
|
Done: true,
|
||||||
DoneReason: "stop",
|
DoneReason: "stop",
|
||||||
PromptEvalCount: 1,
|
PromptEvalCount: 1,
|
||||||
@ -371,7 +370,7 @@ func TestGenerate(t *testing.T) {
|
|||||||
getGpuFn: discover.GetGPUInfo,
|
getGpuFn: discover.GetGPUInfo,
|
||||||
getCpuFn: discover.GetCPUInfo,
|
getCpuFn: discover.GetCPUInfo,
|
||||||
reschedDelay: 250 * time.Millisecond,
|
reschedDelay: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int) {
|
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
@ -390,7 +389,7 @@ func TestGenerate(t *testing.T) {
|
|||||||
{{- if .System }}System: {{ .System }} {{ end }}
|
{{- if .System }}System: {{ .System }} {{ end }}
|
||||||
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
|
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
|
||||||
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
|
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
|
||||||
`, createBinFile(t, fileutils.KV{
|
`, createBinFile(t, llm.KV{
|
||||||
"general.architecture": "llama",
|
"general.architecture": "llama",
|
||||||
"llama.block_count": uint32(1),
|
"llama.block_count": uint32(1),
|
||||||
"llama.context_length": uint32(8192),
|
"llama.context_length": uint32(8192),
|
||||||
@ -400,7 +399,7 @@ func TestGenerate(t *testing.T) {
|
|||||||
"tokenizer.ggml.tokens": []string{""},
|
"tokenizer.ggml.tokens": []string{""},
|
||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []fileutils.Tensor{
|
}, []llm.Tensor{
|
||||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
@ -445,10 +444,10 @@ func TestGenerate(t *testing.T) {
|
|||||||
t.Run("missing capabilities generate", func(t *testing.T) {
|
t.Run("missing capabilities generate", func(t *testing.T) {
|
||||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||||
Model: "bert",
|
Model: "bert",
|
||||||
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, fileutils.KV{
|
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
|
||||||
"general.architecture": "bert",
|
"general.architecture": "bert",
|
||||||
"bert.pooling_type": uint32(0),
|
"bert.pooling_type": uint32(0),
|
||||||
}, []fileutils.Tensor{})),
|
}, []llm.Tensor{})),
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/fileutils"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/openai"
|
"github.com/ollama/ollama/openai"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
@ -83,14 +83,14 @@ func Test_Routes(t *testing.T) {
|
|||||||
fname := createTestFile(t, "ollama-model")
|
fname := createTestFile(t, "ollama-model")
|
||||||
|
|
||||||
r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
|
r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
|
||||||
fileutils, err := parser.ParseFile(r)
|
modelfile, err := parser.ParseFile(r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("failed to parse file: %v", err)
|
t.Fatalf("failed to parse file: %v", err)
|
||||||
}
|
}
|
||||||
fn := func(resp api.ProgressResponse) {
|
fn := func(resp api.ProgressResponse) {
|
||||||
t.Logf("Status: %s", resp.Status)
|
t.Logf("Status: %s", resp.Status)
|
||||||
}
|
}
|
||||||
err = CreateModel(context.TODO(), model.ParseName(name), "", "", fileutils, fn)
|
err = CreateModel(context.TODO(), model.ParseName(name), "", "", modelfile, fn)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("failed to create model: %v", err)
|
t.Fatalf("failed to create model: %v", err)
|
||||||
}
|
}
|
||||||
@ -561,8 +561,8 @@ func TestShow(t *testing.T) {
|
|||||||
Name: "show-model",
|
Name: "show-model",
|
||||||
Modelfile: fmt.Sprintf(
|
Modelfile: fmt.Sprintf(
|
||||||
"FROM %s\nFROM %s",
|
"FROM %s\nFROM %s",
|
||||||
createBinFile(t, fileutils.KV{"general.architecture": "test"}, nil),
|
createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
|
||||||
createBinFile(t, fileutils.KV{"general.type": "projector", "general.architecture": "clip"}, nil),
|
createBinFile(t, llm.KV{"general.type": "projector", "general.architecture": "clip"}, nil),
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user