diff --git a/.gitattributes b/.gitattributes index f7192096..f1c8bcb4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,3 @@ llm/ext_server/* linguist-vendored -* text eol=lf +* text=auto +*.go text eol=lf diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index f0c6db5d..9c1e3e13 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -31,7 +31,7 @@ jobs: security set-keychain-settings -lut 3600 build.keychain - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - name: Build Darwin env: @@ -87,7 +87,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - run: go get ./... - run: | @@ -141,7 +141,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - name: 'Install ROCm' run: | @@ -187,6 +187,13 @@ jobs: generate-windows-cuda: environment: release runs-on: windows + strategy: + matrix: + cuda: + - version: "11" + url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe' + - version: "12" + url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe' env: KEY_CONTAINER: ${{ vars.KEY_CONTAINER }} steps: @@ -218,13 +225,13 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - - name: 'Install CUDA' + - name: 'Install CUDA ${{ matrix.cuda.version }}' run: | $ErrorActionPreference = "Stop" write-host "downloading CUDA Installer" - Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" + Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" write-host "Installing CUDA" Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait write-host "Completed CUDA" @@ -256,15 +263,16 @@ jobs: cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" - uses: actions/upload-artifact@v4 with: - name: generate-windows-cuda + name: generate-windows-cuda-${{ matrix.cuda.version }} path: | llm/build/**/bin/* dist/windows-amd64/** - uses: actions/upload-artifact@v4 with: - name: windows-cuda-deps + name: windows-cuda-deps-${{ matrix.cuda.version }} path: dist/deps/* + # Import the prior generation steps and build the final windows assets build-windows: environment: release @@ -306,7 +314,7 @@ jobs: write-host "plugin installed" - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - run: go get - uses: actions/download-artifact@v4 @@ -314,10 +322,16 @@ jobs: name: generate-windows-cpu - uses: actions/download-artifact@v4 with: - name: generate-windows-cuda + name: generate-windows-cuda-11 - uses: actions/download-artifact@v4 with: - name: windows-cuda-deps + name: generate-windows-cuda-12 + - uses: actions/download-artifact@v4 + with: + name: windows-cuda-deps-11 + - uses: actions/download-artifact@v4 + with: + name: windows-cuda-deps-12 - uses: actions/download-artifact@v4 with: name: windows-rocm-deps @@ -363,7 +377,6 @@ jobs: - run: | ./scripts/build_linux.sh ./scripts/build_docker.sh - mv dist/deps/* dist/ - uses: actions/upload-artifact@v4 with: name: dist-linux-amd64 @@ -459,7 +472,10 @@ jobs: merge-multiple: true - run: | ls -lh dist/ - (cd dist; sha256sum * > sha256sum.txt) + (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt) + mv sha256sum.txt dist/ + mv dist/linux-???64 . + mv dist/linux-amd64-rocm . cat dist/sha256sum.txt - name: Create or update Release run: | diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a57d45fd..3d58fa3e 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -63,7 +63,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - run: go get ./... - run: | @@ -163,7 +163,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - name: 'Install ROCm' run: | @@ -200,7 +200,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - name: 'Install CUDA' run: | @@ -255,7 +255,7 @@ jobs: submodules: recursive - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: false - run: | case ${{ matrix.arch }} in @@ -297,7 +297,7 @@ jobs: submodules: recursive - uses: actions/setup-go@v5 with: - go-version: "stable" + go-version-file: go.mod cache: true - run: | case ${{ matrix.arch }} in diff --git a/.golangci.yaml b/.golangci.yaml index c2c8b52b..c9c9f620 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -24,7 +24,6 @@ linters: - nosprintfhostport - staticcheck - tenv - - testifylint - unconvert - unused - usestdlibvars diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..7f12a0fc --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,37 @@ +# Contributing to Ollama + +Thank you for your interest in contributing to Ollama! Here are a few guidelines to help get you started. + +## Set up + +See the [development documentation](./docs/development.md) for instructions on how to build and run Ollama locally. + +## Pull requests + +### Ideal issues + +* [Bugs](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Abug): issues where Ollama stops working or where it results in an unexpected error. +* [Performance](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Aperformance): issues to make Ollama faster at model inference, downloading or uploading. +* [Security](https://github.com/ollama/ollama/blob/main/SECURITY.md): issues that could lead to a security vulnerability. As mentioned in [SECURITY.md](https://github.com/ollama/ollama/blob/main/SECURITY.md), please do not disclose security vulnerabilities publicly. + +### Issues that are harder to review + +* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future. +* Refactoring: large code improvements are important, but can be harder or take longer to review and merge. +* Documentation: small updates to fill in or dorrect missing documentation is helpful, however large documentation additions can be hard to maintain over time. + +### Issues that may not be accepted + +* Changes that break backwards compatibility in Ollama's API (including the OpenAI-compatible API) +* Changes that add significant friction to the user experience +* Changes that create a large future maintenance burden for maintainers and contributors + +### Best practices + +* Commit messages: please leave both a title and a description in your commit messages. The title should be a short summary of the changes, with a leading word that explains the section of the code being changed (e.g. `api: fix parsing of prompt field`) . In the description, leave a short 2-3 sentences that explain more about the change and its impact. +* Tests: please add test coverage to changes where possible. +* Minimize dependencies: avoid adding new dependencies unless absolutely necessary. + +## Need help? + +If you need help with anything, feel free to reach out to us on our [Discord server](https://discord.gg/ollama). diff --git a/Dockerfile b/Dockerfile index c8efdd8a..c46477b4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,9 @@ ARG GOLANG_VERSION=1.22.5 ARG CMAKE_VERSION=3.22.1 -# this CUDA_VERSION corresponds with the one specified in docs/gpu.md -ARG CUDA_VERSION=11.3.1 +ARG CUDA_VERSION_11=11.3.1 +ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" +ARG CUDA_VERSION_12=12.4.0 +ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" ARG ROCM_VERSION=6.1.2 # Copy the minimal context we need to run the generate scripts @@ -10,7 +12,7 @@ COPY .git .git COPY .gitmodules .gitmodules COPY llm llm -FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64 +FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -18,9 +20,34 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh +ARG CUDA_V11_ARCHITECTURES +ENV GOARCH amd64 +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ + CUDA_VARIANT="_v11" \ + bash gen_linux.sh -FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64 +FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64 +ARG CMAKE_VERSION +COPY ./scripts/rh_linux_deps.sh / +RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh +ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ARG CUDA_V12_ARCHITECTURES +ENV GOARCH amd64 +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \ + CUDA_VARIANT="_v12" \ + OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -28,7 +55,32 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh +ARG CUDA_V11_ARCHITECTURES +ENV GOARCH arm64 +RUN OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ + CUDA_VARIANT="_v11" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64 +ARG CMAKE_VERSION +COPY ./scripts/rh_linux_deps.sh / +RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh +ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ARG CUDA_V12_ARCHITECTURES +ENV GOARCH arm64 +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \ + CUDA_VARIANT="_v12" \ + OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ + bash gen_linux.sh + FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 ARG CMAKE_VERSION @@ -40,15 +92,11 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG AMDGPU_TARGETS -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh -RUN mkdir /tmp/scratch && \ - for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \ - cp ${dep} /tmp/scratch/ || exit 1 ; \ - done && \ - (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \ - mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \ - (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . ) - +ENV GOARCH amd64 +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh +RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \ + (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - ) FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64 ARG CMAKE_VERSION @@ -59,16 +107,21 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS +ENV GOARCH amd64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64 -RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_CPU_TARGET="static" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64 ARG CMAKE_VERSION @@ -79,12 +132,15 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS +ENV GOARCH arm64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64 -RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_CPU_TARGET="static" bash gen_linux.sh FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh # Intermediate stage used for ./scripts/build_linux.sh @@ -95,12 +151,16 @@ COPY . . COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath . +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-amd64/bin/ollama . # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 @@ -109,23 +169,36 @@ ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath . +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-arm64/bin/ollama . + +# Strip out ROCm dependencies to keep the primary image lean +FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/ +RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* # Runtime stages FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 +COPY --from=amd64-libs-without-rocm /scratch/ /lib/ RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ + FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64 +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/ # Radeon images are much larger so we keep it distinct from the CPU/CUDA image FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm RUN update-pciids -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ +RUN ln -s /opt/rocm/lib /lib/ollama EXPOSE 11434 ENV OLLAMA_HOST 0.0.0.0 diff --git a/README.md b/README.md index 7c606e1c..aae92e6c 100644 --- a/README.md +++ b/README.md @@ -325,6 +325,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [tlm](https://github.com/yusufcanb/tlm) - [podman-ollama](https://github.com/ericcurtin/podman-ollama) - [gollama](https://github.com/sammcj/gollama) +- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/) ### Database diff --git a/api/client.go b/api/client.go index bbdf8202..2528fb21 100644 --- a/api/client.go +++ b/api/client.go @@ -298,7 +298,7 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) { return &lr, nil } -// List running models. +// ListRunning lists running models. func (c *Client) ListRunning(ctx context.Context) (*ProcessResponse, error) { var lr ProcessResponse if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil { @@ -333,7 +333,7 @@ func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, err return &resp, nil } -// Hearbeat checks if the server has started and is responsive; if yes, it +// Heartbeat checks if the server has started and is responsive; if yes, it // returns nil, otherwise an error. func (c *Client) Heartbeat(ctx context.Context) error { if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil { diff --git a/api/types.go b/api/types.go index c2529652..2f5a9424 100644 --- a/api/types.go +++ b/api/types.go @@ -231,7 +231,6 @@ type Options struct { // Runner options which must be set when the model is loaded into memory type Runner struct { - UseNUMA bool `json:"numa,omitempty"` NumCtx int `json:"num_ctx,omitempty"` NumBatch int `json:"num_batch,omitempty"` NumGPU int `json:"num_gpu,omitempty"` @@ -505,7 +504,7 @@ func (opts *Options) FromMap(m map[string]interface{}) error { for key, val := range m { opt, ok := jsonOpts[key] if !ok { - slog.Warn("invalid option provided", "option", opt.Name) + slog.Warn("invalid option provided", "option", key) continue } @@ -615,7 +614,6 @@ func DefaultOptions() Options { F16KV: true, UseMLock: false, UseMMap: nil, - UseNUMA: false, }, } } diff --git a/app/ollama.iss b/app/ollama.iss index dc6178f7..bce0a337 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -87,20 +87,11 @@ DialogFontSize=12 [Files] Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit -Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit -Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs +Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit +Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion -#if DirExists("..\dist\windows-amd64\cuda") - Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs -#endif -#if DirExists("..\dist\windows-amd64\oneapi") - Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs -#endif -#if DirExists("..\dist\windows-amd64\rocm") - Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs -#endif - +Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs [Icons] Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" @@ -108,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" [Run] -Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden +Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden [UninstallRun] ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden @@ -143,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi [Registry] Root: HKCU; Subkey: "Environment"; \ - ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \ - Check: NeedsAddPath('{app}') + ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \ + Check: NeedsAddPath('{app}\bin') [Code] diff --git a/app/tray/wintray/menus.go b/app/tray/wintray/menus.go index 9cb3b893..59624444 100644 --- a/app/tray/wintray/menus.go +++ b/app/tray/wintray/menus.go @@ -11,12 +11,12 @@ import ( ) const ( - updatAvailableMenuID = 1 - updateMenuID = updatAvailableMenuID + 1 - separatorMenuID = updateMenuID + 1 - diagLogsMenuID = separatorMenuID + 1 - diagSeparatorMenuID = diagLogsMenuID + 1 - quitMenuID = diagSeparatorMenuID + 1 + updateAvailableMenuID = 1 + updateMenuID = updateAvailableMenuID + 1 + separatorMenuID = updateMenuID + 1 + diagLogsMenuID = separatorMenuID + 1 + diagSeparatorMenuID = diagLogsMenuID + 1 + quitMenuID = diagSeparatorMenuID + 1 ) func (t *winTray) initMenus() error { @@ -35,7 +35,7 @@ func (t *winTray) initMenus() error { func (t *winTray) UpdateAvailable(ver string) error { if !t.updateNotified { slog.Debug("updating menu and sending notification for new update") - if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil { + if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil { return fmt.Errorf("unable to create menu entries %w", err) } if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil { diff --git a/app/tray/wintray/tray.go b/app/tray/wintray/tray.go index ccd087a1..6f827893 100644 --- a/app/tray/wintray/tray.go +++ b/app/tray/wintray/tray.go @@ -11,6 +11,7 @@ import ( "path/filepath" "sort" "sync" + "syscall" "unsafe" "golang.org/x/sys/windows" @@ -433,7 +434,12 @@ func (t *winTray) setIcon(src string) error { t.muNID.Lock() defer t.muNID.Unlock() t.nid.Icon = h - t.nid.Flags |= NIF_ICON + t.nid.Flags |= NIF_ICON | NIF_TIP + if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil { + copy(t.nid.Tip[:], toolTipUTF16) + } else { + return err + } t.nid.Size = uint32(unsafe.Sizeof(*t.nid)) return t.nid.modify() diff --git a/app/tray/wintray/w32api.go b/app/tray/wintray/w32api.go index a1e0381d..7c7c0ac8 100644 --- a/app/tray/wintray/w32api.go +++ b/app/tray/wintray/w32api.go @@ -61,6 +61,7 @@ const ( MIIM_SUBMENU = 0x00000004 MIM_APPLYTOSUBMENUS = 0x80000000 NIF_ICON = 0x00000002 + NIF_TIP = 0x00000004 NIF_INFO = 0x00000010 NIF_MESSAGE = 0x00000001 SW_HIDE = 0 diff --git a/cmd/cmd.go b/cmd/cmd.go index d47db65b..a8a02605 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -22,6 +22,7 @@ import ( "runtime" "slices" "strings" + "sync/atomic" "syscall" "time" @@ -78,6 +79,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error { status := "transferring model data" spinner := progress.NewSpinner(status) p.Add(status, spinner) + defer p.Stop() for i := range modelfile.Commands { switch modelfile.Commands[i].Name { @@ -112,7 +114,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error { path = tempfile } - digest, err := createBlob(cmd, client, path) + digest, err := createBlob(cmd, client, path, spinner) if err != nil { return err } @@ -221,6 +223,14 @@ func tempZipFiles(path string) (string, error) { } files = append(files, js...) + // bert models require a nested config.json + // TODO(mxyng): merge this with the glob above + js, err = glob(filepath.Join(path, "**/*.json"), "text/plain") + if err != nil { + return "", err + } + files = append(files, js...) + if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 { // add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob // tokenizer.model might be a unresolved git lfs reference; error if it is @@ -250,6 +260,11 @@ func tempZipFiles(path string) (string, error) { return "", err } + zfi.Name, err = filepath.Rel(path, file) + if err != nil { + return "", err + } + zf, err := zipfile.CreateHeader(zfi) if err != nil { return "", err @@ -263,13 +278,20 @@ func tempZipFiles(path string) (string, error) { return tempfile.Name(), nil } -func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) { +func createBlob(cmd *cobra.Command, client *api.Client, path string, spinner *progress.Spinner) (string, error) { bin, err := os.Open(path) if err != nil { return "", err } defer bin.Close() + // Get file info to retrieve the size + fileInfo, err := bin.Stat() + if err != nil { + return "", err + } + fileSize := fileInfo.Size() + hash := sha256.New() if _, err := io.Copy(hash, bin); err != nil { return "", err @@ -279,13 +301,43 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er return "", err } + var pw progressWriter + status := "transferring model data 0%" + spinner.SetMessage(status) + + done := make(chan struct{}) + defer close(done) + + go func() { + ticker := time.NewTicker(60 * time.Millisecond) + defer ticker.Stop() + for { + select { + case <-ticker.C: + spinner.SetMessage(fmt.Sprintf("transferring model data %d%%", int(100*pw.n.Load()/fileSize))) + case <-done: + spinner.SetMessage("transferring model data 100%") + return + } + } + }() + digest := fmt.Sprintf("sha256:%x", hash.Sum(nil)) - if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil { + if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil { return "", err } return digest, nil } +type progressWriter struct { + n atomic.Int64 +} + +func (w *progressWriter) Write(p []byte) (n int, err error) { + w.n.Add(int64(len(p))) + return len(p), nil +} + func RunHandler(cmd *cobra.Command, args []string) error { interactive := true @@ -1086,7 +1138,7 @@ func generate(cmd *cobra.Command, opts runOptions) error { return nil } -func RunServer(cmd *cobra.Command, _ []string) error { +func RunServer(_ *cobra.Command, _ []string) error { if err := initializeKeypair(); err != nil { return err } diff --git a/convert/convert.go b/convert/convert.go index b9461e4f..5a314cdd 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -7,6 +7,7 @@ import ( "io" "io/fs" "log/slog" + "strings" "github.com/ollama/ollama/llm" ) @@ -27,6 +28,10 @@ func (Parameters) KV(t *Tokenizer) llm.KV { "tokenizer.ggml.token_type": t.Vocabulary.Types, } + if len(t.Merges) > 0 { + kv["tokenizer.ggml.merges"] = t.Merges + } + if t.Template != "" { kv["tokenizer.chat_template"] = t.Template } @@ -54,14 +59,20 @@ type Converter interface { KV(*Tokenizer) llm.KV // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. Tensors([]Tensor) []llm.Tensor + // Replacements returns a list of string pairs to replace in tensor names. + // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details + Replacements() []string - // tensorName returns the LLM tensor name for a specific input name - tensorName(string) string // specialTokenTypes returns any special token types the model uses specialTokenTypes() []string + // writeFile writes the model to the provided io.WriteSeeker writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error } +type moreParser interface { + parseMore(fs.FS) error +} + // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // and files it finds in the input path. // Supported input model formats include safetensors. @@ -89,6 +100,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { conv = &mixtral{} case "GemmaForCausalLM": conv = &gemma{} + case "Gemma2ForCausalLM": + conv = &gemma2{} + case "Phi3ForCausalLM": + conv = &phi3{} + case "BertModel": + conv = &bert{} default: return errors.New("unsupported architecture") } @@ -97,6 +114,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { return err } + if t, ok := conv.(moreParser); ok { + if err := t.parseMore(fsys); err != nil { + return err + } + } + t, err := parseTokenizer(fsys, conv.specialTokenTypes()) if err != nil { return err @@ -113,7 +136,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) } - ts, err := parseTensors(fsys) + ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...)) if err != nil { return err } diff --git a/convert/convert_bert.go b/convert/convert_bert.go new file mode 100644 index 00000000..6e7d59fe --- /dev/null +++ b/convert/convert_bert.go @@ -0,0 +1,174 @@ +package convert + +import ( + "cmp" + "encoding/json" + "io/fs" + "path/filepath" + "slices" + "strings" + + "github.com/ollama/ollama/llm" +) + +type bert struct { + Parameters + NLayers uint32 `json:"n_layers"` + NumHiddenLayers uint32 `json:"num_hidden_layers"` + NLayer uint32 `json:"n_layer"` + MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` + NCtx uint32 `json:"n_ctx"` + HiddenSize uint32 `json:"hidden_size"` + NEmbd uint32 `json:"n_embd"` + IntermediateSize uint32 `json:"intermediate_size"` + NInner uint32 `json:"n_inner"` + NumAttentionHeads uint32 `json:"num_attention_heads"` + NHead uint32 `json:"n_head"` + NumKeyValueHeads uint32 `json:"num_key_value_heads"` + LayerNormEPS float32 `json:"layer_norm_eps"` + LayerNormEpsilon float32 `json:"layer_norm_epsilon"` + NormEpsilon float32 `json:"norm_epsilon"` + + PoolingType uint32 +} + +var ( + _ Converter = (*bert)(nil) + _ moreParser = (*bert)(nil) +) + +func (p *bert) parseMore(fsys fs.FS) error { + bts, err := fs.ReadFile(fsys, "modules.json") + if err != nil { + return err + } + + var modules []struct { + Type string `json:"type"` + Path string `json:"path"` + } + + if err := json.Unmarshal(bts, &modules); err != nil { + return err + } + + var pooling string + for _, m := range modules { + if m.Type == "sentence_transformers.models.Pooling" { + pooling = m.Path + break + } + } + + if pooling != "" { + bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json")) + if err != nil { + return err + } + + var pc struct { + PoolingModeCLSToken bool `json:"pooling_mode_cls_token"` + PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"` + } + + if err := json.Unmarshal(bts, &pc); err != nil { + return err + } + + if pc.PoolingModeMeanTokens { + p.PoolingType = 1 + } else if pc.PoolingModeCLSToken { + p.PoolingType = 2 + } + } + + return nil +} + +func (p *bert) KV(t *Tokenizer) llm.KV { + kv := p.Parameters.KV(t) + kv["general.architecture"] = "bert" + kv["bert.attention.causal"] = false + kv["bert.pooling_type"] = p.PoolingType + + kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) + + if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 { + kv["bert.context_length"] = contextLength + } + + if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 { + kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) + } + + if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 { + kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner) + } + + if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 { + kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead) + } + + if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 { + kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon + } + + kv["tokenizer.ggml.model"] = "bert" + kv["tokenizer.ggml.token_type_count"] = uint32(2) + + // convert to phantom space tokens + for i, e := range t.Tokens { + if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") { + // noop + } else if strings.HasPrefix(e, "##") { + t.Tokens[i] = e[2:] + } else { + t.Tokens[i] = "\u2581" + e + } + } + + kv["tokenizer.ggml.tokens"] = t.Tokens + + return kv +} + +func (p *bert) Tensors(ts []Tensor) []llm.Tensor { + var out []llm.Tensor + for _, t := range ts { + if slices.Contains([]string{ + "embeddings.position_ids", + "pooler.dense.weight", + "pooler.dense.bias", + }, t.Name()) { + continue + } + + out = append(out, llm.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + + return out +} + +func (bert) Replacements() []string { + return []string{ + "encoder.layer", "blk", + "encoder.layers", "blk", + "embeddings.word_embeddings", "token_embd", + "embeddings.token_type_embeddings", "token_types", + "embeddings.LayerNorm", "token_embd_norm", + "embeddings.position_embeddings", "position_embd", + "attention.self.query", "attn_q", + "attention.self.key", "attn_k", + "attention.self.value", "attn_v", + "attention.output.dense", "attn_output", + "attention.output.LayerNorm", "attn_output_norm", + "intermediate.dense", "ffn_up", + "output.dense", "ffn_down", + "output.LayerNorm", "layer_output_norm", + } +} diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index 9213e157..c4316808 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -26,7 +26,6 @@ var _ Converter = (*gemma)(nil) func (p *gemma) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "gemma" - kv["general.name"] = "gemma" kv["gemma.context_length"] = p.MaxPositionEmbeddings kv["gemma.embedding_length"] = p.HiddenSize kv["gemma.block_count"] = p.HiddenLayers @@ -44,15 +43,14 @@ func (p *gemma) KV(t *Tokenizer) llm.KV { } func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { - var out []llm.Tensor + out := make([]llm.Tensor, 0, len(ts)) for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasSuffix(name, "_norm.weight") { + if strings.HasSuffix(t.Name(), "_norm.weight") { t.SetRepacker(p.addOne) } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -62,8 +60,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *gemma) tensorName(n string) string { - return strings.NewReplacer( +func (p *gemma) Replacements() []string { + return []string{ "model.embed_tokens", "token_embd", "model.norm", "output_norm", "model.layers", "blk", @@ -76,8 +74,7 @@ func (p *gemma) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - "block_sparse_moe.gate", "ffn_inp", - ).Replace(n) + } } func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) { diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go new file mode 100644 index 00000000..084f9c52 --- /dev/null +++ b/convert/convert_gemma2.go @@ -0,0 +1,43 @@ +package convert + +import ( + "github.com/ollama/ollama/llm" +) + +type gemma2 struct { + gemma + SlidingWindow uint32 `json:"sliding_window"` + AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"` + FinalLogitSoftcap float32 `json:"final_logit_softcapping"` +} + +func (p *gemma2) KV(t *Tokenizer) llm.KV { + kv := p.Parameters.KV(t) + kv["general.architecture"] = "gemma2" + kv["gemma2.context_length"] = p.MaxPositionEmbeddings + kv["gemma2.embedding_length"] = p.HiddenSize + kv["gemma2.block_count"] = p.HiddenLayers + kv["gemma2.feed_forward_length"] = p.IntermediateSize + kv["gemma2.attention.head_count"] = p.NumAttentionHeads + kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads + kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS + kv["gemma2.attention.key_length"] = p.HeadDim + kv["gemma2.attention.value_length"] = p.HeadDim + kv["gemma2.attention.sliding_window"] = p.SlidingWindow + kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap + kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap + kv["tokenizer.ggml.eot_token_id"] = uint32(107) + kv["tokenizer.ggml.middle_token_id"] = uint32(68) + kv["tokenizer.ggml.prefix_token_id"] = uint32(67) + kv["tokenizer.ggml.suffix_token_id"] = uint32(69) + return kv +} + +func (p *gemma2) Replacements() []string { + return append( + p.gemma.Replacements(), + "post_attention_layernorm", "post_attention_norm", + "pre_feedforward_layernorm", "ffn_norm", + "post_feedforward_layernorm", "post_ffw_norm", + ) +} diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 0383a85e..27f924fb 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -3,6 +3,7 @@ package convert import ( "cmp" "fmt" + "math" "strings" "github.com/pdevine/tensor" @@ -27,8 +28,14 @@ type llama struct { NumKeyValueHeads uint32 `json:"num_key_value_heads"` RopeTheta float32 `json:"rope_theta"` RopeScaling struct { - Type string `json:"type"` - Factor float32 `json:"factor"` + Type string `json:"type"` + RopeType string `json:"rope_type"` + Factor float32 `json:"factor"` + LowFrequencyFactor float32 `json:"low_freq_factor"` + HighFrequencyFactor float32 `json:"high_freq_factor"` + OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"` + + factors ropeFactor } `json:"rope_scaling"` RMSNormEPS float32 `json:"rms_norm_eps"` LayerNormEPS float32 `json:"layer_norm_eps"` @@ -42,7 +49,6 @@ var _ Converter = (*llama)(nil) func (p *llama) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "llama" - kv["general.name"] = "llama" kv["llama.vocab_size"] = p.VocabSize kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) @@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV { if p.RopeScaling.Type == "linear" { kv["llama.rope.scaling.type"] = p.RopeScaling.Type kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor + } else if p.RopeScaling.RopeType == "llama3" { + dim := p.HiddenSize / p.NumAttentionHeads + for i := uint32(0); i < dim; i += 2 { + factor := cmp.Or(p.RopeScaling.Factor, 8.0) + factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0) + factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0) + + original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192) + lambdaLow := float32(original) / factorLow + lambdaHigh := float32(original) / factorHigh + + lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim)) + if lambda < float64(lambdaHigh) { + p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0) + } else if lambda > float64(lambdaLow) { + p.RopeScaling.factors = append(p.RopeScaling.factors, factor) + } else { + smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow) + p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth)) + } + } } if p.NumKeyValueHeads > 0 { @@ -90,24 +117,29 @@ func (p *llama) KV(t *Tokenizer) llm.KV { kv["llama.attention.value_length"] = p.HeadDim } - if len(t.Merges) > 0 { - kv["tokenizer.ggml.merges"] = t.Merges - } - return kv } func (p *llama) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor + + if p.RopeScaling.factors != nil { + out = append(out, llm.Tensor{ + Name: "rope_freqs.weight", + Kind: 0, + Shape: []uint64{uint64(len(p.RopeScaling.factors))}, + WriterTo: p.RopeScaling.factors, + }) + } + for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasSuffix(name, "attn_q.weight") || - strings.HasSuffix(name, "attn_k.weight") { + if strings.HasSuffix(t.Name(), "attn_q.weight") || + strings.HasSuffix(t.Name(), "attn_k.weight") { t.SetRepacker(p.repack) } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -117,8 +149,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *llama) tensorName(n string) string { - return strings.NewReplacer( +func (p *llama) Replacements() []string { + return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", "model.norm", "output_norm", @@ -132,9 +164,7 @@ func (p *llama) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - // mixtral - "block_sparse_moe.gate", "ffn_gate_inp", - ).Replace(n) + } } func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) { @@ -144,9 +174,9 @@ func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, } var heads uint32 - if strings.HasSuffix(name, "q_proj.weight") { + if strings.HasSuffix(name, "attn_q.weight") { heads = p.NumAttentionHeads - } else if strings.HasSuffix(name, "k_proj.weight") { + } else if strings.HasSuffix(name, "attn_k.weight") { heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) } else { return nil, fmt.Errorf("unknown tensor for repack: %s", name) diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go index 3263a27b..97a86b30 100644 --- a/convert/convert_mixtral.go +++ b/convert/convert_mixtral.go @@ -15,8 +15,6 @@ type mixtral struct { NumExpertsPerToken uint32 `json:"num_experts_per_tok"` } -var _ Converter = (*mixtral)(nil) - func (p *mixtral) KV(t *Tokenizer) llm.KV { kv := p.llama.KV(t) @@ -72,6 +70,13 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor { return append(out, p.llama.Tensors(ts)...) } +func (p *mixtral) Replacements() []string { + return append( + p.llama.Replacements(), + "block_sparse_moe.gate", "ffn_gate_inp", + ) +} + type experts []Tensor func (e experts) WriteTo(w io.Writer) (int64, error) { diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go new file mode 100644 index 00000000..64d3d012 --- /dev/null +++ b/convert/convert_phi3.go @@ -0,0 +1,123 @@ +package convert + +import ( + "cmp" + "encoding/binary" + "io" + "math" + "strings" + "sync" + + "github.com/ollama/ollama/llm" +) + +type phi3 struct { + Parameters + NumHiddenLayers uint32 `json:"num_hidden_layers"` + NLayers uint32 `json:"n_layers"` + HiddenSize uint32 `json:"hidden_size"` + NEmbd uint32 `json:"n_embd"` + IntermediateSize uint32 `json:"intermediate_size"` + NumAttentionHeads uint32 `json:"num_attention_heads"` + NHead uint32 `json:"n_head"` + NumKeyValueHeads uint32 `json:"num_key_value_heads"` + NHeadKV uint32 `json:"n_head_kv"` + RopeTheta float32 `json:"rope_theta"` + RopeScaling struct { + Type string `json:"type"` + LongFactor ropeFactor `json:"long_factor"` + ShortFactor ropeFactor `json:"short_factor"` + } `json:"rope_scaling"` + RMSNormEPS float32 `json:"rms_norm_eps"` + NPositions uint32 `json:"n_positions"` + MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` + OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"` + SlidingWindow uint32 `json:"sliding_window"` +} + +var _ Converter = (*phi3)(nil) + +func (p *phi3) KV(t *Tokenizer) llm.KV { + kv := p.Parameters.KV(t) + kv["general.architecture"] = "phi3" + kv["phi3.context_length"] = p.MaxPositionEmbeddings + kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) + kv["phi3.feed_forward_length"] = p.IntermediateSize + kv["phi3.block_count"] = cmp.Or(p.NumHiddenLayers, p.NLayers) + kv["phi3.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead) + kv["phi3.attention.head_count_kv"] = cmp.Or(p.NumKeyValueHeads, p.NHeadKV) + kv["phi3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS + kv["phi3.rope.dimension_count"] = p.HiddenSize / cmp.Or(p.NumAttentionHeads, p.NHead) + kv["phi3.rope.freq_base"] = p.RopeTheta + kv["phi3.rope.scaling.original_context_length"] = p.OriginalMaxPositionEmbeddings + kv["phi3.attention.sliding_window"] = p.SlidingWindow + + scale := float64(p.MaxPositionEmbeddings) / float64(p.OriginalMaxPositionEmbeddings) + + switch p.RopeScaling.Type { + case "": + // no scaling + case "su", "longrope": + kv["phi3.rope.scaling.attn_factor"] = float32(max(math.Sqrt(1+math.Log(scale)/math.Log(float64(p.OriginalMaxPositionEmbeddings))), 1.0)) + case "yarn": + kv["phi3.rope.scaling.attn_factor"] = float32(max(0.1*math.Log(scale)+1.0, 1.0)) + default: + panic("unknown rope scaling type") + } + + return kv +} + +func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { + var addRopeFactors sync.Once + + out := make([]llm.Tensor, 0, len(ts)+2) + for _, t := range ts { + if strings.HasPrefix(t.Name(), "blk.0.") { + addRopeFactors.Do(func() { + out = append(out, llm.Tensor{ + Name: "rope_factors_long.weight", + Kind: 0, + Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))}, + WriterTo: p.RopeScaling.LongFactor, + }, llm.Tensor{ + Name: "rope_factors_short.weight", + Kind: 0, + Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))}, + WriterTo: p.RopeScaling.ShortFactor, + }) + }) + } + + out = append(out, llm.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + + return out +} + +func (p *phi3) Replacements() []string { + return []string{ + "lm_head", "output", + "model.embed_tokens", "token_embd", + "model.norm", "output_norm", + "model.layers", "blk", + "input_layernorm", "attn_norm", + "self_attn.qkv_proj", "attn_qkv", + "self_attn.o_proj", "attn_output", + "mlp.down_proj", "ffn_down", + "mlp.gate_up_proj", "ffn_up", + "post_attention_layernorm", "ffn_norm", + } +} + +type ropeFactor []float32 + +func (r ropeFactor) WriteTo(w io.Writer) (int64, error) { + err := binary.Write(w, binary.LittleEndian, r) + return 0, err +} diff --git a/convert/convert_test.go b/convert/convert_test.go index 88f38494..64b7df3b 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -62,9 +62,14 @@ func TestMain(m *testing.M) { func TestConvertFull(t *testing.T) { cases := []string{ "Meta-Llama-3-8B-Instruct", + "Meta-Llama-3.1-8B-Instruct", "Mistral-7B-Instruct-v0.2", "Mixtral-8x7B-Instruct-v0.1", "gemma-2b-it", + // microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8 + "Phi-3-mini-128k-instruct", + "all-MiniLM-L6-v2", + "gemma-2-9b-it", } for i := range cases { diff --git a/convert/reader.go b/convert/reader.go index ce95208e..5bba0406 100644 --- a/convert/reader.go +++ b/convert/reader.go @@ -35,7 +35,9 @@ const ( ) func (t tensorBase) Kind() uint32 { - if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") { + if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") || + t.name == "token_types.weight" { + // these tensors are always F32 return 0 } @@ -55,10 +57,10 @@ func (t *tensorBase) SetRepacker(fn repacker) { type repacker func(string, []float32, []uint64) ([]float32, error) -func parseTensors(fsys fs.FS) ([]Tensor, error) { +func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) { patterns := []struct { Pattern string - Func func(fs.FS, ...string) ([]Tensor, error) + Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error) }{ {"model-*-of-*.safetensors", parseSafetensors}, {"model.safetensors", parseSafetensors}, @@ -74,7 +76,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) { } if len(matches) > 0 { - return pattern.Func(fsys, matches...) + return pattern.Func(fsys, replacer, matches...) } } diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go index 42f902a5..32a362cd 100644 --- a/convert/reader_safetensors.go +++ b/convert/reader_safetensors.go @@ -8,6 +8,7 @@ import ( "io" "io/fs" "slices" + "strings" "github.com/d4l3k/go-bfloat16" "github.com/x448/float16" @@ -20,7 +21,7 @@ type safetensorMetadata struct { Offsets []int64 `json:"data_offsets"` } -func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) { +func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { var ts []Tensor for _, p := range ps { f, err := fsys.Open(p) @@ -56,7 +57,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) { offset: safetensorsPad(n, value.Offsets[0]), size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]), tensorBase: &tensorBase{ - name: key, + name: replacer.Replace(key), shape: value.Shape, }, }) diff --git a/convert/reader_torch.go b/convert/reader_torch.go index 531996bf..1b3e1c9f 100644 --- a/convert/reader_torch.go +++ b/convert/reader_torch.go @@ -3,12 +3,13 @@ package convert import ( "io" "io/fs" + "strings" "github.com/nlpodyssey/gopickle/pytorch" "github.com/nlpodyssey/gopickle/types" ) -func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) { +func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { var ts []Tensor for _, p := range ps { pt, err := pytorch.Load(p) @@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) { ts = append(ts, torch{ storage: t.(*pytorch.Tensor).Source, tensorBase: &tensorBase{ - name: k.(string), + name: replacer.Replace(k.(string)), shape: shape, }, }) diff --git a/convert/testdata/Meta-Llama-3.1-8B-Instruct.json b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json new file mode 100644 index 00000000..ad7cd20a --- /dev/null +++ b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json @@ -0,0 +1,3 @@ +{ + "rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222" +} diff --git a/convert/testdata/Phi-3-mini-128k-instruct.json b/convert/testdata/Phi-3-mini-128k-instruct.json new file mode 100644 index 00000000..19296f5a --- /dev/null +++ b/convert/testdata/Phi-3-mini-128k-instruct.json @@ -0,0 +1,225 @@ +{ + "general.architecture": "phi3", + "general.file_type": "1", + "general.quantization_version": "2", + "phi3.block_count": "32", + "phi3.context_length": "131072", + "phi3.embedding_length": "3072", + "phi3.feed_forward_length": "8192", + "phi3.rope.scaling.original_context_length": "4096", + "phi3.rope.dimension_count": "96", + "phi3.rope.freq_base": "10000", + "phi3.rope.scaling.attn_factor": "1.1902381", + "phi3.attention.head_count": "32", + "phi3.attention.head_count_kv": "32", + "phi3.attention.layer_norm_rms_epsilon": "1e-05", + "phi3.attention.sliding_window": "262144", + "tokenizer.ggml.model": "llama", + "tokenizer.ggml.pre": "default", + "tokenizer.ggml.add_bos_token": "false", + "tokenizer.ggml.add_eos_token": "false", + "tokenizer.ggml.bos_token_id": "1", + "tokenizer.ggml.eos_token_id": "32000", + "tokenizer.ggml.unknown_token_id": "0", + "tokenizer.ggml.padding_token_id": "32000", + "tokenizer.ggml.scores": "6e37bcde2adc7e350e87c496eddd7a2124329c1dc66c5bf3ad3997253e4f7a62", + "tokenizer.ggml.token_type": "b6ecf55ec64ee67d87750bdb8d757a2c58bf78377e9f4219f5689a6c4dea57ce", + "tokenizer.ggml.tokens": "d168da3ddd3eee820916945fcb9baf24dd3cde42f606cffa2d19e7c8a8743918", + "blk.0.attn_norm.weight": "216aeb2c9e0c271f899e1ef2a63cceeb8f41e97642e84fada54b1d3c1c11cf25", + "blk.0.attn_output.weight": "b597d56f7188ffc1fafc273fadc59d41738cffd677ae98c61a62c3285b3a3099", + "blk.0.attn_qkv.weight": "d28a6b44e13f59be5483e4be2bedb544e346168d720aca27f47d1a5a722be91e", + "blk.0.ffn_down.weight": "4a691370e5a61fcbbf540fbcbf4c0f1d15dec0364528c0e916d0744f6262b63b", + "blk.0.ffn_norm.weight": "0c00af2b4a3128bec64a0cbb1084b042fdbe13d9ad0d03bd577f9449dfead338", + "blk.0.ffn_up.weight": "b32b52f790c1c083bfb8a3126dc1111cfeeb28dc8c584a930a1e5334cb176bf4", + "blk.1.attn_norm.weight": "68748011503c6c029e8e69a84a8e5a89338f378769627b6dbf7f93d715c292e1", + "blk.1.attn_output.weight": "2267344add13b048ca59e4377c86dc512be8046a57156901fa32a20fa74e4ee0", + "blk.1.attn_qkv.weight": "9109d2e3d7a2eacfda5226587b8be124a3bf44b972da7ebb17aa15795897eacc", + "blk.1.ffn_down.weight": "d675df4df4dd039c0c339ad6445d39eddd2004db6bf35bed6314c7497245a633", + "blk.1.ffn_norm.weight": "3b5767ae977bc8baaa06b06efdbea193b6b3ba605ce76d77a76ce317e935500c", + "blk.1.ffn_up.weight": "80dfd6d9d234b00334c89b8e0a02f81899c2efd377321c34ba5ba51a5f61b5ff", + "blk.2.attn_norm.weight": "6a6743b057e5088f145bc179e92c9bfb41163e7295d7b81c62e23dd89d2b59c4", + "blk.2.attn_output.weight": "bc5491ea54e0db81462d7d9b7d25cbdda380c2db8de041bd1c4ab7b76a1d19c3", + "blk.2.attn_qkv.weight": "a61287a9852e2f5aca9c100b471d98398b2913a3497c743de3c70ec9ddd7087f", + "blk.2.ffn_down.weight": "4fddcc382c8dceeab027fe43d8d44e67edb5e8ce4b9a1b7f773c87770380ade1", + "blk.2.ffn_norm.weight": "07e05f82b3f63f711db3b684ca79aed25c0657917e66f88af47348a82065c227", + "blk.2.ffn_up.weight": "4835a682ef1826c12df01ae7663fc45f9c82bc8e64b665f13fb7da8e201ec0fb", + "blk.3.attn_norm.weight": "f22aba7c03999ba7136f39cda747a39715e498699dc1716cd97fc5dfc58d1b1c", + "blk.3.attn_output.weight": "53b579855366fd786c5126b2b30aac4d583ca7bda56833c4865f5cadb5c18c6d", + "blk.3.attn_qkv.weight": "bb56aba78158123140fcea59c69ac562ca208f6d3086819417cdad8c50f333ad", + "blk.3.ffn_down.weight": "97280897a7cd86db2830c004bccc5bc094f50e293baded0189159a2019145a6e", + "blk.3.ffn_norm.weight": "10a8c99f8b57a960e8e0a1133c4a26f9148403d1b9bff2eff114917de996f3b5", + "blk.3.ffn_up.weight": "7324046c915e75d621b2043597a245a428d8eea31869135e6257a861491d8dcc", + "blk.4.attn_norm.weight": "507d8e164de94646edbfe33def8e8fbf7c9a6ee3fbaedb5000f72d9f51ec5e36", + "blk.4.attn_output.weight": "bbb3429e6efa98c150e0fdbf48c16180cbf0d0cbc1b3c253c6c319d78f4593a2", + "blk.4.attn_qkv.weight": "b95ee5be0786d3901273d806c339fe6c20e6bfffd2a20672a9f56af80921e8ab", + "blk.4.ffn_down.weight": "806bbf91df92a5a22bd5aa1ffb7fc2869f7293ffc7704771c290ecc583b27975", + "blk.4.ffn_norm.weight": "cfc2930a81df7aee3a5e7f726a15c1182233e868bf0d9d37f6b6ae6d8c15c234", + "blk.4.ffn_up.weight": "c3390c69533de2c8424e8069323ccc5d0c4543111535da04cf2c7d26745576aa", + "blk.5.attn_norm.weight": "0d71c4fbcefabbd021569442853d2fe90668b19409ae2805a718a829ca60beab", + "blk.5.attn_output.weight": "10ebd93629112bf2df5c30dd0953a4a5e9020306768283181ed426934d47e14f", + "blk.5.attn_qkv.weight": "5cb05633369f12d4b00e0ff787736bd846856682115720ebc6cce05270c334f6", + "blk.5.ffn_down.weight": "e28bcc5094212eafc7476dbc5b7a520d25b79578cbf4229d698e2655956a80ad", + "blk.5.ffn_norm.weight": "b6f2c4cf9f34bb4d59989f96165c14a67dc1e266ad0a6d0fcc49f1add929e6ff", + "blk.5.ffn_up.weight": "0f9ef99423cc07ebedc0e9cfa95809f2d7108d910bb4ef97ebc0b0309c440750", + "blk.6.attn_norm.weight": "b3edcc47a42218234f7564d7470611b49401a41ae8cd42123f86557c69f5d7f2", + "blk.6.attn_output.weight": "eb9b7d257b388bb5b8fe0515e5c6873317239cb94cda236e4b6ada2a6c57c65c", + "blk.6.attn_qkv.weight": "eb968081f478c52f07bd9c2761741e982dba33cc4eeadeea3557d391b9ac2106", + "blk.6.ffn_down.weight": "1b8588bb7463206290322695577dcfced300895d6e6f4b26966c53a9ae2f0f84", + "blk.6.ffn_norm.weight": "1219c04b7770983c77814200eefe743f46d15328ea2b12711e44f8103eab08d3", + "blk.6.ffn_up.weight": "197ef287239fec47c55677f0fbb66eaf0644f775bc382de843971730721394f6", + "blk.7.attn_norm.weight": "b630ad08c80d564ed1c024384818e9fd3f22a36cd7a14aa96e7e2759a8285099", + "blk.7.attn_output.weight": "970255aa750828a47d6b9d399f9612b5bf25aefe7dadbcba41fc416d0d4067c1", + "blk.7.attn_qkv.weight": "ebb157c880293e6de8d629f263ba8853ed1dbdc02c311d43432bb8cfbb310739", + "blk.7.ffn_down.weight": "24bcd4db4cba844c89f878b81843c373dbbc0675e889d32c5b12e63384a7b670", + "blk.7.ffn_norm.weight": "b9c6f71001808ee873ce7db8056e4b53fb4cccec8b7f0f312899b575fae39d39", + "blk.7.ffn_up.weight": "979f1828d227455c26015a2a11afe9dd05f2bb97a8ba6b38c8dab3f50e627401", + "blk.8.attn_norm.weight": "4e8e347e3775010b7112ee630f2f4f2383be7ff64e6ca6154b9b22566552eaa6", + "blk.8.attn_output.weight": "65a44babf44a435a1829945211b3168f9ec78ac3cb7a049a733e93d11f0d6659", + "blk.8.attn_qkv.weight": "343ed07671da400b040812a4058482fa38284b5d9af9becfed07417fe26ce747", + "blk.8.ffn_down.weight": "7fb7e073e3c2c503c4e9d60efa0988fed7398d900cc003695fe3fffd3e188b82", + "blk.8.ffn_norm.weight": "b07c1f655d8593e3892a2cf73f8a0c19ce8e5cb613fafbe7cbd430da8ce4c57d", + "blk.8.ffn_up.weight": "8b26e14de54b3fdc2e2d3ea41720f9d9c236a93688c3b7fd7bf43f5fbb327c9b", + "blk.9.attn_norm.weight": "46394d408a8e316916177e6aa261de32e137a82d729c0b1800b072f0c38c39b6", + "blk.9.attn_output.weight": "d57f3d46107947a7073373a0b35d6ecf7759b5df15406f4a3590a60666af6b16", + "blk.9.attn_qkv.weight": "14bb8ace8c5453148f4b536e9f4279c813f31136716947256f5cca333448639c", + "blk.9.ffn_down.weight": "2b8d98e2b5ed68338f6e4de43bf7de0c4858cc69103cd5177725f7444eec7694", + "blk.9.ffn_norm.weight": "41a499dfd418cc4c6b8c12313f673f7e2cd4a3f9c4065eb6c4feb5eed02fb542", + "blk.9.ffn_up.weight": "143aab7533a64b17fbe201490a6f674bc7f0bd370c094500b2e100419073d1c2", + "blk.10.attn_norm.weight": "ebb670aafd36816a794347287269d8f1a5b19c1e3c0a1e38023bc19fdba9b073", + "blk.10.attn_output.weight": "b5d65bbc0ed5e49fdd9d754bc18163cd042a285024d0cf6f954c503bc8c877cb", + "blk.10.attn_qkv.weight": "f06b15bac88da798fa34a62b03eaac0dbe8b846020516603c387541f2d8dd672", + "blk.10.ffn_down.weight": "fb091fcd1b4de25d1bea94d1755e255cb02914a030d23e3a234e57b8d46bde6e", + "blk.10.ffn_norm.weight": "eb347bdf9c40414af87e13a8e72e40b31f004b50f7cb366f1a219ced60a61355", + "blk.10.ffn_up.weight": "ed2d52fc881a173f404fe8a1067862c9856d6c3e0d2e90a330a7aa394e3f84d1", + "blk.11.attn_norm.weight": "64e252603cf010a0e502ca39fdf8d0a196a79aec67c0d2bb9213fc0cb80c47d4", + "blk.11.attn_output.weight": "228e33e21c69f52efc74fdfc831bc9af271e44b2a29a3dced1d64e667ce36eb5", + "blk.11.attn_qkv.weight": "ab9ce6d4ef9e42ee0da3f20a7708a3bbc5e79e967b05fa86ba946a05e2eb63eb", + "blk.11.ffn_down.weight": "0ca133b7835c98dc77c25d64e4eb7873778bdb5e4d22d8b80f920f46865b43bd", + "blk.11.ffn_norm.weight": "02455741a0dfd161c79aa1ecc381901721f229fdcda5615622a629631fb61cfd", + "blk.11.ffn_up.weight": "9fecdcc099fbb8e23c6b1ea9294702a027f4a58d265543ec5e7be79b8f63b354", + "blk.12.attn_norm.weight": "783bb459911b1b3609a9b2bdfe272f1670add73b5471da738e07ac47e2e07dfd", + "blk.12.attn_output.weight": "1e1a914c9e48b857206ac5a1f7cead994bc1ea91d5d4fff8c834d73f2e38ef5d", + "blk.12.attn_qkv.weight": "5953e7185ccb87fb4dae8f9426ec86315d4c7794326e8ab59b3a95d4af2189f0", + "blk.12.ffn_down.weight": "a3eecf0f394f86e2cfb48a5940a5c50ca86d71883b2f79fcc642a935fabce0d4", + "blk.12.ffn_norm.weight": "0a4272e41373c23bd72f10d2d82930aa3a1480aac75832bfbf01cebf0b86b6a4", + "blk.12.ffn_up.weight": "06f42776de3a7ceac3025f26a7a8bd20e062233cce2bdaa2183470dc4b30b87d", + "blk.13.attn_norm.weight": "5915da60fb03e201fa649faba780e5fdf1c761c262b206e5415cf83181f65780", + "blk.13.attn_output.weight": "4dbf6eab074fa3835fd32bd631a8208e511037d5056d2fd3015735cca7674ef7", + "blk.13.attn_qkv.weight": "d3d8339a1c4782d9e73d77fdebe154d3c5b83ac40c9175b3e91a4977d08f876b", + "blk.13.ffn_down.weight": "de6772b46a55e1fd42b007637dfbf68b6598e5d5b61622da0935002e1e192d3a", + "blk.13.ffn_norm.weight": "5a640ea3b8c7be49c95a58a2327e10d8e8d9d142504bde5c8091613e5b961d7a", + "blk.13.ffn_up.weight": "f35e3545e4bd3531b2e843b5efd31dee0c13c807ee6386e65473ba67bbec30d0", + "blk.14.attn_norm.weight": "9b34986450b7c98b4927e81e61a816f9e84b1addc7c14926402100037aad6678", + "blk.14.attn_output.weight": "155d52efb23d366016d861a251d4d1f4a0c13699188c50d50dba016a0d8bfcd9", + "blk.14.attn_qkv.weight": "8e1415084e1f33c73a777f19e752489f4dd312cca047733e5ea643cd4a955e04", + "blk.14.ffn_down.weight": "a2a142226b94baa01ccb65bdea2b7418e49085c1d9c3c63e544e3112c58a25da", + "blk.14.ffn_norm.weight": "8aecfd9b0ae6affaea31a80c5c9a4a14b31deaa0db7bd8f6da2a64d23447921c", + "blk.14.ffn_up.weight": "0c1407237b8c1bd02f193346b5681926fe698a5055eac6a7450451b0f991707c", + "blk.15.attn_norm.weight": "e037bd19880bfa83d983200fb0c7866f8ad16c3ff5cc4b4f3a37ca7373870ff6", + "blk.15.attn_output.weight": "045fe4fc95cc129a1b92771b179c11b12845c4c088786c607f17bd98857e68e1", + "blk.15.attn_qkv.weight": "7621b7559705cab1d4dea1c69f76dbf9dc1c8837a203b656f484703b9c1b70ce", + "blk.15.ffn_down.weight": "7e5ac20e290bc60761e1cd972354fde225b7fa861048d44d9a0dd9b046d55f58", + "blk.15.ffn_norm.weight": "b6d830d88f1db1825687973c8c2b1a24c6fa84f07af8d0e3ef9c86009baca0b2", + "blk.15.ffn_up.weight": "dcda0957cd04fc45476774dba2bbf9aa89d6b05d5ca7b10ae6f73ad2c49b1cd3", + "blk.16.attn_norm.weight": "4ee9b70ba15cb2a08240f93990e90f5068c48fceb481f8e2186bec8b7214eb3f", + "blk.16.attn_output.weight": "315cfe5536658d2498192b2980eade15b2c9a4ff220e4011911457b1727fa103", + "blk.16.attn_qkv.weight": "3c8122e3ad637583b9dcde8ff3a323267d3014bb1f0f9771e5322260ca9ecc8d", + "blk.16.ffn_down.weight": "3b5fbebd5ee2b86cad96fb8a9b45a8770d08f82c1c8b74d7061e866f7020a18d", + "blk.16.ffn_norm.weight": "ffab69f20bda372de6e5878f0539163e2fc6ba113621ded95705fc3b1465c9f0", + "blk.16.ffn_up.weight": "0935ea3d258da42d6258406365f39f58ddaabfe97ea5977580db3635188f24a1", + "blk.17.attn_norm.weight": "f030441733f3d147b4a06a1eb4aeb8465c7c24d9c53bf4c48fe7e134d3629803", + "blk.17.attn_output.weight": "07a955ef09e8dc766ac0df647d0b2c69f23c4c69a7137654b4aad80303ed0eda", + "blk.17.attn_qkv.weight": "1c10688061e21e2fe12ad0cb54bf03895c1f83c3b0df743a42f548b52cbca1b2", + "blk.17.ffn_down.weight": "ebb9cc9836f41d88fdae2aa9a4355514e4edaec8d1577ffeb947a35204e77f52", + "blk.17.ffn_norm.weight": "50aff44f6528b13db5389f2ddcdb7676244947610bd7ffbff3f881c968c2a0d4", + "blk.17.ffn_up.weight": "d716537949582be33bde6b02e38f5a70081c9642a9fb05a61312126718b8d148", + "blk.18.attn_norm.weight": "0ea695c4e53d637902f46663a6ee42adc493c36794476acc7dbddaa05b13840d", + "blk.18.attn_output.weight": "5fd35b500221a612eb4f4bddf0e9b6b7db4d7733032a75f8802fb2d884647c2e", + "blk.18.attn_qkv.weight": "b0da37fd030fe69581f990bf23bfd35467a1bbe558af6de7c0924f6b72e92317", + "blk.18.ffn_down.weight": "b355c33f44b328f4bb977567de8f7544db4b005d7a8fbded658518ecf3c5a153", + "blk.18.ffn_norm.weight": "58b3fe9094079989a86e0387143259e1cc35952d24dc3df290c4ba6df44f5c51", + "blk.18.ffn_up.weight": "2ce530954c342c30ed2ead5353f931960bfae1d278868504c0efb973560fabbe", + "blk.19.attn_norm.weight": "533e9aed66feea8f0392aa81f9e293240e1f009a5334253915fb60c2749b615d", + "blk.19.attn_output.weight": "84f2d00f98a4113a779d3b5d1c3e7c914eb47784d3ab13b290367c124c2994aa", + "blk.19.attn_qkv.weight": "fbe6b9f53b07fa7537d3b3d452d20a9bc666f9fd41ec2091dd28bc2f70fc668f", + "blk.19.ffn_down.weight": "b30199e098c8bb3f890183d8b18471e80b62b604729b277ad62488dd71e1206b", + "blk.19.ffn_norm.weight": "c81373e41cd340b7badb19f9517c77c4250b4eb9a02dc758b8b49b652487d7ff", + "blk.19.ffn_up.weight": "5a5cb083ca7725720e3a890f7fa46354760e8007a8188849a092e305694a75e3", + "blk.20.attn_norm.weight": "4953091b4477e354357a8e743ba0a1900633e52f1599ee082a0c9b0b2b5cd978", + "blk.20.attn_output.weight": "62d54f7749cd6856097b2632066a322b0296df915fe66f382c5b5981be0d4f23", + "blk.20.attn_qkv.weight": "406de9e35b0729ebe902d7a47905cc7fb29a921431ed35dbef0c03e5690a1329", + "blk.20.ffn_down.weight": "62fb678b0d1261e19a4903a2b347d67afcc8acff01feb33a687a35a2d1e6f9a5", + "blk.20.ffn_norm.weight": "cd9d36b7e71e55c8925b97bb09c28219f182626bcff094878ae39c3db887a14b", + "blk.20.ffn_up.weight": "b9276771d79d3e932e73ccc520c3f8476342b9ef312ed2ee1e0da822e6e3ad18", + "blk.21.attn_norm.weight": "66d8c8a35e13ce9c2a0e75b670150e2c31484a55c2316df46075312196178ed3", + "blk.21.attn_output.weight": "12ab46c9382648f9b3350fdd92a6be6352743d62d6b520d7e2024e0c838588f5", + "blk.21.attn_qkv.weight": "a7909676ee1675ca23cd29a5fdd226df8dd9d68f94c6c9bbb51dd9fd38504008", + "blk.21.ffn_down.weight": "6fb317279c6542e82f97d5a12a60fac1bd0fa0405154f9fbe265e2fe39bd49cc", + "blk.21.ffn_norm.weight": "c0f703eb3ff161b5ba4490d87d8684b8a6c47a8f433e12f418333b9db439010a", + "blk.21.ffn_up.weight": "6dbdb80ef0c35e364bbce12d40d5e74c7963c7b55d58d9579567a07ffce7b863", + "blk.22.attn_norm.weight": "f94237433bf03d675cb2f655b81ca91a1ce2447bc6b00b13d6b0ccfe2d411eff", + "blk.22.attn_output.weight": "e821f95995ce497c01e63ca64f737713b1b65f11df1903e51d444aa516f33f71", + "blk.22.attn_qkv.weight": "1b0f717c73afb5eb4c82a1708c4e85c969e8a2a8770d9ddb78b1870a2d8a781e", + "blk.22.ffn_down.weight": "0f33f7a3cdc685484be99aa0c03642b0b20850a27d1fddbe054b13a9382f3ccb", + "blk.22.ffn_norm.weight": "9df285cf211ddd7df2b36a50489af574755c7d4d98b29a05cd04566ae613c8dc", + "blk.22.ffn_up.weight": "63ac300e1efb34041dd0136cf43ea622fac6f0caccce1cd9262f5e08d2cf179c", + "blk.23.attn_norm.weight": "5f72d9e88689b4027b28f5f8f26cd3abb03635ceea7ec98a4c91a9fc691f6707", + "blk.23.attn_output.weight": "6ecf04ff61125c5fc768f8656497152149373daf321ee9c957e8f7245a1184d1", + "blk.23.attn_qkv.weight": "a9d9978806724c2959f2cf386c233831f08e1e933dbf2b32665e788d9d512ea4", + "blk.23.ffn_down.weight": "72c7d17886a3da17fa0daa456aa5e877b2ef5b8b403182b870d9ca5ca9c70347", + "blk.23.ffn_norm.weight": "971e4b712e3025a13419b5b57d674b5e4ab7f18f74b57b9afc4671623da90c4b", + "blk.23.ffn_up.weight": "df2b5c7dbd5834545b815073af0c7355b065124e6d6f0fee78d8fa5b2076dc3e", + "blk.24.attn_norm.weight": "c41957c4a79ad3b16f6e11daec1c7f530b9f3f4b618e1e4367c3b67787ac4ab6", + "blk.24.attn_output.weight": "ef7d61f5fc88ac6f31bf60cb5f4d2d6b8df42d38825807112361a7224b0dee3b", + "blk.24.attn_qkv.weight": "3e6a58fe7d49c90bb6971efbad3371c32256881173ea5aee4b0c296cb206490f", + "blk.24.ffn_down.weight": "f43619144047de42fed81dfa495f1815d3cb771330e574043e2b67620819292c", + "blk.24.ffn_norm.weight": "5501d4a2a98c8ca6b42e77b53b221dbc08f530f6a067256d787534ec6fe028bd", + "blk.24.ffn_up.weight": "d64c8b0e509e2b1118f6000176f8956cacecdbb200c7e95ed93fb78b6e26c84a", + "blk.25.attn_norm.weight": "502fa3c302d371f61c5791f4615b73018ffb1daa09b6499b227116581244c5d4", + "blk.25.attn_output.weight": "ad8391d4e9c980856f2547aa945b2b6a407a6382158dc1ddd4f08d94ecc24be6", + "blk.25.attn_qkv.weight": "42e8983780d4a01a02c54ad23d4df21eea437f119a10af5a9c12a76a42d308c1", + "blk.25.ffn_down.weight": "302dd010d4e0ab4eeaee89090409ea0dddeeeed3236415eb8f97c942497eea91", + "blk.25.ffn_norm.weight": "fb34c1ee5bca96986c08834df0a0c047ba041c1123ac1f563e9d64312bf82d6a", + "blk.25.ffn_up.weight": "10739a8de156816d93c92b935386540bfa976bdbef204f0312960f6fc657582f", + "blk.26.attn_norm.weight": "7036c711609128c4e55968ff3681d3043338879a5737efd6c2ac9e1a2a61f1a0", + "blk.26.attn_output.weight": "db5db45dead5cb911fa01da59832f121b7c18b2d167bf53741c40819f24d346c", + "blk.26.attn_qkv.weight": "cae34c6b7f82ed14348d5ed30a79919c383737c1694a9cb9c0de609d3b0c1d0a", + "blk.26.ffn_down.weight": "491ec3a4da9b4f49f8ebc6be658ce397a9b801ae9fb35e82177e47808c65e5d0", + "blk.26.ffn_norm.weight": "fd7059d75d7f0e5288511ddeeb0f772eb3cae3ccfe4226b877015834edc3c386", + "blk.26.ffn_up.weight": "ea1ee1274c56458ce056d2205e5bb6e5422ce4cb0ad58006b8141749b97a0c39", + "blk.27.attn_norm.weight": "cc362c9a937609265052cd38544af17a1a7448cea086d4c801139e1fc865832d", + "blk.27.attn_output.weight": "ba757a81dabde9cb1b069d1bb616fe79649a1724f756567ec61caed1304fe6cf", + "blk.27.attn_qkv.weight": "1ab8d7d02d87756c12c2275636823aa5ede3d683178225c4cac4bd892c319bd4", + "blk.27.ffn_down.weight": "deb1c711c8a66acf4dcd2d088e1548f8e08f296f755e4067d6557fa55afde88c", + "blk.27.ffn_norm.weight": "fc6242d8cb8a4a37a8ddb7e41e7e60a63d4a89edf36acb35df052f10b9c91ece", + "blk.27.ffn_up.weight": "8df39b09c4801f343aca78f2918a1f6db78c8c55e591eda4c69eadb74c26e180", + "blk.28.attn_norm.weight": "75b539308f77e3cefdc6d98484d8b5cbf0538f0c2869a77b7373a145a18bc850", + "blk.28.attn_output.weight": "ae128940eb60a6d2e121762ef4b3e9dcf9eb3e105b249507fa7f12de0e19822c", + "blk.28.attn_qkv.weight": "bdda781c288e9326c240e33905f8e621b6a2ad902e620739d34f93fcd6f933de", + "blk.28.ffn_down.weight": "f1d6e6d1c286b1138bfd7e53fe477f399ae93bc2c04e35416f84218ed7247965", + "blk.28.ffn_norm.weight": "3f837ce82c8b9bde0d61d08b6f5fe5574886ea5328dbdc53f2929f18da8b4087", + "blk.28.ffn_up.weight": "2af027002e31d1b6cfedbdb30a2b9d7213f3aa691167c353913adfd48fda31e4", + "blk.29.attn_norm.weight": "61e8003b5329462ffe0fe172f2b160260de006aed858332d49d75504b6b6aa7a", + "blk.29.attn_output.weight": "ca44542a72a37476dc73dbdcc01f5b7497cb3ebc4ea230a55c9634ccd8e56ad4", + "blk.29.attn_qkv.weight": "abb3d9d6abe57872ae3daa51935d43264093ded5ce63b49d1e280ee5758be0e4", + "blk.29.ffn_down.weight": "6764b895fce881df097489c263446f0106de36217997660c15984b3ee22a5a06", + "blk.29.ffn_norm.weight": "89e03e9a33fc0e6e31ba9f0c2bd7c5734a118c5602bb90148793e08a80e8d0ae", + "blk.29.ffn_up.weight": "fa7ad57a84954f4121653152efed1a871d8adb20a1ea9086e3e849ce359d7d2e", + "blk.30.attn_norm.weight": "91a697aca1e42af54f806a20211031c3369e8d0bd58df1b0147fe24954e1f5a4", + "blk.30.attn_output.weight": "36063fcf766c89ac75be56f688cc63cefe5f2c733fbf4378ea9956ad386fa148", + "blk.30.attn_qkv.weight": "2cacd1161f1121a2c0b979930134f4666f73fb8d7237b3b0659ae091b15955a6", + "blk.30.ffn_down.weight": "9f3fcb6217100595850c05dc98f9ab2a263afdb6ab28df2fcb08aeff512057d7", + "blk.30.ffn_norm.weight": "6c600bc1fc7de39d4f8917b81fc7d1d5ed2a9b56492234c13a4bd6028c30d880", + "blk.30.ffn_up.weight": "73cabd1bb011956b2689ea3338bb76642ef3a57c197377d666d2ab5f56317668", + "blk.31.attn_norm.weight": "72d3e1cc771380645fa75a899858c95f39857a4f3f1ed60fe1578df383b8bc53", + "blk.31.attn_output.weight": "40089cdd29994dc19a1d89fa15902a89cfeca3540f12dc9bf4d00ef82506e456", + "blk.31.attn_qkv.weight": "1d0bb40e9258071ae14290a53c619a8e331dda07354d2a02ef45766c029ae5e4", + "blk.31.ffn_down.weight": "8defa0e06335b793fa8be03883f0a322d6c5b33f52c69c943c35c60d16e42c0a", + "blk.31.ffn_norm.weight": "33c55d9d0c496ccfb130361fe131649346e098abaaac39c0519507e5d846721d", + "blk.31.ffn_up.weight": "599f6503f61c692c1f82001973d35119f9688db5e6be9d9c298411491c93f09b", + "output.weight": "14b8dc662bfa3308ebb2e102c562d8e52c15670e538f20f3216a9c310ca9dd41", + "output_norm.weight": "7f2294ba94ce65681df6c7ddd8698799199b9d77dc83c10bdad5c3999f0fdb82", + "rope_factors_long.weight": "e34d378664e354652c38f47d10dafb0498ccc2fb042d39ff7fef768146fff22b", + "rope_factors_short.weight": "9379146a4988f373d362fe47b06c75e7fe7c54aa4dc9558758df79b7a87471fd", + "token_embd.weight": "19a03c1fb5ac0baee93b0a7d8b0f26e9a9b011e229b694afc50ebfc13d84f8bf" +} diff --git a/convert/testdata/all-MiniLM-L6-v2.json b/convert/testdata/all-MiniLM-L6-v2.json new file mode 100644 index 00000000..15c8f039 --- /dev/null +++ b/convert/testdata/all-MiniLM-L6-v2.json @@ -0,0 +1,124 @@ +{ + "general.architecture": "bert", + "general.file_type": "1", + "general.quantization_version": "2", + "bert.attention.causal": "false", + "bert.attention.head_count": "12", + "bert.attention.layer_norm_epsilon": "1e-12", + "bert.block_count": "6", + "bert.context_length": "512", + "bert.embedding_length": "384", + "bert.feed_forward_length": "1536", + "bert.pooling_type": "1", + "tokenizer.ggml.model": "bert", + "tokenizer.ggml.padding_token_id": "0", + "tokenizer.ggml.unknown_token_id": "100", + "tokenizer.ggml.cls_token_id": "101", + "tokenizer.ggml.seperator_token_id": "102", + "tokenizer.ggml.mask_token_id": "103", + "tokenizer.ggml.token_type_count": "2", + "tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f", + "tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1", + "tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86", + "token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a", + "position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6", + "token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21", + "token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4", + "token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d", + "blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c", + "blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62", + "blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba", + "blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9", + "blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3", + "blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb", + "blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802", + "blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701", + "blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62", + "blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50", + "blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874", + "blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34", + "blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca", + "blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054", + "blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8", + "blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a", + "blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10", + "blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069", + "blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d", + "blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f", + "blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21", + "blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199", + "blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73", + "blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d", + "blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8", + "blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d", + "blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7", + "blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a", + "blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613", + "blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88", + "blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd", + "blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164", + "blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e", + "blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9", + "blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289", + "blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d", + "blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993", + "blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab", + "blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79", + "blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3", + "blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21", + "blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08", + "blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59", + "blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252", + "blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef", + "blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27", + "blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35", + "blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247", + "blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6", + "blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb", + "blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18", + "blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e", + "blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b", + "blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1", + "blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502", + "blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae", + "blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0", + "blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40", + "blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0", + "blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b", + "blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847", + "blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de", + "blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482", + "blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab", + "blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc", + "blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb", + "blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16", + "blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60", + "blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a", + "blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6", + "blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44", + "blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222", + "blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44", + "blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374", + "blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223", + "blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017", + "blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21", + "blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1", + "blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f", + "blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c", + "blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d", + "blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b", + "blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5", + "blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77", + "blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923", + "blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c", + "blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471", + "blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219", + "blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271", + "blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712", + "blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430", + "blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370", + "blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368", + "blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291", + "blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a", + "blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766" +} diff --git a/convert/testdata/gemma-2-9b-it.json b/convert/testdata/gemma-2-9b-it.json new file mode 100644 index 00000000..90cdbee4 --- /dev/null +++ b/convert/testdata/gemma-2-9b-it.json @@ -0,0 +1,6 @@ +{ + "general.architecture": "gemma2", + "gemma2.attention.sliding_window": "4096", + "gemma2.attn_logit_softcapping": "50", + "gemma2.final_logit_softcapping": "30" +} diff --git a/convert/tokenizer.go b/convert/tokenizer.go index 0d42a6d8..653df6d2 100644 --- a/convert/tokenizer.go +++ b/convert/tokenizer.go @@ -1,7 +1,6 @@ package convert import ( - "cmp" "crypto/sha256" "encoding/hex" "encoding/json" @@ -11,6 +10,8 @@ import ( "log/slog" "os" "slices" + + "golang.org/x/exp/maps" ) const ( @@ -184,32 +185,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) { return nil, err } - var tokens []token + tokens := make(map[int]token, len(t.Model.Vocab)) for k, v := range t.Model.Vocab { - tokens = append(tokens, token{ + tokens[v] = token{ ID: v, Content: k, - }) + } } - for _, t := range t.AddedTokens { - t.UserDefined = true - tokens = append(tokens, t) + for _, token := range t.AddedTokens { + token.UserDefined = true + tokens[token.ID] = token } - slices.SortFunc(tokens, func(i, j token) int { - return cmp.Compare(i.ID, j.ID) - }) + keys := maps.Keys(tokens) + slices.Sort(keys) v := Vocabulary{Model: "gpt2"} - for _, t := range tokens { - v.Tokens = append(v.Tokens, t.Content) - v.Scores = append(v.Scores, float32(t.ID)) + for _, k := range keys { + token := tokens[k] + v.Tokens = append(v.Tokens, token.Content) + v.Scores = append(v.Scores, float32(token.ID)) switch { - case t.Special: + case token.Special: v.Types = append(v.Types, tokenTypeControl) - case t.UserDefined: + case token.UserDefined: v.Types = append(v.Types, tokenTypeUserDefined) default: v.Types = append(v.Types, tokenTypeNormal) diff --git a/convert/tokenizer_spm.go b/convert/tokenizer_spm.go index babf702c..5e506087 100644 --- a/convert/tokenizer_spm.go +++ b/convert/tokenizer_spm.go @@ -15,6 +15,11 @@ import ( ) func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { + ast, err := parseAdditionalSpecialTokens(fsys) + if err != nil { + return nil, err + } + bts, err := fs.ReadFile(fsys, "tokenizer.model") if err != nil { return nil, err @@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { sentencepiece.ModelProto_SentencePiece_BYTE: v.Types = append(v.Types, int32(t)) default: - v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL)) + tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL) + if slices.Contains(ast, piece.GetPiece()) { + tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL) + } + + v.Types = append(v.Types, tt) } } @@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { return &v, nil } + +func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) { + f, err := fsys.Open("special_tokens_map.json") + if errors.Is(err, os.ErrNotExist) { + return nil, nil + } else if err != nil { + return nil, err + } + defer f.Close() + + var m struct { + AdditionalSpecialTokens []string `json:"additional_special_tokens"` + } + + if err := json.NewDecoder(f).Decode(&m); err != nil { + return nil, err + } + + return m.AdditionalSpecialTokens, nil +} diff --git a/docs/api.md b/docs/api.md index c0202ef3..aed2b69f 100644 --- a/docs/api.md +++ b/docs/api.md @@ -669,7 +669,7 @@ curl http://localhost:11434/api/chat -d '{ ``` curl http://localhost:11434/api/chat -d '{ - "model": "mistral", + "model": "llama3.1", "messages": [ { "role": "user", @@ -708,7 +708,7 @@ curl http://localhost:11434/api/chat -d '{ ```json { - "model": "mistral:7b-instruct-v0.3-q4_K_M", + "model": "llama3.1", "created_at": "2024-07-22T20:33:28.123648Z", "message": { "role": "assistant", @@ -1175,7 +1175,10 @@ curl http://localhost:11434/api/embed -d '{ "embeddings": [[ 0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814, 0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348 - ]] + ]], + "total_duration": 14143917, + "load_duration": 1019500, + "prompt_eval_count": 8 } ``` diff --git a/docs/import.md b/docs/import.md index f34f09ac..82ea9ba5 100644 --- a/docs/import.md +++ b/docs/import.md @@ -16,7 +16,9 @@ If the model being imported is one of these architectures, it can be imported di - LlamaForCausalLM - MistralForCausalLM + - MixtralForCausalLM - GemmaForCausalLM + - Phi3ForCausalLM ```dockerfile FROM /path/to/safetensors/directory diff --git a/docs/linux.md b/docs/linux.md index ec730656..d1d5892c 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -20,13 +20,12 @@ GPU. ## Manual install -### Download the `ollama` binary +### Download `ollama` -Ollama is distributed as a self-contained binary. Download it to a directory in your PATH: +Download and extract the Linux package: ```bash -sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama -sudo chmod +x /usr/bin/ollama +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr ``` ### Adding Ollama as a startup service (recommended) @@ -96,8 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh Or by downloading the ollama binary: ```bash -sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama -sudo chmod +x /usr/bin/ollama +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr ``` ## Installing specific versions diff --git a/docs/openai.md b/docs/openai.md index 7b3a3f31..75d2c595 100644 --- a/docs/openai.md +++ b/docs/openai.md @@ -182,7 +182,6 @@ curl http://localhost:11434/v1/embeddings \ - [x] Reproducible outputs - [x] Vision - [x] Tools (streaming support coming soon) -- [ ] Vision - [ ] Logprobs #### Supported request fields diff --git a/docs/template.md b/docs/template.md index f6ce06ba..1d7104de 100644 --- a/docs/template.md +++ b/docs/template.md @@ -112,15 +112,9 @@ Keep the following tips and best practices in mind when working with Go template ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2. ```gotmpl -{{- if .System }}<|im_start|>system -{{ .System }}<|im_end|> -{{ end }} {{- range .Messages }}<|im_start|>{{ .Role }} {{ .Content }}<|im_end|> {{ end }}<|im_start|>assistant -{{ else }} -{{ if .System }}<|im_start|>system -{{ .System }}<|im_end|> ``` ### Example Tools diff --git a/envconfig/config.go b/envconfig/config.go index b82b773d..7e45a4f5 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -174,7 +174,7 @@ func RunnersDir() (p string) { defer func() { if p == "" { - slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'") + slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'") } }() @@ -190,17 +190,17 @@ func RunnersDir() (p string) { } var paths []string - for _, root := range []string{filepath.Dir(exe), cwd} { + for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} { paths = append(paths, root, - filepath.Join(root, "windows-"+runtime.GOARCH), - filepath.Join(root, "dist", "windows-"+runtime.GOARCH), + filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), + filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH), ) } // Try a few variations to improve developer experience when building from source in the local tree for _, path := range paths { - candidate := filepath.Join(path, "ollama_runners") + candidate := filepath.Join(path, "lib", "ollama", "runners") if _, err := os.Stat(candidate); err == nil { p = candidate break diff --git a/go.mod b/go.mod index 2e0c6614..6e437c73 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/ollama/ollama -go 1.22.0 +go 1.22.5 require ( github.com/containerd/console v1.0.3 diff --git a/gpu/amd_common.go b/gpu/amd_common.go index 2839cb7c..72d204f7 100644 --- a/gpu/amd_common.go +++ b/gpu/amd_common.go @@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) { // Installer payload location if we're running the installed binary exe, err := os.Executable() if err == nil { - rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm") + rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index edabeb43..a0ae7c96 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) { // Installer payload (if we're running from some other location) localAppData := os.Getenv("LOCALAPPDATA") appDir := filepath.Join(localAppData, "Programs", "Ollama") - rocmTargetDir := filepath.Join(appDir, "rocm") + rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ollama installed ROCm at " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/assets.go b/gpu/assets.go index a35b6630..6d62d0dc 100644 --- a/gpu/assets.go +++ b/gpu/assets.go @@ -49,13 +49,9 @@ func PayloadsDir() (string, error) { } // Track our pid so we can clean up orphaned tmpdirs - pidFilePath := filepath.Join(tmpDir, "ollama.pid") - pidFile, err := os.OpenFile(pidFilePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm) - if err != nil { - return "", err - } - if _, err := pidFile.Write([]byte(strconv.Itoa(os.Getpid()))); err != nil { - return "", err + n := filepath.Join(tmpDir, "ollama.pid") + if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil { + return "", fmt.Errorf("failed to write pid file %s: %w", n, err) } // We create a distinct subdirectory for payloads within the tmpdir @@ -67,37 +63,44 @@ func PayloadsDir() (string, error) { // Best effort to clean up prior tmpdirs func cleanupTmpDirs() { - dirs, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*")) + matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid")) if err != nil { return } - for _, d := range dirs { - info, err := os.Stat(d) - if err != nil || !info.IsDir() { + + for _, match := range matches { + raw, err := os.ReadFile(match) + if errors.Is(err, os.ErrNotExist) { + slog.Debug("not a ollama runtime directory, skipping", "path", match) continue - } - raw, err := os.ReadFile(filepath.Join(d, "ollama.pid")) - if err != nil { - slog.Warn("failed to read ollama.pid", "path", d, "error", err) - // No pid, ignore this tmpdir + } else if err != nil { + slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err) continue } pid, err := strconv.Atoi(string(raw)) if err != nil { - slog.Warn("failed to parse pid", "path", d, "error", err) + slog.Warn("invalid pid, skipping", "path", match, "error", err) continue } - proc, err := os.FindProcess(pid) - if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) { - slog.Warn("found running ollama", "pid", pid, "path", d) - // Another running ollama, ignore this tmpdir + p, err := os.FindProcess(pid) + if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) { + slog.Warn("process still running, skipping", "pid", pid, "path", match) continue } - if err := os.Remove(d); err != nil { - slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err) + if err := os.Remove(match); err != nil { + slog.Warn("could not cleanup stale pidfile", "path", match, "error", err) + } + + runners := filepath.Join(filepath.Dir(match), "runners") + if err := os.RemoveAll(runners); err != nil { + slog.Warn("could not cleanup stale runners", "path", runners, "error", err) + } + + if err := os.Remove(filepath.Dir(match)); err != nil { + slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err) } } } diff --git a/gpu/cpu_common.go b/gpu/cpu_common.go index 63e88f25..34edcdc5 100644 --- a/gpu/cpu_common.go +++ b/gpu/cpu_common.go @@ -1,6 +1,11 @@ package gpu import ( + "os" + "path/filepath" + "runtime" + "strings" + "golang.org/x/sys/cpu" ) @@ -14,3 +19,19 @@ func GetCPUCapability() CPUCapability { // else LCD return CPUCapabilityNone } + +func IsNUMA() bool { + if runtime.GOOS != "linux" { + // numa support in llama.cpp is linux only + return false + } + ids := map[string]interface{}{} + packageIds, _ := filepath.Glob("/sys/devices/system/cpu/cpu*/topology/physical_package_id") + for _, packageId := range packageIds { + id, err := os.ReadFile(packageId) + if err == nil { + ids[strings.TrimSpace(string(id))] = struct{}{} + } + } + return len(ids) > 1 +} diff --git a/gpu/cuda_common.go b/gpu/cuda_common.go index c90a644c..827cc9b4 100644 --- a/gpu/cuda_common.go +++ b/gpu/cuda_common.go @@ -4,9 +4,17 @@ package gpu import ( "log/slog" + "os" + "regexp" + "runtime" + "strconv" "strings" ) +// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. +// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. +var CudaTegra string = os.Getenv("JETSON_JETPACK") + func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { ids := []string{} for _, info := range gpuInfo { @@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { } return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",") } + +func cudaVariant(gpuInfo CudaGPUInfo) string { + if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { + if CudaTegra != "" { + ver := strings.Split(CudaTegra, ".") + if len(ver) > 0 { + return "jetpack" + ver[0] + } + } else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil { + r := regexp.MustCompile(` R(\d+) `) + m := r.FindSubmatch(data) + if len(m) != 2 { + slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version") + } else { + if l4t, err := strconv.Atoi(string(m[1])); err == nil { + // Note: mapping from L4t -> JP is inconsistent (can't just subtract 30) + // https://developer.nvidia.com/embedded/jetpack-archive + switch l4t { + case 35: + return "jetpack5" + case 36: + return "jetpack6" + default: + slog.Info("unsupported L4T version", "nv_tegra_release", string(data)) + } + } + } + } + } + + if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 { + return "v11" + } + return "v12" +} diff --git a/gpu/gpu.go b/gpu/gpu.go index 7ae8fbec..72d237a6 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -64,10 +64,6 @@ var RocmComputeMin = 9 // TODO find a better way to detect iGPU instead of minimum memory const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU -// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. -// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. -var CudaTegra string = os.Getenv("JETSON_JETPACK") - // Note: gpuMutex must already be held func initCudaHandles() *cudaHandles { // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing @@ -215,7 +211,7 @@ func GetGPUInfo() GpuInfoList { GpuInfo: GpuInfo{ memInfo: mem, Library: "cpu", - Variant: cpuCapability, + Variant: cpuCapability.String(), ID: "0", }, }, @@ -229,11 +225,7 @@ func GetGPUInfo() GpuInfoList { return GpuInfoList{cpus[0].GpuInfo} } - // On windows we bundle the nvidia library one level above the runner dir - depPath := "" - if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" { - depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda") - } + depPath := LibraryDir() // Load ALL libraries cHandles = initCudaHandles() @@ -269,11 +261,23 @@ func GetGPUInfo() GpuInfoList { gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) + gpuInfo.computeMajor = int(memInfo.major) + gpuInfo.computeMinor = int(memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory - gpuInfo.DependencyPath = depPath + variant := cudaVariant(gpuInfo) + if depPath != "" { + gpuInfo.DependencyPath = depPath + // Check for variant specific directory + if variant != "" { + if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil { + gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant) + } + } + } gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor + gpuInfo.Variant = variant // query the management library as well so we can record any skew between the two // which represents overhead on the GPU we must set aside on subsequent updates @@ -305,38 +309,34 @@ func GetGPUInfo() GpuInfoList { // Intel if envconfig.IntelGPU() { oHandles = initOneAPIHandles() - // On windows we bundle the oneapi library one level above the runner dir - depPath = "" - if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" { - depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi") - } - - for d := range oHandles.oneapi.num_drivers { - if oHandles.oneapi == nil { - // shouldn't happen - slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers)) - continue - } - devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d)) - for i := range devCount { - gpuInfo := OneapiGPUInfo{ - GpuInfo: GpuInfo{ - Library: "oneapi", - }, - driverIndex: int(d), - gpuIndex: int(i), + if oHandles != nil && oHandles.oneapi != nil { + for d := range oHandles.oneapi.num_drivers { + if oHandles.oneapi == nil { + // shouldn't happen + slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers)) + continue + } + devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d)) + for i := range devCount { + gpuInfo := OneapiGPUInfo{ + GpuInfo: GpuInfo{ + Library: "oneapi", + }, + driverIndex: int(d), + gpuIndex: int(i), + } + // TODO - split bootstrapping from updating free memory + C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo) + // TODO - convert this to MinimumMemory based on testing... + var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. + memInfo.free = C.uint64_t(totalFreeMem) + gpuInfo.TotalMemory = uint64(memInfo.total) + gpuInfo.FreeMemory = uint64(memInfo.free) + gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) + gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) + gpuInfo.DependencyPath = depPath + oneapiGPUs = append(oneapiGPUs, gpuInfo) } - // TODO - split bootstrapping from updating free memory - C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo) - // TODO - convert this to MinimumMemory based on testing... - var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. - memInfo.free = C.uint64_t(totalFreeMem) - gpuInfo.TotalMemory = uint64(memInfo.total) - gpuInfo.FreeMemory = uint64(memInfo.free) - gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) - gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) - gpuInfo.DependencyPath = depPath - oneapiGPUs = append(oneapiGPUs, gpuInfo) } } } @@ -464,10 +464,12 @@ func GetGPUInfo() GpuInfoList { func FindGPULibs(baseLibName string, defaultPatterns []string) []string { // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them var ldPaths []string - var patterns []string gpuLibPaths := []string{} slog.Debug("Searching for GPU library", "name", baseLibName) + // Start with our bundled libraries + patterns := []string{filepath.Join(LibraryDir(), baseLibName)} + switch runtime.GOOS { case "windows": ldPaths = strings.Split(os.Getenv("PATH"), ";") @@ -476,13 +478,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string { default: return gpuLibPaths } - // Start with whatever we find in the PATH/LD_LIBRARY_PATH + + // Then with whatever we find in the PATH/LD_LIBRARY_PATH for _, ldPath := range ldPaths { d, err := filepath.Abs(ldPath) if err != nil { continue } - patterns = append(patterns, filepath.Join(d, baseLibName+"*")) + patterns = append(patterns, filepath.Join(d, baseLibName)) } patterns = append(patterns, defaultPatterns...) slog.Debug("gpu library search", "globs", patterns) @@ -638,3 +641,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { return "", "" } } + +func LibraryDir() string { + // On Windows/linux we bundle the dependencies at the same level as the executable + appExe, err := os.Executable() + if err != nil { + slog.Warn("failed to lookup executable path", "error", err) + } + cwd, err := os.Getwd() + if err != nil { + slog.Warn("failed to lookup working directory", "error", err) + } + // Scan for any of our dependeices, and pick first match + for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} { + libDep := filepath.Join("lib", "ollama") + if _, err := os.Stat(filepath.Join(root, libDep)); err == nil { + return filepath.Join(root, libDep) + } + // Developer mode, local build + if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil { + return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep) + } + if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil { + return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep) + } + } + slog.Warn("unable to locate gpu dependency libraries") + return "" +} diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 9d9fd84e..417b48df 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUCapability(), + Variant: GetCPUCapability().String(), memInfo: mem, }, } @@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUCapability(), + Variant: GetCPUCapability().String(), memInfo: mem, }, } diff --git a/gpu/gpu_linux.go b/gpu/gpu_linux.go index d6d2675c..d4d20bc4 100644 --- a/gpu/gpu_linux.go +++ b/gpu/gpu_linux.go @@ -47,7 +47,7 @@ var ( CudartMgmtName = "libcudart.so*" NvcudaMgmtName = "libcuda.so*" NvmlMgmtName = "" // not currently wired on linux - OneapiMgmtName = "libze_intel_gpu.so" + OneapiMgmtName = "libze_intel_gpu.so*" ) func GetCPUMem() (memInfo, error) { diff --git a/gpu/types.go b/gpu/types.go index 8d22b06b..4cbbeb84 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -19,7 +19,7 @@ type GpuInfo struct { Library string `json:"library,omitempty"` // Optional variant to select (e.g. versions, cpu feature flags) - Variant CPUCapability `json:"variant"` + Variant string `json:"variant"` // MinimumMemory represents the minimum memory required to use the GPU MinimumMemory uint64 `json:"-"` @@ -53,8 +53,10 @@ type CPUInfo struct { type CudaGPUInfo struct { GpuInfo - OSOverhead uint64 // Memory overhead between the driver library and management library - index int //nolint:unused,nolintlint + OSOverhead uint64 // Memory overhead between the driver library and management library + index int //nolint:unused,nolintlint + computeMajor int //nolint:unused,nolintlint + computeMinor int //nolint:unused,nolintlint } type CudaGPUInfoList []CudaGPUInfo @@ -81,8 +83,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { for _, info := range l { found := false requested := info.Library - if info.Variant != CPUCapabilityNone { - requested += "_" + info.Variant.String() + if info.Variant != CPUCapabilityNone.String() { + requested += "_" + info.Variant } for i, lib := range libs { if lib == requested { @@ -105,6 +107,7 @@ func (l GpuInfoList) LogDetails() { slog.Info("inference compute", "id", g.ID, "library", g.Library, + "variant", g.Variant, "compute", g.Compute, "driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor), "name", g.Name, diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index 81d0b587..42e9d074 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -5,6 +5,7 @@ package integration import ( "context" "log/slog" + "os" "strconv" "sync" "testing" @@ -13,7 +14,6 @@ import ( "github.com/stretchr/testify/require" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" ) @@ -41,8 +41,8 @@ func TestMultiModelConcurrency(t *testing.T) { }, } resp = [2][]string{ - []string{"sunlight"}, - []string{"england", "english", "massachusetts", "pilgrims", "british"}, + {"sunlight"}, + {"england", "english", "massachusetts", "pilgrims", "british"}, } ) var wg sync.WaitGroup @@ -71,12 +71,11 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { reqLimit := len(req) iterLimit := 5 - vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM - if vram != "" { - max, err := strconv.ParseUint(vram, 10, 64) + if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" { + maxVram, err := strconv.ParseUint(s, 10, 64) require.NoError(t, err) // Don't hammer on small VRAM cards... - if max < 4*1024*1024*1024 { + if maxVram < 4*format.GibiByte { reqLimit = min(reqLimit, 2) iterLimit = 2 } @@ -233,12 +232,12 @@ func TestMultiModelStress(t *testing.T) { consumed := uint64(256 * format.MebiByte) // Assume some baseline usage for i := 0; i < len(req); i++ { // Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long - if i > 1 && consumed > vram { - slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed)) + if i > 1 && consumed > maxVram { + slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed)) break } consumed += chosenModels[i].size - slog.Info("target vram", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed)) + slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed)) wg.Add(1) go func(i int) { diff --git a/integration/embed_test.go b/integration/embed_test.go index 10333d5d..4a68af68 100644 --- a/integration/embed_test.go +++ b/integration/embed_test.go @@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) { t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0]) } - if res.PromptEvalCount != 8 { - t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount) + if res.PromptEvalCount != 6 { + t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount) } } @@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) { t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0]) } - if res.PromptEvalCount != 16 { - t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount) + if res.PromptEvalCount != 12 { + t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount) } } diff --git a/integration/llm_test.go b/integration/llm_test.go index 4952b072..398e0a03 100644 --- a/integration/llm_test.go +++ b/integration/llm_test.go @@ -35,8 +35,8 @@ var ( }, } resp = [2][]string{ - []string{"sunlight"}, - []string{"england", "english", "massachusetts", "pilgrims"}, + {"sunlight"}, + {"england", "english", "massachusetts", "pilgrims"}, } ) diff --git a/integration/max_queue_test.go b/integration/max_queue_test.go index b06197e1..ec9e085a 100644 --- a/integration/max_queue_test.go +++ b/integration/max_queue_test.go @@ -29,7 +29,7 @@ func TestMaxQueue(t *testing.T) { // Also note that by default Darwin can't sustain > ~128 connections without adjusting limits threadCount := 32 if maxQueue := envconfig.MaxQueue(); maxQueue != 0 { - threadCount = maxQueue + threadCount = int(maxQueue) } else { t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount)) } diff --git a/integration/utils_test.go b/integration/utils_test.go index c2b27ee9..a6010995 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -334,10 +334,10 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { }, }, [][]string{ - []string{"sunlight"}, - []string{"soil", "organic", "earth", "black", "tan"}, - []string{"england", "english", "massachusetts", "pilgrims", "british"}, - []string{"fourth", "july", "declaration", "independence"}, - []string{"nitrogen", "oxygen", "carbon", "dioxide"}, + {"sunlight"}, + {"soil", "organic", "earth", "black", "tan"}, + {"england", "english", "massachusetts", "pilgrims", "british"}, + {"fourth", "july", "declaration", "independence"}, + {"nitrogen", "oxygen", "carbon", "dioxide"}, } } diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt index bfc97c63..90fd0ef2 100644 --- a/llm/ext_server/CMakeLists.txt +++ b/llm/ext_server/CMakeLists.txt @@ -1,12 +1,13 @@ set(TARGET ollama_llama_server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) +set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) -target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS}) if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index d72bb1b1..8e08b850 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -44,6 +44,7 @@ #include #endif +#include #include #include #include @@ -402,7 +403,9 @@ struct llama_server_context } } - std::tie(model, ctx) = llama_init_from_gpt_params(params); + auto init_result = llama_init_from_gpt_params(params); + model = init_result.model; + ctx = init_result.context; if (model == nullptr) { LOG_ERROR("unable to load model", {{"model", params.model}}); @@ -1221,7 +1224,6 @@ struct llama_server_context res.result_json = json { {"embedding", std::vector(embd, embd + n_embd)}, - {"timings", slot.get_formated_timings()}, }; } } @@ -1427,7 +1429,13 @@ struct llama_server_context switch (task.type) { case TASK_TYPE_COMPLETION: { - server_slot *slot = prefix_slot(task.data["prompt"]); + server_slot *slot = nullptr; + if (task.embedding_mode) { + // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0 + slot = slots[0].available() ? &slots[0] : nullptr; + } else { + slot = prefix_slot(task.data["prompt"]); + } if (slot == nullptr) { // if no slot is available, we defer this task for processing later @@ -2420,7 +2428,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g invalid_param = true; break; } - params.lora_adapter.emplace_back(argv[i], 1.0f); + params.lora_adapters.push_back({ + std::string(argv[i]), + 1.0, + }); params.use_mmap = false; } else if (arg == "--lora-scaled") @@ -2436,7 +2447,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g invalid_param = true; break; } - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.lora_adapters.push_back({ + lora_adapter, + std::stof(argv[i]) + }); params.use_mmap = false; } else if (arg == "-v" || arg == "--verbose") @@ -3184,37 +3198,17 @@ int main(int argc, char **argv) { prompt = ""; } - if (prompt.size() == 1) { - prompt = prompt[0]; - } - // create and queue the task - json responses; - { - const int id_task = llama.queue_tasks.get_new_id(); - llama.queue_results.add_waiting_task_id(id_task); - llama.request_completion(id_task, {{"prompt", prompt}}, true, -1); + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, {{"prompt", prompt}}, true, -1); - // get the result - task_result result = llama.queue_results.recv(id_task); - llama.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); - } + // get the result + task_result result = llama.queue_results.recv(task_id); + llama.queue_results.remove_waiting_task_id(task_id); - responses = result.result_json.value("results", std::vector{result.result_json}); - json embeddings = json::array(); - - int prompt_n = 0; - for (auto & elem : responses) { - embeddings.push_back(elem.at("embedding")); - prompt_n += elem.at("timings").at("prompt_n").get(); - } - - // send the result - json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}}; - return res.set_content(embedding_res.dump(), "application/json; charset=utf-8"); - } + // send the result + return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); }); // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!? diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index da1b0688..40115936 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -9,11 +9,14 @@ init_vars() { ARCH="arm64" ;; *) - ARCH=$(uname -m | sed -e "s/aarch64/arm64/g") + echo "GOARCH must be set" + echo "this script is meant to be run from within go generate" + exit 1 + ;; esac LLAMACPP_DIR=../llama.cpp - CMAKE_DEFS="" + CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on" CMAKE_TARGETS="--target ollama_llama_server" if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}" @@ -27,6 +30,7 @@ init_vars() { WHOLE_ARCHIVE="-Wl,-force_load" NO_WHOLE_ARCHIVE="" GCC_ARCH="-arch ${ARCH}" + DIST_BASE=../../dist/darwin-${GOARCH}/ ;; "Linux") LIB_EXT="so" @@ -35,6 +39,7 @@ init_vars() { # Cross compiling not supported on linux - Use docker GCC_ARCH="" + DIST_BASE=../../dist/linux-${GOARCH}/ ;; *) ;; @@ -42,6 +47,7 @@ init_vars() { if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" fi + GZIP=$(which pigz 2>/dev/null || echo "gzip") } git_module_setup() { @@ -85,26 +91,36 @@ build() { compress() { echo "Compressing payloads to reduce overall binary size..." - pids="" rm -rf ${BUILD_DIR}/bin/*.gz for f in ${BUILD_DIR}/bin/* ; do - gzip -n --best -f ${f} & - pids+=" $!" + ${GZIP} -n --best -f ${f} & + compress_pids+=" $!" done # check for lib directory if [ -d ${BUILD_DIR}/lib ]; then for f in ${BUILD_DIR}/lib/* ; do - gzip -n --best -f ${f} & - pids+=" $!" + ${GZIP} -n --best -f ${f} & + compress_pids+=" $!" done fi echo - for pid in ${pids}; do +} + +wait_for_compress() { + for pid in ${compress_pids}; do wait $pid done echo "Finished compression" } +install() { + echo "Installing libraries to bin dir ${BUILD_DIR}/bin/" + for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do + rm -f "${BUILD_DIR}/bin/$(basename ${lib})" + cp -af "${lib}" "${BUILD_DIR}/bin/" + done +} + # Keep the local tree clean after we're done with the build cleanup() { (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt) diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 6c0b62cb..f22c0f8e 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -6,6 +6,7 @@ set -ex set -o pipefail +compress_pids="" echo "Starting darwin generate script" source $(dirname $0)/gen_common.sh init_vars @@ -98,4 +99,5 @@ case "${GOARCH}" in esac cleanup +wait_for_compress echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index db2c6c30..6927dda8 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -13,6 +13,7 @@ set -ex set -o pipefail +compress_pids="" # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference amdGPUs() { @@ -51,7 +52,7 @@ if [ -z "${CUDACXX}" ]; then export CUDACXX=$(command -v nvcc) fi fi -COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" +COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" source $(dirname $0)/gen_common.sh init_vars git_module_setup @@ -77,10 +78,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then init_vars echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" - CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" + CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building custom CPU" build + install compress else # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 @@ -93,7 +95,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake - COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" + COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) @@ -103,6 +105,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building LCD CPU" build + install compress fi @@ -120,6 +123,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu_avx" echo "Building AVX CPU" build + install compress fi @@ -133,6 +137,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" build + install compress fi fi @@ -160,7 +165,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then echo "CUDA libraries detected - building dynamic CUDA library" init_vars CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true) - if [ -n "${CUDA_MAJOR}" ]; then + if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then CUDA_VARIANT=_v${CUDA_MAJOR} fi if [ "${ARCH}" == "arm64" ]; then @@ -178,29 +183,19 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" echo "Building custom CUDA GPU" else - CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" + CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" fi - CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}" + export CUDAFLAGS="-t8" + CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" - EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" + export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" + CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}" build - - # Carry the CUDA libs as payloads to help reduce dependency burden on users - # - # TODO - in the future we may shift to packaging these separately and conditionally - # downloading them in the install script. - DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )" - for lib in libcudart.so libcublas.so libcublasLt.so ; do - DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true) - if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then - cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/" - elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then - cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/" - elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then - cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/" - else - cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/" - fi + install + echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" + mkdir -p "${CUDA_DIST_DIR}" + for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do + cp -a "${lib}" "${CUDA_DIST_DIR}" done compress @@ -218,21 +213,24 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then CC=icx CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" BUILD_DIR="../build/linux/${ARCH}/oneapi" - EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" + ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama" + export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it build # copy oneAPI dependencies + mkdir -p "${ONEAPI_DIST_DIR}" for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do - cp "${dep}" "${BUILD_DIR}/bin/" + cp -a "${dep}" "${ONEAPI_DIST_DIR}" done - cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}" + install compress fi @@ -262,23 +260,22 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then echo "Building custom ROCM GPU" fi BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" - EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" + # ROCm dependencies are too large to fit into a unified bundle + ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama" + # TODO figure out how to disable runpath (rpath) + # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work + export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" build - # Record the ROCM dependencies - rm -f "${BUILD_DIR}/bin/deps.txt" - touch "${BUILD_DIR}/bin/deps.txt" - for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do - echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt" + # copy the ROCM dependencies + mkdir -p "${ROCM_DIST_DIR}" + for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do + cp -a "${dep}"* "${ROCM_DIST_DIR}" done - # bomb out if for some reason we didn't get a few deps - if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then - cat "${BUILD_DIR}/bin/deps.txt" - echo "ERROR: deps file short" - exit 1 - fi + install compress fi cleanup +wait_for_compress echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index d8bce92d..cbdfd09f 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -35,7 +35,7 @@ function init_vars { ) $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() - $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners" + $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners" md "$script:DIST_BASE" -ea 0 > $null if ($env:CGO_CFLAGS -contains "-g") { $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo") @@ -117,7 +117,7 @@ function build { if ($cmakeDefs -contains "-G") { $extra=@("-j8") } else { - $extra= @("--", "/p:CL_MPcount=8") + $extra= @("--", "/maxCpuCount:8") } write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra" & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra @@ -261,7 +261,7 @@ function build_cuda() { if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) { # Then build cuda as a dynamically loaded library $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe" - $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename + $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0] if ($null -ne $script:CUDA_VERSION) { $script:CUDA_VARIANT="_"+$script:CUDA_VERSION } @@ -273,9 +273,9 @@ function build_cuda() { "-DGGML_CUDA=ON", "-DGGML_AVX=on", "-DGGML_AVX2=off", - "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", - "-DCMAKE_CUDA_FLAGS=-t8", - "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}" + "-DCMAKE_CUDA_FLAGS=-t6", + "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}", + "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH" ) if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) { write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`"" @@ -286,12 +286,11 @@ function build_cuda() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null - write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null + write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" } else { write-host "Skipping CUDA generation step" } @@ -325,18 +324,17 @@ function build_oneapi() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" } else { Write-Host "Skipping oneAPI generation step" } @@ -386,12 +384,11 @@ function build_rocm() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null - cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" - cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null + cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs - cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" + cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" } else { write-host "Skipping ROCm generation step" } diff --git a/llm/ggml.go b/llm/ggml.go index d7f2eef7..4c68adf9 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -157,6 +157,14 @@ type Tensor struct { io.WriterTo `json:"-"` } +func (t Tensor) block() (n int) { + if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil { + return -1 + } + + return +} + func (t Tensor) blockSize() uint64 { switch t.Kind { case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16 diff --git a/llm/gguf.go b/llm/gguf.go index 98158313..2e6bc542 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -532,15 +532,14 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { } } - slices.SortFunc(ts, func(a, b Tensor) int { - var i, j int - if n, err := fmt.Sscanf(a.Name, "blk.%d", &i); err != nil || n != 1 { - return cmp.Compare(a.Name, b.Name) - } else if n, err := fmt.Sscanf(b.Name, "blk.%d", &j); err != nil || n != 1 { - return cmp.Compare(a.Name, b.Name) + slices.SortStableFunc(ts, func(a, b Tensor) int { + if i, j := a.block(), b.block(); i < 0 && j > 0 { + return 1 + } else if i > 0 && j < 0 { + return -1 + } else { + return cmp.Compare(i, j) } - - return cmp.Compare(i, j) }) var s uint64 diff --git a/llm/llama.cpp b/llm/llama.cpp index 6eeaeba1..1e6f6554 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972 +Subproject commit 1e6f6554aa11fa10160a5fda689e736c3c34169f diff --git a/llm/memory_test.go b/llm/memory_test.go index 6cf0119f..ffb14286 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) { assert.Len(t, tensors, inputLayerCount+1) err = WriteGGUF(f, KV{ "general.architecture": "llama", - "general.name": "name", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), "llama.block_count": uint32(inputLayerCount), diff --git a/llm/patches/08-pooling.diff b/llm/patches/08-pooling.diff deleted file mode 100644 index 2e4fe11e..00000000 --- a/llm/patches/08-pooling.diff +++ /dev/null @@ -1,60 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index 721b8f4e..cfe7ac40 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -8420,14 +8420,14 @@ struct llm_build_context { - } - - struct ggml_tensor * build_inp_mean() { -- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); -+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max); - cb(lctx.inp_mean, "inp_mean", -1); - ggml_set_input(lctx.inp_mean); - return lctx.inp_mean; - } - - struct ggml_tensor * build_inp_cls() { -- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); -+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max); - cb(lctx.inp_cls, "inp_cls", -1); - ggml_set_input(lctx.inp_cls); - return lctx.inp_cls; -@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); - - float * data = (float *) lctx.inp_mean->data; -- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); -+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean)); - - std::vector sum(n_tokens, 0); - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; -- -- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); -- - sum[seq_id] += 1; - } - -- std::vector div(n_tokens, 0.0f); -- for (int i = 0; i < n_tokens; ++i) { -+ std::vector div(cparams.n_seq_max, 0.0f); -+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); -@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); - - uint32_t * data = (uint32_t *) lctx.inp_cls->data; -- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); -+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls)); - - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; - const llama_pos pos = batch.pos[i]; -- -- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS"); -- - if (pos == 0) { - data[seq_id] = i; - } diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff index 10c66d1d..21958476 100644 --- a/llm/patches/09-lora.diff +++ b/llm/patches/09-lora.diff @@ -1,40 +1,32 @@ diff --git a/common/common.cpp b/common/common.cpp -index dbb724fb..c26fe6ee 100644 +index 2e8374d5..70d0afde 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -2087,14 +2087,27 @@ std::tuple llama_init_from_gpt_par - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); -+ -+ // try to load as gguf - auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); - if (adapter == nullptr) { -- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); +@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (loaded_la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - llama_free(lctx); - llama_free_model(model); -- return std::make_tuple(nullptr, nullptr); -+ fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__); +- return iparams; + + // if that fails, try loading as ggla for compatibility + int err = llama_model_apply_lora_from_file(model, -+ lora_adapter.c_str(), -+ lora_scale, ++ la.path.c_str(), ++ la.scale, + nullptr, + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + llama_free(lctx); + llama_free_model(model); -+ return std::make_tuple(nullptr, nullptr); ++ return iparams; ++ } else { ++ break; + } -+ } else { -+ llama_lora_adapter_set(lctx, adapter, lora_scale); } -- llama_lora_adapter_set(lctx, adapter, lora_scale); + iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters } - - if (params.ignore_eos) { diff --git a/include/llama.h b/include/llama.h index 93fd77ca..b0fb37a6 100644 --- a/include/llama.h @@ -355,4 +347,4 @@ index 80a0dd0f..9d7b0e17 100644 + return 1; + } +} -\ No newline at end of file +\ No newline at end of file \ No newline at end of file diff --git a/llm/patches/10-params.diff b/llm/patches/10-params.diff deleted file mode 100644 index 56699b8e..00000000 --- a/llm/patches/10-params.diff +++ /dev/null @@ -1,20 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index a207451f..fba6b175 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -4969,6 +4969,7 @@ static void llm_load_hparams( - hparams.attn_soft_cap = true; - - switch (hparams.n_layer) { -+ case 26: model.type = e_model::MODEL_2B; break; - case 42: model.type = e_model::MODEL_9B; break; - case 46: model.type = e_model::MODEL_27B; break; - default: model.type = e_model::MODEL_UNKNOWN; -@@ -11736,6 +11737,7 @@ struct llm_build_context { - - // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e - switch (model.type) { -+ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; - case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; - case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; - default: GGML_ABORT("fatal error"); diff --git a/llm/payload.go b/llm/payload.go index b402e1f2..963b3295 100644 --- a/llm/payload.go +++ b/llm/payload.go @@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string { // glob workDir for files that start with ollama_ availableServers := getAvailableServers() requested := info.Library - if info.Variant != gpu.CPUCapabilityNone { - requested += "_" + info.Variant.String() + if info.Variant != gpu.CPUCapabilityNone.String() { + requested += "_" + info.Variant } servers := []string{} diff --git a/llm/server.go b/llm/server.go index 7abc3bd7..9347a458 100644 --- a/llm/server.go +++ b/llm/server.go @@ -33,7 +33,7 @@ type LlamaServer interface { Ping(ctx context.Context) error WaitUntilRunning(ctx context.Context) error Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error - Embed(ctx context.Context, input []string) (*EmbedResponse, error) + Embedding(ctx context.Context, input string) ([]float32, error) Tokenize(ctx context.Context, content string) ([]int, error) Detokenize(ctx context.Context, tokens []int) (string, error) Close() error @@ -44,11 +44,12 @@ type LlamaServer interface { // llmServer is an instance of the llama.cpp server type llmServer struct { - port int - cmd *exec.Cmd - done chan error // Channel to signal when the process exits - status *StatusWriter - options api.Options + port int + cmd *exec.Cmd + done chan error // Channel to signal when the process exits + status *StatusWriter + options api.Options + numParallel int estimate MemoryEstimate totalLayers uint64 @@ -124,8 +125,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } } - // On linux, over-allocating CPU memory will almost always result in an error - if runtime.GOOS == "linux" { + // On linux and windows, over-allocating CPU memory will almost always result in an error + // Darwin has fully dynamic swap so has no direct concept of free swap space + if runtime.GOOS != "darwin" { systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize available := systemFreeMemory + systemSwapFreeMemory if systemMemoryRequired > available { @@ -256,8 +258,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--mlock") } - if opts.UseNUMA { - params = append(params, "--numa") + if gpu.IsNUMA() { + numaMode := "distribute" + if runtime.GOOS == "linux" { + if _, err := exec.LookPath("numactl"); err == nil { + numaMode = "numactl" + } + } + params = append(params, "--numa", numaMode) } params = append(params, "--parallel", strconv.Itoa(numParallel)) @@ -298,20 +306,18 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr if runtime.GOOS == "windows" { pathEnv = "PATH" } - // prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies - libraryPaths := []string{dir, filepath.Dir(dir)} + // Start with the server directory for the LD_LIBRARY_PATH/PATH + libraryPaths := []string{dir} if libraryPath, ok := os.LookupEnv(pathEnv); ok { - // Append our runner directory to the path - // This will favor system libraries over our bundled library dependencies + // favor our bundled library dependencies over system libraries libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) } // Note: we always put the dependency path first - // since this was the exact version we verified for AMD GPUs - // and we favor what the user had in their path + // since this was the exact version we compiled/linked against if gpus[0].DependencyPath != "" { - // TODO refine for multi-gpu support + // assume gpus from the same library have the same dependency path libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...) } @@ -337,6 +343,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr status: NewStatusWriter(os.Stderr), options: opts, estimate: estimate, + numParallel: numParallel, sem: semaphore.NewWeighted(int64(numParallel)), totalLayers: ggml.KV().BlockCount() + 1, gpus: gpus, @@ -874,16 +881,15 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu return nil } -type EmbedRequest struct { - Content []string `json:"content"` +type EmbeddingRequest struct { + Content string `json:"content"` } -type EmbedResponse struct { - Embedding [][]float32 `json:"embedding"` - PromptEvalCount int `json:"prompt_n"` +type EmbeddingResponse struct { + Embedding []float32 `json:"embedding"` } -func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) { +func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) { if err := s.sem.Acquire(ctx, 1); err != nil { slog.Error("Failed to acquire semaphore", "error", err) return nil, err @@ -898,18 +904,18 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, return nil, fmt.Errorf("unexpected server status: %s", status.ToString()) } - data, err := json.Marshal(EmbedRequest{Content: input}) + data, err := json.Marshal(EmbeddingRequest{Content: input}) if err != nil { return nil, fmt.Errorf("error marshaling embed data: %w", err) } - req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data)) + r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data)) if err != nil { return nil, fmt.Errorf("error creating embed request: %w", err) } - req.Header.Set("Content-Type", "application/json") + r.Header.Set("Content-Type", "application/json") - resp, err := http.DefaultClient.Do(req) + resp, err := http.DefaultClient.Do(r) if err != nil { return nil, fmt.Errorf("do embedding request: %w", err) } @@ -925,12 +931,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, return nil, fmt.Errorf("%s", body) } - var e EmbedResponse + var e EmbeddingResponse if err := json.Unmarshal(body, &e); err != nil { return nil, fmt.Errorf("unmarshal tokenize response: %w", err) } - return &e, nil + return e.Embedding, nil } type TokenizeRequest struct { diff --git a/llm/status.go b/llm/status.go index d9f36115..604fe9e0 100644 --- a/llm/status.go +++ b/llm/status.go @@ -26,6 +26,7 @@ var errorPrefixes = []string{ "cudaMalloc failed", "\"ERR\"", "error loading model", + "GGML_ASSERT", } func (w *StatusWriter) Write(b []byte) (int, error) { diff --git a/openai/openai_test.go b/openai/openai_test.go index e08a96c9..c7e9f384 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -7,27 +7,22 @@ import ( "io" "net/http" "net/http/httptest" + "reflect" "strings" "testing" "time" "github.com/gin-gonic/gin" - "github.com/stretchr/testify/assert" "github.com/ollama/ollama/api" ) const ( - prefix = `data:image/jpeg;base64,` - image = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=` - imageURL = prefix + image + prefix = `data:image/jpeg;base64,` + image = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=` ) -func prepareRequest(req *http.Request, body any) { - bodyBytes, _ := json.Marshal(body) - req.Body = io.NopCloser(bytes.NewReader(bodyBytes)) - req.Header.Set("Content-Type", "application/json") -} +var False = false func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc { return func(c *gin.Context) { @@ -43,134 +38,136 @@ func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc { func TestChatMiddleware(t *testing.T) { type testCase struct { - Name string - Setup func(t *testing.T, req *http.Request) - Expected func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) + name string + body string + req api.ChatRequest + err ErrorResponse } var capturedRequest *api.ChatRequest testCases := []testCase{ { - Name: "chat handler", - Setup: func(t *testing.T, req *http.Request) { - body := ChatCompletionRequest{ - Model: "test-model", - Messages: []Message{{Role: "user", Content: "Hello"}}, - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusOK { - t.Fatalf("expected 200, got %d", resp.Code) - } - - if req.Messages[0].Role != "user" { - t.Fatalf("expected 'user', got %s", req.Messages[0].Role) - } - - if req.Messages[0].Content != "Hello" { - t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content) - } + name: "chat handler", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "Hello"} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "Hello", + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, }, }, { - Name: "chat handler with image content", - Setup: func(t *testing.T, req *http.Request) { - body := ChatCompletionRequest{ - Model: "test-model", - Messages: []Message{ - { - Role: "user", Content: []map[string]any{ - {"type": "text", "text": "Hello"}, - {"type": "image_url", "image_url": map[string]string{"url": imageURL}}, + name: "chat handler with image content", + body: `{ + "model": "test-model", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Hello" + }, + { + "type": "image_url", + "image_url": { + "url": "` + prefix + image + `" + } + } + ] + } + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "Hello", + }, + { + Role: "user", + Images: []api.ImageData{ + func() []byte { + img, _ := base64.StdEncoding.DecodeString(image) + return img + }(), + }, + }, + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, + }, + }, + { + name: "chat handler with tools", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What's the weather like in Paris Today?"}, + {"role": "assistant", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]} + ] + }`, + req: api.ChatRequest{ + Model: "test-model", + Messages: []api.Message{ + { + Role: "user", + Content: "What's the weather like in Paris Today?", + }, + { + Role: "assistant", + ToolCalls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: "get_current_weather", + Arguments: map[string]interface{}{ + "location": "Paris, France", + "format": "celsius", + }, + }, }, }, }, - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusOK { - t.Fatalf("expected 200, got %d", resp.Code) - } - - if req.Messages[0].Role != "user" { - t.Fatalf("expected 'user', got %s", req.Messages[0].Role) - } - - if req.Messages[0].Content != "Hello" { - t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content) - } - - img, _ := base64.StdEncoding.DecodeString(imageURL[len(prefix):]) - - if req.Messages[1].Role != "user" { - t.Fatalf("expected 'user', got %s", req.Messages[1].Role) - } - - if !bytes.Equal(req.Messages[1].Images[0], img) { - t.Fatalf("expected image encoding, got %s", req.Messages[1].Images[0]) - } + }, + Options: map[string]any{ + "temperature": 1.0, + "top_p": 1.0, + }, + Stream: &False, }, }, + { - Name: "chat handler with tools", - Setup: func(t *testing.T, req *http.Request) { - body := ChatCompletionRequest{ - Model: "test-model", - Messages: []Message{ - {Role: "user", Content: "What's the weather like in Paris Today?"}, - {Role: "assistant", ToolCalls: []ToolCall{{ - ID: "id", - Type: "function", - Function: struct { - Name string `json:"name"` - Arguments string `json:"arguments"` - }{ - Name: "get_current_weather", - Arguments: "{\"location\": \"Paris, France\", \"format\": \"celsius\"}", - }, - }}}, - }, - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { - if resp.Code != 200 { - t.Fatalf("expected 200, got %d", resp.Code) - } - - if req.Messages[0].Content != "What's the weather like in Paris Today?" { - t.Fatalf("expected What's the weather like in Paris Today?, got %s", req.Messages[0].Content) - } - - if req.Messages[1].ToolCalls[0].Function.Arguments["location"] != "Paris, France" { - t.Fatalf("expected 'Paris, France', got %v", req.Messages[1].ToolCalls[0].Function.Arguments["location"]) - } - - if req.Messages[1].ToolCalls[0].Function.Arguments["format"] != "celsius" { - t.Fatalf("expected celsius, got %v", req.Messages[1].ToolCalls[0].Function.Arguments["format"]) - } - }, - }, - { - Name: "chat handler error forwarding", - Setup: func(t *testing.T, req *http.Request) { - body := ChatCompletionRequest{ - Model: "test-model", - Messages: []Message{{Role: "user", Content: 2}}, - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusBadRequest { - t.Fatalf("expected 400, got %d", resp.Code) - } - - if !strings.Contains(resp.Body.String(), "invalid message content type") { - t.Fatalf("error was not forwarded") - } + name: "chat handler error forwarding", + body: `{ + "model": "test-model", + "messages": [ + {"role": "user", "content": 2} + ] + }`, + err: ErrorResponse{ + Error: Error{ + Message: "invalid message content type: float64", + Type: "invalid_request_error", + }, }, }, } @@ -185,16 +182,26 @@ func TestChatMiddleware(t *testing.T) { router.Handle(http.MethodPost, "/api/chat", endpoint) for _, tc := range testCases { - t.Run(tc.Name, func(t *testing.T) { - req, _ := http.NewRequest(http.MethodPost, "/api/chat", nil) - - tc.Setup(t, req) + t.Run(tc.name, func(t *testing.T) { + req, _ := http.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(tc.body)) + req.Header.Set("Content-Type", "application/json") resp := httptest.NewRecorder() router.ServeHTTP(resp, req) - tc.Expected(t, capturedRequest, resp) + var errResp ErrorResponse + if resp.Code != http.StatusOK { + if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil { + t.Fatal(err) + } + } + if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) { + t.Fatal("requests did not match") + } + if !reflect.DeepEqual(tc.err, errResp) { + t.Fatal("errors did not match") + } capturedRequest = nil }) } @@ -202,71 +209,52 @@ func TestChatMiddleware(t *testing.T) { func TestCompletionsMiddleware(t *testing.T) { type testCase struct { - Name string - Setup func(t *testing.T, req *http.Request) - Expected func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) + name string + body string + req api.GenerateRequest + err ErrorResponse } var capturedRequest *api.GenerateRequest testCases := []testCase{ { - Name: "completions handler", - Setup: func(t *testing.T, req *http.Request) { - temp := float32(0.8) - body := CompletionRequest{ - Model: "test-model", - Prompt: "Hello", - Temperature: &temp, - Stop: []string{"\n", "stop"}, - Suffix: "suffix", - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) { - if req.Prompt != "Hello" { - t.Fatalf("expected 'Hello', got %s", req.Prompt) - } - - if req.Options["temperature"] != 1.6 { - t.Fatalf("expected 1.6, got %f", req.Options["temperature"]) - } - - stopTokens, ok := req.Options["stop"].([]any) - - if !ok { - t.Fatalf("expected stop tokens to be a list") - } - - if stopTokens[0] != "\n" || stopTokens[1] != "stop" { - t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens) - } - - if req.Suffix != "suffix" { - t.Fatalf("expected 'suffix', got %s", req.Suffix) - } + name: "completions handler", + body: `{ + "model": "test-model", + "prompt": "Hello", + "temperature": 0.8, + "stop": ["\n", "stop"], + "suffix": "suffix" + }`, + req: api.GenerateRequest{ + Model: "test-model", + Prompt: "Hello", + Options: map[string]any{ + "frequency_penalty": 0.0, + "presence_penalty": 0.0, + "temperature": 1.6, + "top_p": 1.0, + "stop": []any{"\n", "stop"}, + }, + Suffix: "suffix", + Stream: &False, }, }, { - Name: "completions handler error forwarding", - Setup: func(t *testing.T, req *http.Request) { - body := CompletionRequest{ - Model: "test-model", - Prompt: "Hello", - Temperature: nil, - Stop: []int{1, 2}, - Suffix: "suffix", - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusBadRequest { - t.Fatalf("expected 400, got %d", resp.Code) - } - - if !strings.Contains(resp.Body.String(), "invalid type for 'stop' field") { - t.Fatalf("error was not forwarded") - } + name: "completions handler error forwarding", + body: `{ + "model": "test-model", + "prompt": "Hello", + "temperature": null, + "stop": [1, 2], + "suffix": "suffix" + }`, + err: ErrorResponse{ + Error: Error{ + Message: "invalid type for 'stop' field: float64", + Type: "invalid_request_error", + }, }, }, } @@ -281,15 +269,27 @@ func TestCompletionsMiddleware(t *testing.T) { router.Handle(http.MethodPost, "/api/generate", endpoint) for _, tc := range testCases { - t.Run(tc.Name, func(t *testing.T) { - req, _ := http.NewRequest(http.MethodPost, "/api/generate", nil) - - tc.Setup(t, req) + t.Run(tc.name, func(t *testing.T) { + req, _ := http.NewRequest(http.MethodPost, "/api/generate", strings.NewReader(tc.body)) + req.Header.Set("Content-Type", "application/json") resp := httptest.NewRecorder() router.ServeHTTP(resp, req) - tc.Expected(t, capturedRequest, resp) + var errResp ErrorResponse + if resp.Code != http.StatusOK { + if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil { + t.Fatal(err) + } + } + + if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) { + t.Fatal("requests did not match") + } + + if !reflect.DeepEqual(tc.err, errResp) { + t.Fatal("errors did not match") + } capturedRequest = nil }) @@ -298,78 +298,47 @@ func TestCompletionsMiddleware(t *testing.T) { func TestEmbeddingsMiddleware(t *testing.T) { type testCase struct { - Name string - Setup func(t *testing.T, req *http.Request) - Expected func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) + name string + body string + req api.EmbedRequest + err ErrorResponse } var capturedRequest *api.EmbedRequest testCases := []testCase{ { - Name: "embed handler single input", - Setup: func(t *testing.T, req *http.Request) { - body := EmbedRequest{ - Input: "Hello", - Model: "test-model", - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) { - if req.Input != "Hello" { - t.Fatalf("expected 'Hello', got %s", req.Input) - } - - if req.Model != "test-model" { - t.Fatalf("expected 'test-model', got %s", req.Model) - } + name: "embed handler single input", + body: `{ + "input": "Hello", + "model": "test-model" + }`, + req: api.EmbedRequest{ + Input: "Hello", + Model: "test-model", }, }, { - Name: "embed handler batch input", - Setup: func(t *testing.T, req *http.Request) { - body := EmbedRequest{ - Input: []string{"Hello", "World"}, - Model: "test-model", - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) { - input, ok := req.Input.([]any) - - if !ok { - t.Fatalf("expected input to be a list") - } - - if input[0].(string) != "Hello" { - t.Fatalf("expected 'Hello', got %s", input[0]) - } - - if input[1].(string) != "World" { - t.Fatalf("expected 'World', got %s", input[1]) - } - - if req.Model != "test-model" { - t.Fatalf("expected 'test-model', got %s", req.Model) - } + name: "embed handler batch input", + body: `{ + "input": ["Hello", "World"], + "model": "test-model" + }`, + req: api.EmbedRequest{ + Input: []any{"Hello", "World"}, + Model: "test-model", }, }, { - Name: "embed handler error forwarding", - Setup: func(t *testing.T, req *http.Request) { - body := EmbedRequest{ - Model: "test-model", - } - prepareRequest(req, body) - }, - Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) { - if resp.Code != http.StatusBadRequest { - t.Fatalf("expected 400, got %d", resp.Code) - } - - if !strings.Contains(resp.Body.String(), "invalid input") { - t.Fatalf("error was not forwarded") - } + name: "embed handler error forwarding", + body: `{ + "model": "test-model" + }`, + err: ErrorResponse{ + Error: Error{ + Message: "invalid input", + Type: "invalid_request_error", + }, }, }, } @@ -384,116 +353,167 @@ func TestEmbeddingsMiddleware(t *testing.T) { router.Handle(http.MethodPost, "/api/embed", endpoint) for _, tc := range testCases { - t.Run(tc.Name, func(t *testing.T) { - req, _ := http.NewRequest(http.MethodPost, "/api/embed", nil) - - tc.Setup(t, req) + t.Run(tc.name, func(t *testing.T) { + req, _ := http.NewRequest(http.MethodPost, "/api/embed", strings.NewReader(tc.body)) + req.Header.Set("Content-Type", "application/json") resp := httptest.NewRecorder() router.ServeHTTP(resp, req) - tc.Expected(t, capturedRequest, resp) + var errResp ErrorResponse + if resp.Code != http.StatusOK { + if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil { + t.Fatal(err) + } + } + + if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) { + t.Fatal("requests did not match") + } + + if !reflect.DeepEqual(tc.err, errResp) { + t.Fatal("errors did not match") + } capturedRequest = nil }) } } -func TestMiddlewareResponses(t *testing.T) { +func TestListMiddleware(t *testing.T) { type testCase struct { - Name string - Method string - Path string - TestPath string - Handler func() gin.HandlerFunc - Endpoint func(c *gin.Context) - Setup func(t *testing.T, req *http.Request) - Expected func(t *testing.T, resp *httptest.ResponseRecorder) + name string + endpoint func(c *gin.Context) + resp string } testCases := []testCase{ { - Name: "list handler", - Method: http.MethodGet, - Path: "/api/tags", - TestPath: "/api/tags", - Handler: ListMiddleware, - Endpoint: func(c *gin.Context) { + name: "list handler", + endpoint: func(c *gin.Context) { c.JSON(http.StatusOK, api.ListResponse{ Models: []api.ListModelResponse{ { - Name: "Test Model", + Name: "test-model", + ModifiedAt: time.Unix(int64(1686935002), 0).UTC(), }, }, }) }, - Expected: func(t *testing.T, resp *httptest.ResponseRecorder) { - var listResp ListCompletion - if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil { - t.Fatal(err) - } - - if listResp.Object != "list" { - t.Fatalf("expected list, got %s", listResp.Object) - } - - if len(listResp.Data) != 1 { - t.Fatalf("expected 1, got %d", len(listResp.Data)) - } - - if listResp.Data[0].Id != "Test Model" { - t.Fatalf("expected Test Model, got %s", listResp.Data[0].Id) - } - }, + resp: `{ + "object": "list", + "data": [ + { + "id": "test-model", + "object": "model", + "created": 1686935002, + "owned_by": "library" + } + ] + }`, }, { - Name: "retrieve model", - Method: http.MethodGet, - Path: "/api/show/:model", - TestPath: "/api/show/test-model", - Handler: RetrieveMiddleware, - Endpoint: func(c *gin.Context) { - c.JSON(http.StatusOK, api.ShowResponse{ - ModifiedAt: time.Date(2024, 6, 17, 13, 45, 0, 0, time.UTC), - }) - }, - Expected: func(t *testing.T, resp *httptest.ResponseRecorder) { - var retrieveResp Model - if err := json.NewDecoder(resp.Body).Decode(&retrieveResp); err != nil { - t.Fatal(err) - } - - if retrieveResp.Object != "model" { - t.Fatalf("Expected object to be model, got %s", retrieveResp.Object) - } - - if retrieveResp.Id != "test-model" { - t.Fatalf("Expected id to be test-model, got %s", retrieveResp.Id) - } + name: "list handler empty output", + endpoint: func(c *gin.Context) { + c.JSON(http.StatusOK, api.ListResponse{}) }, + resp: `{ + "object": "list", + "data": null + }`, }, } gin.SetMode(gin.TestMode) - router := gin.New() for _, tc := range testCases { - t.Run(tc.Name, func(t *testing.T) { - router = gin.New() - router.Use(tc.Handler()) - router.Handle(tc.Method, tc.Path, tc.Endpoint) - req, _ := http.NewRequest(tc.Method, tc.TestPath, nil) + router := gin.New() + router.Use(ListMiddleware()) + router.Handle(http.MethodGet, "/api/tags", tc.endpoint) + req, _ := http.NewRequest(http.MethodGet, "/api/tags", nil) - if tc.Setup != nil { - tc.Setup(t, req) - } + resp := httptest.NewRecorder() + router.ServeHTTP(resp, req) - resp := httptest.NewRecorder() - router.ServeHTTP(resp, req) + var expected, actual map[string]any + err := json.Unmarshal([]byte(tc.resp), &expected) + if err != nil { + t.Fatalf("failed to unmarshal expected response: %v", err) + } - assert.Equal(t, http.StatusOK, resp.Code) + err = json.Unmarshal(resp.Body.Bytes(), &actual) + if err != nil { + t.Fatalf("failed to unmarshal actual response: %v", err) + } - tc.Expected(t, resp) - }) + if !reflect.DeepEqual(expected, actual) { + t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual) + } + } +} + +func TestRetrieveMiddleware(t *testing.T) { + type testCase struct { + name string + endpoint func(c *gin.Context) + resp string + } + + testCases := []testCase{ + { + name: "retrieve handler", + endpoint: func(c *gin.Context) { + c.JSON(http.StatusOK, api.ShowResponse{ + ModifiedAt: time.Unix(int64(1686935002), 0).UTC(), + }) + }, + resp: `{ + "id":"test-model", + "object":"model", + "created":1686935002, + "owned_by":"library"} + `, + }, + { + name: "retrieve handler error forwarding", + endpoint: func(c *gin.Context) { + c.JSON(http.StatusBadRequest, gin.H{"error": "model not found"}) + }, + resp: `{ + "error": { + "code": null, + "message": "model not found", + "param": null, + "type": "api_error" + } + }`, + }, + } + + gin.SetMode(gin.TestMode) + + for _, tc := range testCases { + router := gin.New() + router.Use(RetrieveMiddleware()) + router.Handle(http.MethodGet, "/api/show/:model", tc.endpoint) + req, _ := http.NewRequest(http.MethodGet, "/api/show/test-model", nil) + + resp := httptest.NewRecorder() + router.ServeHTTP(resp, req) + + var expected, actual map[string]any + err := json.Unmarshal([]byte(tc.resp), &expected) + if err != nil { + t.Fatalf("failed to unmarshal expected response: %v", err) + } + + err = json.Unmarshal(resp.Body.Bytes(), &actual) + if err != nil { + t.Fatalf("failed to unmarshal actual response: %v", err) + } + + if !reflect.DeepEqual(expected, actual) { + t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual) + } } } diff --git a/progress/spinner.go b/progress/spinner.go index 02f3f9fb..e39a45ee 100644 --- a/progress/spinner.go +++ b/progress/spinner.go @@ -3,11 +3,12 @@ package progress import ( "fmt" "strings" + "sync/atomic" "time" ) type Spinner struct { - message string + message atomic.Value messageWidth int parts []string @@ -21,20 +22,25 @@ type Spinner struct { func NewSpinner(message string) *Spinner { s := &Spinner{ - message: message, parts: []string{ "⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏", }, started: time.Now(), } + s.SetMessage(message) go s.start() return s } +func (s *Spinner) SetMessage(message string) { + s.message.Store(message) +} + func (s *Spinner) String() string { var sb strings.Builder - if len(s.message) > 0 { - message := strings.TrimSpace(s.message) + + if message, ok := s.message.Load().(string); ok && len(message) > 0 { + message := strings.TrimSpace(message) if s.messageWidth > 0 && len(message) > s.messageWidth { message = message[:s.messageWidth] } diff --git a/readline/buffer.go b/readline/buffer.go index 68573d40..d91fe0a9 100644 --- a/readline/buffer.go +++ b/readline/buffer.go @@ -62,7 +62,7 @@ func (b *Buffer) MoveLeft() { rLength := runewidth.RuneWidth(r) if b.DisplayPos%b.LineWidth == 0 { - fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width)) + fmt.Print(CursorUp + CursorBOL + CursorRightN(b.Width)) if rLength == 2 { fmt.Print(CursorLeft) } @@ -74,7 +74,7 @@ func (b *Buffer) MoveLeft() { fmt.Print(CursorLeft) } } else { - fmt.Print(cursorLeftN(rLength)) + fmt.Print(CursorLeftN(rLength)) } b.Pos -= 1 @@ -115,15 +115,15 @@ func (b *Buffer) MoveRight() { b.DisplayPos += rLength if b.DisplayPos%b.LineWidth == 0 { - fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt()))) + fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt()))) } else if (b.DisplayPos-rLength)%b.LineWidth == b.LineWidth-1 && hasSpace { - fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())+rLength)) + fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt())+rLength)) b.DisplayPos += 1 } else if b.LineHasSpace.Size() > 0 && b.DisplayPos%b.LineWidth == b.LineWidth-1 && hasSpace { - fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt()))) + fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt()))) b.DisplayPos += 1 } else { - fmt.Print(cursorRightN(rLength)) + fmt.Print(CursorRightN(rLength)) } } } @@ -154,7 +154,7 @@ func (b *Buffer) MoveToStart() { fmt.Print(CursorUp) } } - fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt()))) + fmt.Print(CursorBOL + CursorRightN(len(b.Prompt.prompt()))) b.Pos = 0 b.DisplayPos = 0 } @@ -169,9 +169,9 @@ func (b *Buffer) MoveToEnd() { fmt.Print(CursorDown) } remainder := b.DisplaySize() % b.LineWidth - fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())+remainder)) + fmt.Print(CursorBOL + CursorRightN(len(b.Prompt.prompt())+remainder)) } else { - fmt.Print(cursorRightN(b.DisplaySize() - b.DisplayPos)) + fmt.Print(CursorRightN(b.DisplaySize() - b.DisplayPos)) } b.Pos = b.Buf.Size() @@ -286,8 +286,7 @@ func (b *Buffer) drawRemaining() { remLength := runewidth.StringWidth(remainingText) if len(currLine) > 0 { - fmt.Printf(ClearToEOL + currLine) - fmt.Print(cursorLeftN(currLineSpace)) + fmt.Print(ClearToEOL + currLine + CursorLeftN(currLineSpace)) } else { fmt.Print(ClearToEOL) } @@ -301,9 +300,9 @@ func (b *Buffer) drawRemaining() { } if (b.DisplayPos+currLineSpace)%b.LineWidth == 0 && currLine == remainingText { - fmt.Print(cursorRightN(currLineSpace)) + fmt.Print(CursorRightN(currLineSpace)) fmt.Printf("\n%s", b.Prompt.AltPrompt) - fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width-currLineSpace)) + fmt.Print(CursorUp + CursorBOL + CursorRightN(b.Width-currLineSpace)) } // render the other lines @@ -333,9 +332,7 @@ func (b *Buffer) drawRemaining() { lineLength += runewidth.RuneWidth(c) fmt.Printf("%c", c) } - fmt.Print(ClearToEOL) - fmt.Print(cursorUpN(totalLines)) - fmt.Printf(CursorBOL + cursorRightN(b.Width-currLineSpace)) + fmt.Print(ClearToEOL + CursorUpN(totalLines) + CursorBOL + CursorRightN(b.Width-currLineSpace)) hasSpace := b.GetLineSpacing(b.DisplayPos / b.LineWidth) @@ -357,8 +354,7 @@ func (b *Buffer) Remove() { if b.DisplayPos%b.LineWidth == 0 { // if the user backspaces over the word boundary, do this magic to clear the line // and move to the end of the previous line - fmt.Printf(CursorBOL + ClearToEOL) - fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width)) + fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + CursorRightN(b.Width)) if b.DisplaySize()%b.LineWidth < (b.DisplaySize()-rLength)%b.LineWidth { b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1) @@ -370,24 +366,23 @@ func (b *Buffer) Remove() { } if rLength == 2 { - fmt.Print(CursorLeft + " " + cursorLeftN(2)) + fmt.Print(CursorLeft + " " + CursorLeftN(2)) } else { fmt.Print(" " + CursorLeft) } } else if (b.DisplayPos-rLength)%b.LineWidth == 0 && hasSpace { - fmt.Printf(CursorBOL + ClearToEOL) - fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width)) + fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + CursorRightN(b.Width)) if b.Pos == b.Buf.Size() { b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1) } b.DisplayPos -= 1 } else { - fmt.Print(cursorLeftN(rLength)) + fmt.Print(CursorLeftN(rLength)) for range rLength { fmt.Print(" ") } - fmt.Print(cursorLeftN(rLength)) + fmt.Print(CursorLeftN(rLength)) } var eraseExtraLine bool @@ -405,9 +400,9 @@ func (b *Buffer) Remove() { // are trailing characters which go over the line width boundary if eraseExtraLine { remainingLines := (b.DisplaySize() - b.DisplayPos) / b.LineWidth - fmt.Printf(cursorDownN(remainingLines+1) + CursorBOL + ClearToEOL) + fmt.Print(CursorDownN(remainingLines+1) + CursorBOL + ClearToEOL) place := b.DisplayPos % b.LineWidth - fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.prompt()))) + fmt.Print(CursorUpN(remainingLines+1) + CursorRightN(place+len(b.Prompt.prompt()))) } } } @@ -422,9 +417,9 @@ func (b *Buffer) Delete() { if b.DisplaySize()%b.LineWidth == 0 { if b.DisplayPos != b.DisplaySize() { remainingLines := (b.DisplaySize() - b.DisplayPos) / b.LineWidth - fmt.Printf(cursorDownN(remainingLines) + CursorBOL + ClearToEOL) + fmt.Print(CursorDownN(remainingLines) + CursorBOL + ClearToEOL) place := b.DisplayPos % b.LineWidth - fmt.Printf(cursorUpN(remainingLines) + cursorRightN(place+len(b.Prompt.prompt()))) + fmt.Print(CursorUpN(remainingLines) + CursorRightN(place+len(b.Prompt.prompt()))) } } } @@ -471,17 +466,17 @@ func (b *Buffer) DeleteWord() { } func (b *Buffer) ClearScreen() { - fmt.Printf(ClearScreen + CursorReset + b.Prompt.prompt()) + fmt.Print(ClearScreen + CursorReset + b.Prompt.prompt()) if b.IsEmpty() { ph := b.Prompt.placeholder() - fmt.Printf(ColorGrey + ph + cursorLeftN(len(ph)) + ColorDefault) + fmt.Print(ColorGrey + ph + CursorLeftN(len(ph)) + ColorDefault) } else { currPos := b.DisplayPos currIndex := b.Pos b.Pos = 0 b.DisplayPos = 0 b.drawRemaining() - fmt.Printf(CursorReset + cursorRightN(len(b.Prompt.prompt()))) + fmt.Print(CursorReset + CursorRightN(len(b.Prompt.prompt()))) if currPos > 0 { targetLine := currPos / b.LineWidth if targetLine > 0 { @@ -491,10 +486,10 @@ func (b *Buffer) ClearScreen() { } remainder := currPos % b.LineWidth if remainder > 0 { - fmt.Print(cursorRightN(remainder)) + fmt.Print(CursorRightN(remainder)) } if currPos%b.LineWidth == 0 { - fmt.Printf(CursorBOL + b.Prompt.AltPrompt) + fmt.Print(CursorBOL + b.Prompt.AltPrompt) } } b.Pos = currIndex @@ -513,13 +508,13 @@ func (b *Buffer) Replace(r []rune) { b.Buf.Clear() - fmt.Printf(CursorBOL + ClearToEOL) + fmt.Print(CursorBOL + ClearToEOL) for range lineNums { fmt.Print(CursorUp + CursorBOL + ClearToEOL) } - fmt.Printf(CursorBOL + b.Prompt.prompt()) + fmt.Print(CursorBOL + b.Prompt.prompt()) for _, c := range r { b.Add(c) @@ -545,19 +540,3 @@ func (b *Buffer) StringNM(n, m int) string { } return s } - -func cursorLeftN(n int) string { - return fmt.Sprintf(CursorLeftN, n) -} - -func cursorRightN(n int) string { - return fmt.Sprintf(CursorRightN, n) -} - -func cursorUpN(n int) string { - return fmt.Sprintf(CursorUpN, n) -} - -func cursorDownN(n int) string { - return fmt.Sprintf(CursorDownN, n) -} diff --git a/readline/readline.go b/readline/readline.go index e90a5e01..1c14fe10 100644 --- a/readline/readline.go +++ b/readline/readline.go @@ -98,7 +98,7 @@ func (i *Instance) Readline() (string, error) { showPlaceholder := !i.Pasting || i.Prompt.UseAlt if buf.IsEmpty() && showPlaceholder { ph := i.Prompt.placeholder() - fmt.Printf(ColorGrey + ph + fmt.Sprintf(CursorLeftN, len(ph)) + ColorDefault) + fmt.Print(ColorGrey + ph + CursorLeftN(len(ph)) + ColorDefault) } r, err := i.Terminal.Read() diff --git a/readline/types.go b/readline/types.go index 3b88588f..e136d996 100644 --- a/readline/types.go +++ b/readline/types.go @@ -1,5 +1,7 @@ package readline +import "strconv" + const ( CharNull = 0 CharLineStart = 1 @@ -41,34 +43,49 @@ const ( ) const ( - CursorUp = "\033[1A" - CursorDown = "\033[1B" - CursorRight = "\033[1C" - CursorLeft = "\033[1D" + Esc = "\x1b" - CursorSave = "\033[s" - CursorRestore = "\033[u" + CursorSave = Esc + "[s" + CursorRestore = Esc + "[u" - CursorUpN = "\033[%dA" - CursorDownN = "\033[%dB" - CursorRightN = "\033[%dC" - CursorLeftN = "\033[%dD" + CursorEOL = Esc + "[E" + CursorBOL = Esc + "[1G" + CursorHide = Esc + "[?25l" + CursorShow = Esc + "[?25h" - CursorEOL = "\033[E" - CursorBOL = "\033[1G" - CursorHide = "\033[?25l" - CursorShow = "\033[?25h" + ClearToEOL = Esc + "[K" + ClearLine = Esc + "[2K" + ClearScreen = Esc + "[2J" + CursorReset = Esc + "[0;0f" - ClearToEOL = "\033[K" - ClearLine = "\033[2K" - ClearScreen = "\033[2J" - CursorReset = "\033[0;0f" + ColorGrey = Esc + "[38;5;245m" + ColorDefault = Esc + "[0m" - ColorGrey = "\033[38;5;245m" - ColorDefault = "\033[0m" + StartBracketedPaste = Esc + "[?2004h" + EndBracketedPaste = Esc + "[?2004l" +) - StartBracketedPaste = "\033[?2004h" - EndBracketedPaste = "\033[?2004l" +func CursorUpN(n int) string { + return Esc + "[" + strconv.Itoa(n) + "A" +} + +func CursorDownN(n int) string { + return Esc + "[" + strconv.Itoa(n) + "B" +} + +func CursorRightN(n int) string { + return Esc + "[" + strconv.Itoa(n) + "C" +} + +func CursorLeftN(n int) string { + return Esc + "[" + strconv.Itoa(n) + "D" +} + +var ( + CursorUp = CursorUpN(1) + CursorDown = CursorDownN(1) + CursorRight = CursorRightN(1) + CursorLeft = CursorLeftN(1) ) const ( diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 27c4ff1f..6cb0d0cd 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -4,6 +4,7 @@ set -eu export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" +GZIP=$(which pigz 2>/dev/null || echo "gzip") BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"} export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""} @@ -21,11 +22,16 @@ for TARGETARCH in ${BUILD_ARCH}; do -t builder:$TARGETARCH \ . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH - - if [ "$TARGETARCH" = "amd64" ]; then - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/ + rm -rf ./dist/linux-$TARGETARCH + docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist + if echo ${TARGETARCH} | grep "amd64" > /dev/null; then + docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist fi - docker rm builder-$TARGETARCH + echo "Compressing final linux bundle..." + rm -f ./dist/ollama-linux-$TARGETARCH.tgz + (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz ) + if [ -d dist/linux-$TARGETARCH-rocm ]; then + (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz ) + fi done diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index edc73759..9cebf1f4 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -7,6 +7,7 @@ $ErrorActionPreference = "Stop" function checkEnv() { + $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower() Write-host "Building for ${script:TARGET_ARCH}" write-host "Locating required tools and paths" @@ -15,26 +16,23 @@ function checkEnv() { $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0] } - # Try to find the CUDA dir - if ($null -eq $env:NVIDIA_DIR) { + # Locate CUDA versions + # Note: this assumes every version found will be built + $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue') + if ($cudaList.length -eq 0) { $d=(get-command -ea 'silentlycontinue' nvcc).path - if ($d -ne $null) { - $script:NVIDIA_DIR=($d| split-path -parent) - } else { - $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue') - if ($cudaList.length > 0) { - $script:NVIDIA_DIR=$cudaList[0] - } + if ($null -ne $d) { + $script:CUDA_DIRS=@($d| split-path -parent) } } else { - $script:NVIDIA_DIR=$env:NVIDIA_DIR + $script:CUDA_DIRS=$cudaList } $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0] $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}" $env:CGO_ENABLED="1" - echo "Checking version" + Write-Output "Checking version" if (!$env:VERSION) { $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always) $pattern="v(.+)" @@ -71,7 +69,48 @@ function checkEnv() { function buildOllama() { write-host "Building ollama CLI" if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) { - & go generate ./... + Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" + + # TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle + # which targets to build + + # Start by skipping CUDA to build everything else + pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... } + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + + # Then skip everyhting else and build all the CUDA variants + foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) { + write-host "Building CUDA ${env:CUDA_LIB_DIR}" + + if ($env:CUDA_LIB_DIR.Contains("v12")) { + pwsh -Command { + $env:OLLAMA_SKIP_CUDA_GENERATE="" + $env:OLLAMA_SKIP_STATIC_GENERATE="1" + $env:OLLAMA_SKIP_CPU_GENERATE="1" + $env:OLLAMA_SKIP_ONEAPI_GENERATE="1" + $env:OLLAMA_SKIP_ROCM_GENERATE="1" + $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" + $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" + $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent + $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH" + & go generate ./... + } + } else { + pwsh -Command { + $env:OLLAMA_SKIP_CUDA_GENERATE="" + $env:OLLAMA_SKIP_STATIC_GENERATE="1" + $env:OLLAMA_SKIP_CPU_GENERATE="1" + $env:OLLAMA_SKIP_ONEAPI_GENERATE="1" + $env:OLLAMA_SKIP_ROCM_GENERATE="1" + $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" + $env:OLLAMA_CUSTOM_CUDA_DEFS="" + $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent + $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH" + & go generate ./... + } + } + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + } if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } else { write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set" @@ -83,8 +122,8 @@ function buildOllama() { /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } - New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force - cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\ + New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force + cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\ } function buildApp() { @@ -103,22 +142,22 @@ function buildApp() { function gatherDependencies() { write-host "Gathering runtime dependencies" cd "${script:SRC_DIR}" - md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null + md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null # TODO - this varies based on host build system and MSVC version - drive from dumpbin output # currently works for Win11 + MSVC 2019 + Cuda V11 - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\" foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) { - cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\" } cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\" if ("${env:KEY_CONTAINER}") { write-host "about to sign" - foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ + foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ write-host "signing $file" & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file diff --git a/scripts/install.sh b/scripts/install.sh index aa8b3e5e..25f57565 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -63,16 +63,36 @@ if [ -n "$NEEDS" ]; then exit 1 fi -status "Downloading ollama..." -curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}" - for BINDIR in /usr/local/bin /usr/bin /bin; do echo $PATH | grep -q $BINDIR && break || continue done +OLLAMA_INSTALL_DIR=$(dirname ${BINDIR}) -status "Installing ollama to $BINDIR..." +status "Installing ollama to $OLLAMA_INSTALL_DIR" $SUDO install -o0 -g0 -m755 -d $BINDIR -$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama +$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR" +if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then + status "Downloading Linux ${ARCH} bundle" + curl --fail --show-error --location --progress-bar \ + "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \ + $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" + BUNDLE=1 + if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then + status "Making ollama accessible in the PATH in $BINDIR" + $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" + fi +else + status "Downloading Linux ${ARCH} CLI" + curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\ + "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}" + $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama + BUNDLE=0 + if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then + status "Making ollama accessible in the PATH in $BINDIR" + $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" + fi +fi + install_success() { status 'The Ollama API is now available at 127.0.0.1:11434.' @@ -178,6 +198,16 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg fi if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then + if [ $BUNDLE -ne 0 ]; then + status "Downloading Linux ROCm ${ARCH} bundle" + curl --fail --show-error --location --progress-bar \ + "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \ + $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" + + install_success + status "AMD GPU ready." + exit 0 + fi # Look for pre-existing ROCm v6 before downloading the dependencies for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then @@ -209,15 +239,15 @@ install_cuda_driver_yum() { case $PACKAGE_MANAGER in yum) $SUDO $PACKAGE_MANAGER -y install yum-utils - if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then - $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo" >/dev/null ; then + $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo else error $CUDA_REPO_ERR_MSG fi ;; dnf) - if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then - $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo" >/dev/null ; then + $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo else error $CUDA_REPO_ERR_MSG fi @@ -245,8 +275,8 @@ install_cuda_driver_yum() { # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian install_cuda_driver_apt() { status 'Installing NVIDIA repository...' - if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb" >/dev/null ; then - curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb + if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-keyring_1.1-1_all.deb" >/dev/null ; then + curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-keyring_1.1-1_all.deb else error $CUDA_REPO_ERR_MSG fi diff --git a/scripts/rh_linux_deps.sh b/scripts/rh_linux_deps.sh index 81648d68..b4c9afd6 100644 --- a/scripts/rh_linux_deps.sh +++ b/scripts/rh_linux_deps.sh @@ -3,6 +3,7 @@ # Script for common Dockerfile dependency installation in redhat linux based images set -ex +set -o pipefail MACHINE=$(uname -m) if grep -i "centos" /etc/system-release >/dev/null; then @@ -29,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then dnf install -y rh-git227-git ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git fi - dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ + dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz elif grep -i "rocky" /etc/system-release >/dev/null; then # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC) cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo @@ -43,12 +44,21 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial EOF dnf install -y git \ gcc-toolset-10-gcc-10.2.1-8.2.el8 \ - gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 + gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \ + pigz else echo "ERROR Unexpected distro" exit 1 fi +if [ "${MACHINE}" = "x86_64" ] ; then + curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \ + mv /tmp/ccache /usr/local/bin/ +else + yum -y install epel-release + yum install -y ccache +fi + if [ -n "${CMAKE_VERSION}" ]; then curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 fi diff --git a/server/download.go b/server/download.go index a903d96f..02f7ae88 100644 --- a/server/download.go +++ b/server/download.go @@ -94,7 +94,7 @@ func (p *blobDownloadPart) UnmarshalJSON(b []byte) error { } const ( - numDownloadParts = 64 + numDownloadParts = 16 minDownloadPartSize int64 = 100 * format.MegaByte maxDownloadPartSize int64 = 1000 * format.MegaByte ) @@ -216,6 +216,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis return err } defer file.Close() + setSparse(file) _ = file.Truncate(b.Total) @@ -232,7 +233,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error { if len(via) > 10 { - return errors.New("maxium redirects exceeded (10) for directURL") + return errors.New("maximum redirects exceeded (10) for directURL") } // if the hostname is the same, allow the redirect diff --git a/server/images.go b/server/images.go index 81357f3c..8b3a67cf 100644 --- a/server/images.go +++ b/server/images.go @@ -215,25 +215,20 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) { return nil, "", err } - if _, err = os.Stat(fp); err != nil { - return nil, "", err - } - - var manifest *Manifest - - bts, err := os.ReadFile(fp) + f, err := os.Open(fp) if err != nil { - return nil, "", fmt.Errorf("couldn't open file '%s'", fp) + return nil, "", err } + defer f.Close() - shaSum := sha256.Sum256(bts) - shaStr := hex.EncodeToString(shaSum[:]) + sha256sum := sha256.New() - if err := json.Unmarshal(bts, &manifest); err != nil { + var manifest Manifest + if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil { return nil, "", err } - return manifest, shaStr, nil + return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil } func GetModel(name string) (*Model, error) { @@ -250,19 +245,21 @@ func GetModel(name string) (*Model, error) { Template: template.DefaultTemplate, } - filename, err := GetBlobsPath(manifest.Config.Digest) - if err != nil { - return nil, err - } + if manifest.Config.Digest != "" { + filename, err := GetBlobsPath(manifest.Config.Digest) + if err != nil { + return nil, err + } - configFile, err := os.Open(filename) - if err != nil { - return nil, err - } - defer configFile.Close() + configFile, err := os.Open(filename) + if err != nil { + return nil, err + } + defer configFile.Close() - if err := json.NewDecoder(configFile).Decode(&model.Config); err != nil { - return nil, err + if err := json.NewDecoder(configFile).Decode(&model.Config); err != nil { + return nil, err + } } for _, layer := range manifest.Layers { @@ -371,7 +368,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio var messages []*api.Message parameters := make(map[string]any) - var layers []*Layer + var layers []Layer for _, c := range modelfile.Commands { mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name) @@ -497,7 +494,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio if c.Name != "license" { // replace - layers = slices.DeleteFunc(layers, func(layer *Layer) bool { + layers = slices.DeleteFunc(layers, func(layer Layer) bool { if layer.MediaType != mediatype { return false } @@ -543,7 +540,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio } var err2 error - layers = slices.DeleteFunc(layers, func(layer *Layer) bool { + layers = slices.DeleteFunc(layers, func(layer Layer) bool { switch layer.MediaType { case "application/vnd.ollama.image.message": // if there are new messages, remove the inherited ones @@ -623,12 +620,12 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio return err } - layer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json") + configLayer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json") if err != nil { return err } - for _, layer := range append(layers, layer) { + for _, layer := range append(layers, configLayer) { if layer.status != "" { fn(api.ProgressResponse{Status: layer.status}) } @@ -637,7 +634,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio old, _ := ParseNamedManifest(name) fn(api.ProgressResponse{Status: "writing manifest"}) - if err := WriteManifest(name, layer, layers); err != nil { + if err := WriteManifest(name, configLayer, layers); err != nil { return err } @@ -690,44 +687,18 @@ func CopyModel(src, dst model.Name) error { return err } -func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error { - fp, err := GetManifestPath() +func deleteUnusedLayers(deleteMap map[string]struct{}) error { + manifests, err := Manifests() if err != nil { return err } - walkFunc := func(path string, info os.FileInfo, _ error) error { - if info.IsDir() { - return nil - } - - dir, file := filepath.Split(path) - dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator)) - tag := strings.Join([]string{dir, file}, ":") - fmp := ParseModelPath(tag) - - // skip the manifest we're trying to delete - if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() { - return nil - } - - // save (i.e. delete from the deleteMap) any files used in other manifests - manifest, _, err := GetManifest(fmp) - if err != nil { - //nolint:nilerr - return nil - } - + for _, manifest := range manifests { for _, layer := range manifest.Layers { delete(deleteMap, layer.Digest) } delete(deleteMap, manifest.Config.Digest) - return nil - } - - if err := filepath.Walk(fp, walkFunc); err != nil { - return err } // only delete the files which are still in the deleteMap @@ -780,9 +751,9 @@ func PruneLayers() error { slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap))) - err = deleteUnusedLayers(nil, deleteMap) - if err != nil { - return err + if err := deleteUnusedLayers(deleteMap); err != nil { + slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err)) + return nil } slog.Info(fmt.Sprintf("total unused blobs removed: %d", len(deleteMap))) @@ -837,9 +808,11 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu return err } - var layers []*Layer + var layers []Layer layers = append(layers, manifest.Layers...) - layers = append(layers, manifest.Config) + if manifest.Config.Digest != "" { + layers = append(layers, manifest.Config) + } for _, layer := range layers { if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil { @@ -873,23 +846,18 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error { mp := ParseModelPath(name) - var manifest *Manifest - var err error - var noprune string - // build deleteMap to prune unused layers deleteMap := make(map[string]struct{}) - - if !envconfig.NoPrune() { - manifest, _, err = GetManifest(mp) - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err + manifest, _, err := GetManifest(mp) + if errors.Is(err, os.ErrNotExist) { + // noop + } else if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } else { + for _, l := range manifest.Layers { + deleteMap[l.Digest] = struct{}{} } - - if manifest != nil { - for _, l := range manifest.Layers { - deleteMap[l.Digest] = struct{}{} - } + if manifest.Config.Digest != "" { deleteMap[manifest.Config.Digest] = struct{}{} } } @@ -905,9 +873,11 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu return fmt.Errorf("pull model manifest: %s", err) } - var layers []*Layer + var layers []Layer layers = append(layers, manifest.Layers...) - layers = append(layers, manifest.Config) + if manifest.Config.Digest != "" { + layers = append(layers, manifest.Config) + } skipVerify := make(map[string]bool) for _, layer := range layers { @@ -967,11 +937,10 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu return err } - if noprune == "" { - fn(api.ProgressResponse{Status: "removing any unused layers"}) - err = deleteUnusedLayers(nil, deleteMap) - if err != nil { - return err + if !envconfig.NoPrune() && len(deleteMap) > 0 { + fn(api.ProgressResponse{Status: "removing unused layers"}) + if err := deleteUnusedLayers(deleteMap); err != nil { + fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)}) } } @@ -991,12 +960,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio } defer resp.Body.Close() - var m *Manifest + var m Manifest if err := json.NewDecoder(resp.Body).Decode(&m); err != nil { return nil, err } - return m, err + return &m, err } // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer diff --git a/server/layer.go b/server/layer.go index cc6709d2..0bdee72b 100644 --- a/server/layer.go +++ b/server/layer.go @@ -2,6 +2,7 @@ package server import ( "crypto/sha256" + "errors" "fmt" "io" "os" @@ -15,15 +16,15 @@ type Layer struct { status string } -func NewLayer(r io.Reader, mediatype string) (*Layer, error) { +func NewLayer(r io.Reader, mediatype string) (Layer, error) { blobs, err := GetBlobsPath("") if err != nil { - return nil, err + return Layer{}, err } temp, err := os.CreateTemp(blobs, "sha256-") if err != nil { - return nil, err + return Layer{}, err } defer temp.Close() defer os.Remove(temp.Name()) @@ -31,28 +32,31 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) { sha256sum := sha256.New() n, err := io.Copy(io.MultiWriter(temp, sha256sum), r) if err != nil { - return nil, err + return Layer{}, err } if err := temp.Close(); err != nil { - return nil, err + return Layer{}, err } digest := fmt.Sprintf("sha256:%x", sha256sum.Sum(nil)) blob, err := GetBlobsPath(digest) if err != nil { - return nil, err + return Layer{}, err } status := "using existing layer" if _, err := os.Stat(blob); err != nil { status = "creating new layer" if err := os.Rename(temp.Name(), blob); err != nil { - return nil, err + return Layer{}, err + } + if err := os.Chmod(blob, 0o644); err != nil { + return Layer{}, err } } - return &Layer{ + return Layer{ MediaType: mediatype, Digest: digest, Size: n, @@ -60,18 +64,22 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) { }, nil } -func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) { +func NewLayerFromLayer(digest, mediatype, from string) (Layer, error) { + if digest == "" { + return Layer{}, errors.New("creating new layer from layer with empty digest") + } + blob, err := GetBlobsPath(digest) if err != nil { - return nil, err + return Layer{}, err } fi, err := os.Stat(blob) if err != nil { - return nil, err + return Layer{}, err } - return &Layer{ + return Layer{ MediaType: mediatype, Digest: digest, Size: fi.Size(), @@ -81,6 +89,10 @@ func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) { } func (l *Layer) Open() (io.ReadSeekCloser, error) { + if l.Digest == "" { + return nil, errors.New("opening layer with empty digest") + } + blob, err := GetBlobsPath(l.Digest) if err != nil { return nil, err @@ -90,6 +102,10 @@ func (l *Layer) Open() (io.ReadSeekCloser, error) { } func (l *Layer) Remove() error { + if l.Digest == "" { + return nil + } + ms, err := Manifests() if err != nil { return err diff --git a/server/manifest.go b/server/manifest.go index b8df11ef..6b04753f 100644 --- a/server/manifest.go +++ b/server/manifest.go @@ -5,6 +5,7 @@ import ( "encoding/hex" "encoding/json" "errors" + "fmt" "io" "log/slog" "os" @@ -14,10 +15,10 @@ import ( ) type Manifest struct { - SchemaVersion int `json:"schemaVersion"` - MediaType string `json:"mediaType"` - Config *Layer `json:"config"` - Layers []*Layer `json:"layers"` + SchemaVersion int `json:"schemaVersion"` + MediaType string `json:"mediaType"` + Config Layer `json:"config"` + Layers []Layer `json:"layers"` filepath string fi os.FileInfo @@ -47,10 +48,12 @@ func (m *Manifest) Remove() error { func (m *Manifest) RemoveLayers() error { for _, layer := range append(m.Layers, m.Config) { - if err := layer.Remove(); errors.Is(err, os.ErrNotExist) { - slog.Debug("layer does not exist", "digest", layer.Digest) - } else if err != nil { - return err + if layer.Digest != "" { + if err := layer.Remove(); errors.Is(err, os.ErrNotExist) { + slog.Debug("layer does not exist", "digest", layer.Digest) + } else if err != nil { + return err + } } } @@ -93,7 +96,7 @@ func ParseNamedManifest(n model.Name) (*Manifest, error) { return &m, nil } -func WriteManifest(name model.Name, config *Layer, layers []*Layer) error { +func WriteManifest(name model.Name, config Layer, layers []Layer) error { manifests, err := GetManifestPath() if err != nil { return err @@ -148,14 +151,16 @@ func Manifests() (map[model.Name]*Manifest, error) { n := model.ParseNameFromFilepath(rel) if !n.IsValid() { - slog.Warn("bad manifest name", "path", rel, "error", err) + slog.Warn("bad manifest name", "path", rel) continue } m, err := ParseNamedManifest(n) - if err != nil { + if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) { slog.Warn("bad manifest", "name", n, "error", err) continue + } else if err != nil { + return nil, fmt.Errorf("%s: %w", n, err) } ms[n] = m diff --git a/server/model.go b/server/model.go index f2946a0b..b17bf0e3 100644 --- a/server/model.go +++ b/server/model.go @@ -26,7 +26,7 @@ import ( var intermediateBlobs map[string]string = make(map[string]string) type layerGGML struct { - *Layer + Layer *llm.GGML } @@ -176,9 +176,20 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap mediatype = "application/vnd.ollama.image.projector" } - layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype) - if err != nil { - return nil, err + var layer Layer + if digest != "" && n == stat.Size() && offset == 0 { + layer, err = NewLayerFromLayer(digest, mediatype, file.Name()) + if err != nil { + slog.Debug("could not create new layer from layer", "error", err) + } + } + + // Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size()) + if layer.Digest == "" { + layer, err = NewLayer(io.NewSectionReader(file, offset, n), mediatype) + if err != nil { + return nil, err + } } layers = append(layers, &layerGGML{layer, ggml}) diff --git a/server/model_test.go b/server/model_test.go index aa214d3d..63fc408d 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -2,8 +2,10 @@ package server import ( "bytes" + "context" "encoding/json" "fmt" + "io" "os" "path/filepath" "testing" @@ -11,6 +13,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/llm" "github.com/ollama/ollama/template" ) @@ -133,3 +136,82 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, }) } } + +func TestParseFromFileFromLayer(t *testing.T) { + tempModels := t.TempDir() + + file, err := os.CreateTemp(tempModels, "") + if err != nil { + t.Fatalf("failed to open file: %v", err) + } + defer file.Close() + if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil { + t.Fatalf("failed to write gguf: %v", err) + } + + if _, err := file.Seek(0, io.SeekStart); err != nil { + t.Fatalf("failed to seek to start: %v", err) + } + + layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {}) + if err != nil { + t.Fatalf("failed to parse from file: %v", err) + } + + if len(layers) != 1 { + t.Fatalf("got %d != want 1", len(layers)) + } + + if _, err := file.Seek(0, io.SeekStart); err != nil { + t.Fatalf("failed to seek to start: %v", err) + } + + layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {}) + if err != nil { + t.Fatalf("failed to parse from file: %v", err) + } + if len(layers2) != 1 { + t.Fatalf("got %d != want 1", len(layers2)) + } + + if layers[0].Digest != layers2[0].Digest { + t.Fatalf("got %s != want %s", layers[0].Digest, layers2[0].Digest) + } + + if layers[0].Size != layers2[0].Size { + t.Fatalf("got %d != want %d", layers[0].Size, layers2[0].Size) + } + + if layers[0].MediaType != layers2[0].MediaType { + t.Fatalf("got %v != want %v", layers[0].MediaType, layers2[0].MediaType) + } +} + +func TestParseLayerFromCopy(t *testing.T) { + tempModels := t.TempDir() + + file2, err := os.CreateTemp(tempModels, "") + if err != nil { + t.Fatalf("failed to open file: %v", err) + } + defer file2.Close() + + for range 5 { + if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil { + t.Fatalf("failed to write gguf: %v", err) + } + } + + if _, err := file2.Seek(0, io.SeekStart); err != nil { + t.Fatalf("failed to seek to start: %v", err) + } + + layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {}) + if err != nil { + t.Fatalf("failed to parse from file: %v", err) + } + + if len(layers) != 5 { + t.Fatalf("got %d != want 5", len(layers)) + } +} diff --git a/server/routes.go b/server/routes.go index e56272c7..eecb9965 100644 --- a/server/routes.go +++ b/server/routes.go @@ -23,6 +23,7 @@ import ( "github.com/gin-contrib/cors" "github.com/gin-gonic/gin" + "golang.org/x/sync/errgroup" "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" @@ -323,13 +324,10 @@ func (s *Server) EmbedHandler(c *gin.Context) { input = append(input, v.(string)) } default: - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"}) - return - } - - if len(input) == 0 { - c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}}) - return + if req.Input != nil { + c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"}) + return + } } r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive) @@ -340,12 +338,18 @@ func (s *Server) EmbedHandler(c *gin.Context) { checkpointLoaded := time.Now() + if len(input) == 0 { + c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}}) + return + } + kvData, err := getKVData(m.ModelPath, false) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } + var count int for i, s := range input { tokens, err := r.Tokenize(c.Request.Context(), s) if err != nil { @@ -368,25 +372,36 @@ func (s *Server) EmbedHandler(c *gin.Context) { } } + count += len(tokens) + input[i] = s } - embeddings, err := r.Embed(c.Request.Context(), input) - if err != nil { - slog.Error("embedding generation failed", "error", err) - c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"}) - return + + var g errgroup.Group + embeddings := make([][]float32, len(input)) + for i, text := range input { + g.Go(func() error { + embedding, err := r.Embedding(c.Request.Context(), text) + if err != nil { + return err + } + embeddings[i] = normalize(embedding) + return nil + }) } - for i, e := range embeddings.Embedding { - embeddings.Embedding[i] = normalize(e) + if err := g.Wait(); err != nil { + slog.Error("embedding generation failed", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Errorf("failed to generate embeddings: %v", err)}) + return } resp := api.EmbedResponse{ Model: req.Model, - Embeddings: embeddings.Embedding, + Embeddings: embeddings, TotalDuration: time.Since(checkpointStart), LoadDuration: checkpointLoaded.Sub(checkpointStart), - PromptEvalCount: embeddings.PromptEvalCount, + PromptEvalCount: count, } c.JSON(http.StatusOK, resp) } @@ -430,21 +445,20 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { return } - embeddings, err := r.Embed(c.Request.Context(), []string{req.Prompt}) + embedding, err := r.Embedding(c.Request.Context(), req.Prompt) if err != nil { slog.Info(fmt.Sprintf("embedding generation failed: %v", err)) c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"}) return } - embedding := make([]float64, len(embeddings.Embedding[0])) - - for i, v := range embeddings.Embedding[0] { - embedding[i] = float64(v) + var e []float64 + for _, v := range embedding { + e = append(e, float64(v)) } resp := api.EmbeddingResponse{ - Embedding: embedding, + Embedding: e, } c.JSON(http.StatusOK, resp) } @@ -824,17 +838,20 @@ func (s *Server) ListModelsHandler(c *gin.Context) { models := []api.ListModelResponse{} for n, m := range ms { - f, err := m.Config.Open() - if err != nil { - slog.Warn("bad manifest filepath", "name", n, "error", err) - continue - } - defer f.Close() - var cf ConfigV2 - if err := json.NewDecoder(f).Decode(&cf); err != nil { - slog.Warn("bad manifest config", "name", n, "error", err) - continue + + if m.Config.Digest != "" { + f, err := m.Config.Open() + if err != nil { + slog.Warn("bad manifest filepath", "name", n, "error", err) + continue + } + defer f.Close() + + if err := json.NewDecoder(f).Decode(&cf); err != nil { + slog.Warn("bad manifest config", "name", n, "error", err) + continue + } } // tag should never be masked diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 9fd7f8cd..4de07b25 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -2,6 +2,7 @@ package server import ( "bytes" + "cmp" "encoding/json" "fmt" "io" @@ -53,6 +54,8 @@ func (t *responseRecorder) CloseNotify() <-chan bool { func createRequest(t *testing.T, fn func(*gin.Context), body any) *httptest.ResponseRecorder { t.Helper() + // if OLLAMA_MODELS is not set, set it to the temp directory + t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir())) w := NewRecorder() c, _ := gin.CreateTestContext(w) diff --git a/server/routes_delete_test.go b/server/routes_delete_test.go index 1c950d66..82fac9f5 100644 --- a/server/routes_delete_test.go +++ b/server/routes_delete_test.go @@ -98,7 +98,7 @@ func TestDeleteDuplicateLayers(t *testing.T) { } // create a manifest with duplicate layers - if err := WriteManifest(n, config, []*Layer{config}); err != nil { + if err := WriteManifest(n, config, []Layer{config}); err != nil { t.Fatal(err) } diff --git a/server/routes_test.go b/server/routes_test.go index ef7248ef..242875d6 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -272,76 +272,6 @@ func Test_Routes(t *testing.T) { assert.Equal(t, "library", retrieveResp.OwnedBy) }, }, - { - Name: "Embed Handler Empty Input", - Method: http.MethodPost, - Path: "/api/embed", - Setup: func(t *testing.T, req *http.Request) { - embedReq := api.EmbedRequest{ - Model: "t-bone", - Input: "", - } - jsonData, err := json.Marshal(embedReq) - require.NoError(t, err) - req.Body = io.NopCloser(bytes.NewReader(jsonData)) - }, - Expected: func(t *testing.T, resp *http.Response) { - contentType := resp.Header.Get("Content-Type") - if contentType != "application/json; charset=utf-8" { - t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType) - } - body, err := io.ReadAll(resp.Body) - if err != nil { - t.Fatal(err) - } - - var embedResp api.EmbedResponse - err = json.Unmarshal(body, &embedResp) - if err != nil { - t.Fatal(err) - } - - if embedResp.Model != "t-bone" { - t.Fatalf("expected model t-bone, got %s", embedResp.Model) - } - - if embedResp.Embeddings == nil { - t.Fatalf("expected embeddings to not be nil, got %v", embedResp.Embeddings) - } - - if len(embedResp.Embeddings) != 0 { - t.Fatalf("expected embeddings to be empty, got %v", embedResp.Embeddings) - } - }, - }, - { - Name: "Embed Handler Invalid Input", - Method: http.MethodPost, - Path: "/api/embed", - Setup: func(t *testing.T, req *http.Request) { - embedReq := api.EmbedRequest{ - Model: "t-bone", - Input: 2, - } - jsonData, err := json.Marshal(embedReq) - require.NoError(t, err) - req.Body = io.NopCloser(bytes.NewReader(jsonData)) - }, - Expected: func(t *testing.T, resp *http.Response) { - contentType := resp.Header.Get("Content-Type") - if contentType != "application/json; charset=utf-8" { - t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType) - } - _, err := io.ReadAll(resp.Body) - if err != nil { - t.Fatal(err) - } - - if resp.StatusCode != http.StatusBadRequest { - t.Fatalf("expected status code 400, got %d", resp.StatusCode) - } - }, - }, } t.Setenv("OLLAMA_MODELS", t.TempDir()) diff --git a/server/sched.go b/server/sched.go index c378865b..58071bf0 100644 --- a/server/sched.go +++ b/server/sched.go @@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) { break } + // Embedding models should always be loaded with parallel=1 + if pending.model.CheckCapabilities(CapabilityCompletion) != nil { + numParallel = 1 + } + // Evaluate if the model will fit in the available system memory, or if we should unload a model first if len(gpus) == 1 && gpus[0].Library == "cpu" { // simplifying assumption of defaultParallel when in CPU mode @@ -418,7 +423,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, // some older models are not compatible with newer versions of llama.cpp // show a generalized compatibility error until there is a better way to // check for model compatibility - if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") { + if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") { err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName) } slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err) @@ -734,7 +739,10 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL // If multiple Libraries are detected, pick the Library which loads the most layers for the model func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { - *numParallel = 1 + if *numParallel <= 0 { + *numParallel = 1 + req.opts.NumCtx = req.origNumCtx + } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { return gpus diff --git a/server/sched_test.go b/server/sched_test.go index c8717430..fb049574 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est require.NoError(t, llm.WriteGGUF(f, llm.KV{ "general.architecture": "llama", - "general.name": "name", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), "llama.block_count": uint32(1), @@ -708,8 +707,8 @@ type mockLlm struct { pingResp error waitResp error completionResp error - embedResp *llm.EmbedResponse - embedRespErr error + embeddingResp []float32 + embeddingRespErr error tokenizeResp []int tokenizeRespErr error detokenizeResp string @@ -727,8 +726,8 @@ func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn return s.completionResp } -func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) { - return s.embedResp, s.embedRespErr +func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) { + return s.embeddingResp, s.embeddingRespErr } func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) { diff --git a/server/sparse_common.go b/server/sparse_common.go new file mode 100644 index 00000000..c88b2da0 --- /dev/null +++ b/server/sparse_common.go @@ -0,0 +1,8 @@ +//go:build !windows + +package server + +import "os" + +func setSparse(*os.File) { +} diff --git a/server/sparse_windows.go b/server/sparse_windows.go new file mode 100644 index 00000000..f21cbbda --- /dev/null +++ b/server/sparse_windows.go @@ -0,0 +1,17 @@ +package server + +import ( + "os" + + "golang.org/x/sys/windows" +) + +func setSparse(file *os.File) { + // exFat (and other FS types) don't support sparse files, so ignore errors + windows.DeviceIoControl( //nolint:errcheck + windows.Handle(file.Fd()), windows.FSCTL_SET_SPARSE, + nil, 0, + nil, 0, + nil, nil, + ) +} diff --git a/server/upload.go b/server/upload.go index b5a244ea..020e8955 100644 --- a/server/upload.go +++ b/server/upload.go @@ -26,7 +26,7 @@ import ( var blobUploadManager sync.Map type blobUpload struct { - *Layer + Layer Total int64 Completed atomic.Int64 @@ -45,7 +45,7 @@ type blobUpload struct { } const ( - numUploadParts = 64 + numUploadParts = 16 minUploadPartSize int64 = 100 * format.MegaByte maxUploadPartSize int64 = 1000 * format.MegaByte ) @@ -362,7 +362,7 @@ func (p *progressWriter) Rollback() { p.written = 0 } -func uploadBlob(ctx context.Context, mp ModelPath, layer *Layer, opts *registryOptions, fn func(api.ProgressResponse)) error { +func uploadBlob(ctx context.Context, mp ModelPath, layer Layer, opts *registryOptions, fn func(api.ProgressResponse)) error { requestURL := mp.BaseURL() requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs", layer.Digest) diff --git a/types/model/name.go b/types/model/name.go index 018cb2f5..75b35ef7 100644 --- a/types/model/name.go +++ b/types/model/name.go @@ -219,7 +219,7 @@ func (n Name) String() string { return b.String() } -// DisplayShort returns a short string version of the name. +// DisplayShortest returns a short string version of the name. func (n Name) DisplayShortest() string { var sb strings.Builder