Compare commits
29 Commits
jessegross
...
main
Author | SHA1 | Date | |
---|---|---|---|
|
67691e410d | ||
|
d7eb05b936 | ||
|
636a743c2b | ||
|
df011054fa | ||
|
ac07160c8d | ||
|
6606e4243c | ||
|
65973ceb64 | ||
|
bebef1e50d | ||
|
d48c1c5a44 | ||
|
36a8372b28 | ||
|
4e94227b5d | ||
|
479d551766 | ||
|
76b2b723b2 | ||
|
b8d77cdeab | ||
|
c2e8cbaa14 | ||
|
771fab1dd8 | ||
|
3a5239e6bf | ||
|
3d25e7bf8c | ||
|
1618700c5a | ||
|
b111aa5a91 | ||
|
9e83e550e1 | ||
|
fc2a0715df | ||
|
3020d2dc58 | ||
|
a909417602 | ||
|
6cd566872b | ||
|
9d71bcc3e2 | ||
|
a4c70fe157 | ||
|
34a75102f7 | ||
|
4157d1f7b6 |
2
.github/workflows/test.yaml
vendored
2
.github/workflows/test.yaml
vendored
@ -281,7 +281,7 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
- uses: golangci/golangci-lint-action@v6
|
- uses: golangci/golangci-lint-action@v6
|
||||||
with:
|
with:
|
||||||
args: --timeout 8m0s -v
|
args: --timeout 10m0s -v
|
||||||
test:
|
test:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
72
Dockerfile
72
Dockerfile
@ -5,6 +5,8 @@ ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
|
|||||||
ARG CUDA_VERSION_12=12.4.0
|
ARG CUDA_VERSION_12=12.4.0
|
||||||
ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
|
ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
|
||||||
ARG ROCM_VERSION=6.1.2
|
ARG ROCM_VERSION=6.1.2
|
||||||
|
ARG JETPACK_6=r36.2.0
|
||||||
|
ARG JETPACK_5=r35.4.1
|
||||||
|
|
||||||
### To create a local image for building linux binaries on mac or windows with efficient incremental builds
|
### To create a local image for building linux binaries on mac or windows with efficient incremental builds
|
||||||
#
|
#
|
||||||
@ -13,7 +15,7 @@ ARG ROCM_VERSION=6.1.2
|
|||||||
#
|
#
|
||||||
### Then incremental builds will be much faster in this container
|
### Then incremental builds will be much faster in this container
|
||||||
#
|
#
|
||||||
# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
|
# make -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
|
||||||
#
|
#
|
||||||
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
|
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
@ -76,9 +78,9 @@ ARG CUDA_V12_ARCHITECTURES
|
|||||||
ARG OLLAMA_FAST_BUILD
|
ARG OLLAMA_FAST_BUILD
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
|
if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
|
||||||
make -C llama -j $(expr $(nproc) / 2 ) ; \
|
make -j $(expr $(nproc) / 2 ) ; \
|
||||||
else \
|
else \
|
||||||
make -C llama -j 5 ; \
|
make -j 5 ; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
|
FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
|
||||||
@ -90,7 +92,46 @@ ARG CUDA_V11_ARCHITECTURES
|
|||||||
ARG CUDA_V12_ARCHITECTURES
|
ARG CUDA_V12_ARCHITECTURES
|
||||||
ARG OLLAMA_FAST_BUILD
|
ARG OLLAMA_FAST_BUILD
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
make -C llama -j 8
|
make -j 5
|
||||||
|
|
||||||
|
# Jetsons need to be built in discrete stages
|
||||||
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
|
||||||
|
ARG GOLANG_VERSION
|
||||||
|
RUN apt-get update && apt-get install -y git curl ccache && \
|
||||||
|
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
|
||||||
|
ln -s /usr/local/go/bin/go /usr/local/bin/go && \
|
||||||
|
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
WORKDIR /go/src/github.com/ollama/ollama/
|
||||||
|
COPY . .
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
ENV GOARCH arm64
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
make -j 5 cuda_v11 \
|
||||||
|
CUDA_ARCHITECTURES="72;87" \
|
||||||
|
GPU_RUNNER_VARIANT=_jetpack5 \
|
||||||
|
CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
|
||||||
|
DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
|
||||||
|
DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
|
||||||
|
|
||||||
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
|
||||||
|
ARG GOLANG_VERSION
|
||||||
|
RUN apt-get update && apt-get install -y git curl ccache && \
|
||||||
|
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
|
||||||
|
ln -s /usr/local/go/bin/go /usr/local/bin/go && \
|
||||||
|
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
WORKDIR /go/src/github.com/ollama/ollama/
|
||||||
|
COPY . .
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
ENV GOARCH arm64
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
make -j 5 cuda_v12 \
|
||||||
|
CUDA_ARCHITECTURES="87" \
|
||||||
|
GPU_RUNNER_VARIANT=_jetpack6 \
|
||||||
|
CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
|
||||||
|
DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
|
||||||
|
DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
|
||||||
|
|
||||||
|
|
||||||
# Intermediate stages used for ./scripts/build_linux.sh
|
# Intermediate stages used for ./scripts/build_linux.sh
|
||||||
@ -134,12 +175,20 @@ FROM --platform=linux/arm64 builder-arm64 AS build-arm64
|
|||||||
COPY . .
|
COPY . .
|
||||||
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
|
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
|
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/build/ build/
|
||||||
ARG GOFLAGS
|
ARG GOFLAGS
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
go build -trimpath -o dist/linux-arm64/bin/ollama .
|
go build -trimpath -o dist/linux-arm64/bin/ollama .
|
||||||
RUN cd dist/linux-$GOARCH && \
|
RUN cd dist/linux-$GOARCH && \
|
||||||
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
|
||||||
|
RUN cd dist/linux-$GOARCH-jetpack5 && \
|
||||||
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
|
||||||
|
RUN cd dist/linux-$GOARCH-jetpack6 && \
|
||||||
|
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
|
||||||
|
|
||||||
FROM --platform=linux/amd64 scratch AS dist-amd64
|
FROM --platform=linux/amd64 scratch AS dist-amd64
|
||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
|
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
|
||||||
@ -180,16 +229,23 @@ RUN rm -rf \
|
|||||||
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
|
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y ca-certificates && \
|
apt-get install -y ca-certificates && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
||||||
COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
|
||||||
FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
|
FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
|
||||||
|
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
|
||||||
|
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y ca-certificates && \
|
apt-get install -y ca-certificates && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
|
COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
|
||||||
COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
||||||
|
|
||||||
|
|
||||||
# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
|
# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
|
||||||
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
||||||
@ -198,7 +254,7 @@ FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
|||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
|
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y ca-certificates && \
|
apt-get install -y ca-certificates && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
||||||
COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
||||||
|
|
||||||
|
@ -48,9 +48,11 @@ Ollama supports a list of models available on [ollama.com/library](https://ollam
|
|||||||
Here are some example models that can be downloaded:
|
Here are some example models that can be downloaded:
|
||||||
|
|
||||||
| Model | Parameters | Size | Download |
|
| Model | Parameters | Size | Download |
|
||||||
| ------------------ | ---------- | ----- | ------------------------------ |
|
| ------------------ | ---------- | ----- | -------------------------------- |
|
||||||
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
||||||
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
||||||
|
| Llama 3.2 Vision | 11B | 7.9GB | `ollama run llama3.2-vision` |
|
||||||
|
| Llama 3.2 Vision | 90B | 55GB | `ollama run llama3.2-vision:90b` |
|
||||||
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
|
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
|
||||||
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
|
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
|
||||||
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
|
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
|
||||||
@ -331,6 +333,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
|
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
|
||||||
- [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
|
- [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
|
||||||
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
||||||
|
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
|
||||||
|
- [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
|
||||||
|
|
||||||
### Terminal
|
### Terminal
|
||||||
|
|
||||||
@ -357,6 +361,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
||||||
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
|
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
|
||||||
- [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
|
- [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
|
||||||
|
- [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
|
||||||
|
|
||||||
### Apple Vision Pro
|
### Apple Vision Pro
|
||||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||||
@ -413,6 +418,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
|
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
|
||||||
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
|
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
|
||||||
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
||||||
|
- [GoLamify](https://github.com/prasad89/golamify)
|
||||||
|
|
||||||
### Mobile
|
### Mobile
|
||||||
|
|
||||||
@ -450,6 +456,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
||||||
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
||||||
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
||||||
|
- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
|
||||||
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
||||||
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
||||||
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
||||||
|
@ -55,7 +55,7 @@ func checkError(resp *http.Response, body []byte) error {
|
|||||||
|
|
||||||
// ClientFromEnvironment creates a new [Client] using configuration from the
|
// ClientFromEnvironment creates a new [Client] using configuration from the
|
||||||
// environment variable OLLAMA_HOST, which points to the network host and
|
// environment variable OLLAMA_HOST, which points to the network host and
|
||||||
// port on which the ollama service is listenting. The format of this variable
|
// port on which the ollama service is listening. The format of this variable
|
||||||
// is:
|
// is:
|
||||||
//
|
//
|
||||||
// <scheme>://<host>:<port>
|
// <scheme>://<host>:<port>
|
||||||
|
15
api/types.go
15
api/types.go
@ -12,7 +12,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// StatusError is an error with and HTTP status code.
|
// StatusError is an error with an HTTP status code and message.
|
||||||
type StatusError struct {
|
type StatusError struct {
|
||||||
StatusCode int
|
StatusCode int
|
||||||
Status string
|
Status string
|
||||||
@ -57,7 +57,7 @@ type GenerateRequest struct {
|
|||||||
Template string `json:"template"`
|
Template string `json:"template"`
|
||||||
|
|
||||||
// Context is the context parameter returned from a previous call to
|
// Context is the context parameter returned from a previous call to
|
||||||
// Generate call. It can be used to keep a short conversational memory.
|
// [Client.Generate]. It can be used to keep a short conversational memory.
|
||||||
Context []int `json:"context,omitempty"`
|
Context []int `json:"context,omitempty"`
|
||||||
|
|
||||||
// Stream specifies whether the response is streaming; it is true by default.
|
// Stream specifies whether the response is streaming; it is true by default.
|
||||||
@ -90,14 +90,14 @@ type ChatRequest struct {
|
|||||||
// Messages is the messages of the chat - can be used to keep a chat memory.
|
// Messages is the messages of the chat - can be used to keep a chat memory.
|
||||||
Messages []Message `json:"messages"`
|
Messages []Message `json:"messages"`
|
||||||
|
|
||||||
// Stream enable streaming of returned response; true by default.
|
// Stream enables streaming of returned responses; true by default.
|
||||||
Stream *bool `json:"stream,omitempty"`
|
Stream *bool `json:"stream,omitempty"`
|
||||||
|
|
||||||
// Format is the format to return the response in (e.g. "json").
|
// Format is the format to return the response in (e.g. "json").
|
||||||
Format string `json:"format"`
|
Format string `json:"format"`
|
||||||
|
|
||||||
// KeepAlive controls how long the model will stay loaded into memory
|
// KeepAlive controls how long the model will stay loaded into memory
|
||||||
// followin the request.
|
// following the request.
|
||||||
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
||||||
|
|
||||||
// Tools is an optional list of tools the model has access to.
|
// Tools is an optional list of tools the model has access to.
|
||||||
@ -203,8 +203,8 @@ type Metrics struct {
|
|||||||
EvalDuration time.Duration `json:"eval_duration,omitempty"`
|
EvalDuration time.Duration `json:"eval_duration,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Options specified in [GenerateRequest], if you add a new option here add it
|
// Options specified in [GenerateRequest]. If you add a new option here, also
|
||||||
// to the API docs also.
|
// add it to the API docs.
|
||||||
type Options struct {
|
type Options struct {
|
||||||
Runner
|
Runner
|
||||||
|
|
||||||
@ -236,7 +236,7 @@ type Runner struct {
|
|||||||
NumGPU int `json:"num_gpu,omitempty"`
|
NumGPU int `json:"num_gpu,omitempty"`
|
||||||
MainGPU int `json:"main_gpu,omitempty"`
|
MainGPU int `json:"main_gpu,omitempty"`
|
||||||
LowVRAM bool `json:"low_vram,omitempty"`
|
LowVRAM bool `json:"low_vram,omitempty"`
|
||||||
F16KV bool `json:"f16_kv,omitempty"`
|
F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
|
||||||
LogitsAll bool `json:"logits_all,omitempty"`
|
LogitsAll bool `json:"logits_all,omitempty"`
|
||||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||||
UseMMap *bool `json:"use_mmap,omitempty"`
|
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||||
@ -613,7 +613,6 @@ func DefaultOptions() Options {
|
|||||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||||
NumThread: 0, // let the runtime decide
|
NumThread: 0, // let the runtime decide
|
||||||
LowVRAM: false,
|
LowVRAM: false,
|
||||||
F16KV: true,
|
|
||||||
UseMLock: false,
|
UseMLock: false,
|
||||||
UseMMap: nil,
|
UseMMap: nil,
|
||||||
},
|
},
|
||||||
|
@ -136,7 +136,7 @@ Type: filesandordirs; Name: "{%TEMP}\ollama*"
|
|||||||
Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
|
Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
|
||||||
|
|
||||||
[Messages]
|
[Messages]
|
||||||
WizardReady=Ollama Windows Preview
|
WizardReady=Ollama
|
||||||
ReadyLabel1=%nLet's get you up and running with your own large language models.
|
ReadyLabel1=%nLet's get you up and running with your own large language models.
|
||||||
SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit.
|
SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit.
|
||||||
|
|
||||||
|
@ -800,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
case "parameters":
|
case "parameters":
|
||||||
fmt.Println(resp.Parameters)
|
fmt.Println(resp.Parameters)
|
||||||
case "system":
|
case "system":
|
||||||
fmt.Println(resp.System)
|
fmt.Print(resp.System)
|
||||||
case "template":
|
case "template":
|
||||||
fmt.Println(resp.Template)
|
fmt.Print(resp.Template)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -350,7 +350,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gpuInfo.DependencyPath = libDir
|
gpuInfo.DependencyPath = []string{libDir}
|
||||||
|
|
||||||
if gfxOverride == "" {
|
if gfxOverride == "" {
|
||||||
// Only load supported list once
|
// Only load supported list once
|
||||||
|
@ -111,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|||||||
UnreliableFreeMemory: true,
|
UnreliableFreeMemory: true,
|
||||||
|
|
||||||
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
||||||
DependencyPath: libDir,
|
DependencyPath: []string{libDir},
|
||||||
MinimumMemory: rocmMinimumMemory,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
Name: name,
|
Name: name,
|
||||||
Compute: gfx,
|
Compute: gfx,
|
||||||
|
@ -240,7 +240,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: cpuCapability.String(),
|
Variant: cpuCapability.String(),
|
||||||
ID: "0",
|
ID: "0",
|
||||||
DependencyPath: depPath,
|
DependencyPath: []string{depPath},
|
||||||
},
|
},
|
||||||
CPUs: details,
|
CPUs: details,
|
||||||
},
|
},
|
||||||
@ -293,11 +293,11 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
gpuInfo.DriverMinor = driverMinor
|
gpuInfo.DriverMinor = driverMinor
|
||||||
variant := cudaVariant(gpuInfo)
|
variant := cudaVariant(gpuInfo)
|
||||||
if depPath != "" {
|
if depPath != "" {
|
||||||
gpuInfo.DependencyPath = depPath
|
gpuInfo.DependencyPath = []string{depPath}
|
||||||
// Check for variant specific directory
|
// Check for variant specific directory
|
||||||
if variant != "" {
|
if variant != "" {
|
||||||
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
|
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
|
||||||
gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
|
gpuInfo.DependencyPath = []string{filepath.Join(depPath, "cuda_"+variant), depPath}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -370,7 +370,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||||
gpuInfo.DependencyPath = depPath
|
gpuInfo.DependencyPath = []string{depPath}
|
||||||
oneapiGPUs = append(oneapiGPUs, gpuInfo)
|
oneapiGPUs = append(oneapiGPUs, gpuInfo)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include "gpu_info_nvcuda.h"
|
#include "gpu_info_nvcuda.h"
|
||||||
|
|
||||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||||
|
LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
|
||||||
CUresult ret;
|
CUresult ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
resp->num_devices = 0;
|
resp->num_devices = 0;
|
||||||
@ -57,8 +58,10 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
resp->cudaErr = -1;
|
resp->cudaErr = -1;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG(resp->ch.verbose, "calling cuInit\n");
|
||||||
ret = (*resp->ch.cuInit)(0);
|
ret = (*resp->ch.cuInit)(0);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
||||||
@ -75,15 +78,18 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
resp->ch.driver_minor = 0;
|
resp->ch.driver_minor = 0;
|
||||||
|
|
||||||
// Report driver version if we're in verbose mode, ignore errors
|
// Report driver version if we're in verbose mode, ignore errors
|
||||||
|
LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
|
||||||
ret = (*resp->ch.cuDriverGetVersion)(&version);
|
ret = (*resp->ch.cuDriverGetVersion)(&version);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
|
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
|
||||||
} else {
|
} else {
|
||||||
|
LOG(resp->ch.verbose, "raw version 0x%x\n", version);
|
||||||
resp->ch.driver_major = version / 1000;
|
resp->ch.driver_major = version / 1000;
|
||||||
resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
|
resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
|
||||||
LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
|
LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
|
||||||
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
|
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
|
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
|
||||||
@ -94,6 +100,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
resp->cudaErr = ret;
|
resp->cudaErr = ret;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
|
@ -25,7 +25,7 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
|
|||||||
MinimumMemory uint64 `json:"-"`
|
MinimumMemory uint64 `json:"-"`
|
||||||
|
|
||||||
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
||||||
DependencyPath string `json:"lib_path,omitempty"`
|
DependencyPath []string `json:"lib_path,omitempty"`
|
||||||
|
|
||||||
// Extra environment variables specific to the GPU as list of [key,value]
|
// Extra environment variables specific to the GPU as list of [key,value]
|
||||||
EnvWorkarounds [][2]string `json:"envs,omitempty"`
|
EnvWorkarounds [][2]string `json:"envs,omitempty"`
|
||||||
|
@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
"num_gpu": 1,
|
"num_gpu": 1,
|
||||||
"main_gpu": 0,
|
"main_gpu": 0,
|
||||||
"low_vram": false,
|
"low_vram": false,
|
||||||
"f16_kv": true,
|
|
||||||
"vocab_only": false,
|
"vocab_only": false,
|
||||||
"use_mmap": true,
|
"use_mmap": true,
|
||||||
"use_mlock": false,
|
"use_mlock": false,
|
||||||
|
@ -108,7 +108,7 @@ Custom CPU settings are not currently supported in the new Go server build but w
|
|||||||
|
|
||||||
#### Containerized Linux Build
|
#### Containerized Linux Build
|
||||||
|
|
||||||
If you have Docker available, you can build linux binaries with `OLLAMA_NEW_RUNNERS=1 ./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
|
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ ollama run my-model
|
|||||||
|
|
||||||
Ollama supports importing adapters based on several different model architectures including:
|
Ollama supports importing adapters based on several different model architectures including:
|
||||||
|
|
||||||
* Llama (including Llama 2, Llama 3, and Llama 3.1);
|
* Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
|
||||||
* Mistral (including Mistral 1, Mistral 2, and Mixtral); and
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral); and
|
||||||
* Gemma (including Gemma 1 and Gemma 2)
|
* Gemma (including Gemma 1 and Gemma 2)
|
||||||
|
|
||||||
@ -67,14 +67,12 @@ ollama run my-model
|
|||||||
|
|
||||||
Ollama supports importing models for several different architectures including:
|
Ollama supports importing models for several different architectures including:
|
||||||
|
|
||||||
* Llama (including Llama 2, Llama 3, and Llama 3.1);
|
* Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
|
||||||
* Mistral (including Mistral 1, Mistral 2, and Mixtral);
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral);
|
||||||
* Gemma (including Gemma 1 and Gemma 2); and
|
* Gemma (including Gemma 1 and Gemma 2); and
|
||||||
* Phi3
|
* Phi3
|
||||||
|
|
||||||
This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
|
This includes importing foundation models as well as any fine tuned models which have been _fused_ with a foundation model.
|
||||||
|
|
||||||
|
|
||||||
## Importing a GGUF based model or adapter
|
## Importing a GGUF based model or adapter
|
||||||
|
|
||||||
If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
|
If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
|
||||||
|
@ -120,7 +120,7 @@ FROM <model directory>
|
|||||||
The model directory should contain the Safetensors weights for a supported architecture.
|
The model directory should contain the Safetensors weights for a supported architecture.
|
||||||
|
|
||||||
Currently supported model architectures:
|
Currently supported model architectures:
|
||||||
* Llama (including Llama 2, Llama 3, and Llama 3.1)
|
* Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
|
||||||
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
|
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
|
||||||
* Gemma (including Gemma 1 and Gemma 2)
|
* Gemma (including Gemma 1 and Gemma 2)
|
||||||
* Phi3
|
* Phi3
|
||||||
|
@ -95,7 +95,9 @@ If none of those resolve the problem, gather additional information and file an
|
|||||||
|
|
||||||
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
|
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
|
||||||
|
|
||||||
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
|
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices. For example, in the following output `crw-rw---- 1 0 44 226, 0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44`
|
||||||
|
|
||||||
|
If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker. Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
|
||||||
|
|
||||||
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
|
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
|
||||||
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
|
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
|
||||||
|
@ -10,7 +10,7 @@ This sounds like a typical censored response, but even llama2-uncensored gives a
|
|||||||
|
|
||||||
So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.
|
So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.
|
||||||
|
|
||||||
Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:
|
Let's start by asking a simple question that we can get an answer to from the **Llama3** model using **Ollama**. First, we need to install the **LangChain** package:
|
||||||
|
|
||||||
`pip install langchain_community`
|
`pip install langchain_community`
|
||||||
|
|
||||||
|
392
llama/base64.hpp
vendored
Normal file
392
llama/base64.hpp
vendored
Normal file
@ -0,0 +1,392 @@
|
|||||||
|
/*
|
||||||
|
This is free and unencumbered software released into the public domain.
|
||||||
|
|
||||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
distribute this software, either in source code form or as a compiled
|
||||||
|
binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
means.
|
||||||
|
|
||||||
|
In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
of this software dedicate any and all copyright interest in the
|
||||||
|
software to the public domain. We make this dedication for the benefit
|
||||||
|
of the public at large and to the detriment of our heirs and
|
||||||
|
successors. We intend this dedication to be an overt act of
|
||||||
|
relinquishment in perpetuity of all present and future rights to this
|
||||||
|
software under copyright law.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
For more information, please refer to <http://unlicense.org>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PUBLIC_DOMAIN_BASE64_HPP_
|
||||||
|
#define PUBLIC_DOMAIN_BASE64_HPP_
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <iterator>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class base64_error : public std::runtime_error
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using std::runtime_error::runtime_error;
|
||||||
|
};
|
||||||
|
|
||||||
|
class base64
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
enum class alphabet
|
||||||
|
{
|
||||||
|
/** the alphabet is detected automatically */
|
||||||
|
auto_,
|
||||||
|
/** the standard base64 alphabet is used */
|
||||||
|
standard,
|
||||||
|
/** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
|
||||||
|
url_filename_safe
|
||||||
|
};
|
||||||
|
|
||||||
|
enum class decoding_behavior
|
||||||
|
{
|
||||||
|
/** if the input is not padded, the remaining bits are ignored */
|
||||||
|
moderate,
|
||||||
|
/** if a padding character is encounter decoding is finished */
|
||||||
|
loose
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
Encodes all the elements from `in_begin` to `in_end` to `out`.
|
||||||
|
|
||||||
|
@warning The source and destination cannot overlap. The destination must be able to hold at least
|
||||||
|
`required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
|
||||||
|
|
||||||
|
@tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
|
||||||
|
8 bits
|
||||||
|
@tparam Output_iterator the destination; the elements written to it are from the type `char`
|
||||||
|
@param in_begin the beginning of the source
|
||||||
|
@param in_end the ending of the source
|
||||||
|
@param out the destination iterator
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@returns the iterator to the next element past the last element copied
|
||||||
|
@throws see `Input_iterator` and `Output_iterator`
|
||||||
|
*/
|
||||||
|
template<typename Input_iterator, typename Output_iterator>
|
||||||
|
static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
|
||||||
|
alphabet alphabet = alphabet::standard)
|
||||||
|
{
|
||||||
|
constexpr auto pad = '=';
|
||||||
|
const char* alpha = alphabet == alphabet::url_filename_safe
|
||||||
|
? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
|
||||||
|
: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||||
|
|
||||||
|
while (in_begin != in_end) {
|
||||||
|
std::uint8_t i0 = 0, i1 = 0, i2 = 0;
|
||||||
|
|
||||||
|
// first character
|
||||||
|
i0 = static_cast<std::uint8_t>(*in_begin);
|
||||||
|
++in_begin;
|
||||||
|
|
||||||
|
*out = alpha[i0 >> 2 & 0x3f];
|
||||||
|
++out;
|
||||||
|
|
||||||
|
// part of first character and second
|
||||||
|
if (in_begin != in_end) {
|
||||||
|
i1 = static_cast<std::uint8_t>(*in_begin);
|
||||||
|
++in_begin;
|
||||||
|
|
||||||
|
*out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
|
||||||
|
++out;
|
||||||
|
} else {
|
||||||
|
*out = alpha[(i0 & 0x3) << 4];
|
||||||
|
++out;
|
||||||
|
|
||||||
|
// last padding
|
||||||
|
*out = pad;
|
||||||
|
++out;
|
||||||
|
|
||||||
|
// last padding
|
||||||
|
*out = pad;
|
||||||
|
++out;
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// part of second character and third
|
||||||
|
if (in_begin != in_end) {
|
||||||
|
i2 = static_cast<std::uint8_t>(*in_begin);
|
||||||
|
++in_begin;
|
||||||
|
|
||||||
|
*out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
|
||||||
|
++out;
|
||||||
|
} else {
|
||||||
|
*out = alpha[(i1 & 0xf) << 2];
|
||||||
|
++out;
|
||||||
|
|
||||||
|
// last padding
|
||||||
|
*out = pad;
|
||||||
|
++out;
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// rest of third
|
||||||
|
*out = alpha[i2 & 0x3f];
|
||||||
|
++out;
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Encodes a string.
|
||||||
|
|
||||||
|
@param str the string that should be encoded
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@returns the encoded base64 string
|
||||||
|
@throws see base64::encode()
|
||||||
|
*/
|
||||||
|
static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
result.reserve(required_encode_size(str.length()) + 1);
|
||||||
|
|
||||||
|
encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Encodes a char array.
|
||||||
|
|
||||||
|
@param buffer the char array
|
||||||
|
@param size the size of the array
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@returns the encoded string
|
||||||
|
*/
|
||||||
|
static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
result.reserve(required_encode_size(size) + 1);
|
||||||
|
|
||||||
|
encode(buffer, buffer + size, std::back_inserter(result), alphabet);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
|
||||||
|
in other words: inplace decoding is possible.
|
||||||
|
|
||||||
|
@warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
|
||||||
|
otherwise the behavior depends on the output iterator.
|
||||||
|
|
||||||
|
@tparam Input_iterator the source; the returned elements are cast to `char`
|
||||||
|
@tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
|
||||||
|
@param in_begin the beginning of the source
|
||||||
|
@param in_end the ending of the source
|
||||||
|
@param out the destination iterator
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@param behavior the behavior when an error was detected
|
||||||
|
@returns the iterator to the next element past the last element copied
|
||||||
|
@throws base64_error depending on the set behavior
|
||||||
|
@throws see `Input_iterator` and `Output_iterator`
|
||||||
|
*/
|
||||||
|
template<typename Input_iterator, typename Output_iterator>
|
||||||
|
static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
|
||||||
|
alphabet alphabet = alphabet::auto_,
|
||||||
|
decoding_behavior behavior = decoding_behavior::moderate)
|
||||||
|
{
|
||||||
|
//constexpr auto pad = '=';
|
||||||
|
std::uint8_t last = 0;
|
||||||
|
auto bits = 0;
|
||||||
|
|
||||||
|
while (in_begin != in_end) {
|
||||||
|
auto c = *in_begin;
|
||||||
|
++in_begin;
|
||||||
|
|
||||||
|
if (c == '=') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto part = _base64_value(alphabet, c);
|
||||||
|
|
||||||
|
// enough bits for one byte
|
||||||
|
if (bits + 6 >= 8) {
|
||||||
|
*out = (last << (8 - bits)) | (part >> (bits - 2));
|
||||||
|
++out;
|
||||||
|
|
||||||
|
bits -= 2;
|
||||||
|
} else {
|
||||||
|
bits += 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
last = part;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check padding
|
||||||
|
if (behavior != decoding_behavior::loose) {
|
||||||
|
while (in_begin != in_end) {
|
||||||
|
auto c = *in_begin;
|
||||||
|
++in_begin;
|
||||||
|
|
||||||
|
if (c != '=') {
|
||||||
|
throw base64_error("invalid base64 character.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Decodes a string.
|
||||||
|
|
||||||
|
@param str the base64 encoded string
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@param behavior the behavior when an error was detected
|
||||||
|
@returns the decoded string
|
||||||
|
@throws see base64::decode()
|
||||||
|
*/
|
||||||
|
static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
|
||||||
|
decoding_behavior behavior = decoding_behavior::moderate)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
result.reserve(max_decode_size(str.length()));
|
||||||
|
|
||||||
|
decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Decodes a string.
|
||||||
|
|
||||||
|
@param buffer the base64 encoded buffer
|
||||||
|
@param size the size of the buffer
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@param behavior the behavior when an error was detected
|
||||||
|
@returns the decoded string
|
||||||
|
@throws see base64::decode()
|
||||||
|
*/
|
||||||
|
static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
|
||||||
|
decoding_behavior behavior = decoding_behavior::moderate)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
result.reserve(max_decode_size(size));
|
||||||
|
|
||||||
|
decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Decodes a string inplace.
|
||||||
|
|
||||||
|
@param[in,out] str the base64 encoded string
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@param behavior the behavior when an error was detected
|
||||||
|
@throws base64::decode_inplace()
|
||||||
|
*/
|
||||||
|
static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
|
||||||
|
decoding_behavior behavior = decoding_behavior::moderate)
|
||||||
|
{
|
||||||
|
str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Decodes a char array inplace.
|
||||||
|
|
||||||
|
@param[in,out] str the string array
|
||||||
|
@param size the length of the array
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@param behavior the behavior when an error was detected
|
||||||
|
@returns the pointer to the next element past the last element decoded
|
||||||
|
@throws base64::decode_inplace()
|
||||||
|
*/
|
||||||
|
static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
|
||||||
|
decoding_behavior behavior = decoding_behavior::moderate)
|
||||||
|
{
|
||||||
|
return decode(str, str + size, str, alphabet, behavior);
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Returns the required decoding size for a given size. The value is calculated with the following formula:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\lceil \frac{size}{4} \rceil \cdot 3
|
||||||
|
$$
|
||||||
|
|
||||||
|
@param size the size of the encoded input
|
||||||
|
@returns the size of the resulting decoded buffer; this the absolute maximum
|
||||||
|
*/
|
||||||
|
static std::size_t max_decode_size(std::size_t size) noexcept
|
||||||
|
{
|
||||||
|
return (size / 4 + (size % 4 ? 1 : 0)) * 3;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Returns the required encoding size for a given size. The value is calculated with the following formula:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\lceil \frac{size}{3} \rceil \cdot 4
|
||||||
|
$$
|
||||||
|
|
||||||
|
@param size the size of the decoded input
|
||||||
|
@returns the size of the resulting encoded buffer
|
||||||
|
*/
|
||||||
|
static std::size_t required_encode_size(std::size_t size) noexcept
|
||||||
|
{
|
||||||
|
return (size / 3 + (size % 3 ? 1 : 0)) * 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static std::uint8_t _base64_value(alphabet& alphabet, char c)
|
||||||
|
{
|
||||||
|
if (c >= 'A' && c <= 'Z') {
|
||||||
|
return c - 'A';
|
||||||
|
} else if (c >= 'a' && c <= 'z') {
|
||||||
|
return c - 'a' + 26;
|
||||||
|
} else if (c >= '0' && c <= '9') {
|
||||||
|
return c - '0' + 52;
|
||||||
|
}
|
||||||
|
|
||||||
|
// comes down to alphabet
|
||||||
|
if (alphabet == alphabet::standard) {
|
||||||
|
if (c == '+') {
|
||||||
|
return 62;
|
||||||
|
} else if (c == '/') {
|
||||||
|
return 63;
|
||||||
|
}
|
||||||
|
} else if (alphabet == alphabet::url_filename_safe) {
|
||||||
|
if (c == '-') {
|
||||||
|
return 62;
|
||||||
|
} else if (c == '_') {
|
||||||
|
return 63;
|
||||||
|
}
|
||||||
|
} // auto detect
|
||||||
|
else {
|
||||||
|
if (c == '+') {
|
||||||
|
alphabet = alphabet::standard;
|
||||||
|
|
||||||
|
return 62;
|
||||||
|
} else if (c == '/') {
|
||||||
|
alphabet = alphabet::standard;
|
||||||
|
|
||||||
|
return 63;
|
||||||
|
} else if (c == '-') {
|
||||||
|
alphabet = alphabet::url_filename_safe;
|
||||||
|
|
||||||
|
return 62;
|
||||||
|
} else if (c == '_') {
|
||||||
|
alphabet = alphabet::url_filename_safe;
|
||||||
|
|
||||||
|
return 63;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw base64_error("invalid base64 character.");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // !PUBLIC_DOMAIN_BASE64_HPP_
|
2092
llama/common.cpp
vendored
Normal file
2092
llama/common.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
581
llama/common.h
vendored
Normal file
581
llama/common.h
vendored
Normal file
@ -0,0 +1,581 @@
|
|||||||
|
/**
|
||||||
|
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||||
|
*
|
||||||
|
* MIT License
|
||||||
|
*
|
||||||
|
* Copyright (c) 2023-2024 The ggml authors
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Various helper functions and utilities
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
|
#else
|
||||||
|
#define DIRECTORY_SEPARATOR '/'
|
||||||
|
#endif // _WIN32
|
||||||
|
|
||||||
|
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||||
|
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||||
|
|
||||||
|
#define print_build_info() do { \
|
||||||
|
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||||
|
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||||
|
|
||||||
|
struct llama_lora_adapter_info {
|
||||||
|
std::string path;
|
||||||
|
float scale;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_lora_adapter_container : llama_lora_adapter_info {
|
||||||
|
struct llama_lora_adapter * adapter;
|
||||||
|
};
|
||||||
|
|
||||||
|
// build info
|
||||||
|
extern int LLAMA_BUILD_NUMBER;
|
||||||
|
extern char const * LLAMA_COMMIT;
|
||||||
|
extern char const * LLAMA_COMPILER;
|
||||||
|
extern char const * LLAMA_BUILD_TARGET;
|
||||||
|
|
||||||
|
struct llama_control_vector_load_info;
|
||||||
|
|
||||||
|
//
|
||||||
|
// CPU utils
|
||||||
|
//
|
||||||
|
|
||||||
|
struct cpu_params {
|
||||||
|
int n_threads = -1;
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||||
|
bool mask_valid = false; // Default: any CPU
|
||||||
|
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||||
|
bool strict_cpu = false; // Use strict CPU placement
|
||||||
|
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||||
|
};
|
||||||
|
|
||||||
|
int32_t cpu_get_num_physical_cores();
|
||||||
|
int32_t cpu_get_num_math();
|
||||||
|
|
||||||
|
//
|
||||||
|
// Common params
|
||||||
|
//
|
||||||
|
|
||||||
|
enum llama_example {
|
||||||
|
LLAMA_EXAMPLE_COMMON,
|
||||||
|
LLAMA_EXAMPLE_SPECULATIVE,
|
||||||
|
LLAMA_EXAMPLE_MAIN,
|
||||||
|
LLAMA_EXAMPLE_INFILL,
|
||||||
|
LLAMA_EXAMPLE_EMBEDDING,
|
||||||
|
LLAMA_EXAMPLE_PERPLEXITY,
|
||||||
|
LLAMA_EXAMPLE_RETRIEVAL,
|
||||||
|
LLAMA_EXAMPLE_PASSKEY,
|
||||||
|
LLAMA_EXAMPLE_IMATRIX,
|
||||||
|
LLAMA_EXAMPLE_BENCH,
|
||||||
|
LLAMA_EXAMPLE_SERVER,
|
||||||
|
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||||
|
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||||
|
LLAMA_EXAMPLE_LLAVA,
|
||||||
|
LLAMA_EXAMPLE_LOOKUP,
|
||||||
|
LLAMA_EXAMPLE_PARALLEL,
|
||||||
|
|
||||||
|
LLAMA_EXAMPLE_COUNT,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum gpt_sampler_type {
|
||||||
|
GPT_SAMPLER_TYPE_NONE = 0,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_K = 1,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_P = 2,
|
||||||
|
GPT_SAMPLER_TYPE_MIN_P = 3,
|
||||||
|
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||||
|
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||||
|
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||||
|
};
|
||||||
|
|
||||||
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
enum dimre_method {
|
||||||
|
DIMRE_METHOD_PCA,
|
||||||
|
DIMRE_METHOD_MEAN,
|
||||||
|
};
|
||||||
|
|
||||||
|
// sampler parameters
|
||||||
|
struct gpt_sampler_params {
|
||||||
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||||
|
|
||||||
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||||
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
|
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||||
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||||
|
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||||
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
|
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||||
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
|
float penalty_present = 0.00f; // 0.0 = disabled
|
||||||
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
|
bool ignore_eos = false;
|
||||||
|
bool no_perf = false; // disable performance metrics
|
||||||
|
|
||||||
|
std::vector<enum gpt_sampler_type> samplers = {
|
||||||
|
GPT_SAMPLER_TYPE_TOP_K,
|
||||||
|
GPT_SAMPLER_TYPE_TFS_Z,
|
||||||
|
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_P,
|
||||||
|
GPT_SAMPLER_TYPE_MIN_P,
|
||||||
|
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||||
|
|
||||||
|
// print the parameters into a string
|
||||||
|
std::string print() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_params {
|
||||||
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
|
int32_t n_ctx = 0; // context size
|
||||||
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
||||||
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
int32_t grp_attn_n = 1; // group-attention factor
|
||||||
|
int32_t grp_attn_w = 512; // group-attention width
|
||||||
|
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||||
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||||
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||||
|
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||||
|
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||||
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||||
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
|
|
||||||
|
struct cpu_params cpuparams;
|
||||||
|
struct cpu_params cpuparams_batch;
|
||||||
|
struct cpu_params draft_cpuparams;
|
||||||
|
struct cpu_params draft_cpuparams_batch;
|
||||||
|
|
||||||
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||||
|
|
||||||
|
struct gpt_sampler_params sparams;
|
||||||
|
|
||||||
|
std::string model = ""; // model path // NOLINT
|
||||||
|
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
||||||
|
std::string model_alias = "unknown"; // model alias // NOLINT
|
||||||
|
std::string model_url = ""; // model url to download // NOLINT
|
||||||
|
std::string hf_token = ""; // HF token // NOLINT
|
||||||
|
std::string hf_repo = ""; // HF repo // NOLINT
|
||||||
|
std::string hf_file = ""; // HF file // NOLINT
|
||||||
|
std::string prompt = ""; // NOLINT
|
||||||
|
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
||||||
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
||||||
|
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
||||||
|
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
||||||
|
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
||||||
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
||||||
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||||
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||||
|
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
||||||
|
|
||||||
|
std::vector<std::string> in_files; // all input files
|
||||||
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
|
std::vector<llama_model_kv_override> kv_overrides;
|
||||||
|
|
||||||
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
||||||
|
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
||||||
|
|
||||||
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
|
||||||
|
int32_t verbosity = 0;
|
||||||
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||||
|
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||||
|
|
||||||
|
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||||
|
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||||
|
// (which is more convenient to use for plotting)
|
||||||
|
//
|
||||||
|
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||||
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||||
|
|
||||||
|
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
|
||||||
|
size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
|
||||||
|
|
||||||
|
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
|
||||||
|
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
|
||||||
|
|
||||||
|
bool kl_divergence = false; // compute KL divergence
|
||||||
|
|
||||||
|
bool usage = false; // print usage
|
||||||
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
|
bool special = false; // enable special token output
|
||||||
|
bool interactive = false; // interactive mode
|
||||||
|
bool interactive_first = false; // wait for user input immediately
|
||||||
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||||
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
|
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||||
|
bool multiline_input = false; // reverse the usage of `\`
|
||||||
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
|
bool flash_attn = false; // flash attention
|
||||||
|
bool no_perf = false; // disable performance metrics
|
||||||
|
bool ctx_shift = true; // context shift on inifinite text generation
|
||||||
|
|
||||||
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
|
bool display_prompt = true; // print prompt before generation
|
||||||
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
|
bool warmup = true; // warmup run
|
||||||
|
bool check_tensors = false; // validate tensor data
|
||||||
|
|
||||||
|
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||||
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
|
||||||
|
// multimodal models (see examples/llava)
|
||||||
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||||
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
|
// embedding
|
||||||
|
bool embedding = false; // get only sentence embedding
|
||||||
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
|
std::string embd_sep = "\n"; // separator of embendings
|
||||||
|
bool reranking = false; // enable reranking support on server
|
||||||
|
|
||||||
|
// server params
|
||||||
|
int32_t port = 8080; // server listens on this network port
|
||||||
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
|
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
|
|
||||||
|
std::string hostname = "127.0.0.1";
|
||||||
|
std::string public_path = ""; // NOLINT
|
||||||
|
std::string chat_template = ""; // NOLINT
|
||||||
|
std::string system_prompt = ""; // NOLINT
|
||||||
|
bool enable_chat_template = true;
|
||||||
|
|
||||||
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
|
std::string ssl_file_key = ""; // NOLINT
|
||||||
|
std::string ssl_file_cert = ""; // NOLINT
|
||||||
|
|
||||||
|
bool endpoint_slots = true;
|
||||||
|
bool endpoint_metrics = false;
|
||||||
|
|
||||||
|
bool log_json = false;
|
||||||
|
|
||||||
|
std::string slot_save_path;
|
||||||
|
|
||||||
|
float slot_prompt_similarity = 0.5f;
|
||||||
|
|
||||||
|
// batched-bench params
|
||||||
|
bool is_pp_shared = false;
|
||||||
|
|
||||||
|
std::vector<int32_t> n_pp;
|
||||||
|
std::vector<int32_t> n_tg;
|
||||||
|
std::vector<int32_t> n_pl;
|
||||||
|
|
||||||
|
// retrieval params
|
||||||
|
std::vector<std::string> context_files; // context files to embed
|
||||||
|
|
||||||
|
int32_t chunk_size = 64; // chunk size for context embedding
|
||||||
|
|
||||||
|
std::string chunk_separator = "\n"; // chunk separator for context embedding
|
||||||
|
|
||||||
|
// passkey params
|
||||||
|
int32_t n_junk = 250; // number of times to repeat the junk text
|
||||||
|
int32_t i_pos = -1; // position of the passkey in the junk text
|
||||||
|
|
||||||
|
// imatrix params
|
||||||
|
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
|
||||||
|
|
||||||
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
||||||
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||||
|
int32_t i_chunk = 0; // start processing from this chunk
|
||||||
|
|
||||||
|
bool process_output = false; // collect data for the output tensor
|
||||||
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
|
||||||
|
// cvector-generator params
|
||||||
|
int n_pca_batch = 100;
|
||||||
|
int n_pca_iterations = 1000;
|
||||||
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||||
|
std::string cvector_outfile = "control_vector.gguf";
|
||||||
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
|
|
||||||
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||||
|
|
||||||
|
// batched-bench params
|
||||||
|
bool batched_bench_output_jsonl = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
// call once at the start of a program if it uses libcommon
|
||||||
|
// initializes the logging system and prints info about the build
|
||||||
|
void gpt_init();
|
||||||
|
|
||||||
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
|
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
|
||||||
|
bool set_process_priority(enum ggml_sched_priority prio);
|
||||||
|
|
||||||
|
//
|
||||||
|
// String utils
|
||||||
|
//
|
||||||
|
|
||||||
|
std::vector<std::string> string_split(std::string input, char separator);
|
||||||
|
|
||||||
|
std::string string_strip(const std::string & str);
|
||||||
|
std::string string_get_sortable_timestamp();
|
||||||
|
|
||||||
|
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
||||||
|
|
||||||
|
template<class T>
|
||||||
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
||||||
|
std::vector<T> values;
|
||||||
|
std::istringstream str_stream(str);
|
||||||
|
std::string token;
|
||||||
|
while (std::getline(str_stream, token, delim)) {
|
||||||
|
T value;
|
||||||
|
std::istringstream token_stream(token);
|
||||||
|
token_stream >> value;
|
||||||
|
values.push_back(value);
|
||||||
|
}
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
|
void string_process_escapes(std::string & input);
|
||||||
|
|
||||||
|
std::string string_from(bool value);
|
||||||
|
std::string string_from(const std::vector<int> & values);
|
||||||
|
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
||||||
|
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Filesystem utils
|
||||||
|
//
|
||||||
|
|
||||||
|
bool fs_validate_filename(const std::string & filename);
|
||||||
|
bool fs_create_directory_with_parents(const std::string & path);
|
||||||
|
|
||||||
|
std::string fs_get_cache_directory();
|
||||||
|
std::string fs_get_cache_file(const std::string & filename);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Model utils
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_init_result {
|
||||||
|
struct llama_model * model = nullptr;
|
||||||
|
struct llama_context * context = nullptr;
|
||||||
|
std::vector<llama_lora_adapter_container> lora_adapters;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
||||||
|
|
||||||
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||||
|
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
|
||||||
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||||
|
|
||||||
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
|
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
|
|
||||||
|
// clear LoRA adapters from context, then apply new list of adapters
|
||||||
|
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
|
||||||
|
|
||||||
|
// Batch utils
|
||||||
|
|
||||||
|
void llama_batch_clear(struct llama_batch & batch);
|
||||||
|
|
||||||
|
void llama_batch_add(
|
||||||
|
struct llama_batch & batch,
|
||||||
|
llama_token id,
|
||||||
|
llama_pos pos,
|
||||||
|
const std::vector<llama_seq_id> & seq_ids,
|
||||||
|
bool logits);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Vocab utils
|
||||||
|
//
|
||||||
|
|
||||||
|
// tokenizes a string into a vector of tokens
|
||||||
|
// should work similar to Python's `tokenizer.encode`
|
||||||
|
std::vector<llama_token> llama_tokenize(
|
||||||
|
const struct llama_context * ctx,
|
||||||
|
const std::string & text,
|
||||||
|
bool add_special,
|
||||||
|
bool parse_special = false);
|
||||||
|
|
||||||
|
std::vector<llama_token> llama_tokenize(
|
||||||
|
const struct llama_model * model,
|
||||||
|
const std::string & text,
|
||||||
|
bool add_special,
|
||||||
|
bool parse_special = false);
|
||||||
|
|
||||||
|
// tokenizes a token into a piece, optionally renders special/control tokens
|
||||||
|
// should work similar to Python's `tokenizer.id_to_piece`
|
||||||
|
std::string llama_token_to_piece(
|
||||||
|
const struct llama_context * ctx,
|
||||||
|
llama_token token,
|
||||||
|
bool special = true);
|
||||||
|
|
||||||
|
// detokenizes a vector of tokens into a string
|
||||||
|
// should work similar to Python's `tokenizer.decode`
|
||||||
|
// optionally renders special/control tokens
|
||||||
|
std::string llama_detokenize(
|
||||||
|
llama_context * ctx,
|
||||||
|
const std::vector<llama_token> & tokens,
|
||||||
|
bool special = true);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Chat template utils
|
||||||
|
//
|
||||||
|
|
||||||
|
// same with llama_chat_message, but uses std::string
|
||||||
|
struct llama_chat_msg {
|
||||||
|
std::string role;
|
||||||
|
std::string content;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
|
bool llama_chat_verify_template(const std::string & tmpl);
|
||||||
|
|
||||||
|
// CPP wrapper for llama_chat_apply_template
|
||||||
|
// If the built-in template is not supported, we default to chatml
|
||||||
|
// If the custom "tmpl" is not supported, we throw an error
|
||||||
|
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & chat,
|
||||||
|
bool add_ass);
|
||||||
|
|
||||||
|
// Format single message, while taking into account the position of that message in chat history
|
||||||
|
std::string llama_chat_format_single(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & past_msg,
|
||||||
|
const llama_chat_msg & new_msg,
|
||||||
|
bool add_ass);
|
||||||
|
|
||||||
|
// Returns an example of formatted chat
|
||||||
|
std::string llama_chat_format_example(const struct llama_model * model,
|
||||||
|
const std::string & tmpl);
|
||||||
|
|
||||||
|
//
|
||||||
|
// KV cache utils
|
||||||
|
//
|
||||||
|
|
||||||
|
// Dump the KV cache view with the number of sequences per cell.
|
||||||
|
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||||
|
|
||||||
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
|
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Embedding utils
|
||||||
|
//
|
||||||
|
|
||||||
|
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
||||||
|
|
||||||
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Control vector utils
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_control_vector_data {
|
||||||
|
int n_embd;
|
||||||
|
|
||||||
|
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
||||||
|
std::vector<float> data;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_control_vector_load_info {
|
||||||
|
float strength;
|
||||||
|
|
||||||
|
std::string fname;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Load control vectors, scale each by strength, and add them together.
|
||||||
|
// On error, returns {-1, empty}
|
||||||
|
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Split utils
|
||||||
|
//
|
||||||
|
|
||||||
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||||
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||||
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
|
//
|
||||||
|
// YAML utils
|
||||||
|
//
|
||||||
|
|
||||||
|
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
||||||
|
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
||||||
|
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||||
|
|
||||||
|
void yaml_dump_non_result_info(
|
||||||
|
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
1071
llama/json-schema-to-grammar.cpp
vendored
Normal file
1071
llama/json-schema-to-grammar.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
34
llama/json-schema-to-grammar.h
vendored
Normal file
34
llama/json-schema-to-grammar.h
vendored
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
/**
|
||||||
|
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||||
|
*
|
||||||
|
* MIT License
|
||||||
|
*
|
||||||
|
* Copyright (c) 2023-2024 The ggml authors
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
|
#include "json.hpp"
|
||||||
|
|
||||||
|
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
|
24766
llama/json.hpp
vendored
Normal file
24766
llama/json.hpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
152
llama/llama.go
152
llama/llama.go
@ -21,6 +21,8 @@ package llama
|
|||||||
#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
|
||||||
|
#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
|
||||||
|
#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
|
||||||
#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
|
#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
|
||||||
#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
|
#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
|
||||||
#cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
|
#cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
|
||||||
@ -36,8 +38,8 @@ package llama
|
|||||||
#cgo linux CXXFLAGS: -D_GNU_SOURCE
|
#cgo linux CXXFLAGS: -D_GNU_SOURCE
|
||||||
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
||||||
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
|
||||||
#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
|
#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
|
||||||
#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
|
#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
|
||||||
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
|
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
|
||||||
#cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
|
#cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
|
||||||
#cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
|
#cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
|
||||||
@ -65,6 +67,7 @@ package llama
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "mllama.h"
|
#include "mllama.h"
|
||||||
|
#include "sampling_ext.h"
|
||||||
|
|
||||||
bool llamaProgressCallback(float progress, void *user_data);
|
bool llamaProgressCallback(float progress, void *user_data);
|
||||||
|
|
||||||
@ -85,7 +88,6 @@ import (
|
|||||||
_ "embed"
|
_ "embed"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
|
||||||
"runtime"
|
"runtime"
|
||||||
"runtime/cgo"
|
"runtime/cgo"
|
||||||
"slices"
|
"slices"
|
||||||
@ -181,15 +183,6 @@ func (c *Context) Model() *Model {
|
|||||||
return &Model{c: C.llama_get_model(c.c)}
|
return &Model{c: C.llama_get_model(c.c)}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Context) GetLogitsIth(i int) ([]float32, error) {
|
|
||||||
logits := (*float32)(unsafe.Pointer(C.llama_get_logits_ith(c.c, C.int(i))))
|
|
||||||
if logits == nil {
|
|
||||||
return nil, errors.New("unable to get logits")
|
|
||||||
}
|
|
||||||
|
|
||||||
return unsafe.Slice(logits, c.Model().NumVocab()), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Context) KvCacheSeqAdd(seqId int, p0 int, p1 int, delta int) {
|
func (c *Context) KvCacheSeqAdd(seqId int, p0 int, p1 int, delta int) {
|
||||||
C.llama_kv_cache_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
|
C.llama_kv_cache_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
|
||||||
}
|
}
|
||||||
@ -608,6 +601,11 @@ func (c *Context) SetCrossAttention(state bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// sampling
|
// sampling
|
||||||
|
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
|
||||||
|
type SamplingContext struct {
|
||||||
|
c *C.struct_gpt_sampler
|
||||||
|
}
|
||||||
|
|
||||||
type SamplingParams struct {
|
type SamplingParams struct {
|
||||||
TopK int
|
TopK int
|
||||||
TopP float32
|
TopP float32
|
||||||
@ -627,120 +625,46 @@ type SamplingParams struct {
|
|||||||
Grammar string
|
Grammar string
|
||||||
}
|
}
|
||||||
|
|
||||||
type SamplingContext struct {
|
|
||||||
chain *C.struct_llama_sampler
|
|
||||||
grammar *C.struct_llama_sampler
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, error) {
|
func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, error) {
|
||||||
var s SamplingContext
|
var cparams C.struct_gpt_sampler_cparams
|
||||||
runtime.SetFinalizer(&s, func(s *SamplingContext) { s.free() })
|
cparams.top_k = C.int32_t(params.TopK)
|
||||||
|
cparams.top_p = C.float(params.TopP)
|
||||||
sparams := C.llama_sampler_chain_default_params()
|
cparams.min_p = C.float(params.MinP)
|
||||||
s.chain = C.llama_sampler_chain_init(sparams)
|
cparams.tfs_z = C.float(params.TfsZ)
|
||||||
|
cparams.typical_p = C.float(params.TypicalP)
|
||||||
|
cparams.temp = C.float(params.Temp)
|
||||||
|
cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
|
||||||
|
cparams.penalty_repeat = C.float(params.PenaltyRepeat)
|
||||||
|
cparams.penalty_freq = C.float(params.PenaltyFreq)
|
||||||
|
cparams.penalty_present = C.float(params.PenaltyFreq)
|
||||||
|
cparams.mirostat = C.int32_t(params.Mirostat)
|
||||||
|
cparams.mirostat_tau = C.float(params.MirostatTau)
|
||||||
|
cparams.mirostat_eta = C.float(params.MirostatEta)
|
||||||
|
cparams.penalize_nl = C.bool(params.PenalizeNl)
|
||||||
|
cparams.seed = C.uint32_t(params.Seed)
|
||||||
|
|
||||||
grammar := C.CString(params.Grammar)
|
grammar := C.CString(params.Grammar)
|
||||||
defer C.free(unsafe.Pointer(grammar))
|
defer C.free(unsafe.Pointer(grammar))
|
||||||
root := C.CString("root")
|
|
||||||
defer C.free(unsafe.Pointer(root))
|
|
||||||
s.grammar = C.llama_sampler_init_grammar(model.c, grammar, root)
|
|
||||||
|
|
||||||
C.llama_sampler_chain_add(s.chain,
|
cparams.grammar = grammar
|
||||||
C.llama_sampler_init_penalties(
|
context := &SamplingContext{c: C.gpt_sampler_cinit(model.c, &cparams)}
|
||||||
C.llama_n_vocab(model.c),
|
if context.c == nil {
|
||||||
C.llama_token_eos(model.c),
|
return nil, errors.New("unable to create sampling context")
|
||||||
C.llama_token_nl(model.c),
|
|
||||||
C.int32_t(params.RepeatLastN),
|
|
||||||
C.float(params.PenaltyRepeat),
|
|
||||||
C.float(params.PenaltyFreq),
|
|
||||||
C.float(params.PenaltyPresent),
|
|
||||||
C.bool(params.PenalizeNl),
|
|
||||||
false))
|
|
||||||
|
|
||||||
if params.Temp > 0 {
|
|
||||||
switch params.Mirostat {
|
|
||||||
case 0:
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_top_k(C.int32_t(params.TopK)))
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_tail_free(C.float(params.TfsZ), 0))
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_typical(C.float(params.TypicalP), 0))
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_top_p(C.float(params.TopP), 0))
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_min_p(C.float(params.MinP), 0))
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_temp(C.float(params.Temp)))
|
|
||||||
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_softmax())
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_dist(C.uint32_t(params.Seed)))
|
|
||||||
case 1:
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_temp(C.float(params.Temp)))
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_mirostat(C.llama_n_vocab(model.c),
|
|
||||||
C.uint32_t(params.Seed), C.float(params.MirostatTau), C.float(params.MirostatEta), 100))
|
|
||||||
case 2:
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_temp(C.float(params.Temp)))
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_mirostat_v2(C.uint32_t(params.Seed),
|
|
||||||
C.float(params.MirostatTau), C.float(params.MirostatEta)))
|
|
||||||
default:
|
|
||||||
return nil, fmt.Errorf("sampling: unknown mirostat version: %v", params.Mirostat)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_greedy())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &s, nil
|
runtime.SetFinalizer(context, func(s *SamplingContext) { C.gpt_sampler_cfree(s.c) })
|
||||||
|
|
||||||
|
return context, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SamplingContext) Sample(llamaContext *Context, idx int) (int, error) {
|
func (s *SamplingContext) Reset() {
|
||||||
logits, err := llamaContext.GetLogitsIth(idx)
|
C.gpt_sampler_creset(s.c)
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
numVocab := llamaContext.Model().NumVocab()
|
func (s *SamplingContext) Sample(llamaContext *Context, idx int) int {
|
||||||
|
return int(C.gpt_sampler_csample(s.c, llamaContext.c, C.int(idx)))
|
||||||
tokenData := make([]C.llama_token_data, numVocab)
|
|
||||||
var tokenDataPin runtime.Pinner
|
|
||||||
tokenDataPin.Pin(&tokenData[0])
|
|
||||||
defer tokenDataPin.Unpin()
|
|
||||||
|
|
||||||
for i := range tokenData {
|
|
||||||
tokenData[i] = C.llama_token_data{id: C.llama_token(i), logit: C.float(logits[i])}
|
|
||||||
}
|
|
||||||
tokenDataArray := C.llama_token_data_array{data: &tokenData[0], size: C.size_t(len(tokenData)), selected: -1}
|
|
||||||
|
|
||||||
C.llama_sampler_apply(s.chain, &tokenDataArray)
|
|
||||||
|
|
||||||
id := tokenData[tokenDataArray.selected].id
|
|
||||||
|
|
||||||
// Check if the selected token is allowed by the grammar
|
|
||||||
// If it is allowed then return it, otherwise evaluate the grammar on all
|
|
||||||
// tokens and resample (slow)
|
|
||||||
tokenData[0] = C.llama_token_data{id: id, logit: 1}
|
|
||||||
tokenDataArray = C.llama_token_data_array{data: &tokenData[0], size: 1, selected: -1}
|
|
||||||
|
|
||||||
C.llama_sampler_apply(s.grammar, &tokenDataArray)
|
|
||||||
if !math.IsInf(float64(tokenData[0].logit), -1) {
|
|
||||||
return int(id), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
for i := range tokenData {
|
|
||||||
tokenData[i] = C.llama_token_data{id: C.llama_token(i), logit: C.float(logits[i])}
|
|
||||||
}
|
|
||||||
tokenDataArray = C.llama_token_data_array{data: &tokenData[0], size: C.size_t(len(tokenData)), selected: -1}
|
|
||||||
|
|
||||||
C.llama_sampler_apply(s.grammar, &tokenDataArray)
|
|
||||||
C.llama_sampler_apply(s.chain, &tokenDataArray)
|
|
||||||
|
|
||||||
return int(tokenData[tokenDataArray.selected].id), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SamplingContext) Accept(id int, applyGrammar bool) {
|
func (s *SamplingContext) Accept(id int, applyGrammar bool) {
|
||||||
if applyGrammar {
|
C.gpt_sampler_caccept(s.c, C.llama_token(id), C.bool(applyGrammar))
|
||||||
C.llama_sampler_accept(s.grammar, C.llama_token(id))
|
|
||||||
}
|
|
||||||
C.llama_sampler_accept(s.chain, C.llama_token(id))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *SamplingContext) free() {
|
|
||||||
if s != nil {
|
|
||||||
C.llama_sampler_free(s.grammar)
|
|
||||||
C.llama_sampler_free(s.chain)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
427
llama/log.cpp
vendored
Normal file
427
llama/log.cpp
vendored
Normal file
@ -0,0 +1,427 @@
|
|||||||
|
/**
|
||||||
|
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||||
|
*
|
||||||
|
* MIT License
|
||||||
|
*
|
||||||
|
* Copyright (c) 2023-2024 The ggml authors
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <cstdarg>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <mutex>
|
||||||
|
#include <sstream>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||||
|
|
||||||
|
void gpt_log_set_verbosity_thold(int verbosity) {
|
||||||
|
gpt_log_verbosity_thold = verbosity;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define LOG_COL_DEFAULT "\033[0m"
|
||||||
|
#define LOG_COL_BOLD "\033[1m"
|
||||||
|
#define LOG_COL_RED "\033[31m"
|
||||||
|
#define LOG_COL_GREEN "\033[32m"
|
||||||
|
#define LOG_COL_YELLOW "\033[33m"
|
||||||
|
#define LOG_COL_BLUE "\033[34m"
|
||||||
|
#define LOG_COL_MAGENTA "\033[35m"
|
||||||
|
#define LOG_COL_CYAN "\033[36m"
|
||||||
|
#define LOG_COL_WHITE "\033[37m"
|
||||||
|
|
||||||
|
static int64_t t_us() {
|
||||||
|
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
// colors
|
||||||
|
enum gpt_log_col : int {
|
||||||
|
GPT_LOG_COL_DEFAULT = 0,
|
||||||
|
GPT_LOG_COL_BOLD,
|
||||||
|
GPT_LOG_COL_RED,
|
||||||
|
GPT_LOG_COL_GREEN,
|
||||||
|
GPT_LOG_COL_YELLOW,
|
||||||
|
GPT_LOG_COL_BLUE,
|
||||||
|
GPT_LOG_COL_MAGENTA,
|
||||||
|
GPT_LOG_COL_CYAN,
|
||||||
|
GPT_LOG_COL_WHITE,
|
||||||
|
};
|
||||||
|
|
||||||
|
// disable colors by default
|
||||||
|
static std::vector<const char *> g_col = {
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_log_entry {
|
||||||
|
enum ggml_log_level level;
|
||||||
|
|
||||||
|
bool prefix;
|
||||||
|
|
||||||
|
int64_t timestamp;
|
||||||
|
|
||||||
|
std::vector<char> msg;
|
||||||
|
|
||||||
|
// signals the worker thread to stop
|
||||||
|
bool is_end;
|
||||||
|
|
||||||
|
void print(FILE * file = nullptr) const {
|
||||||
|
FILE * fcur = file;
|
||||||
|
if (!fcur) {
|
||||||
|
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
||||||
|
// these messages will still be logged to a file
|
||||||
|
if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fcur = stdout;
|
||||||
|
|
||||||
|
if (level != GGML_LOG_LEVEL_NONE) {
|
||||||
|
fcur = stderr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
|
||||||
|
if (timestamp) {
|
||||||
|
// [M.s.ms.us]
|
||||||
|
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
||||||
|
g_col[GPT_LOG_COL_BLUE],
|
||||||
|
(int) (timestamp / 1000000 / 60),
|
||||||
|
(int) (timestamp / 1000000 % 60),
|
||||||
|
(int) (timestamp / 1000 % 1000),
|
||||||
|
(int) (timestamp % 1000),
|
||||||
|
g_col[GPT_LOG_COL_DEFAULT]);
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (level) {
|
||||||
|
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
|
||||||
|
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
|
||||||
|
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
|
||||||
|
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(fcur, "%s", msg.data());
|
||||||
|
|
||||||
|
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
|
||||||
|
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
|
||||||
|
}
|
||||||
|
|
||||||
|
fflush(fcur);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_log {
|
||||||
|
// default capacity - will be expanded if needed
|
||||||
|
gpt_log() : gpt_log(256) {}
|
||||||
|
|
||||||
|
gpt_log(size_t capacity) {
|
||||||
|
file = nullptr;
|
||||||
|
prefix = false;
|
||||||
|
timestamps = false;
|
||||||
|
running = false;
|
||||||
|
t_start = t_us();
|
||||||
|
|
||||||
|
// initial message size - will be expanded if longer messages arrive
|
||||||
|
entries.resize(capacity);
|
||||||
|
for (auto & entry : entries) {
|
||||||
|
entry.msg.resize(256);
|
||||||
|
}
|
||||||
|
|
||||||
|
head = 0;
|
||||||
|
tail = 0;
|
||||||
|
|
||||||
|
resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
~gpt_log() {
|
||||||
|
pause();
|
||||||
|
if (file) {
|
||||||
|
fclose(file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::mutex mtx;
|
||||||
|
std::thread thrd;
|
||||||
|
std::condition_variable cv;
|
||||||
|
|
||||||
|
FILE * file;
|
||||||
|
|
||||||
|
bool prefix;
|
||||||
|
bool timestamps;
|
||||||
|
bool running;
|
||||||
|
|
||||||
|
int64_t t_start;
|
||||||
|
|
||||||
|
// ring buffer of entries
|
||||||
|
std::vector<gpt_log_entry> entries;
|
||||||
|
size_t head;
|
||||||
|
size_t tail;
|
||||||
|
|
||||||
|
// worker thread copies into this
|
||||||
|
gpt_log_entry cur;
|
||||||
|
|
||||||
|
public:
|
||||||
|
void add(enum ggml_log_level level, const char * fmt, va_list args) {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
if (!running) {
|
||||||
|
// discard messages while the worker thread is paused
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto & entry = entries[tail];
|
||||||
|
|
||||||
|
{
|
||||||
|
// cannot use args twice, so make a copy in case we need to expand the buffer
|
||||||
|
va_list args_copy;
|
||||||
|
va_copy(args_copy, args);
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
|
||||||
|
if (n >= entry.msg.size()) {
|
||||||
|
entry.msg.resize(n + 1);
|
||||||
|
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// hack for bolding arguments
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
for (int i = 0; fmt[i] != 0; i++) {
|
||||||
|
if (fmt[i] == '%') {
|
||||||
|
ss << LOG_COL_BOLD;
|
||||||
|
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
|
||||||
|
ss << LOG_COL_DEFAULT;
|
||||||
|
if (fmt[i] == 0) break;
|
||||||
|
}
|
||||||
|
ss << fmt[i];
|
||||||
|
}
|
||||||
|
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
|
||||||
|
if (n >= entry.msg.size()) {
|
||||||
|
entry.msg.resize(n + 1);
|
||||||
|
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
entry.level = level;
|
||||||
|
entry.prefix = prefix;
|
||||||
|
entry.timestamp = 0;
|
||||||
|
if (timestamps) {
|
||||||
|
entry.timestamp = t_us() - t_start;
|
||||||
|
}
|
||||||
|
entry.is_end = false;
|
||||||
|
|
||||||
|
tail = (tail + 1) % entries.size();
|
||||||
|
if (tail == head) {
|
||||||
|
// expand the buffer
|
||||||
|
std::vector<gpt_log_entry> new_entries(2*entries.size());
|
||||||
|
|
||||||
|
size_t new_tail = 0;
|
||||||
|
|
||||||
|
do {
|
||||||
|
new_entries[new_tail] = std::move(entries[head]);
|
||||||
|
|
||||||
|
head = (head + 1) % entries.size();
|
||||||
|
new_tail = (new_tail + 1);
|
||||||
|
} while (head != tail);
|
||||||
|
|
||||||
|
head = 0;
|
||||||
|
tail = new_tail;
|
||||||
|
|
||||||
|
for (size_t i = tail; i < new_entries.size(); i++) {
|
||||||
|
new_entries[i].msg.resize(256);
|
||||||
|
}
|
||||||
|
|
||||||
|
entries = std::move(new_entries);
|
||||||
|
}
|
||||||
|
|
||||||
|
cv.notify_one();
|
||||||
|
}
|
||||||
|
|
||||||
|
void resume() {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
if (running) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
running = true;
|
||||||
|
|
||||||
|
thrd = std::thread([this]() {
|
||||||
|
while (true) {
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock(mtx);
|
||||||
|
cv.wait(lock, [this]() { return head != tail; });
|
||||||
|
|
||||||
|
cur = entries[head];
|
||||||
|
|
||||||
|
head = (head + 1) % entries.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cur.is_end) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur.print(); // stdout and stderr
|
||||||
|
|
||||||
|
if (file) {
|
||||||
|
cur.print(file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void pause() {
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
if (!running) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
running = false;
|
||||||
|
|
||||||
|
// push an entry to signal the worker thread to stop
|
||||||
|
{
|
||||||
|
auto & entry = entries[tail];
|
||||||
|
entry.is_end = true;
|
||||||
|
|
||||||
|
tail = (tail + 1) % entries.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
cv.notify_one();
|
||||||
|
}
|
||||||
|
|
||||||
|
thrd.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_file(const char * path) {
|
||||||
|
pause();
|
||||||
|
|
||||||
|
if (file) {
|
||||||
|
fclose(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path) {
|
||||||
|
file = fopen(path, "w");
|
||||||
|
} else {
|
||||||
|
file = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_colors(bool colors) {
|
||||||
|
pause();
|
||||||
|
|
||||||
|
if (colors) {
|
||||||
|
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
||||||
|
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
|
||||||
|
g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
|
||||||
|
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
|
||||||
|
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
||||||
|
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
|
||||||
|
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
||||||
|
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
|
||||||
|
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < g_col.size(); i++) {
|
||||||
|
g_col[i] = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_prefix(bool prefix) {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
this->prefix = prefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_timestamps(bool timestamps) {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
this->timestamps = timestamps;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// public API
|
||||||
|
//
|
||||||
|
|
||||||
|
struct gpt_log * gpt_log_init() {
|
||||||
|
return new gpt_log;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct gpt_log * gpt_log_main() {
|
||||||
|
static struct gpt_log log;
|
||||||
|
|
||||||
|
return &log;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_pause(struct gpt_log * log) {
|
||||||
|
log->pause();
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_resume(struct gpt_log * log) {
|
||||||
|
log->resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_free(struct gpt_log * log) {
|
||||||
|
delete log;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
log->add(level, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_file(struct gpt_log * log, const char * file) {
|
||||||
|
log->set_file(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
|
||||||
|
log->set_colors(colors);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
|
||||||
|
log->set_prefix(prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
|
||||||
|
log->set_timestamps(timestamps);
|
||||||
|
}
|
118
llama/log.h
vendored
Normal file
118
llama/log.h
vendored
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
/**
|
||||||
|
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||||
|
*
|
||||||
|
* MIT License
|
||||||
|
*
|
||||||
|
* Copyright (c) 2023-2024 The ggml authors
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h" // for ggml_log_level
|
||||||
|
|
||||||
|
#ifndef __GNUC__
|
||||||
|
# define LOG_ATTRIBUTE_FORMAT(...)
|
||||||
|
#elif defined(__MINGW32__)
|
||||||
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||||
|
#else
|
||||||
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define LOG_DEFAULT_DEBUG 1
|
||||||
|
#define LOG_DEFAULT_LLAMA 0
|
||||||
|
|
||||||
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||||
|
// set via gpt_log_set_verbosity()
|
||||||
|
extern int gpt_log_verbosity_thold;
|
||||||
|
|
||||||
|
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
|
||||||
|
|
||||||
|
// the gpt_log uses an internal worker thread to print/write log messages
|
||||||
|
// when the worker thread is paused, incoming log messages are discarded
|
||||||
|
struct gpt_log;
|
||||||
|
|
||||||
|
struct gpt_log * gpt_log_init();
|
||||||
|
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
|
||||||
|
void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
|
||||||
|
void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
|
||||||
|
void gpt_log_free (struct gpt_log * log);
|
||||||
|
|
||||||
|
LOG_ATTRIBUTE_FORMAT(3, 4)
|
||||||
|
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
|
||||||
|
|
||||||
|
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
|
||||||
|
//
|
||||||
|
// regular log output:
|
||||||
|
//
|
||||||
|
// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
|
||||||
|
// llm_load_tensors: ggml ctx size = 0.27 MiB
|
||||||
|
// llm_load_tensors: offloading 32 repeating layers to GPU
|
||||||
|
// llm_load_tensors: offloading non-repeating layers to GPU
|
||||||
|
//
|
||||||
|
// with prefix = true, timestamps = true, the log output will look like this:
|
||||||
|
//
|
||||||
|
// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
|
||||||
|
// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB
|
||||||
|
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
|
||||||
|
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
|
||||||
|
//
|
||||||
|
// I - info (stdout, V = 0)
|
||||||
|
// W - warning (stderr, V = 0)
|
||||||
|
// E - error (stderr, V = 0)
|
||||||
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
||||||
|
//
|
||||||
|
|
||||||
|
void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
|
||||||
|
void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
|
||||||
|
void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
|
||||||
|
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||||
|
|
||||||
|
// helper macros for logging
|
||||||
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
||||||
|
//
|
||||||
|
// for example:
|
||||||
|
//
|
||||||
|
// LOG_DBG("this is a debug message: %d\n", expensive_function());
|
||||||
|
//
|
||||||
|
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
|
||||||
|
//
|
||||||
|
|
||||||
|
#define LOG_TMPL(level, verbosity, ...) \
|
||||||
|
do { \
|
||||||
|
if ((verbosity) <= gpt_log_verbosity_thold) { \
|
||||||
|
gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
|
||||||
|
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
|
||||||
|
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
|
||||||
|
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
|
||||||
|
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
|
||||||
|
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
|
||||||
|
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
|
||||||
|
#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
|
||||||
|
#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
|
||||||
|
#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT, verbosity, __VA_ARGS__)
|
@ -58,6 +58,8 @@ endif
|
|||||||
GPU_COMPILER_CUFLAGS = \
|
GPU_COMPILER_CUFLAGS = \
|
||||||
$(GPU_COMPILER_FPIC) \
|
$(GPU_COMPILER_FPIC) \
|
||||||
$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
|
$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
|
||||||
|
-mf16c \
|
||||||
|
-mfma \
|
||||||
-parallel-jobs=2 \
|
-parallel-jobs=2 \
|
||||||
-c \
|
-c \
|
||||||
-O3 \
|
-O3 \
|
||||||
@ -77,6 +79,9 @@ GPU_COMPILER_CUFLAGS = \
|
|||||||
-D_CRT_SECURE_NO_WARNINGS \
|
-D_CRT_SECURE_NO_WARNINGS \
|
||||||
-D_GNU_SOURCE \
|
-D_GNU_SOURCE \
|
||||||
-D_XOPEN_SOURCE=600 \
|
-D_XOPEN_SOURCE=600 \
|
||||||
|
-DUSE_PROF_API=1 \
|
||||||
|
-std=gnu++14 \
|
||||||
|
-x hip \
|
||||||
-mllvm=-amdgpu-early-inline-all=true \
|
-mllvm=-amdgpu-early-inline-all=true \
|
||||||
-mllvm=-amdgpu-function-calls=false \
|
-mllvm=-amdgpu-function-calls=false \
|
||||||
-Wno-expansion-to-defined \
|
-Wno-expansion-to-defined \
|
||||||
@ -87,6 +92,12 @@ GPU_COMPILER_CUFLAGS = \
|
|||||||
-Wno-unused-result \
|
-Wno-unused-result \
|
||||||
-I.
|
-I.
|
||||||
|
|
||||||
|
# Workaround buggy P2P copy on some windows multi-GPU setups
|
||||||
|
# This workaround breaks linux systems with small system RAM, so only enable on windows
|
||||||
|
ifeq ($(OS),windows)
|
||||||
|
GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
|
||||||
|
endif
|
||||||
|
|
||||||
include make/gpu.make
|
include make/gpu.make
|
||||||
|
|
||||||
# Adjust the rules from gpu.make to handle the ROCm dependencies properly
|
# Adjust the rules from gpu.make to handle the ROCm dependencies properly
|
||||||
|
@ -47,8 +47,8 @@ create-patches: $(LLAMACPP_REPO)
|
|||||||
git -C $(LLAMACPP_REPO) format-patch --no-signature --no-numbered --zero-commit -o $(LLAMACPP_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
|
git -C $(LLAMACPP_REPO) format-patch --no-signature --no-numbered --zero-commit -o $(LLAMACPP_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
|
||||||
|
|
||||||
# Vendoring template logic
|
# Vendoring template logic
|
||||||
EXCLUDED_FILES=sgemm.cpp sgemm.h stb_image.h json.hpp llama_darwin.c base64.hpp
|
EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
|
||||||
OLLAMA_NATIVE_FILES=mllama.cpp mllama.h llama_darwin.c
|
OLLAMA_NATIVE_FILES=mllama.cpp mllama.h llama_darwin.c sampling_ext.cpp sampling_ext.h
|
||||||
define vendor_file
|
define vendor_file
|
||||||
$(strip $(addprefix $(2),$(notdir $1))) : $(addprefix $(LLAMACPP_REPO),$(1))
|
$(strip $(addprefix $(2),$(notdir $1))) : $(addprefix $(LLAMACPP_REPO),$(1))
|
||||||
ifneq ($$(filter-out $(EXCLUDED_FILES),$(notdir $1)),)
|
ifneq ($$(filter-out $(EXCLUDED_FILES),$(notdir $1)),)
|
||||||
@ -149,7 +149,20 @@ LAVA_FILES= \
|
|||||||
examples/llava/clip.h \
|
examples/llava/clip.h \
|
||||||
examples/llava/llava.cpp \
|
examples/llava/llava.cpp \
|
||||||
examples/llava/llava.h \
|
examples/llava/llava.h \
|
||||||
|
common/log.h \
|
||||||
|
common/log.cpp \
|
||||||
common/stb_image.h
|
common/stb_image.h
|
||||||
|
# These files are mostly used by the llava code
|
||||||
|
# and shouldn't be necessary once we use clip.cpp directly
|
||||||
|
LAVA_FILES+= \
|
||||||
|
common/common.cpp \
|
||||||
|
common/common.h \
|
||||||
|
common/sampling.cpp \
|
||||||
|
common/sampling.h \
|
||||||
|
common/json.hpp \
|
||||||
|
common/json-schema-to-grammar.cpp \
|
||||||
|
common/json-schema-to-grammar.h \
|
||||||
|
common/base64.hpp
|
||||||
$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
|
$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
|
||||||
|
|
||||||
$(DST_DIR)build-info.cpp:
|
$(DST_DIR)build-info.cpp:
|
||||||
|
@ -20,7 +20,7 @@ GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
|||||||
GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
|
GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
|
||||||
GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
||||||
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
|
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
|
||||||
GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_LIB_DIR)/,$(notdir $(GPU_LIBS))))
|
GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
|
||||||
|
|
||||||
ifeq ($(OS),linux)
|
ifeq ($(OS),linux)
|
||||||
CUDA_PATH?=/usr/local/cuda
|
CUDA_PATH?=/usr/local/cuda
|
||||||
|
@ -85,7 +85,7 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS
|
|||||||
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
|
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
|
||||||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
|
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
|
||||||
@-mkdir -p $(dir $@)
|
@-mkdir -p $(dir $@)
|
||||||
$(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
|
$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
|
||||||
|
|
||||||
# Distribution targets
|
# Distribution targets
|
||||||
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
|
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
|
||||||
|
@ -2,6 +2,7 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"reflect"
|
"reflect"
|
||||||
"time"
|
"time"
|
||||||
@ -22,7 +23,11 @@ type InputCache struct {
|
|||||||
lc *llama.Context
|
lc *llama.Context
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) *InputCache {
|
func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) (*InputCache, error) {
|
||||||
|
if kvSize/numSlots < 1 {
|
||||||
|
return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
|
||||||
|
}
|
||||||
|
|
||||||
slots := make([]InputCacheSlot, numSlots)
|
slots := make([]InputCacheSlot, numSlots)
|
||||||
|
|
||||||
for i := range slots {
|
for i := range slots {
|
||||||
@ -37,7 +42,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
|
|||||||
slots: slots,
|
slots: slots,
|
||||||
multiUserCache: multiUserCache,
|
multiUserCache: multiUserCache,
|
||||||
lc: lc,
|
lc: lc,
|
||||||
}
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Locking: Operations on InputCacheSlot (including finding one
|
// Locking: Operations on InputCacheSlot (including finding one
|
||||||
@ -58,7 +63,7 @@ type InputCacheSlot struct {
|
|||||||
lastUsed time.Time
|
lastUsed time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, int, error) {
|
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
|
||||||
var slot *InputCacheSlot
|
var slot *InputCacheSlot
|
||||||
var numPast int
|
var numPast int
|
||||||
var err error
|
var err error
|
||||||
@ -75,7 +80,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
|
|||||||
slot, numPast, err = c.findBestCacheSlot(prompt)
|
slot, numPast, err = c.findBestCacheSlot(prompt)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, 0, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if !cachePrompt {
|
if !cachePrompt {
|
||||||
@ -102,7 +107,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
|
|||||||
prompt = prompt[numPast:]
|
prompt = prompt[numPast:]
|
||||||
slot.Inputs = slot.Inputs[:numPast]
|
slot.Inputs = slot.Inputs[:numPast]
|
||||||
|
|
||||||
return slot, prompt, numPast, nil
|
return slot, prompt, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
|
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
|
||||||
@ -194,14 +199,30 @@ func countCommonPrefix(a []input, b []input) int {
|
|||||||
return count
|
return count
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscard int, numPast int) {
|
// Frees up space in the KV cache by deleting the oldest half of history and shifting
|
||||||
// TODO (jessegross): KV cache removal can fail for certain types of models
|
// the newest half into that space (saving numKeep inputs at the beginning).
|
||||||
// server.cpp doesn't handle this, though we can be more graceful
|
//
|
||||||
c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+numDiscard)
|
// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
|
||||||
c.lc.KvCacheSeqAdd(slot.Id, numKeep+numDiscard, numPast, -numDiscard)
|
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) {
|
||||||
|
targetFree := (c.numCtx - numKeep) / 2
|
||||||
|
targetFree = max(targetFree, 1)
|
||||||
|
|
||||||
for i := numKeep + numDiscard; i < len(slot.Inputs); i++ {
|
currentFree := c.numCtx - len(slot.Inputs)
|
||||||
slot.Inputs[i-numDiscard] = slot.Inputs[i]
|
discard := targetFree - currentFree
|
||||||
|
|
||||||
|
if discard <= 0 {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
|
|
||||||
|
slog.Debug("context limit hit - shifting", "limit", c.numCtx, "input", len(slot.Inputs),
|
||||||
|
"keep", numKeep, "discard", discard)
|
||||||
|
|
||||||
|
// TODO (jessegross): KV cache removal can fail for certain types of models
|
||||||
|
c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard)
|
||||||
|
c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
|
||||||
|
|
||||||
|
for i := numKeep + discard; i < len(slot.Inputs); i++ {
|
||||||
|
slot.Inputs[i-discard] = slot.Inputs[i]
|
||||||
|
}
|
||||||
|
slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
|
||||||
}
|
}
|
||||||
|
@ -68,6 +68,10 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(data) <= 0 {
|
||||||
|
return nil, errors.New("received zero length image")
|
||||||
|
}
|
||||||
|
|
||||||
hash := c.hashImage(data)
|
hash := c.hashImage(data)
|
||||||
|
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
|
@ -34,9 +34,6 @@ type input struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Sequence struct {
|
type Sequence struct {
|
||||||
// number of inputs evaluated
|
|
||||||
numPast int
|
|
||||||
|
|
||||||
// batch index
|
// batch index
|
||||||
iBatch int
|
iBatch int
|
||||||
|
|
||||||
@ -112,21 +109,15 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
|
|||||||
params.numKeep = len(inputs)
|
params.numKeep = len(inputs)
|
||||||
}
|
}
|
||||||
|
|
||||||
if !params.embedding {
|
if s.model.AddBOSToken() {
|
||||||
// Subtracting 4 ensures that at least 1 input can be discarded during shift
|
params.numKeep += 1
|
||||||
params.numKeep = min(params.numKeep, s.cache.numCtx-4)
|
|
||||||
params.numKeep += s.bosToken
|
|
||||||
} else {
|
|
||||||
// Embeddings are 1 shot - just truncate to the context window, without ever shifting
|
|
||||||
params.numKeep = min(params.numKeep, s.cache.numCtx)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// truncate to fit in context window
|
// Ensure that at least 1 input can be discarded during shift
|
||||||
|
params.numKeep = min(params.numKeep, s.cache.numCtx-1)
|
||||||
|
|
||||||
if len(inputs) > s.cache.numCtx {
|
if len(inputs) > s.cache.numCtx {
|
||||||
slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
|
slog.Warn("input exceeds context length", "prompt", len(inputs), "limit", s.cache.numCtx)
|
||||||
newInputs := inputs[:params.numKeep]
|
|
||||||
newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
|
|
||||||
inputs = newInputs
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var sc *llama.SamplingContext
|
var sc *llama.SamplingContext
|
||||||
@ -231,9 +222,6 @@ type Server struct {
|
|||||||
// KV cache
|
// KV cache
|
||||||
cache *InputCache
|
cache *InputCache
|
||||||
|
|
||||||
// does this model require a beginning of sequence token?
|
|
||||||
bosToken int
|
|
||||||
|
|
||||||
// next sequence for prompt processing to avoid starvation
|
// next sequence for prompt processing to avoid starvation
|
||||||
nextSeq int
|
nextSeq int
|
||||||
|
|
||||||
@ -258,18 +246,6 @@ func (s *Server) allNil() bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) shiftContext(seq *Sequence) {
|
|
||||||
numLeft := seq.numPast - seq.numKeep
|
|
||||||
numDiscard := numLeft / 2
|
|
||||||
|
|
||||||
slog.Debug("context limit hit - shifting", "limit", s.cache.numCtx, "numPast", seq.numPast,
|
|
||||||
"numKeep", seq.numKeep, "numLeft", numLeft, "numDiscard", numDiscard)
|
|
||||||
|
|
||||||
s.cache.ShiftCacheSlot(seq.cache, seq.numKeep, numDiscard, seq.numPast)
|
|
||||||
|
|
||||||
seq.numPast -= numDiscard
|
|
||||||
}
|
|
||||||
|
|
||||||
func flushPending(seq *Sequence) bool {
|
func flushPending(seq *Sequence) bool {
|
||||||
joined := strings.Join(seq.pendingResponses, "")
|
joined := strings.Join(seq.pendingResponses, "")
|
||||||
seq.pendingResponses = []string{}
|
seq.pendingResponses = []string{}
|
||||||
@ -369,17 +345,24 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// if past the num predict limit
|
// if past the num predict limit
|
||||||
if seq.numPredict > 0 && seq.numPredicted > seq.numPredict {
|
if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
|
||||||
s.removeSequence(seqIdx, "limit")
|
s.removeSequence(seqIdx, "limit")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if seq.numPast+len(seq.inputs) > s.cache.numCtx {
|
var numInputsProcessed int
|
||||||
s.shiftContext(seq)
|
shifted := false
|
||||||
|
|
||||||
|
for i, input := range seq.inputs {
|
||||||
|
if len(seq.cache.Inputs)+1 > s.cache.numCtx {
|
||||||
|
if !shifted {
|
||||||
|
s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
|
||||||
|
shifted = true
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var numInputsProcessed int
|
|
||||||
for i, input := range seq.inputs {
|
|
||||||
embedding := input.embed != nil
|
embedding := input.embed != nil
|
||||||
|
|
||||||
// If we don't currently have a batch, use one of the correct type and
|
// If we don't currently have a batch, use one of the correct type and
|
||||||
@ -403,13 +386,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
}
|
}
|
||||||
|
|
||||||
crossAttention = seq.crossAttention
|
crossAttention = seq.crossAttention
|
||||||
batch.Add(input.token, input.embed, seq.numPast, numInputsProcessed+1 == len(seq.inputs), seq.cache.Id)
|
batch.Add(input.token, input.embed, len(seq.cache.Inputs), i+1 == len(seq.inputs), seq.cache.Id)
|
||||||
seq.numPast++
|
seq.cache.Inputs = append(seq.cache.Inputs, input)
|
||||||
numInputsProcessed++
|
numInputsProcessed++
|
||||||
}
|
}
|
||||||
|
|
||||||
if numInputsProcessed > 0 {
|
if numInputsProcessed > 0 {
|
||||||
seq.cache.Inputs = append(seq.cache.Inputs, seq.inputs[:numInputsProcessed]...)
|
|
||||||
seq.inputs = seq.inputs[numInputsProcessed:]
|
seq.inputs = seq.inputs[numInputsProcessed:]
|
||||||
seq.iBatch = batch.NumTokens() - 1
|
seq.iBatch = batch.NumTokens() - 1
|
||||||
}
|
}
|
||||||
@ -455,12 +437,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// sample a token
|
// sample a token
|
||||||
token, err := seq.samplingCtx.Sample(s.lc, seq.iBatch)
|
token := seq.samplingCtx.Sample(s.lc, seq.iBatch)
|
||||||
if err != nil {
|
|
||||||
slog.Error("failed to sample token", "error", err)
|
|
||||||
s.removeSequence(i, "error")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
seq.samplingCtx.Accept(token, true)
|
seq.samplingCtx.Accept(token, true)
|
||||||
piece := s.model.TokenToPiece(token)
|
piece := s.model.TokenToPiece(token)
|
||||||
|
|
||||||
@ -637,7 +614,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
|||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
for i, sq := range s.seqs {
|
for i, sq := range s.seqs {
|
||||||
if sq == nil {
|
if sq == nil {
|
||||||
seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
||||||
@ -720,7 +697,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
|
|||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
for i, sq := range s.seqs {
|
for i, sq := range s.seqs {
|
||||||
if sq == nil {
|
if sq == nil {
|
||||||
seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
||||||
@ -807,10 +784,6 @@ func (s *Server) loadModel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.model.AddBOSToken() {
|
|
||||||
s.bosToken = 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if ppath != "" {
|
if ppath != "" {
|
||||||
var err error
|
var err error
|
||||||
s.image, err = NewImageContext(s.lc, ppath)
|
s.image, err = NewImageContext(s.lc, ppath)
|
||||||
@ -819,7 +792,10 @@ func (s *Server) loadModel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
s.cache = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
|
s.cache, err = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
s.status = ServerStatusReady
|
s.status = ServerStatusReady
|
||||||
s.ready.Done()
|
s.ready.Done()
|
||||||
@ -842,14 +818,8 @@ func main() {
|
|||||||
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
||||||
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||||
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||||
// Expose requirements as a JSON output to stdout
|
|
||||||
requirements := flag.Bool("requirements", false, "print json requirement information")
|
requirements := flag.Bool("requirements", false, "print json requirement information")
|
||||||
|
|
||||||
// These are either ignored by llama.cpp or have no significance to us
|
|
||||||
_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
|
|
||||||
_ = flag.Bool("log-disable", false, "disables logging to a file")
|
|
||||||
_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
|
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
if *requirements {
|
if *requirements {
|
||||||
printRequirements(os.Stdout)
|
printRequirements(os.Stdout)
|
||||||
|
484
llama/sampling.cpp
vendored
Normal file
484
llama/sampling.cpp
vendored
Normal file
@ -0,0 +1,484 @@
|
|||||||
|
/**
|
||||||
|
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||||
|
*
|
||||||
|
* MIT License
|
||||||
|
*
|
||||||
|
* Copyright (c) 2023-2024 The ggml authors
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||||
|
// TODO: deduplicate with llama-impl.h
|
||||||
|
template<typename T>
|
||||||
|
struct ring_buffer {
|
||||||
|
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
||||||
|
|
||||||
|
T & front() {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
return data[first];
|
||||||
|
}
|
||||||
|
|
||||||
|
const T & front() const {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
return data[first];
|
||||||
|
}
|
||||||
|
|
||||||
|
T & back() {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
return data[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
const T & back() const {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
return data[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
void push_back(const T & value) {
|
||||||
|
if (sz == capacity) {
|
||||||
|
// advance the start when buffer is full
|
||||||
|
first = (first + 1) % capacity;
|
||||||
|
} else {
|
||||||
|
sz++;
|
||||||
|
}
|
||||||
|
data[pos] = value;
|
||||||
|
pos = (pos + 1) % capacity;
|
||||||
|
}
|
||||||
|
|
||||||
|
T pop_front() {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
T value = data[first];
|
||||||
|
first = (first + 1) % capacity;
|
||||||
|
sz--;
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
const T & rat(size_t i) const {
|
||||||
|
if (i >= sz) {
|
||||||
|
throw std::runtime_error("ring buffer: index out of bounds");
|
||||||
|
}
|
||||||
|
return data[(first + sz - i - 1) % capacity];
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<T> to_vector() const {
|
||||||
|
std::vector<T> result;
|
||||||
|
result.reserve(sz);
|
||||||
|
for (size_t i = 0; i < sz; i++) {
|
||||||
|
result.push_back(data[(first + i) % capacity]);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void clear() {
|
||||||
|
// here only reset the status of the buffer
|
||||||
|
sz = 0;
|
||||||
|
first = 0;
|
||||||
|
pos = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool empty() const {
|
||||||
|
return sz == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t size() const {
|
||||||
|
return sz;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t capacity = 0;
|
||||||
|
size_t sz = 0;
|
||||||
|
size_t first = 0;
|
||||||
|
size_t pos = 0;
|
||||||
|
std::vector<T> data;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_sampler {
|
||||||
|
gpt_sampler_params params;
|
||||||
|
|
||||||
|
struct llama_sampler * grmr;
|
||||||
|
struct llama_sampler * chain;
|
||||||
|
|
||||||
|
ring_buffer<llama_token> prev;
|
||||||
|
|
||||||
|
std::vector<llama_token_data> cur;
|
||||||
|
|
||||||
|
llama_token_data_array cur_p;
|
||||||
|
|
||||||
|
void set_logits(struct llama_context * ctx, int idx) {
|
||||||
|
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
|
cur.resize(n_vocab);
|
||||||
|
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||||
|
}
|
||||||
|
|
||||||
|
cur_p = { cur.data(), cur.size(), -1, false };
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string gpt_sampler_params::print() const {
|
||||||
|
char result[1024];
|
||||||
|
|
||||||
|
snprintf(result, sizeof(result),
|
||||||
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
|
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
||||||
|
top_k, tfs_z, top_p, min_p, typ_p, temp,
|
||||||
|
mirostat, mirostat_eta, mirostat_tau);
|
||||||
|
|
||||||
|
return std::string(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||||
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
lparams.no_perf = params.no_perf;
|
||||||
|
|
||||||
|
auto * result = new gpt_sampler {
|
||||||
|
/* .params = */ params,
|
||||||
|
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
||||||
|
/* .chain = */ llama_sampler_chain_init(lparams),
|
||||||
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||||
|
/* .cur = */ {},
|
||||||
|
/* .cur_p = */ {},
|
||||||
|
};
|
||||||
|
|
||||||
|
llama_sampler_chain_add(result->chain,
|
||||||
|
llama_sampler_init_logit_bias(
|
||||||
|
llama_n_vocab(model),
|
||||||
|
params.logit_bias.size(),
|
||||||
|
params.logit_bias.data()));
|
||||||
|
|
||||||
|
llama_sampler_chain_add(result->chain,
|
||||||
|
llama_sampler_init_penalties(
|
||||||
|
llama_n_vocab (model),
|
||||||
|
llama_token_eos(model),
|
||||||
|
llama_token_nl (model),
|
||||||
|
params.penalty_last_n,
|
||||||
|
params.penalty_repeat,
|
||||||
|
params.penalty_freq,
|
||||||
|
params.penalty_present,
|
||||||
|
params.penalize_nl,
|
||||||
|
params.ignore_eos));
|
||||||
|
|
||||||
|
if (params.temp > 0.0f) {
|
||||||
|
if (params.mirostat == 0) {
|
||||||
|
for (const auto & cnstr : params.samplers) {
|
||||||
|
switch (cnstr) {
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_K:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_P:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_MIN_P:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TFS_Z:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TYPICAL_P:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TEMPERATURE:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||||
|
} else if (params.mirostat == 1) {
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||||
|
} else if (params.mirostat == 2) {
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false && "unknown mirostat version");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (params.n_probs > 0) {
|
||||||
|
// some use cases require to sample greedily, but still obtain the probabilities of the top tokens
|
||||||
|
// ref: https://github.com/ggerganov/llama.cpp/pull/9605
|
||||||
|
//
|
||||||
|
// the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
|
||||||
|
// it is much faster, since we avoid sorting all tokens and should give a good approximation
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
||||||
|
}
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_sampler_free(struct gpt_sampler * gsmpl) {
|
||||||
|
if (gsmpl) {
|
||||||
|
llama_sampler_free(gsmpl->grmr);
|
||||||
|
|
||||||
|
llama_sampler_free(gsmpl->chain);
|
||||||
|
|
||||||
|
delete gsmpl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||||
|
if (accept_grammar) {
|
||||||
|
llama_sampler_accept(gsmpl->grmr, token);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_accept(gsmpl->chain, token);
|
||||||
|
|
||||||
|
gsmpl->prev.push_back(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
|
||||||
|
llama_sampler_reset(gsmpl->grmr);
|
||||||
|
|
||||||
|
llama_sampler_reset(gsmpl->chain);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
|
||||||
|
return new gpt_sampler {
|
||||||
|
/* .params = */ gsmpl->params,
|
||||||
|
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||||
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||||
|
/* .prev = */ gsmpl->prev,
|
||||||
|
/* .cur = */ gsmpl->cur,
|
||||||
|
/* .cur_p = */ gsmpl->cur_p,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
|
||||||
|
// TODO: measure grammar performance
|
||||||
|
|
||||||
|
if (gsmpl) {
|
||||||
|
llama_perf_sampler_print(gsmpl->chain);
|
||||||
|
}
|
||||||
|
if (ctx) {
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||||
|
gsmpl->set_logits(ctx, idx);
|
||||||
|
|
||||||
|
auto & grmr = gsmpl->grmr;
|
||||||
|
auto & chain = gsmpl->chain;
|
||||||
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||||
|
|
||||||
|
if (grammar_first) {
|
||||||
|
llama_sampler_apply(grmr, &cur_p);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||||
|
|
||||||
|
const llama_token id = cur_p.data[cur_p.selected].id;
|
||||||
|
|
||||||
|
if (grammar_first) {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if it the sampled token fits the grammar
|
||||||
|
{
|
||||||
|
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||||
|
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
||||||
|
|
||||||
|
llama_sampler_apply(grmr, &single_token_data_array);
|
||||||
|
|
||||||
|
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||||
|
if (is_valid) {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// resampling:
|
||||||
|
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||||
|
gsmpl->set_logits(ctx, idx);
|
||||||
|
|
||||||
|
llama_sampler_apply(grmr, &cur_p);
|
||||||
|
llama_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
||||||
|
|
||||||
|
return cur_p.data[cur_p.selected].id;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
|
||||||
|
return llama_sampler_get_seed(gsmpl->chain);
|
||||||
|
}
|
||||||
|
|
||||||
|
// helpers
|
||||||
|
|
||||||
|
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
||||||
|
return &gsmpl->cur_p;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
|
||||||
|
return gsmpl->prev.rat(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
||||||
|
std::string result = "logits ";
|
||||||
|
|
||||||
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||||
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||||
|
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
|
||||||
|
n = std::min(n, (int) gsmpl->prev.size());
|
||||||
|
|
||||||
|
if (n <= 0) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
|
||||||
|
|
||||||
|
for (int i = n - 1; i >= 0; i--) {
|
||||||
|
const llama_token id = gsmpl->prev.rat(i);
|
||||||
|
|
||||||
|
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
||||||
|
|
||||||
|
result += llama_token_to_piece(ctx_main, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
|
||||||
|
switch (cnstr) {
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_K: return 'k';
|
||||||
|
case GPT_SAMPLER_TYPE_TFS_Z: return 'f';
|
||||||
|
case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_P: return 'p';
|
||||||
|
case GPT_SAMPLER_TYPE_MIN_P: return 'm';
|
||||||
|
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||||
|
default : return '?';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
|
||||||
|
switch (cnstr) {
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||||
|
case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z";
|
||||||
|
case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||||
|
case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||||
|
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||||
|
default : return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||||
|
std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
|
||||||
|
{ "top_k", GPT_SAMPLER_TYPE_TOP_K },
|
||||||
|
{ "top_p", GPT_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
{ "min_p", GPT_SAMPLER_TYPE_MIN_P },
|
||||||
|
{ "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
|
{ "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
|
||||||
|
};
|
||||||
|
|
||||||
|
// since samplers names are written multiple ways
|
||||||
|
// make it ready for both system names and input names
|
||||||
|
std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
|
||||||
|
{ "top-k", GPT_SAMPLER_TYPE_TOP_K },
|
||||||
|
{ "top-p", GPT_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ "nucleus", GPT_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
{ "typical", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
{ "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
{ "typ", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
{ "min-p", GPT_SAMPLER_TYPE_MIN_P },
|
||||||
|
{ "tfs-z", GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
|
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
|
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<gpt_sampler_type> samplers;
|
||||||
|
samplers.reserve(names.size());
|
||||||
|
|
||||||
|
for (const auto & name : names) {
|
||||||
|
auto sampler = sampler_canonical_name_map.find(name);
|
||||||
|
if (sampler != sampler_canonical_name_map.end()) {
|
||||||
|
samplers.push_back(sampler->second);
|
||||||
|
} else {
|
||||||
|
if (allow_alt_names) {
|
||||||
|
sampler = sampler_alt_name_map.find(name);
|
||||||
|
if (sampler != sampler_alt_name_map.end()) {
|
||||||
|
samplers.push_back(sampler->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return samplers;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
||||||
|
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<gpt_sampler_type> samplers;
|
||||||
|
samplers.reserve(chars.size());
|
||||||
|
|
||||||
|
for (const auto & c : chars) {
|
||||||
|
const auto sampler = sampler_name_map.find(c);
|
||||||
|
if (sampler != sampler_name_map.end()) {
|
||||||
|
samplers.push_back(sampler->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return samplers;
|
||||||
|
}
|
109
llama/sampling.h
vendored
Normal file
109
llama/sampling.h
vendored
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
/**
|
||||||
|
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||||
|
*
|
||||||
|
* MIT License
|
||||||
|
*
|
||||||
|
* Copyright (c) 2023-2024 The ggml authors
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// gpt_sampler extends llama_sampler with additional functionality:
|
||||||
|
//
|
||||||
|
// - grammar support
|
||||||
|
// - custom sampler logic based on the parameters
|
||||||
|
// - history of the last accepted tokens
|
||||||
|
// - performance metrics
|
||||||
|
//
|
||||||
|
// This goal is to have a common implementation of the sampling logic shared across the examples.
|
||||||
|
// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
|
||||||
|
// complex (top-k, top-p, etc).
|
||||||
|
//
|
||||||
|
// Another example is related to the grammar. In general, the grammar constraints applied on the full
|
||||||
|
// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
|
||||||
|
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
|
||||||
|
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
||||||
|
//
|
||||||
|
// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
|
||||||
|
// be moved into the core llama library.
|
||||||
|
//
|
||||||
|
// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
|
||||||
|
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
||||||
|
//
|
||||||
|
// TODO: measure grammar performance
|
||||||
|
//
|
||||||
|
|
||||||
|
struct gpt_sampler;
|
||||||
|
|
||||||
|
// llama_sampler API overloads
|
||||||
|
|
||||||
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
|
||||||
|
|
||||||
|
void gpt_sampler_free(struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
||||||
|
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
|
||||||
|
void gpt_sampler_reset (struct gpt_sampler * gsmpl);
|
||||||
|
struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// arguments can be nullptr to skip printing
|
||||||
|
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// extended sampling implementation:
|
||||||
|
//
|
||||||
|
// - set logits
|
||||||
|
// - apply the configured sampler chain
|
||||||
|
// - check if the token fits the grammar (if any)
|
||||||
|
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||||
|
//
|
||||||
|
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||||
|
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||||
|
//
|
||||||
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||||
|
|
||||||
|
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// helpers
|
||||||
|
|
||||||
|
// access the internal list of current candidate tokens
|
||||||
|
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// get the last accepted token
|
||||||
|
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// print the sampler chain into a string
|
||||||
|
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// get a string representation of the last accepted tokens
|
||||||
|
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
|
||||||
|
|
||||||
|
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
|
||||||
|
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
|
||||||
|
|
||||||
|
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||||
|
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
|
56
llama/sampling_ext.cpp
vendored
Normal file
56
llama/sampling_ext.cpp
vendored
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "sampling_ext.h"
|
||||||
|
|
||||||
|
struct gpt_sampler *gpt_sampler_cinit(
|
||||||
|
const struct llama_model *model, struct gpt_sampler_cparams *params)
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
gpt_sampler_params sparams;
|
||||||
|
sparams.top_k = params->top_k;
|
||||||
|
sparams.top_p = params->top_p;
|
||||||
|
sparams.min_p = params->min_p;
|
||||||
|
sparams.tfs_z = params->tfs_z;
|
||||||
|
sparams.typ_p = params->typical_p;
|
||||||
|
sparams.temp = params->temp;
|
||||||
|
sparams.penalty_last_n = params->penalty_last_n;
|
||||||
|
sparams.penalty_repeat = params->penalty_repeat;
|
||||||
|
sparams.penalty_freq = params->penalty_freq;
|
||||||
|
sparams.penalty_present = params->penalty_present;
|
||||||
|
sparams.mirostat = params->mirostat;
|
||||||
|
sparams.mirostat_tau = params->mirostat_tau;
|
||||||
|
sparams.mirostat_eta = params->mirostat_eta;
|
||||||
|
sparams.penalize_nl = params->penalize_nl;
|
||||||
|
sparams.seed = params->seed;
|
||||||
|
sparams.grammar = params->grammar;
|
||||||
|
return gpt_sampler_init(model, sparams);
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_sampler_cfree(struct gpt_sampler *sampler)
|
||||||
|
{
|
||||||
|
gpt_sampler_free(sampler);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_sampler_creset(struct gpt_sampler *sampler)
|
||||||
|
{
|
||||||
|
gpt_sampler_reset(sampler);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token gpt_sampler_csample(
|
||||||
|
struct gpt_sampler *sampler,
|
||||||
|
struct llama_context *ctx_main,
|
||||||
|
int idx)
|
||||||
|
{
|
||||||
|
return gpt_sampler_sample(sampler, ctx_main, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_sampler_caccept(
|
||||||
|
struct gpt_sampler *sampler,
|
||||||
|
llama_token id,
|
||||||
|
bool apply_grammar)
|
||||||
|
{
|
||||||
|
gpt_sampler_accept(sampler, id, apply_grammar);
|
||||||
|
}
|
54
llama/sampling_ext.h
vendored
Normal file
54
llama/sampling_ext.h
vendored
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
|
||||||
|
#ifndef GPT_SAMPLER_EXT_H
|
||||||
|
#define GPT_SAMPLER_EXT_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C"
|
||||||
|
{
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Forward declaration to avoid include of "sampling.h" which has c++
|
||||||
|
// includes
|
||||||
|
struct gpt_sampler;
|
||||||
|
|
||||||
|
struct gpt_sampler_cparams
|
||||||
|
{
|
||||||
|
int32_t top_k;
|
||||||
|
float top_p;
|
||||||
|
float min_p;
|
||||||
|
float tfs_z;
|
||||||
|
float typical_p;
|
||||||
|
float temp;
|
||||||
|
int32_t penalty_last_n;
|
||||||
|
float penalty_repeat;
|
||||||
|
float penalty_freq;
|
||||||
|
float penalty_present;
|
||||||
|
int32_t mirostat;
|
||||||
|
float mirostat_tau;
|
||||||
|
float mirostat_eta;
|
||||||
|
bool penalize_nl;
|
||||||
|
uint32_t seed;
|
||||||
|
char *grammar;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_sampler *gpt_sampler_cinit(
|
||||||
|
const struct llama_model *model,
|
||||||
|
struct gpt_sampler_cparams *params);
|
||||||
|
void gpt_sampler_cfree(struct gpt_sampler *sampler);
|
||||||
|
void gpt_sampler_creset(struct gpt_sampler *sampler);
|
||||||
|
|
||||||
|
llama_token gpt_sampler_csample(
|
||||||
|
struct gpt_sampler *sampler,
|
||||||
|
struct llama_context *ctx_main,
|
||||||
|
int idx);
|
||||||
|
|
||||||
|
void gpt_sampler_caccept(
|
||||||
|
struct gpt_sampler *sampler,
|
||||||
|
llama_token id,
|
||||||
|
bool apply_grammar);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // GPT_SAMPLER_EXT_H
|
@ -186,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|||||||
"--model", model,
|
"--model", model,
|
||||||
"--ctx-size", strconv.Itoa(opts.NumCtx),
|
"--ctx-size", strconv.Itoa(opts.NumCtx),
|
||||||
"--batch-size", strconv.Itoa(opts.NumBatch),
|
"--batch-size", strconv.Itoa(opts.NumBatch),
|
||||||
"--embedding",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.NumGPU >= 0 {
|
if opts.NumGPU >= 0 {
|
||||||
@ -218,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|||||||
params = append(params, "--threads", strconv.Itoa(defaultThreads))
|
params = append(params, "--threads", strconv.Itoa(defaultThreads))
|
||||||
}
|
}
|
||||||
|
|
||||||
if !opts.F16KV {
|
|
||||||
params = append(params, "--memory-f32")
|
|
||||||
}
|
|
||||||
|
|
||||||
flashAttnEnabled := envconfig.FlashAttention()
|
flashAttnEnabled := envconfig.FlashAttention()
|
||||||
|
|
||||||
for _, g := range gpus {
|
for _, g := range gpus {
|
||||||
@ -311,9 +306,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|||||||
|
|
||||||
// Note: we always put the dependency path first
|
// Note: we always put the dependency path first
|
||||||
// since this was the exact version we compiled/linked against
|
// since this was the exact version we compiled/linked against
|
||||||
if gpus[0].DependencyPath != "" {
|
if gpus[0].DependencyPath != nil {
|
||||||
// assume gpus from the same library have the same dependency path
|
// assume gpus from the same library have the same dependency path
|
||||||
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
|
libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
|
||||||
}
|
}
|
||||||
|
|
||||||
server := filepath.Join(dir, "ollama_llama_server")
|
server := filepath.Join(dir, "ollama_llama_server")
|
||||||
|
@ -440,7 +440,6 @@ func TestParseFileParameters(t *testing.T) {
|
|||||||
"num_gpu 1": {"num_gpu", "1"},
|
"num_gpu 1": {"num_gpu", "1"},
|
||||||
"main_gpu 1": {"main_gpu", "1"},
|
"main_gpu 1": {"main_gpu", "1"},
|
||||||
"low_vram true": {"low_vram", "true"},
|
"low_vram true": {"low_vram", "true"},
|
||||||
"f16_kv true": {"f16_kv", "true"},
|
|
||||||
"logits_all true": {"logits_all", "true"},
|
"logits_all true": {"logits_all", "true"},
|
||||||
"vocab_only true": {"vocab_only", "true"},
|
"vocab_only true": {"vocab_only", "true"},
|
||||||
"use_mmap true": {"use_mmap", "true"},
|
"use_mmap true": {"use_mmap", "true"},
|
||||||
|
@ -6,10 +6,6 @@ set -e
|
|||||||
|
|
||||||
mkdir -p dist
|
mkdir -p dist
|
||||||
|
|
||||||
for TARGETARCH in arm64 amd64; do
|
|
||||||
echo "Building Go runner darwin $TARGETARCH"
|
|
||||||
rm -rf llama/build
|
|
||||||
GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
|
|
||||||
# These require Xcode v13 or older to target MacOS v11
|
# These require Xcode v13 or older to target MacOS v11
|
||||||
# If installed to an alternate location use the following to enable
|
# If installed to an alternate location use the following to enable
|
||||||
# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
|
||||||
@ -17,6 +13,11 @@ for TARGETARCH in arm64 amd64; do
|
|||||||
export CGO_CFLAGS=-mmacosx-version-min=11.3
|
export CGO_CFLAGS=-mmacosx-version-min=11.3
|
||||||
export CGO_CXXFLAGS=-mmacosx-version-min=11.3
|
export CGO_CXXFLAGS=-mmacosx-version-min=11.3
|
||||||
export CGO_LDFLAGS=-mmacosx-version-min=11.3
|
export CGO_LDFLAGS=-mmacosx-version-min=11.3
|
||||||
|
|
||||||
|
for TARGETARCH in arm64 amd64; do
|
||||||
|
echo "Building Go runner darwin $TARGETARCH"
|
||||||
|
rm -rf llama/build
|
||||||
|
GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
|
||||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
|
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
|
||||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
|
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
|
||||||
done
|
done
|
||||||
|
@ -690,7 +690,8 @@ func CopyModel(src, dst model.Name) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func deleteUnusedLayers(deleteMap map[string]struct{}) error {
|
func deleteUnusedLayers(deleteMap map[string]struct{}) error {
|
||||||
manifests, err := Manifests()
|
// Ignore corrupt manifests to avoid blocking deletion of layers that are freshly orphaned
|
||||||
|
manifests, err := Manifests(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -853,8 +854,8 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
|||||||
manifest, _, err := GetManifest(mp)
|
manifest, _, err := GetManifest(mp)
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
// noop
|
// noop
|
||||||
} else if err != nil && !errors.Is(err, os.ErrNotExist) {
|
} else if err != nil {
|
||||||
return err
|
slog.Warn("pulling model with bad existing manifest", "name", name, "error", err)
|
||||||
} else {
|
} else {
|
||||||
for _, l := range manifest.Layers {
|
for _, l := range manifest.Layers {
|
||||||
deleteMap[l.Digest] = struct{}{}
|
deleteMap[l.Digest] = struct{}{}
|
||||||
|
@ -106,7 +106,8 @@ func (l *Layer) Remove() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
ms, err := Manifests()
|
// Ignore corrupt manifests to avoid blocking deletion of layers that are freshly orphaned
|
||||||
|
ms, err := Manifests(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -123,7 +123,7 @@ func WriteManifest(name model.Name, config Layer, layers []Layer) error {
|
|||||||
return json.NewEncoder(f).Encode(m)
|
return json.NewEncoder(f).Encode(m)
|
||||||
}
|
}
|
||||||
|
|
||||||
func Manifests() (map[model.Name]*Manifest, error) {
|
func Manifests(continueOnError bool) (map[model.Name]*Manifest, error) {
|
||||||
manifests, err := GetManifestPath()
|
manifests, err := GetManifestPath()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -145,22 +145,29 @@ func Manifests() (map[model.Name]*Manifest, error) {
|
|||||||
if !fi.IsDir() {
|
if !fi.IsDir() {
|
||||||
rel, err := filepath.Rel(manifests, match)
|
rel, err := filepath.Rel(manifests, match)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if !continueOnError {
|
||||||
|
return nil, fmt.Errorf("%s %w", match, err)
|
||||||
|
}
|
||||||
slog.Warn("bad filepath", "path", match, "error", err)
|
slog.Warn("bad filepath", "path", match, "error", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
n := model.ParseNameFromFilepath(rel)
|
n := model.ParseNameFromFilepath(rel)
|
||||||
if !n.IsValid() {
|
if !n.IsValid() {
|
||||||
|
if !continueOnError {
|
||||||
|
return nil, fmt.Errorf("%s %w", rel, err)
|
||||||
|
}
|
||||||
slog.Warn("bad manifest name", "path", rel)
|
slog.Warn("bad manifest name", "path", rel)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
m, err := ParseNamedManifest(n)
|
m, err := ParseNamedManifest(n)
|
||||||
if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
|
if err != nil {
|
||||||
|
if !continueOnError {
|
||||||
|
return nil, fmt.Errorf("%s %w", n, err)
|
||||||
|
}
|
||||||
slog.Warn("bad manifest", "name", n, "error", err)
|
slog.Warn("bad manifest", "name", n, "error", err)
|
||||||
continue
|
continue
|
||||||
} else if err != nil {
|
|
||||||
return nil, fmt.Errorf("%s: %w", n, err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ms[n] = m
|
ms[n] = m
|
||||||
|
@ -112,7 +112,7 @@ func TestManifests(t *testing.T) {
|
|||||||
createManifest(t, d, p)
|
createManifest(t, d, p)
|
||||||
}
|
}
|
||||||
|
|
||||||
ms, err := Manifests()
|
ms, err := Manifests(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
@ -27,6 +27,16 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||||||
|
|
||||||
isMllama := checkMllamaModelFamily(m)
|
isMllama := checkMllamaModelFamily(m)
|
||||||
|
|
||||||
|
var imageNumTokens int
|
||||||
|
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
|
||||||
|
if isMllama {
|
||||||
|
// Our mllama implementation packs all of the embeddings into a single token
|
||||||
|
imageNumTokens = 1
|
||||||
|
} else {
|
||||||
|
// Clip images are represented as 768 tokens, each an embedding
|
||||||
|
imageNumTokens = 768
|
||||||
|
}
|
||||||
|
|
||||||
n := len(msgs) - 1
|
n := len(msgs) - 1
|
||||||
// in reverse, find all messages that fit into context window
|
// in reverse, find all messages that fit into context window
|
||||||
for i := n; i >= 0; i-- {
|
for i := n; i >= 0; i-- {
|
||||||
@ -59,9 +69,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||||||
ctxLen := len(s)
|
ctxLen := len(s)
|
||||||
if m.ProjectorPaths != nil {
|
if m.ProjectorPaths != nil {
|
||||||
for _, m := range msgs[i:] {
|
for _, m := range msgs[i:] {
|
||||||
// images are represented as 768 sized embeddings
|
ctxLen += imageNumTokens * len(m.Images)
|
||||||
// TODO: get embedding length from project metadata
|
|
||||||
ctxLen += 768 * len(m.Images)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -622,7 +622,7 @@ func (s *Server) PushHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func checkNameExists(name model.Name) error {
|
func checkNameExists(name model.Name) error {
|
||||||
names, err := Manifests()
|
names, err := Manifests(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -894,7 +894,7 @@ func getKVData(digest string, verbose bool) (llm.KV, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) ListHandler(c *gin.Context) {
|
func (s *Server) ListHandler(c *gin.Context) {
|
||||||
ms, err := Manifests()
|
ms, err := Manifests(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
return
|
return
|
||||||
@ -1211,6 +1211,9 @@ func Serve(ln net.Listener) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !envconfig.NoPrune() {
|
if !envconfig.NoPrune() {
|
||||||
|
if _, err := Manifests(false); err != nil {
|
||||||
|
slog.Warn("corrupt manifests detected, skipping prune operation. Re-pull or delete to clear", "error", err)
|
||||||
|
} else {
|
||||||
// clean up unused layers and manifests
|
// clean up unused layers and manifests
|
||||||
if err := PruneLayers(); err != nil {
|
if err := PruneLayers(); err != nil {
|
||||||
return err
|
return err
|
||||||
@ -1225,6 +1228,7 @@ func Serve(ln net.Listener) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ctx, done := context.WithCancel(context.Background())
|
ctx, done := context.WithCancel(context.Background())
|
||||||
schedCtx, schedDone := context.WithCancel(ctx)
|
schedCtx, schedDone := context.WithCancel(ctx)
|
||||||
|
@ -130,11 +130,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
numParallel := int(envconfig.NumParallel())
|
numParallel := int(envconfig.NumParallel())
|
||||||
// TODO (jmorganca): multimodal models don't support parallel yet
|
// TODO (jmorganca): mllama doesn't support parallel yet
|
||||||
// see https://github.com/ollama/ollama/issues/4165
|
// see https://github.com/ollama/ollama/issues/4165
|
||||||
if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
|
if checkMllamaModelFamily(pending.model) && numParallel != 1 {
|
||||||
numParallel = 1
|
numParallel = 1
|
||||||
slog.Warn("multimodal models don't support parallel requests yet")
|
slog.Warn("mllama doesn't support parallel requests yet")
|
||||||
}
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user