From 7d787ba90d07ab0991eee393e62cf5fbe421542f Mon Sep 17 00:00:00 2001 From: Sam Date: Thu, 14 Nov 2024 07:00:43 +1100 Subject: [PATCH] fix(docs): update FA FAQ wording slightly refactor: only allow setting K and V cache types together --- cmd/cmd.go | 3 +-- docs/faq.md | 7 +++---- envconfig/config.go | 9 +++------ llama/llama.go | 6 +++--- llama/runner/runner.go | 12 +++++------- llm/memory.go | 6 ++---- llm/memory_test.go | 3 +-- llm/server.go | 19 +++++++++---------- 8 files changed, 27 insertions(+), 38 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index d3f8fd56..066ea067 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1482,8 +1482,7 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_SCHED_SPREAD"], envVars["OLLAMA_TMPDIR"], envVars["OLLAMA_FLASH_ATTENTION"], - envVars["OLLAMA_CACHE_TYPE_K"], - envVars["OLLAMA_CACHE_TYPE_V"], + envVars["OLLAMA_KV_CACHE_TYPE"], envVars["OLLAMA_LLM_LIBRARY"], envVars["OLLAMA_GPU_OVERHEAD"], envVars["OLLAMA_LOAD_TIMEOUT"], diff --git a/docs/faq.md b/docs/faq.md index 14022a43..2d3cd1a8 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -291,16 +291,15 @@ Installing multiple GPUs of the same brand can be a great way to increase your a Flash Attention is a feature of most (but not all) modern models that can significantly reduce memory usage as the context size grows. To enable Flash Attention, set the `OLLAMA_FLASH_ATTENTION` environment variable to `1` when starting the Ollama server. -> Note: If you're using an uncommon quantization type with CUDA, you may benefit from build Ollama with `LLAMA_CUDA_FA_ALL_QUANTS=1` to make llama.cpp build all flash attention quantization types. +> Note: If you're using an uncommon quantization type with CUDA, advanced users may benefit from building Ollama and passing `GGML_CUDA_FA_ALL_QUANTS=1` to the llama.cpp build to enable FA for all combinations of quantisation types. More information on this can be found in [llama.cpp](https://github.com/ggerganov/llama.cpp/blob/fb4a0ec0833c71cff5a1a367ba375447ce6106eb/ggml/src/ggml-cuda/fattn-common.cuh#L575). ## How can I set the quantization type for the K/V cache? The K/V context cache can be quantized to significantly reduce memory usage when Flash Attention is enabled. -To use quantized K/V cache with Ollama you can set the following environment variables: +To use quantized K/V cache with Ollama you can set the following environment variable: -- `OLLAMA_CACHE_TYPE_K` - The quantization type for the key cache. Default is `f16`. -- `OLLAMA_CACHE_TYPE_V` - The quantization type for the value cache. Default is `f16`. +- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache. Default is `f16`. > Note: Currently this is a global option - meaning all models will run with the specified quantization type. diff --git a/envconfig/config.go b/envconfig/config.go index 73271f86..027608f7 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -153,10 +153,8 @@ var ( Debug = Bool("OLLAMA_DEBUG") // FlashAttention enables the experimental flash attention feature. FlashAttention = Bool("OLLAMA_FLASH_ATTENTION") - // CacheTypeK is the quantization type for the K/V cache keys. - CacheTypeK = String("OLLAMA_CACHE_TYPE_K") - // CacheTypeV is the quantization type for the K/V cache values. - CacheTypeV = String("OLLAMA_CACHE_TYPE_V") + // KvCacheType is the quantization type for the K/V cache. + KvCacheType = String("OLLAMA_KV_CACHE_TYPE") // NoHistory disables readline history. NoHistory = Bool("OLLAMA_NOHISTORY") // NoPrune disables pruning of model blobs on startup. @@ -238,8 +236,7 @@ func AsMap() map[string]EnvVar { ret := map[string]EnvVar{ "OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"}, - "OLLAMA_CACHE_TYPE_K": {"OLLAMA_CACHE_TYPE_K", CacheTypeK(), "Type of cache for keys (default: f16)"}, - "OLLAMA_CACHE_TYPE_V": {"OLLAMA_CACHE_TYPE_V", CacheTypeV(), "Type of cache for values (default: f16)"}, + "OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantisation type for the K/V cache (default: f16)"}, "OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"}, "OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"}, diff --git a/llama/llama.go b/llama/llama.go index ec9fe0b3..04cab77c 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -140,7 +140,7 @@ type ContextParams struct { c C.struct_llama_context_params } -func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, cacheTypeK string, cacheTypeV string) ContextParams { +func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams { params := C.llama_context_default_params() params.n_ctx = C.uint(numCtx) params.n_batch = C.uint(batchSize) @@ -149,8 +149,8 @@ func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, fla params.n_threads_batch = params.n_threads params.embeddings = C.bool(true) params.flash_attn = C.bool(flashAttention) - params.type_k = KvCacheTypeFromStr(cacheTypeK) - params.type_v = KvCacheTypeFromStr(cacheTypeV) + params.type_k = KvCacheTypeFromStr(kvCacheType) + params.type_v = KvCacheTypeFromStr(kvCacheType) return ContextParams{c: params} } diff --git a/llama/runner/runner.go b/llama/runner/runner.go index bb434046..3c289ced 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -471,7 +471,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) // the last one generated wasn't submitted to Decode // - Remove any stop sequences that we stripped out // - If truncateStop removed a portion of a token, drop that - // - As defense-in-depth, if truncatedToken didn't find a stop token + // - As defence-in-depth, if truncatedToken didn't find a stop token // remove the extra one that we added to the cache len tokenLen := len(seq.cache.Inputs) + 1 tokenLen -= origLen - newLen @@ -762,8 +762,7 @@ func (s *Server) loadModel( flashAttention bool, threads int, multiUserCache bool, - cacheTypeK string, - cacheTypeV string, + kvCacheType string, ) { llama.BackendInit() @@ -773,7 +772,7 @@ func (s *Server) loadModel( panic(err) } - ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention, cacheTypeK, cacheTypeV) + ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention, kvCacheType) s.lc, err = llama.NewContextWithModel(s.model, ctxParams) if err != nil { panic(err) @@ -821,8 +820,7 @@ func main() { tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") requirements := flag.Bool("requirements", false, "print json requirement information") - cacheTypeK := flag.String("cache-type-k", "f16", "quantization type for key in cache (default: f16)") - cacheTypeV := flag.String("cache-type-v", "f16", "quantization type for value in cache (default: f16)") + kvCacheType := flag.String("kv-cache-type", "f16", "quantization type for KV cache (default: f16)") flag.Parse() if *requirements { @@ -878,7 +876,7 @@ func main() { } server.ready.Add(1) - go server.loadModel(params, *mpath, *lpath, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache, *cacheTypeK, *cacheTypeV) + go server.loadModel(params, *mpath, *lpath, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache, *kvCacheType) server.cond = sync.NewCond(&server.mu) diff --git a/llm/memory.go b/llm/memory.go index 8eb99ca6..ff1e09cc 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -129,10 +129,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, isEmbeddingModel = true } - // Estimate the memory required for K and V caches separately as they can have different quantization types - kSize := estimateKvCacheSize(envconfig.CacheTypeK(), uint64(opts.NumCtx), ggml.KV().BlockCount(), ggml.KV().EmbeddingHeadCountK(), ggml.KV().HeadCountKV(), isEmbeddingModel) - vSize := estimateKvCacheSize(envconfig.CacheTypeV(), uint64(opts.NumCtx), ggml.KV().BlockCount(), ggml.KV().EmbeddingHeadCountV(), ggml.KV().HeadCountKV(), isEmbeddingModel) - kv := kSize + vSize + // Estimate the memory required for KV cache quantisation + kv := estimateKvCacheSize(envconfig.KvCacheType(), uint64(opts.NumCtx), ggml.KV().BlockCount(), ggml.KV().EmbeddingHeadCountK(), ggml.KV().HeadCountKV(), isEmbeddingModel) * 2 // KV is proportional to the number of layers layerSize += kv / ggml.KV().BlockCount() diff --git a/llm/memory_test.go b/llm/memory_test.go index b0780b48..73ee7915 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -15,8 +15,7 @@ import ( func TestEstimateGPULayers(t *testing.T) { t.Setenv("OLLAMA_DEBUG", "1") - t.Setenv("OLLAMA_CACHE_TYPE_K", "") - t.Setenv("OLLAMA_CACHE_TYPE_V", "") + t.Setenv("OLLAMA_KV_CACHE_TYPE", "") modelName := "dummy" f, err := os.CreateTemp(t.TempDir(), modelName) diff --git a/llm/server.go b/llm/server.go index 413a19c2..98b3c07a 100644 --- a/llm/server.go +++ b/llm/server.go @@ -218,11 +218,13 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter params = append(params, "--threads", strconv.Itoa(defaultThreads)) } + // isEmbeddingModel checks for common GGML attributes that help distinguish most embedding models from normal models. isEmbeddingModel := false if _, ok := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]; ok { isEmbeddingModel = true } + // Validates and applies KV cache parameters setCacheTypeParam := func(paramName, cacheType string) { if cacheType == "" { return @@ -245,9 +247,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter slog.Debug("Setting cache type", "param", paramName, "type", cacheType) } - // Define cacheTypeK and cacheTypeV - cacheTypeK := envconfig.CacheTypeK() - cacheTypeV := envconfig.CacheTypeV() + kvCacheType := envconfig.KvCacheType() // Set cache types only if they are not empty supportsFlashAttention := func(ggml *GGML) bool { @@ -255,12 +255,12 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter headCountV := ggml.KV().EmbeddingHeadCountV() if headCountK == 0 || headCountV == 0 { - slog.Debug("Model is missing embedding head count for K or V") + slog.Debug("Model is missing embedding head count for K or V, does not support flash attention") return false } if headCountK != headCountV { - slog.Debug("Embedding head count K does not equal V", "K", headCountK, "V", headCountV) + slog.Debug("Embedding head count K does not equal V, does not support flash attention", "K", headCountK, "V", headCountV) return false } @@ -291,14 +291,13 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter params = append(params, "--flash-attn") slog.Info("Enabling flash attention") - setCacheTypeParam("--cache-type-k", cacheTypeK) - setCacheTypeParam("--cache-type-v", cacheTypeV) + setCacheTypeParam("--kv-cache-type", kvCacheType) } else { slog.Info("Flash attention not enabled") quantizedCacheTypes := []string{"q8_0", "q5_1", "q5_0", "iq4_nl", "q4_1", "q4_0"} - if !isEmbeddingModel && (cacheTypeK != "" || cacheTypeV != "") { - if slices.Contains(quantizedCacheTypes, cacheTypeK) || slices.Contains(quantizedCacheTypes, cacheTypeV) { - slog.Warn("Quantized cache types require flash attention. Using default cache types.") + if !isEmbeddingModel && (kvCacheType != "") { + if slices.Contains(quantizedCacheTypes, kvCacheType) { + slog.Warn("Quantized cache types require flash attention. Falling back to default cache types.") } } }