From 74a158a79ec0eec3ce6b0b5db4d6ae91a58025e1 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Thu, 6 Jun 2024 23:55:47 -0700 Subject: [PATCH] add patches --- llama/patches/{01-cuda.patch => 01-cuda.diff} | 0 llama/patches/02-default-pretokenizer.diff | 32 +++++++++++++ llama/patches/03-metal.diff | 45 +++++++++++++++++++ llama/patches/04-qwen2.diff | 13 ++++++ 4 files changed, 90 insertions(+) rename llama/patches/{01-cuda.patch => 01-cuda.diff} (100%) create mode 100644 llama/patches/02-default-pretokenizer.diff create mode 100644 llama/patches/03-metal.diff create mode 100644 llama/patches/04-qwen2.diff diff --git a/llama/patches/01-cuda.patch b/llama/patches/01-cuda.diff similarity index 100% rename from llama/patches/01-cuda.patch rename to llama/patches/01-cuda.diff diff --git a/llama/patches/02-default-pretokenizer.diff b/llama/patches/02-default-pretokenizer.diff new file mode 100644 index 00000000..27c8aabc --- /dev/null +++ b/llama/patches/02-default-pretokenizer.diff @@ -0,0 +1,32 @@ +diff --git a/llama.cpp b/llama.cpp +index 40d2ec2c..74f3ee9c 100644 +--- a/llama.cpp ++++ b/llama.cpp +@@ -4642,16 +4642,7 @@ static void llm_load_vocab( + + // for now, only BPE models have pre-tokenizers + if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { +- if (tokenizer_pre.empty()) { +- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); +- LLAMA_LOG_WARN("%s: \n", __func__); +- LLAMA_LOG_WARN("%s: ************************************ \n", __func__); +- LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__); +- LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__); +- LLAMA_LOG_WARN("%s: ************************************ \n", __func__); +- LLAMA_LOG_WARN("%s: \n", __func__); +- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; +- } else if ( ++ if ( + tokenizer_pre == "default") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + } else if ( +@@ -4703,7 +4694,8 @@ static void llm_load_vocab( + tokenizer_pre == "smaug-bpe") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG; + } else { +- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); ++ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); ++ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + } + } else { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; diff --git a/llama/patches/03-metal.diff b/llama/patches/03-metal.diff new file mode 100644 index 00000000..f8fa7db7 --- /dev/null +++ b/llama/patches/03-metal.diff @@ -0,0 +1,45 @@ +diff --git a/ggml-metal.m b/ggml-metal.m +index 0207b787..b5e9884b 100644 +--- a/ggml-metal.m ++++ b/ggml-metal.m +@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute( + // to the matrix-vector kernel + int ne11_mm_min = 1; + +-#if 0 + // the numbers below are measured on M2 Ultra for 7B and 13B models + // these numbers do not translate to other devices or model sizes + // TODO: need to find a better approach +- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { +- switch (src0t) { +- case GGML_TYPE_F16: ne11_mm_min = 2; break; +- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; +- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; +- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; +- case GGML_TYPE_Q4_0: +- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; +- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; +- case GGML_TYPE_Q5_0: // not tested yet +- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet +- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; +- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; +- default: ne11_mm_min = 1; break; +- } ++ switch (src0t) { ++ case GGML_TYPE_F16: ne11_mm_min = 2; break; ++ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; ++ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; ++ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; ++ case GGML_TYPE_Q4_0: ++ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; ++ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; ++ case GGML_TYPE_Q5_0: // not tested yet ++ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet ++ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; ++ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; ++ default: ne11_mm_min = 1; break; + } +-#endif + + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel diff --git a/llama/patches/04-qwen2.diff b/llama/patches/04-qwen2.diff new file mode 100644 index 00000000..d7b0c155 --- /dev/null +++ b/llama/patches/04-qwen2.diff @@ -0,0 +1,13 @@ +diff --git a/llama.cpp b/llama.cpp +index 40d2ec2c..f34eb79a 100644 +--- a/llama.cpp ++++ b/llama.cpp +@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv( + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + cb(kq, "kq", il); + +- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { ++ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) { + // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs + // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 + ggml_mul_mat_set_prec(kq, GGML_PREC_F32);