forked from third-party-mirrors/ollama
add patches
This commit is contained in:
parent
8f79a2e86a
commit
74a158a79e
32
llama/patches/02-default-pretokenizer.diff
Normal file
32
llama/patches/02-default-pretokenizer.diff
Normal file
@ -0,0 +1,32 @@
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index 40d2ec2c..74f3ee9c 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
|
||||
|
||||
// for now, only BPE models have pre-tokenizers
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
- if (tokenizer_pre.empty()) {
|
||||
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: \n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: \n", __func__);
|
||||
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
- } else if (
|
||||
+ if (
|
||||
tokenizer_pre == "default") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "smaug-bpe") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
}
|
||||
} else {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
45
llama/patches/03-metal.diff
Normal file
45
llama/patches/03-metal.diff
Normal file
@ -0,0 +1,45 @@
|
||||
diff --git a/ggml-metal.m b/ggml-metal.m
|
||||
index 0207b787..b5e9884b 100644
|
||||
--- a/ggml-metal.m
|
||||
+++ b/ggml-metal.m
|
||||
@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
// to the matrix-vector kernel
|
||||
int ne11_mm_min = 1;
|
||||
|
||||
-#if 0
|
||||
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||
// these numbers do not translate to other devices or model sizes
|
||||
// TODO: need to find a better approach
|
||||
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
||||
- switch (src0t) {
|
||||
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q4_0:
|
||||
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
- case GGML_TYPE_Q5_0: // not tested yet
|
||||
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
- default: ne11_mm_min = 1; break;
|
||||
- }
|
||||
+ switch (src0t) {
|
||||
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q4_0:
|
||||
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
+ case GGML_TYPE_Q5_0: // not tested yet
|
||||
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
+ default: ne11_mm_min = 1; break;
|
||||
}
|
||||
-#endif
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
13
llama/patches/04-qwen2.diff
Normal file
13
llama/patches/04-qwen2.diff
Normal file
@ -0,0 +1,13 @@
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index 40d2ec2c..f34eb79a 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
||||
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
|
||||
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
Loading…
x
Reference in New Issue
Block a user