diff --git a/llama/build-info.cpp b/llama/build-info.cpp
index 63732571..d4ddbffc 100644
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/clip.cpp b/llama/clip.cpp
index 2039bdc8..d8a85004 100644
--- a/llama/clip.cpp
+++ b/llama/clip.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/clip.h b/llama/clip.h
index 8665ad6a..84a51470 100644
--- a/llama/clip.h
+++ b/llama/clip.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/common.cpp b/llama/common.cpp
index f542c129..cfda8854 100644
--- a/llama/common.cpp
+++ b/llama/common.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -710,14 +710,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "--lora") {
         CHECK_ARG
-        params.lora_adapter.emplace_back(argv[i], 1.0f);
+        params.lora_adapters.push_back({
+            std::string(argv[i]),
+            1.0,
+        });
         return true;
     }
     if (arg == "--lora-scaled") {
         CHECK_ARG
-        const char* lora_adapter = argv[i];
+        std::string lora_adapter = argv[i];
         CHECK_ARG
-        params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
+        params.lora_adapters.push_back({
+            lora_adapter,
+            std::stof(argv[i]),
+        });
+        return true;
+    }
+    if (arg == "--lora-init-without-apply") {
+        params.lora_init_without_apply = true;
         return true;
     }
     if (arg == "--control-vector") {
@@ -1660,7 +1670,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "server",      "       --host HOST",            "ip address to listen (default: %s)", params.hostname.c_str() });
     options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
     options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
-    options.push_back({ "server",      "       --embedding(s)",         "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
+    options.push_back({ "server",      "       --embedding(s)",         "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
     options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
     options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
     options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
@@ -1680,6 +1690,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
     options.push_back({ "server",      "-sps,  --slot-prompt-similarity SIMILARITY",
                                                                         "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
+    options.push_back({ "server",      "       --lora-init-without-apply",     "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
 
 #ifndef LOG_DISABLE_LOGS
     options.push_back({ "logging" });
@@ -2065,8 +2076,8 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
-
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
+struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
+    llama_init_result iparams;
     auto mparams = llama_model_params_from_gpt_params(params);
 
     llama_model * model = nullptr;
@@ -2081,7 +2092,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-        return std::make_tuple(nullptr, nullptr);
+        return iparams;
     }
 
     auto cparams = llama_context_params_from_gpt_params(params);
@@ -2090,7 +2101,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     if (lctx == NULL) {
         fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
         llama_free_model(model);
-        return std::make_tuple(nullptr, nullptr);
+        return iparams;
     }
 
     if (!params.control_vectors.empty()) {
@@ -2101,7 +2112,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         if (cvec.n_embd == -1) {
             llama_free(lctx);
             llama_free_model(model);
-            return std::make_tuple(nullptr, nullptr);
+            return iparams;
         }
 
         int err = llama_control_vector_apply(lctx,
@@ -2113,34 +2124,38 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         if (err) {
             llama_free(lctx);
             llama_free_model(model);
-            return std::make_tuple(nullptr, nullptr);
+            return iparams;
         }
     }
 
-    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
-        const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
-        float lora_scale = std::get<1>(params.lora_adapter[i]);
-
-        // try to load as gguf
-        auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
-        if (adapter == nullptr) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
+    // load and optionally apply lora adapters
+    for (auto & la : params.lora_adapters) {
+        llama_lora_adapter_container loaded_la;
+        loaded_la.path = la.path;
+        loaded_la.scale = la.scale;
+        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
+        if (loaded_la.adapter == nullptr) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
 
             // if that fails, try loading as ggla for compatibility
             int err = llama_model_apply_lora_from_file(model,
-                                                    lora_adapter.c_str(),
-                                                    lora_scale,
+                                                    la.path.c_str(),
+                                                    la.scale,
                                                     nullptr,
                                                     params.n_threads);
             if (err != 0) {
                 fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
                 llama_free(lctx);
                 llama_free_model(model);
-                return std::make_tuple(nullptr, nullptr);
+                return iparams;
+            } else {
+                break;
             }
-        } else {
-            llama_lora_adapter_set(lctx, adapter, lora_scale);
         }
+        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
+    }
+    if (!params.lora_init_without_apply) {
+        llama_lora_adapters_apply(lctx, iparams.lora_adapters);
     }
 
     if (params.ignore_eos) {
@@ -2174,7 +2189,18 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         llama_reset_timings(lctx);
     }
 
-    return std::make_tuple(model, lctx);
+    iparams.model   = model;
+    iparams.context = lctx;
+    return iparams;
+}
+
+void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
+    llama_lora_adapter_clear(ctx);
+    for (auto & la : lora_adapters) {
+        if (la.scale != 0.0f) {
+            llama_lora_adapter_set(ctx, la.adapter, la.scale);
+        }
+    }
 }
 
 struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
@@ -3199,19 +3225,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     }
 
     fprintf(stream, "lora:\n");
-    for (std::tuple<std::string, float> la : params.lora_adapter) {
-        if (std::get<1>(la) != 1.0f) {
-            continue;
+    for (auto & la : params.lora_adapters) {
+        if (la.scale == 1.0f) {
+            fprintf(stream, "  - %s\n", la.path.c_str());
         }
-        fprintf(stream, "  - %s\n", std::get<0>(la).c_str());
     }
     fprintf(stream, "lora_scaled:\n");
-    for (std::tuple<std::string, float> la : params.lora_adapter) {
-        if (std::get<1>(la) == 1.0f) {
-            continue;
+    for (auto & la : params.lora_adapters) {
+        if (la.scale != 1.0f) {
+            fprintf(stream, "  - %s: %f\n", la.path.c_str(), la.scale);
         }
-        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
     }
+    fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
     fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
diff --git a/llama/common.h b/llama/common.h
index 181c412c..b25dc42e 100644
--- a/llama/common.h
+++ b/llama/common.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -59,6 +59,15 @@
 
 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
 
+struct llama_lora_adapter_info {
+    std::string path;
+    float scale;
+};
+
+struct llama_lora_adapter_container : llama_lora_adapter_info {
+    struct llama_lora_adapter * adapter;
+};
+
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const * LLAMA_COMMIT;
@@ -152,8 +161,8 @@ struct gpt_params {
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
 
-    // TODO: avoid tuple, use struct
-    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
+    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
 
     std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
 
@@ -334,8 +343,13 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //
 
-// TODO: avoid tuplue, use struct
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
+struct llama_init_result {
+    struct llama_model   * model   = nullptr;
+    struct llama_context * context = nullptr;
+    std::vector<llama_lora_adapter_container> lora_adapters;
+};
+
+struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
 
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
@@ -343,6 +357,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 
+// clear LoRA adapters from context, then apply new list of adapters
+void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
+
 // Batch utils
 
 void llama_batch_clear(struct llama_batch & batch);
diff --git a/llama/ggml-aarch64.c b/llama/ggml-aarch64.c
index c2189c02..ea751c58 100644
--- a/llama/ggml-aarch64.c
+++ b/llama/ggml-aarch64.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -410,8 +410,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -522,8 +522,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -640,7 +640,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (ggml_sve_cnt_b == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -706,12 +706,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         return;
     }
     else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                     "performance");
     }
     else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                     "quantization format for optimal performance");
     }
@@ -771,8 +771,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -1292,8 +1292,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -1754,7 +1754,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (ggml_sve_cnt_b == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -2165,12 +2165,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         return;
     }
     else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                     "performance");
     }
     else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                     "quantization format for optimal performance");
     }
diff --git a/llama/ggml-aarch64.h b/llama/ggml-aarch64.h
index f00fde74..d8f7fd3a 100644
--- a/llama/ggml-aarch64.h
+++ b/llama/ggml-aarch64.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-alloc.c b/llama/ggml-alloc.c
index ca84d2e9..0bb174f4 100644
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-alloc.h b/llama/ggml-alloc.h
index 676c9695..8c8cb4eb 100644
--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-backend-impl.h b/llama/ggml-backend-impl.h
index c44e5b0f..e3a3d335 100644
--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-backend.c b/llama/ggml-backend.c
index ca846cdb..bfc6e10a 100644
--- a/llama/ggml-backend.c
+++ b/llama/ggml-backend.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-backend.h b/llama/ggml-backend.h
index 7950571d..7cf3d939 100644
--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-common.h b/llama/ggml-common.h
index 8ff58bfa..80c80024 100644
--- a/llama/ggml-common.h
+++ b/llama/ggml-common.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda.cu b/llama/ggml-cuda.cu
index a3341229..8b310ae4 100644
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -156,7 +156,22 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
     }
     return res;
 #else
+
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+    cudaError_t err;
+    if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
+    {
+        err = cudaMallocManaged(ptr, size);
+    }
+    else
+    {
+        err = cudaMalloc(ptr, size);
+    }
+    return err;
+#else
     return cudaMalloc(ptr, size);
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+
 #endif
 }
 
@@ -1516,7 +1531,7 @@ static void ggml_cuda_op_mul_mat(
         }
 
         // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
-        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
+        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
             const int64_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
             const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
             CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
@@ -1915,10 +1930,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
 
-    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
+    bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
-        && src1->ne[1] == 1;
+        && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
     bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
diff --git a/llama/ggml-cuda.h b/llama/ggml-cuda.h
index fce52bf9..d5616d8b 100644
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/acc.cu b/llama/ggml-cuda/acc.cu
index 0f55c157..cb7eb35a 100644
--- a/llama/ggml-cuda/acc.cu
+++ b/llama/ggml-cuda/acc.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/acc.cuh b/llama/ggml-cuda/acc.cuh
index 519c95c8..126a4d6d 100644
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/alibi.cu b/llama/ggml-cuda/alibi.cu
index 35d276b5..1ee61656 100644
--- a/llama/ggml-cuda/alibi.cu
+++ b/llama/ggml-cuda/alibi.cu
@@ -1,3 +1,29 @@
+/**
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 /**
  * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
  *
diff --git a/llama/ggml-cuda/alibi.cuh b/llama/ggml-cuda/alibi.cuh
index 0d6a3440..98a724a8 100644
--- a/llama/ggml-cuda/alibi.cuh
+++ b/llama/ggml-cuda/alibi.cuh
@@ -1,3 +1,29 @@
+/**
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 /**
  * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
  *
diff --git a/llama/ggml-cuda/arange.cu b/llama/ggml-cuda/arange.cu
index 514c146e..8f9b86e3 100644
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/arange.cuh b/llama/ggml-cuda/arange.cuh
index f1d8acc2..10b0a6fc 100644
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/argsort.cu b/llama/ggml-cuda/argsort.cu
index 1987e87f..796633c6 100644
--- a/llama/ggml-cuda/argsort.cu
+++ b/llama/ggml-cuda/argsort.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/argsort.cuh b/llama/ggml-cuda/argsort.cuh
index 9189815c..75f434c2 100644
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/binbcast.cu b/llama/ggml-cuda/binbcast.cu
index df396eb2..c8262f3d 100644
--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/binbcast.cuh b/llama/ggml-cuda/binbcast.cuh
index e6a48196..74348c78 100644
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/clamp.cu b/llama/ggml-cuda/clamp.cu
index 844cb913..cf0a60fd 100644
--- a/llama/ggml-cuda/clamp.cu
+++ b/llama/ggml-cuda/clamp.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/clamp.cuh b/llama/ggml-cuda/clamp.cuh
index 2d25cb00..c7b6a414 100644
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/common.cuh b/llama/ggml-cuda/common.cuh
index dbd07204..37079ab6 100644
--- a/llama/ggml-cuda/common.cuh
+++ b/llama/ggml-cuda/common.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -53,255 +53,11 @@
 #include <vector>
 
 #if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F  HIPBLAS_R_16F
-#define CUDA_R_32F  HIPBLAS_R_32F
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
-#define cublasCreate hipblasCreate
-#define cublasDestroy hipblasDestroy
-#define cublasGemmEx hipblasGemmEx
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cublasHandle_t hipblasHandle_t
-#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
-#define cublasSetStream hipblasSetStream
-#define cublasSgemm hipblasSgemm
-#define cublasStatus_t hipblasStatus_t
-#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
-#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
-#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
-#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-#define cudaDeviceProp hipDeviceProp_t
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaError_t hipError_t
-#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
-#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
-#define cudaEventCreateWithFlags hipEventCreateWithFlags
-#define cudaEventDisableTiming hipEventDisableTiming
-#define cudaEventRecord hipEventRecord
-#define cudaEventSynchronize hipEventSynchronize
-#define cudaEvent_t hipEvent_t
-#define cudaEventDestroy hipEventDestroy
-#define cudaFree hipFree
-#define cudaFreeHost hipHostFree
-#define cudaGetDevice hipGetDevice
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetLastError hipGetLastError
-#define cudaHostRegister hipHostRegister
-#define cudaHostRegisterPortable hipHostRegisterPortable
-#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
-#define cudaHostUnregister hipHostUnregister
-#define cudaLaunchHostFunc hipLaunchHostFunc
-#define cudaMalloc hipMalloc
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
-#define cudaMemcpy hipMemcpy
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaMemcpyKind hipMemcpyKind
-#define cudaMemset hipMemset
-#define cudaMemsetAsync hipMemsetAsync
-#define cudaMemGetInfo hipMemGetInfo
-#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
-#define cudaSetDevice hipSetDevice
-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-#define cudaStreamDestroy hipStreamDestroy
-#define cudaStreamFireAndForget hipStreamFireAndForget
-#define cudaStreamNonBlocking hipStreamNonBlocking
-#define cudaStreamPerThread hipStreamPerThread
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-#define cudaStream_t hipStream_t
-#define cudaSuccess hipSuccess
-#define __trap() do { abort(); __builtin_unreachable(); } while(0)
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#include "vendors/hip.h"
 #elif defined(GGML_USE_MUSA)
-#include <musa_runtime.h>
-#include <musa.h>
-#include <mublas.h>
-#include <musa_fp16.h>
-// XXX: Keep the following order the same as hipBLAS
-// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
-// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
-#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N MUBLAS_OP_N
-#define CUBLAS_OP_T MUBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
-// #define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F  MUSA_R_16F
-#define CUDA_R_32F  MUSA_R_32F
-// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-// #define cublasComputeType_t mublasComputeType_t
-#define cublasCreate mublasCreate
-#define cublasDestroy mublasDestroy
-#define cublasGemmEx mublasGemmEx
-#define cublasGemmBatchedEx mublasGemmBatchedEx
-#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
-#define cublasHandle_t mublasHandle_t
-// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
-#define cublasSetMathMode mublasSetMathMode
-#define cublasSetStream mublasSetStream
-#define cublasSgemm mublasSgemm
-#define cublasStatus_t mublasStatus_t
-#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
-#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
-#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
-#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
-#define cudaDeviceProp musaDeviceProp
-#define cudaDeviceSynchronize musaDeviceSynchronize
-#define cudaError_t musaError_t
-#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
-#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
-#define cudaEventCreateWithFlags musaEventCreateWithFlags
-#define cudaEventDisableTiming musaEventDisableTiming
-#define cudaEventRecord musaEventRecord
-#define cudaEventSynchronize musaEventSynchronize
-#define cudaEvent_t musaEvent_t
-#define cudaEventDestroy musaEventDestroy
-#define cudaFree musaFree
-#define cudaFreeHost musaFreeHost
-#define cudaGetDevice musaGetDevice
-#define cudaGetDeviceCount musaGetDeviceCount
-#define cudaGetDeviceProperties musaGetDeviceProperties
-#define cudaGetErrorString musaGetErrorString
-#define cudaGetLastError musaGetLastError
-#define cudaHostRegister musaHostRegister
-#define cudaHostRegisterPortable musaHostRegisterPortable
-#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
-#define cudaHostUnregister musaHostUnregister
-#define cudaLaunchHostFunc musaLaunchHostFunc
-#define cudaMalloc musaMalloc
-#define cudaMallocHost musaMallocHost
-#define cudaMemcpy musaMemcpy
-#define cudaMemcpyAsync musaMemcpyAsync
-#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
-#define cudaMemcpy2DAsync musaMemcpy2DAsync
-#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
-#define cudaMemcpyKind musaMemcpyKind
-#define cudaMemset musaMemset
-#define cudaMemsetAsync musaMemsetAsync
-#define cudaMemGetInfo musaMemGetInfo
-#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
-#define cudaSetDevice musaSetDevice
-#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
-#define cudaStreamDestroy musaStreamDestroy
-#define cudaStreamFireAndForget musaStreamFireAndForget
-#define cudaStreamNonBlocking musaStreamNonBlocking
-#define cudaStreamPerThread musaStreamPerThread
-#define cudaStreamSynchronize musaStreamSynchronize
-#define cudaStreamWaitEvent musaStreamWaitEvent
-#define cudaStream_t musaStream_t
-#define cudaSuccess musaSuccess
-
-// XXX: Other CUDA => MUSA mapping
-#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
-#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
-#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
-#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
-#define CUdevice MUdevice
-#define CUdeviceptr MUdeviceptr
-#define CUmemAccessDesc MUmemAccessDesc
-#define CUmemAllocationProp MUmemAllocationProp
-#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
-#define cuDeviceGet muDeviceGet
-#define cuDeviceGetAttribute muDeviceGetAttribute
-#define cuMemAddressFree muMemAddressFree
-#define cuMemAddressReserve muMemAddressReserve
-#define cuMemCreate muMemCreate
-#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
-#define cuMemMap muMemMap
-#define cuMemRelease muMemRelease
-#define cuMemSetAccess muMemSetAccess
-#define cuMemUnmap muMemUnmap
-#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
-#define cudaFuncSetAttribute musaFuncSetAttribute
-#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
-#define make_cudaExtent make_musaExtent
-#define make_cudaPitchedPtr make_musaPitchedPtr
-
-// XXX: USE_CUDA_GRAPH
-#define CUDA_SUCCESS MUSA_SUCCESS
-#define CUresult MUresult
-#define cuGetErrorString muGetErrorString
-#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
-#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
-#define cudaGraphDestroy musaGraphDestroy
-#define cudaGraphExecDestroy musaGraphExecDestroy
-#define cudaGraphExec_t musaGraphExec_t
-#define cudaGraphExecUpdate musaGraphExecUpdate
-#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
-#define cudaGraphGetNodes musaGraphGetNodes
-#define cudaGraphInstantiate musaGraphInstantiate
-#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
-#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
-#define cudaGraphLaunch musaGraphLaunch
-#define cudaGraphNodeGetType musaGraphNodeGetType
-#define cudaGraphNode_t musaGraphNode_t
-#define cudaGraphNodeType musaGraphNodeType
-#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
-#define cudaGraph_t musaGraph_t
-#define cudaKernelNodeParams musaKernelNodeParams
-#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
-#define cudaStreamEndCapture musaStreamEndCapture
-
-// XXX: cuBLAS => muBLAS mapping
-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
-#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
-#define CUBLAS_COMPUTE_16F CUDA_R_16F
-#define CUBLAS_COMPUTE_32F CUDA_R_32F
-#define cublasComputeType_t cudaDataType_t
-
-// XXX: Clang builtins mapping
-#define __vsub4   __vsub4_musa
-#define __vcmpeq4 __vcmpeq4_musa
-#define __vcmpne4 __vcmpne4_musa
+#include "vendors/musa.h"
 #else
-#include <cuda_runtime.h>
-#include <cuda.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-
-#if CUDART_VERSION < 11020
-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
-#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
-#define CUBLAS_COMPUTE_16F CUDA_R_16F
-#define CUBLAS_COMPUTE_32F CUDA_R_32F
-#define cublasComputeType_t cudaDataType_t
-#endif // CUDART_VERSION < 11020
-
+#include "vendors/cuda.h"
 #endif // defined(GGML_USE_HIPBLAS)
 
 #define STRINGIZE_IMPL(...) #__VA_ARGS__
@@ -344,11 +100,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
 
 #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
     static const char * cublas_get_error_str(const cublasStatus_t err) {
-#ifndef GGML_USE_MUSA
         return cublasGetStatusString(err);
-#else
-        return mublasStatus_to_string(err);
-#endif // GGML_USE_MUSA
     }
 #else
     static const char * cublas_get_error_str(const cublasStatus_t err) {
@@ -390,129 +142,7 @@ typedef half2 dfloat2;
 #else
 typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
-#endif //GGML_CUDA_F16
-
-#if defined(GGML_USE_MUSA)
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-
-static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
-    return __vsubss4(a, b);
-}
-
-static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
-    }
-    return c;
-}
-
-static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
-    }
-    return c;
-}
-#endif // defined(GGML_USE_MUSA)
-
-#if defined(GGML_USE_HIPBLAS)
-#define __CUDA_ARCH__ 1300
-
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-    defined(__gfx1150__) || defined(__gfx1151__)
-#define RDNA3
-#endif
-
-#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
-#define RDNA2
-#endif
-
-#if defined(__gfx1010__) || defined(__gfx1012__)
-#define RDNA1
-#endif
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-#if __has_builtin(__builtin_elementwise_sub_sat)
-    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
-    return reinterpret_cast<const int &>(c);
-#else
-    int8x4_t c;
-    int16_t tmp;
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-        tmp = va[i] - vb[i];
-        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
-        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
-        c[i] = tmp;
-    }
-    return reinterpret_cast<int &>(c);
-#endif // __has_builtin(__builtin_elementwise_sub_sat)
-}
-
-static __device__ __forceinline__ int __vsub4(const int a, const int b) {
-    return __vsubss4(a, b);
-}
-
-static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
-    }
-    return c;
-}
-
-static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
-    }
-    return c;
-}
-
-#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
-// __shfl_xor() for half2 was added in ROCm 5.6
-static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
-    typedef union half2_b32 {
-        half2 val;
-        int   b32;
-    } half2_b32_t;
-    half2_b32_t tmp;
-    tmp.val = var;
-    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
-    return tmp.val;
-}
-#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
-#endif // defined(GGML_USE_HIPBLAS)
+#endif // GGML_CUDA_F16
 
 #if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 #define FP16_AVAILABLE
diff --git a/llama/ggml-cuda/concat.cu b/llama/ggml-cuda/concat.cu
index e77a1c44..c73b3311 100644
--- a/llama/ggml-cuda/concat.cu
+++ b/llama/ggml-cuda/concat.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/concat.cuh b/llama/ggml-cuda/concat.cuh
index f2010440..5869b398 100644
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/conv-transpose-1d.cu b/llama/ggml-cuda/conv-transpose-1d.cu
index 0117a6b7..4124f559 100644
--- a/llama/ggml-cuda/conv-transpose-1d.cu
+++ b/llama/ggml-cuda/conv-transpose-1d.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/conv-transpose-1d.cuh b/llama/ggml-cuda/conv-transpose-1d.cuh
index 90ed15d0..41990dde 100644
--- a/llama/ggml-cuda/conv-transpose-1d.cuh
+++ b/llama/ggml-cuda/conv-transpose-1d.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/convert.cu b/llama/ggml-cuda/convert.cu
index 44a18e53..52d27ea3 100644
--- a/llama/ggml-cuda/convert.cu
+++ b/llama/ggml-cuda/convert.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/convert.cuh b/llama/ggml-cuda/convert.cuh
index a72f0206..40e15280 100644
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/cpy.cu b/llama/ggml-cuda/cpy.cu
index d5024659..a4ee847b 100644
--- a/llama/ggml-cuda/cpy.cu
+++ b/llama/ggml-cuda/cpy.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/cpy.cuh b/llama/ggml-cuda/cpy.cuh
index 9907eb3e..96400051 100644
--- a/llama/ggml-cuda/cpy.cuh
+++ b/llama/ggml-cuda/cpy.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/dequantize.cuh b/llama/ggml-cuda/dequantize.cuh
index 4baf3f59..60f0b66e 100644
--- a/llama/ggml-cuda/dequantize.cuh
+++ b/llama/ggml-cuda/dequantize.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/diagmask.cu b/llama/ggml-cuda/diagmask.cu
index 14dbb972..1a6bcbc9 100644
--- a/llama/ggml-cuda/diagmask.cu
+++ b/llama/ggml-cuda/diagmask.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/diagmask.cuh b/llama/ggml-cuda/diagmask.cuh
index 1ec8e9ba..34f76fe2 100644
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/dmmv.cu b/llama/ggml-cuda/dmmv.cu
index feb9bf80..a606493d 100644
--- a/llama/ggml-cuda/dmmv.cu
+++ b/llama/ggml-cuda/dmmv.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -526,7 +526,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
 }
 
 static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
     const dim3 block_nums(block_num_y, 1, 1);
@@ -536,7 +536,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
 }
 
 static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -545,7 +545,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
 }
 
 static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -554,7 +554,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
 }
 
 static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -563,7 +563,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
 }
 
 static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -614,7 +614,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
 }
 
 static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -698,3 +698,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
     GGML_UNUSED(src1_ncols);
     GGML_UNUSED(src1_padded_row_size);
 }
+
+bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
+    return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 ||
+        src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 ||
+        src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
+        src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
+        src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
+        src0_type == GGML_TYPE_F16;
+}
diff --git a/llama/ggml-cuda/dmmv.cuh b/llama/ggml-cuda/dmmv.cuh
index be2b3fa6..c68aa1d1 100644
--- a/llama/ggml-cuda/dmmv.cuh
+++ b/llama/ggml-cuda/dmmv.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -42,3 +42,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, cudaStream_t stream);
+
+bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
diff --git a/llama/ggml-cuda/fattn-common.cuh b/llama/ggml-cuda/fattn-common.cuh
index cba14ae2..ac1937a9 100644
--- a/llama/ggml-cuda/fattn-common.cuh
+++ b/llama/ggml-cuda/fattn-common.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-tile-f16.cu b/llama/ggml-cuda/fattn-tile-f16.cu
index a4fc2127..4718d05e 100644
--- a/llama/ggml-cuda/fattn-tile-f16.cu
+++ b/llama/ggml-cuda/fattn-tile-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-tile-f16.cuh b/llama/ggml-cuda/fattn-tile-f16.cuh
index c48c863d..fd9d0e22 100644
--- a/llama/ggml-cuda/fattn-tile-f16.cuh
+++ b/llama/ggml-cuda/fattn-tile-f16.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-tile-f32.cu b/llama/ggml-cuda/fattn-tile-f32.cu
index 49c1ec56..7c8b975c 100644
--- a/llama/ggml-cuda/fattn-tile-f32.cu
+++ b/llama/ggml-cuda/fattn-tile-f32.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-tile-f32.cuh b/llama/ggml-cuda/fattn-tile-f32.cuh
index 87c48525..67ae64e7 100644
--- a/llama/ggml-cuda/fattn-tile-f32.cuh
+++ b/llama/ggml-cuda/fattn-tile-f32.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-vec-f16.cuh b/llama/ggml-cuda/fattn-vec-f16.cuh
index 496535c1..b89d7200 100644
--- a/llama/ggml-cuda/fattn-vec-f16.cuh
+++ b/llama/ggml-cuda/fattn-vec-f16.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-vec-f32.cuh b/llama/ggml-cuda/fattn-vec-f32.cuh
index 1517ac72..143d1a28 100644
--- a/llama/ggml-cuda/fattn-vec-f32.cuh
+++ b/llama/ggml-cuda/fattn-vec-f32.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-wmma-f16.cuh b/llama/ggml-cuda/fattn-wmma-f16.cuh
index ce74f71d..1b5701bd 100644
--- a/llama/ggml-cuda/fattn-wmma-f16.cuh
+++ b/llama/ggml-cuda/fattn-wmma-f16.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn.cu b/llama/ggml-cuda/fattn.cu
index 511e19d4..dc0652d1 100644
--- a/llama/ggml-cuda/fattn.cu
+++ b/llama/ggml-cuda/fattn.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn.cuh b/llama/ggml-cuda/fattn.cuh
index e04eefbc..fab654c7 100644
--- a/llama/ggml-cuda/fattn.cuh
+++ b/llama/ggml-cuda/fattn.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/getrows.cu b/llama/ggml-cuda/getrows.cu
index 87b09d8b..a6c96599 100644
--- a/llama/ggml-cuda/getrows.cu
+++ b/llama/ggml-cuda/getrows.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/getrows.cuh b/llama/ggml-cuda/getrows.cuh
index 0700d3a6..3173d5ce 100644
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/im2col.cu b/llama/ggml-cuda/im2col.cu
index 574e641b..3ee4b485 100644
--- a/llama/ggml-cuda/im2col.cu
+++ b/llama/ggml-cuda/im2col.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/im2col.cuh b/llama/ggml-cuda/im2col.cuh
index ca3d91f0..592125e1 100644
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mma.cuh b/llama/ggml-cuda/mma.cuh
index 2e7fff79..552e5082 100644
--- a/llama/ggml-cuda/mma.cuh
+++ b/llama/ggml-cuda/mma.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mmq.cu b/llama/ggml-cuda/mmq.cu
index a5046bf1..2ad38cfd 100644
--- a/llama/ggml-cuda/mmq.cu
+++ b/llama/ggml-cuda/mmq.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mmq.cuh b/llama/ggml-cuda/mmq.cuh
index ab18ee1f..fc6ff807 100644
--- a/llama/ggml-cuda/mmq.cuh
+++ b/llama/ggml-cuda/mmq.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mmvq.cu b/llama/ggml-cuda/mmvq.cu
index f693109a..d6ffc0ec 100644
--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mmvq.cuh b/llama/ggml-cuda/mmvq.cuh
index c76123b1..5d311870 100644
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/norm.cu b/llama/ggml-cuda/norm.cu
index f27c597f..dafbaf3f 100644
--- a/llama/ggml-cuda/norm.cu
+++ b/llama/ggml-cuda/norm.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -168,8 +168,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
     }
 }
 
-static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
-    static const float eps = 1e-6f;
+static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
     if (group_size < 1024) {
         const dim3 block_dims(WARP_SIZE, 1, 1);
         group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
@@ -222,8 +221,12 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
     int num_groups = dst->op_params[0];
+
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+
     int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
+    group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream);
 }
 
 void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
diff --git a/llama/ggml-cuda/norm.cuh b/llama/ggml-cuda/norm.cuh
index cd20016a..9584aab0 100644
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/pad.cu b/llama/ggml-cuda/pad.cu
index 38abb23e..7ddbe97c 100644
--- a/llama/ggml-cuda/pad.cu
+++ b/llama/ggml-cuda/pad.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/pad.cuh b/llama/ggml-cuda/pad.cuh
index 33b5f1b6..79a760e4 100644
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/pool2d.cu b/llama/ggml-cuda/pool2d.cu
index f14bdd35..6fe947b5 100644
--- a/llama/ggml-cuda/pool2d.cu
+++ b/llama/ggml-cuda/pool2d.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/pool2d.cuh b/llama/ggml-cuda/pool2d.cuh
index 3a680462..b24f68bb 100644
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/quantize.cu b/llama/ggml-cuda/quantize.cu
index 6c5b6f9f..3e310e97 100644
--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/quantize.cuh b/llama/ggml-cuda/quantize.cuh
index f533e30e..4c046dd8 100644
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/rope.cu b/llama/ggml-cuda/rope.cu
index 5046697c..11f0d459 100644
--- a/llama/ggml-cuda/rope.cu
+++ b/llama/ggml-cuda/rope.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/rope.cuh b/llama/ggml-cuda/rope.cuh
index aa34b1df..1f3b1c0b 100644
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/scale.cu b/llama/ggml-cuda/scale.cu
index e2d849e0..41b72f6e 100644
--- a/llama/ggml-cuda/scale.cu
+++ b/llama/ggml-cuda/scale.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/scale.cuh b/llama/ggml-cuda/scale.cuh
index 4c0dc83f..508f8a11 100644
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/softmax.cu b/llama/ggml-cuda/softmax.cu
index db94d7de..5793a3d0 100644
--- a/llama/ggml-cuda/softmax.cu
+++ b/llama/ggml-cuda/softmax.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/softmax.cuh b/llama/ggml-cuda/softmax.cuh
index ac4e2914..08f99632 100644
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/sumrows.cu b/llama/ggml-cuda/sumrows.cu
index a6b8f720..4c55bdcb 100644
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/sumrows.cuh b/llama/ggml-cuda/sumrows.cuh
index 9b8c9cd6..b3b5cd3f 100644
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
index 05196989..7b9acc47 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
index fd02735b..6d2f5b2b 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
index 5fdcd8e4..036e25a4 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
index e032d0b3..cdc846c2 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
index 6c89d944..42b9553e 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
index b5326ec7..0d79f2ad 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
index c654b9d9..73e0683c 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
index 3eeed729..e21dc472 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
index 4c8b8e7e..4307b7ef 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
index ed93bda8..8782139f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
index dd7a6ed9..c01dc3d5 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
index f13cbabb..2c39ee12 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
index c50660d2..ffdb9752 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
index a32ba4e0..48566bc0 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
index 117c686d..adb54260 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
index 83b169e4..6ba20855 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
index 44883202..e7fa59e8 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
index ea964906..e3b85fa4 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
index 488ff9a6..308dbaaa 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
index 1a0449a2..2a804981 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
index b1a2723e..ce1a7ab9 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
index 74f18b63..42549ea2 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
index d6350bec..a211f509 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
index 5ecc0c48..724cf117 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
index 641d6a04..92e6c590 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
index 7615d691..b53d0b40 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
index c5755ff3..0e37890a 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
index 375f370e..725904ab 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
index 555eba19..283daffd 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
index 29982a4c..0a4b699b 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
index cd8b538a..5f00fac6 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
index a102886a..03e828b4 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
index 700c84a8..c0ba74e4 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
index acf305d4..7c355e7e 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
index c29b8262..e589492c 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
index 5b96efad..dc25793c 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
index 2c4a76ad..7c1b8672 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
index 6a4a424c..5692cf32 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
index 949cba5f..3a6d82a4 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
index 7e360e14..824a9077 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
index afb4d80e..0f8bb517 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
index aa39aefb..0677d0a0 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
index 78bc0019..ae606891 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
index 35f772f1..224235f8 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
index 6afb111e..32a2a026 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
index 03a69b8c..0aae0de5 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
index 59ad9cd8..4254ae4d 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
index cd84c81e..4c70e71f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
index 6ef8b30f..a90d0f09 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
index 1cdf9601..75a6b6dc 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
index 092b6757..387046e6 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
index 5fd20888..b90165a8 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
index 7fd85f46..c9ab4302 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
index 39d5f402..05786903 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
index 5dd34807..3ae05fe7 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
index 8fa2a892..98738e83 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
index 74a935f6..2a4b87ea 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
index 9c336952..97867cd0 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
index c1691913..4885065f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
index ddb6f5c4..6be68bd4 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
index 460e0501..eccf3e84 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
index 788346ed..3400e197 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
index dfb2a12d..60061afb 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
index 4b9848d5..4b1a250c 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
index 141a3c0d..03c09c1e 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
index 5e9736b8..68d9cb38 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
index 6027c480..df587ef4 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
index d766d427..161acbb9 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
index 3af17ada..5f262c61 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
index 28ce6f86..90327e26 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
index 5dc4609e..cb67631e 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
index bd97d45e..e79cac28 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
index 7d0b363e..12a2e0ab 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
index 92ee4c0f..3a83cddc 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
index 51fde074..b902e3dc 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
index 235e3872..aaba5d45 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
index dc3715d7..785200ed 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
index a5b4241f..cbbf0726 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
index 9a2fe54a..2e902550 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
index 5d8153e2..3f24e03c 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
index 73102eaf..38f42660 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
index 2f1a60bc..4850c544 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
index 5c2395be..10a82b72 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
index e038d84f..ad160596 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
index 832789fa..ba59ca88 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
index c5b27e37..41fc980a 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
index 2f34c8fb..0fdb4990 100644
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
index f443658e..b48eb117 100644
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
+++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
index 3e1304de..263d0428 100644
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
index d7c6d597..e2a0252d 100644
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
+++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
index 6bc3dc3f..e477e099 100644
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
+++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
index 7b484e65..f6be840f 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
index 445791db..5e1a54bc 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
index 4f7eb4ba..a2c15684 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
index bb1a3adb..c255311c 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
index 01affe46..192de6e6 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
index badd19cf..bdf93bd6 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
index e79360f9..d72ed1e6 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
index fa75948f..479303ce 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu
index cb3d2b14..fff6937a 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu
index 3afd2877..7fcee96a 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu
index e6fcb3d5..326d23ce 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu
index e8c23dae..da40b380 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu
index 1b106850..41e76496 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu
index d17d2636..dc835624 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu
index e0f6b4ad..63622b42 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu
index cc50ae8d..ec34a691 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu
index 66cd6c91..baf377ae 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu
index ac2f5322..e5077fc1 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/tsembd.cu b/llama/ggml-cuda/tsembd.cu
index 3feed02b..b295567e 100644
--- a/llama/ggml-cuda/tsembd.cu
+++ b/llama/ggml-cuda/tsembd.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/tsembd.cuh b/llama/ggml-cuda/tsembd.cuh
index cbfd942e..11334f64 100644
--- a/llama/ggml-cuda/tsembd.cuh
+++ b/llama/ggml-cuda/tsembd.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/unary.cu b/llama/ggml-cuda/unary.cu
index db9fa38d..c23e6774 100644
--- a/llama/ggml-cuda/unary.cu
+++ b/llama/ggml-cuda/unary.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/unary.cuh b/llama/ggml-cuda/unary.cuh
index 3d4a675b..ab78ec6a 100644
--- a/llama/ggml-cuda/unary.cuh
+++ b/llama/ggml-cuda/unary.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/upscale.cu b/llama/ggml-cuda/upscale.cu
index 4e5e614f..49ebe347 100644
--- a/llama/ggml-cuda/upscale.cu
+++ b/llama/ggml-cuda/upscale.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/upscale.cuh b/llama/ggml-cuda/upscale.cuh
index e3951934..73fa0d37 100644
--- a/llama/ggml-cuda/upscale.cuh
+++ b/llama/ggml-cuda/upscale.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/vecdotq.cuh b/llama/ggml-cuda/vecdotq.cuh
index 97360639..b0cf4b1a 100644
--- a/llama/ggml-cuda/vecdotq.cuh
+++ b/llama/ggml-cuda/vecdotq.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-impl.h b/llama/ggml-impl.h
index 80ca886d..467c4cc7 100644
--- a/llama/ggml-impl.h
+++ b/llama/ggml-impl.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -106,8 +106,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
 /**
  * Converts float32 to brain16.
  *
- * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
- * Subnormals shall be flushed to zero, and NANs will be quiet.
+ * This is binary identical with Google Brain float conversion.
+ * Floats shall round to nearest even, and NANs shall be quiet.
+ * Subnormals aren't flushed to zero, except perhaps when used.
  * This code should vectorize nicely if using modern compilers.
  */
 static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
@@ -121,10 +122,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
         h.bits = (u.i >> 16) | 64; /* force to quiet */
         return h;
     }
-    if (!(u.i & 0x7f800000)) { /* subnormal */
-        h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
-        return h;
-    }
     h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
     return h;
 }
@@ -172,6 +169,7 @@ extern "C" {
 
 #if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
+#include <sys/prctl.h>
 #endif
 
 // 16-bit float
diff --git a/llama/ggml-metal-darwin_arm64.m b/llama/ggml-metal-darwin_arm64.m
index 67b638ac..8d705b9c 100644
--- a/llama/ggml-metal-darwin_arm64.m
+++ b/llama/ggml-metal-darwin_arm64.m
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -2251,10 +2251,8 @@ static enum ggml_status ggml_metal_graph_compute(
                         GGML_ASSERT(ne00 % 4 == 0);
                         GGML_ASSERT(ggml_is_contiguous(src0));
 
-                        //float eps;
-                        //memcpy(&eps, dst->op_params, sizeof(float));
-
-                        const float eps = 1e-6f; // TODO: temporarily hardcoded
+                        float eps;
+                        memcpy(&eps, dst->op_params + 1, sizeof(float));
 
                         const int32_t n_groups = ((int32_t *) dst->op_params)[0];
 
diff --git a/llama/ggml-metal.h b/llama/ggml-metal.h
index be606ecd..906ca235 100644
--- a/llama/ggml-metal.h
+++ b/llama/ggml-metal.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-metal.metal b/llama/ggml-metal.metal
index 287ff1ce..5e363c19 100644
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-quants.c b/llama/ggml-quants.c
index 81d64d19..93552afd 100644
--- a/llama/ggml-quants.c
+++ b/llama/ggml-quants.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -3844,7 +3844,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     float sumf = 0;
 
 #if defined(__ARM_FEATURE_SVE)
-    if (svcntb() == QK8_0) {
+    if (ggml_sve_cnt_b == QK8_0) {
         const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
         const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
 
@@ -5329,7 +5329,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     float sumf = 0;
 
 #if defined(__ARM_FEATURE_SVE)
-    if (svcntb() == QK8_0) {
+    if (ggml_sve_cnt_b == QK8_0) {
         svfloat32_t sumv0 = svdup_n_f32(0.0f);
         svfloat32_t sumv1 = svdup_n_f32(0.0f);
 
@@ -6475,22 +6475,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             // compute mask for subtraction
             vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
-            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
+            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
             m <<= 1;
 
             vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
+            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
             m <<= 1;
 
             vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
+            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
             m <<= 1;
 
             vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
-            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
+            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
             m <<= 1;
 
             // load Q8 and take product with Q3
@@ -7746,13 +7746,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
             vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
+            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
             m <<= 1;
 
             vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
             vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
+            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
             m <<= 1;
 
             vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
diff --git a/llama/ggml-quants.h b/llama/ggml-quants.h
index 39ece43c..26d494e9 100644
--- a/llama/ggml-quants.h
+++ b/llama/ggml-quants.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -153,6 +153,10 @@ void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 
+#if defined(__ARM_FEATURE_SVE)
+extern int ggml_sve_cnt_b;
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/llama/ggml.c b/llama/ggml.c
index e7822f91..d4f84e51 100644
--- a/llama/ggml.c
+++ b/llama/ggml.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -63,6 +63,9 @@
 #include <unistd.h>
 #endif
 
+#if defined(__ARM_FEATURE_SVE)
+int ggml_sve_cnt_b = 0;
+#endif
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@@ -167,7 +170,51 @@ typedef pthread_t ggml_thread_t;
 
 #include <sys/wait.h>
 
-#if defined(__linux__)
+#if defined(__ANDROID__)
+#include <unwind.h>
+#include <dlfcn.h>
+#include <stdio.h>
+
+struct backtrace_state {
+    void ** current;
+    void ** end;
+};
+
+static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
+    struct backtrace_state * state = (struct backtrace_state *)arg;
+    uintptr_t pc = _Unwind_GetIP(context);
+    if (pc) {
+        if (state->current == state->end) {
+            return _URC_END_OF_STACK;
+        } else {
+            *state->current++ = (void*)pc;
+        }
+    }
+    return _URC_NO_REASON;
+}
+
+static void ggml_print_backtrace_symbols(void) {
+    const int max = 100;
+    void* buffer[max];
+
+    struct backtrace_state state = {buffer, buffer + max};
+    _Unwind_Backtrace(unwind_callback, &state);
+
+    int count = state.current - buffer;
+
+    for (int idx = 0; idx < count; ++idx) {
+        const void * addr = buffer[idx];
+        const char * symbol = "";
+
+        Dl_info info;
+        if (dladdr(addr, &info) && info.dli_sname) {
+            symbol = info.dli_sname;
+        }
+
+        fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
+    }
+}
+#elif defined(__linux__) && defined(__GLIBC__)
 #include <execinfo.h>
 static void ggml_print_backtrace_symbols(void) {
     void * trace[100];
@@ -462,9 +509,16 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
     }
 }
 
+void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
+    for (int i = 0; i < n; i++) {
+        y[i] = ggml_compute_fp32_to_bf16(x[i]);
+    }
+}
+
 void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
   int i = 0;
 #if defined(__AVX512BF16__)
+  // subnormals are flushed to zero on this platform
   for (; i + 32 <= n; i += 32) {
         _mm512_storeu_si512(
             (__m512i *)(y + i),
@@ -944,7 +998,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .is_quantized             = false,
         .to_float                 = (ggml_to_float_t) ggml_bf16_to_fp32_row,
         .from_float               = (ggml_from_float_t) ggml_fp32_to_bf16_row,
-        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row,
+        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
         .vec_dot_type             = GGML_TYPE_BF16,
         .nrows                    = 1,
@@ -2284,7 +2338,7 @@ inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) {
 inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
 inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
-inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
+inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
 inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
@@ -3533,6 +3587,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     GGML_ASSERT_ALIGNED(ctx->mem_buffer);
 
+#if defined(__ARM_FEATURE_SVE)
+    if (!ggml_sve_cnt_b) {
+        ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+    }
+#endif
+
     GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
 
     ggml_critical_section_end();
@@ -5340,6 +5400,7 @@ static struct ggml_tensor * ggml_group_norm_impl(
     struct ggml_context * ctx,
     struct ggml_tensor * a,
     int n_groups,
+    float eps,
     bool inplace) {
 
     bool is_node = false;
@@ -5350,7 +5411,8 @@ static struct ggml_tensor * ggml_group_norm_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    result->op_params[0] = n_groups;
+    ggml_set_op_params_i32(result, 0, n_groups);
+    ggml_set_op_params_f32(result, 1, eps);
 
     result->op = GGML_OP_GROUP_NORM;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5362,15 +5424,17 @@ static struct ggml_tensor * ggml_group_norm_impl(
 struct ggml_tensor * ggml_group_norm(
     struct ggml_context * ctx,
     struct ggml_tensor * a,
-    int n_groups) {
-    return ggml_group_norm_impl(ctx, a, n_groups, false);
+    int n_groups,
+    float eps) {
+    return ggml_group_norm_impl(ctx, a, n_groups, eps, false);
 }
 
 struct ggml_tensor * ggml_group_norm_inplace(
     struct ggml_context * ctx,
     struct ggml_tensor * a,
-    int n_groups) {
-    return ggml_group_norm_impl(ctx, a, n_groups, true);
+    int n_groups,
+    float eps) {
+    return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
 }
 
 // ggml_mul_mat
@@ -12061,10 +12125,11 @@ static void ggml_compute_forward_group_norm_f32(
 
     GGML_TENSOR_UNARY_OP_LOCALS
 
-    const float eps = 1e-6f; // TODO: make this a parameter
-
     // TODO: optimize
 
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+
     int n_channels = src0->ne[2];
     int n_groups = dst->op_params[0];
     int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
@@ -20632,7 +20697,7 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_BF16:
             {
                 size_t elemsize = sizeof(ggml_bf16_t);
-                ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n);
+                ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n);
                 result = n * elemsize;
             } break;
         case GGML_TYPE_F32:
diff --git a/llama/ggml.h b/llama/ggml.h
index f5821853..3e87205b 100644
--- a/llama/ggml.h
+++ b/llama/ggml.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -375,6 +375,7 @@ extern "C" {
     GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
     GGML_API float       ggml_bf16_to_fp32(ggml_bf16_t);  // consider just doing << 16
     GGML_API void        ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
+    GGML_API void        ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
     GGML_API void        ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
 
     struct ggml_object;
@@ -1165,16 +1166,17 @@ extern "C" {
 
     // group normalize along ne0*ne1*n_groups
     // used in stable-diffusion
-    // TODO: eps is hardcoded to 1e-6 for now
     GGML_API struct ggml_tensor * ggml_group_norm(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
 
     GGML_API struct ggml_tensor * ggml_group_norm_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
 
     // a - x
     // b - dy
@@ -1481,7 +1483,6 @@ extern "C" {
     // if mode & 2 == 1, GPT-NeoX style
     //
     // b is an int32 vector with size a->ne[2], it contains the positions
-    // c is freq factors (e.g. phi3-128k), (optional)
     GGML_API struct ggml_tensor * ggml_rope(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -1498,6 +1499,7 @@ extern "C" {
             int                   mode);
 
     // custom RoPE
+    // c is freq factors (e.g. phi3-128k), (optional)
     GGML_API struct ggml_tensor * ggml_rope_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/llama/grammar-parser.cpp b/llama/grammar-parser.cpp
index ebfb3198..b20310d0 100644
--- a/llama/grammar-parser.cpp
+++ b/llama/grammar-parser.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/grammar-parser.h b/llama/grammar-parser.h
index 9a24cad8..1c4e3aff 100644
--- a/llama/grammar-parser.h
+++ b/llama/grammar-parser.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/json-schema-to-grammar.cpp b/llama/json-schema-to-grammar.cpp
index e78c57ab..df453bac 100644
--- a/llama/json-schema-to-grammar.cpp
+++ b/llama/json-schema-to-grammar.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/json-schema-to-grammar.h b/llama/json-schema-to-grammar.h
index d3311b70..0b2fb0d7 100644
--- a/llama/json-schema-to-grammar.h
+++ b/llama/json-schema-to-grammar.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-grammar.cpp b/llama/llama-grammar.cpp
index e5e67c7b..422578cc 100644
--- a/llama/llama-grammar.cpp
+++ b/llama/llama-grammar.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-grammar.h b/llama/llama-grammar.h
index 17f6f88a..8d4d342a 100644
--- a/llama/llama-grammar.h
+++ b/llama/llama-grammar.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-impl.h b/llama/llama-impl.h
index 322307c7..f34ff1d8 100644
--- a/llama/llama-impl.h
+++ b/llama/llama-impl.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-sampling.cpp b/llama/llama-sampling.cpp
index 935547c2..fdb3f079 100644
--- a/llama/llama-sampling.cpp
+++ b/llama/llama-sampling.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-sampling.h b/llama/llama-sampling.h
index 89b8d33a..9c44e9b8 100644
--- a/llama/llama-sampling.h
+++ b/llama/llama-sampling.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-vocab.cpp b/llama/llama-vocab.cpp
index a40a9259..154ab4dc 100644
--- a/llama/llama-vocab.cpp
+++ b/llama/llama-vocab.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -842,6 +842,9 @@ struct llm_tokenizer_ugm {
      * the best tokenization.
     */
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        // get current size of output (for reversal later)
+        size_t output_size = output.size();
+
         // normalize the input first
         std::string normalized;
         normalize(text, &normalized);
@@ -921,7 +924,7 @@ struct llm_tokenizer_ugm {
         }
 
         // reverse the output since we added tokens starting from the end of the input
-        std::reverse(output.begin(), output.end());
+        std::reverse(output.begin() + output_size, output.end());
     }
 
 private:
@@ -1470,7 +1473,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
     return token != -1 && (
         token == llama_token_eos_impl(vocab) ||
-        token == llama_token_eot_impl(vocab)
+        token == llama_token_eot_impl(vocab) ||
+        token == llama_token_eom_impl(vocab)
     );
 }
 
@@ -1526,6 +1530,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
     return vocab.special_eot_id;
 }
 
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eom_id;
+}
+
 int32_t llama_tokenize_impl(
     const struct llama_vocab & vocab,
                   const char * text,
diff --git a/llama/llama-vocab.h b/llama/llama-vocab.h
index 84826366..2b7f5d0c 100644
--- a/llama/llama-vocab.h
+++ b/llama/llama-vocab.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -71,6 +71,7 @@ struct llama_vocab {
     id special_suffix_id = -1;
     id special_middle_id = -1;
     id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
+    id special_eom_id    = -1;
 
     // tokenizer flags
     bool tokenizer_add_space_prefix = false;
@@ -127,6 +128,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
+llama_token llama_token_eom_impl   (const struct llama_vocab & vocab);
 
 int32_t llama_tokenize_impl(
         const struct llama_vocab & vocab,
diff --git a/llama/llama.cpp b/llama/llama.cpp
index b95ed228..0281c556 100644
--- a/llama/llama.cpp
+++ b/llama/llama.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
@@ -148,17 +148,14 @@ static std::string trim(const std::string & str) {
 }
 
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
-    for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
+    if (search.empty()) {
+        return; // Avoid infinite loop if 'search' is an empty string
+    }
+    size_t pos = 0;
+    while ((pos = s.find(search, pos)) != std::string::npos) {
+        s.replace(pos, search.length(), replace);
+        pos += replace.length();
     }
-    s = std::move(result);
 }
 
 static bool is_float_close(float a, float b, float abs_tol) {
@@ -388,6 +385,7 @@ enum llm_kv {
     LLM_KV_TOKENIZER_SUFFIX_ID,
     LLM_KV_TOKENIZER_MIDDLE_ID,
     LLM_KV_TOKENIZER_EOT_ID,
+    LLM_KV_TOKENIZER_EOM_ID,
 
     LLM_KV_ADAPTER_TYPE,
     LLM_KV_ADAPTER_LORA_ALPHA,
@@ -485,6 +483,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_SUFFIX_ID,            "tokenizer.ggml.suffix_token_id"          },
     { LLM_KV_TOKENIZER_MIDDLE_ID,            "tokenizer.ggml.middle_token_id"          },
     { LLM_KV_TOKENIZER_EOT_ID,               "tokenizer.ggml.eot_token_id"             },
+    { LLM_KV_TOKENIZER_EOM_ID,               "tokenizer.ggml.eom_token_id"             },
 
     { LLM_KV_ADAPTER_TYPE,                  "adapter.type"       },
     { LLM_KV_ADAPTER_LORA_ALPHA,            "adapter.lora.alpha" },
@@ -4995,6 +4994,7 @@ static void llm_load_hparams(
                 hparams.attn_soft_cap = true;
 
                 switch (hparams.n_layer) {
+                    case 26: model.type = e_model::MODEL_2B; break;
                     case 42: model.type = e_model::MODEL_9B; break;
                     case 46: model.type = e_model::MODEL_27B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
@@ -5603,6 +5603,7 @@ static void llm_load_vocab(
             { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
             { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
             { LLM_KV_TOKENIZER_EOT_ID,    vocab.special_eot_id    },
+            { LLM_KV_TOKENIZER_EOM_ID,    vocab.special_eom_id    },
         };
 
         for (const auto & it : special_token_types) {
@@ -5655,6 +5656,17 @@ static void llm_load_vocab(
                 }
             }
         }
+
+        // find EOM token: "<|eom_id|>"
+        //
+        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
+        //       for now, we apply this workaround to find the EOM token based on its text
+        if (vocab.special_eom_id == -1) {
+            const auto & t = vocab.token_to_id.find("<|eom_id|>");
+            if (t != vocab.token_to_id.end()) {
+                vocab.special_eom_id = t->second;
+            }
+        }
     }
 
     // build special tokens cache
@@ -11754,6 +11766,7 @@ struct llm_build_context {
 
                 // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
                 switch (model.type) {
+                    case e_model::MODEL_2B:
                     case e_model::MODEL_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
                     case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
                     default: GGML_ABORT("fatal error");
diff --git a/llama/llama.h b/llama/llama.h
index 469bf75e..c624a688 100644
--- a/llama/llama.h
+++ b/llama/llama.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama_darwin.c b/llama/llama_darwin.c
index 7d2d98c6..8164a759 100644
--- a/llama/llama_darwin.c
+++ b/llama/llama_darwin.c
@@ -1,3 +1,29 @@
+/**
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 /**
  * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
  *
diff --git a/llama/llava.cpp b/llama/llava.cpp
index d94196ec..d54175b3 100644
--- a/llama/llava.cpp
+++ b/llama/llava.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llava.h b/llama/llava.h
index 61dde037..8d1ed60f 100644
--- a/llama/llava.h
+++ b/llama/llava.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/log.h b/llama/log.h
index 67e92545..e600939c 100644
--- a/llama/log.h
+++ b/llama/log.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/patches/09-lora.diff b/llama/patches/09-lora.diff
index 10c66d1d..21958476 100644
--- a/llama/patches/09-lora.diff
+++ b/llama/patches/09-lora.diff
@@ -1,40 +1,32 @@
 diff --git a/common/common.cpp b/common/common.cpp
-index dbb724fb..c26fe6ee 100644
+index 2e8374d5..70d0afde 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -2087,14 +2087,27 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
-     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
-         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
-         float lora_scale = std::get<1>(params.lora_adapter[i]);
-+
-+        // try to load as gguf
-         auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
-         if (adapter == nullptr) {
--            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
+         loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
+         if (loaded_la.adapter == nullptr) {
+             fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
 -            llama_free(lctx);
 -            llama_free_model(model);
--            return std::make_tuple(nullptr, nullptr);
-+            fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
+-            return iparams;
 +
 +            // if that fails, try loading as ggla for compatibility
 +            int err = llama_model_apply_lora_from_file(model,
-+                                                    lora_adapter.c_str(),
-+                                                    lora_scale,
++                                                    la.path.c_str(),
++                                                    la.scale,
 +                                                    nullptr,
 +                                                    params.n_threads);
 +            if (err != 0) {
 +                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
 +                llama_free(lctx);
 +                llama_free_model(model);
-+                return std::make_tuple(nullptr, nullptr);
++                return iparams;
++            } else {
++                break;
 +            }
-+        } else {
-+            llama_lora_adapter_set(lctx, adapter, lora_scale);
          }
--        llama_lora_adapter_set(lctx, adapter, lora_scale);
+         iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
      }
- 
-     if (params.ignore_eos) {
 diff --git a/include/llama.h b/include/llama.h
 index 93fd77ca..b0fb37a6 100644
 --- a/include/llama.h
@@ -355,4 +347,4 @@ index 80a0dd0f..9d7b0e17 100644
 +        return 1;
 +    }
 +}
-\ No newline at end of file
+\ No newline at end of file
\ No newline at end of file
diff --git a/llama/sampling.cpp b/llama/sampling.cpp
index 1985ac2f..4c947815 100644
--- a/llama/sampling.cpp
+++ b/llama/sampling.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/sampling.h b/llama/sampling.h
index 30b4134f..b1c82a5c 100644
--- a/llama/sampling.h
+++ b/llama/sampling.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/stb_image.h b/llama/stb_image.h
index ed9badad..4f5c44a5 100644
--- a/llama/stb_image.h
+++ b/llama/stb_image.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/unicode-data.cpp b/llama/unicode-data.cpp
index ae01e5c4..7e89db07 100644
--- a/llama/unicode-data.cpp
+++ b/llama/unicode-data.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/unicode-data.h b/llama/unicode-data.h
index 3abb9c74..9cd3d5f9 100644
--- a/llama/unicode-data.h
+++ b/llama/unicode-data.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/unicode.cpp b/llama/unicode.cpp
index 774a5210..b7bdf334 100644
--- a/llama/unicode.cpp
+++ b/llama/unicode.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/unicode.h b/llama/unicode.h
index 1850ceeb..2ea5e168 100644
--- a/llama/unicode.h
+++ b/llama/unicode.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
  *
  * MIT License
  *