From 80db43b7b468665c141cf9450d218a837e83d96d Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 6 Aug 2024 16:50:34 -0700 Subject: [PATCH] Bump llama sync to 1e6f65 --- llama/build-info.cpp | 2 +- llama/clip.cpp | 2 +- llama/clip.h | 2 +- llama/common.cpp | 91 +++-- llama/common.h | 27 +- llama/ggml-aarch64.c | 30 +- llama/ggml-aarch64.h | 2 +- llama/ggml-alloc.c | 2 +- llama/ggml-alloc.h | 2 +- llama/ggml-backend-impl.h | 2 +- llama/ggml-backend.c | 2 +- llama/ggml-backend.h | 2 +- llama/ggml-common.h | 2 +- llama/ggml-cuda.cu | 24 +- llama/ggml-cuda.h | 2 +- llama/ggml-cuda/acc.cu | 2 +- llama/ggml-cuda/acc.cuh | 2 +- llama/ggml-cuda/alibi.cu | 26 ++ llama/ggml-cuda/alibi.cuh | 26 ++ llama/ggml-cuda/arange.cu | 2 +- llama/ggml-cuda/arange.cuh | 2 +- llama/ggml-cuda/argsort.cu | 2 +- llama/ggml-cuda/argsort.cuh | 2 +- llama/ggml-cuda/binbcast.cu | 2 +- llama/ggml-cuda/binbcast.cuh | 2 +- llama/ggml-cuda/clamp.cu | 2 +- llama/ggml-cuda/clamp.cuh | 2 +- llama/ggml-cuda/common.cuh | 380 +----------------- llama/ggml-cuda/concat.cu | 2 +- llama/ggml-cuda/concat.cuh | 2 +- llama/ggml-cuda/conv-transpose-1d.cu | 2 +- llama/ggml-cuda/conv-transpose-1d.cuh | 2 +- llama/ggml-cuda/convert.cu | 2 +- llama/ggml-cuda/convert.cuh | 2 +- llama/ggml-cuda/cpy.cu | 2 +- llama/ggml-cuda/cpy.cuh | 2 +- llama/ggml-cuda/dequantize.cuh | 2 +- llama/ggml-cuda/diagmask.cu | 2 +- llama/ggml-cuda/diagmask.cuh | 2 +- llama/ggml-cuda/dmmv.cu | 23 +- llama/ggml-cuda/dmmv.cuh | 4 +- llama/ggml-cuda/fattn-common.cuh | 2 +- llama/ggml-cuda/fattn-tile-f16.cu | 2 +- llama/ggml-cuda/fattn-tile-f16.cuh | 2 +- llama/ggml-cuda/fattn-tile-f32.cu | 2 +- llama/ggml-cuda/fattn-tile-f32.cuh | 2 +- llama/ggml-cuda/fattn-vec-f16.cuh | 2 +- llama/ggml-cuda/fattn-vec-f32.cuh | 2 +- llama/ggml-cuda/fattn-wmma-f16.cuh | 2 +- llama/ggml-cuda/fattn.cu | 2 +- llama/ggml-cuda/fattn.cuh | 2 +- llama/ggml-cuda/getrows.cu | 2 +- llama/ggml-cuda/getrows.cuh | 2 +- llama/ggml-cuda/im2col.cu | 2 +- llama/ggml-cuda/im2col.cuh | 2 +- llama/ggml-cuda/mma.cuh | 2 +- llama/ggml-cuda/mmq.cu | 2 +- llama/ggml-cuda/mmq.cuh | 2 +- llama/ggml-cuda/mmvq.cu | 2 +- llama/ggml-cuda/mmvq.cuh | 2 +- llama/ggml-cuda/norm.cu | 11 +- llama/ggml-cuda/norm.cuh | 2 +- llama/ggml-cuda/pad.cu | 2 +- llama/ggml-cuda/pad.cuh | 2 +- llama/ggml-cuda/pool2d.cu | 2 +- llama/ggml-cuda/pool2d.cuh | 2 +- llama/ggml-cuda/quantize.cu | 2 +- llama/ggml-cuda/quantize.cuh | 2 +- llama/ggml-cuda/rope.cu | 2 +- llama/ggml-cuda/rope.cuh | 2 +- llama/ggml-cuda/scale.cu | 2 +- llama/ggml-cuda/scale.cuh | 2 +- llama/ggml-cuda/softmax.cu | 2 +- llama/ggml-cuda/softmax.cuh | 2 +- llama/ggml-cuda/sumrows.cu | 2 +- llama/ggml-cuda/sumrows.cuh | 2 +- .../fattn-vec-f16-instance-hs128-f16-f16.cu | 2 +- .../fattn-vec-f16-instance-hs128-f16-q4_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-f16-q4_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-f16-q5_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-f16-q5_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-f16-q8_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_0-f16.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_0-q4_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_0-q4_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_0-q5_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_0-q5_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_0-q8_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_1-f16.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_1-q4_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_1-q4_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_1-q5_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_1-q5_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-q4_1-q8_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_0-f16.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_0-q4_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_0-q4_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_0-q5_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_0-q5_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_0-q8_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_1-f16.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_1-q4_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_1-q4_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_1-q5_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_1-q5_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-q5_1-q8_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q8_0-f16.cu | 2 +- .../fattn-vec-f16-instance-hs128-q8_0-q4_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q8_0-q4_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-q8_0-q5_0.cu | 2 +- .../fattn-vec-f16-instance-hs128-q8_0-q5_1.cu | 2 +- .../fattn-vec-f16-instance-hs128-q8_0-q8_0.cu | 2 +- .../fattn-vec-f16-instance-hs256-f16-f16.cu | 2 +- .../fattn-vec-f16-instance-hs64-f16-f16.cu | 2 +- .../fattn-vec-f16-instance-hs64-f16-q4_0.cu | 2 +- .../fattn-vec-f16-instance-hs64-f16-q4_1.cu | 2 +- .../fattn-vec-f16-instance-hs64-f16-q5_0.cu | 2 +- .../fattn-vec-f16-instance-hs64-f16-q5_1.cu | 2 +- .../fattn-vec-f16-instance-hs64-f16-q8_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-f16-f16.cu | 2 +- .../fattn-vec-f32-instance-hs128-f16-q4_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-f16-q4_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-f16-q5_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-f16-q5_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-f16-q8_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_0-f16.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_0-q4_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_0-q4_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_0-q5_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_0-q5_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_0-q8_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_1-f16.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_1-q4_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_1-q4_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_1-q5_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_1-q5_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-q4_1-q8_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_0-f16.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_0-q4_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_0-q4_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_0-q5_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_0-q5_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_0-q8_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_1-f16.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_1-q4_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_1-q4_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_1-q5_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_1-q5_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-q5_1-q8_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q8_0-f16.cu | 2 +- .../fattn-vec-f32-instance-hs128-q8_0-q4_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q8_0-q4_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-q8_0-q5_0.cu | 2 +- .../fattn-vec-f32-instance-hs128-q8_0-q5_1.cu | 2 +- .../fattn-vec-f32-instance-hs128-q8_0-q8_0.cu | 2 +- .../fattn-vec-f32-instance-hs256-f16-f16.cu | 2 +- .../fattn-vec-f32-instance-hs64-f16-f16.cu | 2 +- .../fattn-vec-f32-instance-hs64-f16-q4_0.cu | 2 +- .../fattn-vec-f32-instance-hs64-f16-q4_1.cu | 2 +- .../fattn-vec-f32-instance-hs64-f16-q5_0.cu | 2 +- .../fattn-vec-f32-instance-hs64-f16-q5_1.cu | 2 +- .../fattn-vec-f32-instance-hs64-f16-q8_0.cu | 2 +- .../fattn-wmma-f16-instance-kqfloat-cpb16.cu | 2 +- .../fattn-wmma-f16-instance-kqfloat-cpb32.cu | 2 +- .../fattn-wmma-f16-instance-kqhalf-cpb16.cu | 2 +- .../fattn-wmma-f16-instance-kqhalf-cpb32.cu | 2 +- .../fattn-wmma-f16-instance-kqhalf-cpb8.cu | 2 +- .../template-instances/mmq-instance-iq1_s.cu | 2 +- .../template-instances/mmq-instance-iq2_s.cu | 2 +- .../template-instances/mmq-instance-iq2_xs.cu | 2 +- .../mmq-instance-iq2_xxs.cu | 2 +- .../template-instances/mmq-instance-iq3_s.cu | 2 +- .../mmq-instance-iq3_xxs.cu | 2 +- .../template-instances/mmq-instance-iq4_nl.cu | 2 +- .../template-instances/mmq-instance-iq4_xs.cu | 2 +- .../template-instances/mmq-instance-q2_k.cu | 2 +- .../template-instances/mmq-instance-q3_k.cu | 2 +- .../template-instances/mmq-instance-q4_0.cu | 2 +- .../template-instances/mmq-instance-q4_1.cu | 2 +- .../template-instances/mmq-instance-q4_k.cu | 2 +- .../template-instances/mmq-instance-q5_0.cu | 2 +- .../template-instances/mmq-instance-q5_1.cu | 2 +- .../template-instances/mmq-instance-q5_k.cu | 2 +- .../template-instances/mmq-instance-q6_k.cu | 2 +- .../template-instances/mmq-instance-q8_0.cu | 2 +- llama/ggml-cuda/tsembd.cu | 2 +- llama/ggml-cuda/tsembd.cuh | 2 +- llama/ggml-cuda/unary.cu | 2 +- llama/ggml-cuda/unary.cuh | 2 +- llama/ggml-cuda/upscale.cu | 2 +- llama/ggml-cuda/upscale.cuh | 2 +- llama/ggml-cuda/vecdotq.cuh | 2 +- llama/ggml-impl.h | 12 +- llama/ggml-metal-darwin_arm64.m | 8 +- llama/ggml-metal.h | 2 +- llama/ggml-metal.metal | 2 +- llama/ggml-quants.c | 18 +- llama/ggml-quants.h | 6 +- llama/ggml.c | 89 +++- llama/ggml.h | 12 +- llama/grammar-parser.cpp | 2 +- llama/grammar-parser.h | 2 +- llama/json-schema-to-grammar.cpp | 2 +- llama/json-schema-to-grammar.h | 2 +- llama/llama-grammar.cpp | 2 +- llama/llama-grammar.h | 2 +- llama/llama-impl.h | 2 +- llama/llama-sampling.cpp | 2 +- llama/llama-sampling.h | 2 +- llama/llama-vocab.cpp | 14 +- llama/llama-vocab.h | 4 +- llama/llama.cpp | 35 +- llama/llama.h | 2 +- llama/llama_darwin.c | 26 ++ llama/llava.cpp | 2 +- llama/llava.h | 2 +- llama/log.h | 2 +- llama/patches/09-lora.diff | 34 +- llama/sampling.cpp | 2 +- llama/sampling.h | 2 +- llama/stb_image.h | 2 +- llama/unicode-data.cpp | 2 +- llama/unicode-data.h | 2 +- llama/unicode.cpp | 2 +- llama/unicode.h | 2 +- 225 files changed, 584 insertions(+), 724 deletions(-) diff --git a/llama/build-info.cpp b/llama/build-info.cpp index 63732571..d4ddbffc 100644 --- a/llama/build-info.cpp +++ b/llama/build-info.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/clip.cpp b/llama/clip.cpp index 2039bdc8..d8a85004 100644 --- a/llama/clip.cpp +++ b/llama/clip.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/clip.h b/llama/clip.h index 8665ad6a..84a51470 100644 --- a/llama/clip.h +++ b/llama/clip.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/common.cpp b/llama/common.cpp index f542c129..cfda8854 100644 --- a/llama/common.cpp +++ b/llama/common.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -710,14 +710,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--lora") { CHECK_ARG - params.lora_adapter.emplace_back(argv[i], 1.0f); + params.lora_adapters.push_back({ + std::string(argv[i]), + 1.0, + }); return true; } if (arg == "--lora-scaled") { CHECK_ARG - const char* lora_adapter = argv[i]; + std::string lora_adapter = argv[i]; CHECK_ARG - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.lora_adapters.push_back({ + lora_adapter, + std::stof(argv[i]), + }); + return true; + } + if (arg == "--lora-init-without-apply") { + params.lora_init_without_apply = true; return true; } if (arg == "--control-vector") { @@ -1660,7 +1670,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); - options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" }); + options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" }); options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); @@ -1680,6 +1690,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); + options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"}); #ifndef LOG_DISABLE_LOGS options.push_back({ "logging" }); @@ -2065,8 +2076,8 @@ std::string fs_get_cache_file(const std::string & filename) { // // Model utils // - -std::tuple llama_init_from_gpt_params(gpt_params & params) { +struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + llama_init_result iparams; auto mparams = llama_model_params_from_gpt_params(params); llama_model * model = nullptr; @@ -2081,7 +2092,7 @@ std::tuple llama_init_from_gpt_par if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return std::make_tuple(nullptr, nullptr); + return iparams; } auto cparams = llama_context_params_from_gpt_params(params); @@ -2090,7 +2101,7 @@ std::tuple llama_init_from_gpt_par if (lctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } if (!params.control_vectors.empty()) { @@ -2101,7 +2112,7 @@ std::tuple llama_init_from_gpt_par if (cvec.n_embd == -1) { llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } int err = llama_control_vector_apply(lctx, @@ -2113,34 +2124,38 @@ std::tuple llama_init_from_gpt_par if (err) { llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } } - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); - - // try to load as gguf - auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); - if (adapter == nullptr) { - fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__); + // load and optionally apply lora adapters + for (auto & la : params.lora_adapters) { + llama_lora_adapter_container loaded_la; + loaded_la.path = la.path; + loaded_la.scale = la.scale; + loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (loaded_la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); // if that fails, try loading as ggla for compatibility int err = llama_model_apply_lora_from_file(model, - lora_adapter.c_str(), - lora_scale, + la.path.c_str(), + la.scale, nullptr, params.n_threads); if (err != 0) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; + } else { + break; } - } else { - llama_lora_adapter_set(lctx, adapter, lora_scale); } + iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters + } + if (!params.lora_init_without_apply) { + llama_lora_adapters_apply(lctx, iparams.lora_adapters); } if (params.ignore_eos) { @@ -2174,7 +2189,18 @@ std::tuple llama_init_from_gpt_par llama_reset_timings(lctx); } - return std::make_tuple(model, lctx); + iparams.model = model; + iparams.context = lctx; + return iparams; +} + +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters) { + llama_lora_adapter_clear(ctx); + for (auto & la : lora_adapters) { + if (la.scale != 0.0f) { + llama_lora_adapter_set(ctx, la.adapter, la.scale); + } + } } struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { @@ -3199,19 +3225,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) != 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale == 1.0f) { + fprintf(stream, " - %s\n", la.path.c_str()); } - fprintf(stream, " - %s\n", std::get<0>(la).c_str()); } fprintf(stream, "lora_scaled:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) == 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale != 1.0f) { + fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale); } - fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } + fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false"); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); diff --git a/llama/common.h b/llama/common.h index 181c412c..b25dc42e 100644 --- a/llama/common.h +++ b/llama/common.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -59,6 +59,15 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" +struct llama_lora_adapter_info { + std::string path; + float scale; +}; + +struct llama_lora_adapter_container : llama_lora_adapter_info { + struct llama_lora_adapter * adapter; +}; + // build info extern int LLAMA_BUILD_NUMBER; extern char const * LLAMA_COMMIT; @@ -152,8 +161,8 @@ struct gpt_params { std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; - // TODO: avoid tuple, use struct - std::vector> lora_adapter; // lora adapter path with user defined scale + bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) + std::vector lora_adapters; // lora adapter path with user defined scale std::vector control_vectors; // control vector with user defined scale @@ -334,8 +343,13 @@ std::string fs_get_cache_file(const std::string & filename); // Model utils // -// TODO: avoid tuplue, use struct -std::tuple llama_init_from_gpt_params(gpt_params & params); +struct llama_init_result { + struct llama_model * model = nullptr; + struct llama_context * context = nullptr; + std::vector lora_adapters; +}; + +struct llama_init_result llama_init_from_gpt_params(gpt_params & params); struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); @@ -343,6 +357,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +// clear LoRA adapters from context, then apply new list of adapters +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); + // Batch utils void llama_batch_clear(struct llama_batch & batch); diff --git a/llama/ggml-aarch64.c b/llama/ggml-aarch64.c index c2189c02..ea751c58 100644 --- a/llama/ggml-aarch64.c +++ b/llama/ggml-aarch64.c @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -410,8 +410,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -522,8 +522,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -640,7 +640,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (svcntw() == 8) { + if (ggml_sve_cnt_b == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -706,12 +706,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * return; } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) && + GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (ggml_cpu_has_neon()) { - GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) && + GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } @@ -771,8 +771,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1292,8 +1292,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1754,7 +1754,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (svcntw() == 8) { + if (ggml_sve_cnt_b == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -2165,12 +2165,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * return; } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) && + GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (ggml_cpu_has_neon()) { - GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) && + GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } diff --git a/llama/ggml-aarch64.h b/llama/ggml-aarch64.h index f00fde74..d8f7fd3a 100644 --- a/llama/ggml-aarch64.h +++ b/llama/ggml-aarch64.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-alloc.c b/llama/ggml-alloc.c index ca84d2e9..0bb174f4 100644 --- a/llama/ggml-alloc.c +++ b/llama/ggml-alloc.c @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-alloc.h b/llama/ggml-alloc.h index 676c9695..8c8cb4eb 100644 --- a/llama/ggml-alloc.h +++ b/llama/ggml-alloc.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-backend-impl.h b/llama/ggml-backend-impl.h index c44e5b0f..e3a3d335 100644 --- a/llama/ggml-backend-impl.h +++ b/llama/ggml-backend-impl.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-backend.c b/llama/ggml-backend.c index ca846cdb..bfc6e10a 100644 --- a/llama/ggml-backend.c +++ b/llama/ggml-backend.c @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-backend.h b/llama/ggml-backend.h index 7950571d..7cf3d939 100644 --- a/llama/ggml-backend.h +++ b/llama/ggml-backend.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-common.h b/llama/ggml-common.h index 8ff58bfa..80c80024 100644 --- a/llama/ggml-common.h +++ b/llama/ggml-common.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda.cu b/llama/ggml-cuda.cu index a3341229..8b310ae4 100644 --- a/llama/ggml-cuda.cu +++ b/llama/ggml-cuda.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -156,7 +156,22 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) } return res; #else + +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + cudaError_t err; + if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) + { + err = cudaMallocManaged(ptr, size); + } + else + { + err = cudaMalloc(ptr, size); + } + return err; +#else return cudaMalloc(ptr, size); +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + #endif } @@ -1516,7 +1531,7 @@ static void ggml_cuda_op_mul_mat( } // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared: - if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) { + if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) { const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00); const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING); CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream)); @@ -1915,10 +1930,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); - bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) + bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2 - && src1->ne[1] == 1; + && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; diff --git a/llama/ggml-cuda.h b/llama/ggml-cuda.h index fce52bf9..d5616d8b 100644 --- a/llama/ggml-cuda.h +++ b/llama/ggml-cuda.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/acc.cu b/llama/ggml-cuda/acc.cu index 0f55c157..cb7eb35a 100644 --- a/llama/ggml-cuda/acc.cu +++ b/llama/ggml-cuda/acc.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/acc.cuh b/llama/ggml-cuda/acc.cuh index 519c95c8..126a4d6d 100644 --- a/llama/ggml-cuda/acc.cuh +++ b/llama/ggml-cuda/acc.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/alibi.cu b/llama/ggml-cuda/alibi.cu index 35d276b5..1ee61656 100644 --- a/llama/ggml-cuda/alibi.cu +++ b/llama/ggml-cuda/alibi.cu @@ -1,3 +1,29 @@ +/** + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /** * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file * diff --git a/llama/ggml-cuda/alibi.cuh b/llama/ggml-cuda/alibi.cuh index 0d6a3440..98a724a8 100644 --- a/llama/ggml-cuda/alibi.cuh +++ b/llama/ggml-cuda/alibi.cuh @@ -1,3 +1,29 @@ +/** + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /** * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file * diff --git a/llama/ggml-cuda/arange.cu b/llama/ggml-cuda/arange.cu index 514c146e..8f9b86e3 100644 --- a/llama/ggml-cuda/arange.cu +++ b/llama/ggml-cuda/arange.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/arange.cuh b/llama/ggml-cuda/arange.cuh index f1d8acc2..10b0a6fc 100644 --- a/llama/ggml-cuda/arange.cuh +++ b/llama/ggml-cuda/arange.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/argsort.cu b/llama/ggml-cuda/argsort.cu index 1987e87f..796633c6 100644 --- a/llama/ggml-cuda/argsort.cu +++ b/llama/ggml-cuda/argsort.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/argsort.cuh b/llama/ggml-cuda/argsort.cuh index 9189815c..75f434c2 100644 --- a/llama/ggml-cuda/argsort.cuh +++ b/llama/ggml-cuda/argsort.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/binbcast.cu b/llama/ggml-cuda/binbcast.cu index df396eb2..c8262f3d 100644 --- a/llama/ggml-cuda/binbcast.cu +++ b/llama/ggml-cuda/binbcast.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/binbcast.cuh b/llama/ggml-cuda/binbcast.cuh index e6a48196..74348c78 100644 --- a/llama/ggml-cuda/binbcast.cuh +++ b/llama/ggml-cuda/binbcast.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/clamp.cu b/llama/ggml-cuda/clamp.cu index 844cb913..cf0a60fd 100644 --- a/llama/ggml-cuda/clamp.cu +++ b/llama/ggml-cuda/clamp.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/clamp.cuh b/llama/ggml-cuda/clamp.cuh index 2d25cb00..c7b6a414 100644 --- a/llama/ggml-cuda/clamp.cuh +++ b/llama/ggml-cuda/clamp.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/common.cuh b/llama/ggml-cuda/common.cuh index dbd07204..37079ab6 100644 --- a/llama/ggml-cuda/common.cuh +++ b/llama/ggml-cuda/common.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -53,255 +53,11 @@ #include #if defined(GGML_USE_HIPBLAS) -#include -#include -#include -#ifdef __HIP_PLATFORM_AMD__ -// for rocblas_initialize() -#include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ -#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F HIPBLAS_R_16F -#define CUDA_R_32F HIPBLAS_R_32F -#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 -#define cublasCreate hipblasCreate -#define cublasDestroy hipblasDestroy -#define cublasGemmEx hipblasGemmEx -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cublasHandle_t hipblasHandle_t -#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS -#define cublasSetStream hipblasSetStream -#define cublasSgemm hipblasSgemm -#define cublasStatus_t hipblasStatus_t -#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 -#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess -#define cudaDeviceProp hipDeviceProp_t -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEventSynchronize hipEventSynchronize -#define cudaEvent_t hipEvent_t -#define cudaEventDestroy hipEventDestroy -#define cudaFree hipFree -#define cudaFreeHost hipHostFree -#define cudaGetDevice hipGetDevice -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaGetDeviceProperties hipGetDeviceProperties -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaHostRegister hipHostRegister -#define cudaHostRegisterPortable hipHostRegisterPortable -#define cudaHostRegisterReadOnly hipHostRegisterReadOnly -#define cudaHostUnregister hipHostUnregister -#define cudaLaunchHostFunc hipLaunchHostFunc -#define cudaMalloc hipMalloc -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) -#define cudaMemcpy hipMemcpy -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyPeerAsync hipMemcpyPeerAsync -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemcpyKind hipMemcpyKind -#define cudaMemset hipMemset -#define cudaMemsetAsync hipMemsetAsync -#define cudaMemGetInfo hipMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize -#define cudaSetDevice hipSetDevice -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamDestroy hipStreamDestroy -#define cudaStreamFireAndForget hipStreamFireAndForget -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamPerThread hipStreamPerThread -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define __trap() do { abort(); __builtin_unreachable(); } while(0) -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#include "vendors/hip.h" #elif defined(GGML_USE_MUSA) -#include -#include -#include -#include -// XXX: Keep the following order the same as hipBLAS -// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F -// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F -#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F -#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N MUBLAS_OP_N -#define CUBLAS_OP_T MUBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS -// #define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F MUSA_R_16F -#define CUDA_R_32F MUSA_R_32F -// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -// #define cublasComputeType_t mublasComputeType_t -#define cublasCreate mublasCreate -#define cublasDestroy mublasDestroy -#define cublasGemmEx mublasGemmEx -#define cublasGemmBatchedEx mublasGemmBatchedEx -#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx -#define cublasHandle_t mublasHandle_t -// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS -#define cublasSetMathMode mublasSetMathMode -#define cublasSetStream mublasSetStream -#define cublasSgemm mublasSgemm -#define cublasStatus_t mublasStatus_t -#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6 -#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess -#define cudaDeviceProp musaDeviceProp -#define cudaDeviceSynchronize musaDeviceSynchronize -#define cudaError_t musaError_t -#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags musaEventCreateWithFlags -#define cudaEventDisableTiming musaEventDisableTiming -#define cudaEventRecord musaEventRecord -#define cudaEventSynchronize musaEventSynchronize -#define cudaEvent_t musaEvent_t -#define cudaEventDestroy musaEventDestroy -#define cudaFree musaFree -#define cudaFreeHost musaFreeHost -#define cudaGetDevice musaGetDevice -#define cudaGetDeviceCount musaGetDeviceCount -#define cudaGetDeviceProperties musaGetDeviceProperties -#define cudaGetErrorString musaGetErrorString -#define cudaGetLastError musaGetLastError -#define cudaHostRegister musaHostRegister -#define cudaHostRegisterPortable musaHostRegisterPortable -#define cudaHostRegisterReadOnly musaHostRegisterReadOnly -#define cudaHostUnregister musaHostUnregister -#define cudaLaunchHostFunc musaLaunchHostFunc -#define cudaMalloc musaMalloc -#define cudaMallocHost musaMallocHost -#define cudaMemcpy musaMemcpy -#define cudaMemcpyAsync musaMemcpyAsync -#define cudaMemcpyPeerAsync musaMemcpyPeerAsync -#define cudaMemcpy2DAsync musaMemcpy2DAsync -#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost -#define cudaMemcpyHostToDevice musaMemcpyHostToDevice -#define cudaMemcpyKind musaMemcpyKind -#define cudaMemset musaMemset -#define cudaMemsetAsync musaMemsetAsync -#define cudaMemGetInfo musaMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize -#define cudaSetDevice musaSetDevice -#define cudaStreamCreateWithFlags musaStreamCreateWithFlags -#define cudaStreamDestroy musaStreamDestroy -#define cudaStreamFireAndForget musaStreamFireAndForget -#define cudaStreamNonBlocking musaStreamNonBlocking -#define cudaStreamPerThread musaStreamPerThread -#define cudaStreamSynchronize musaStreamSynchronize -#define cudaStreamWaitEvent musaStreamWaitEvent -#define cudaStream_t musaStream_t -#define cudaSuccess musaSuccess - -// XXX: Other CUDA => MUSA mapping -#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE -#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED -#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED -#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE -#define CUdevice MUdevice -#define CUdeviceptr MUdeviceptr -#define CUmemAccessDesc MUmemAccessDesc -#define CUmemAllocationProp MUmemAllocationProp -#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle -#define cuDeviceGet muDeviceGet -#define cuDeviceGetAttribute muDeviceGetAttribute -#define cuMemAddressFree muMemAddressFree -#define cuMemAddressReserve muMemAddressReserve -#define cuMemCreate muMemCreate -#define cuMemGetAllocationGranularity muMemGetAllocationGranularity -#define cuMemMap muMemMap -#define cuMemRelease muMemRelease -#define cuMemSetAccess muMemSetAccess -#define cuMemUnmap muMemUnmap -#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize -#define cudaFuncSetAttribute musaFuncSetAttribute -#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms -#define make_cudaExtent make_musaExtent -#define make_cudaPitchedPtr make_musaPitchedPtr - -// XXX: USE_CUDA_GRAPH -#define CUDA_SUCCESS MUSA_SUCCESS -#define CUresult MUresult -#define cuGetErrorString muGetErrorString -#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure -#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction -#define cudaGraphDestroy musaGraphDestroy -#define cudaGraphExecDestroy musaGraphExecDestroy -#define cudaGraphExec_t musaGraphExec_t -#define cudaGraphExecUpdate musaGraphExecUpdate -#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult -#define cudaGraphGetNodes musaGraphGetNodes -#define cudaGraphInstantiate musaGraphInstantiate -#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams -#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams -#define cudaGraphLaunch musaGraphLaunch -#define cudaGraphNodeGetType musaGraphNodeGetType -#define cudaGraphNode_t musaGraphNode_t -#define cudaGraphNodeType musaGraphNodeType -#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel -#define cudaGraph_t musaGraph_t -#define cudaKernelNodeParams musaKernelNodeParams -#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed -#define cudaStreamEndCapture musaStreamEndCapture - -// XXX: cuBLAS => muBLAS mapping -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define cublasComputeType_t cudaDataType_t - -// XXX: Clang builtins mapping -#define __vsub4 __vsub4_musa -#define __vcmpeq4 __vcmpeq4_musa -#define __vcmpne4 __vcmpne4_musa +#include "vendors/musa.h" #else -#include -#include -#include -#include - -#if CUDART_VERSION < 11020 -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define cublasComputeType_t cudaDataType_t -#endif // CUDART_VERSION < 11020 - +#include "vendors/cuda.h" #endif // defined(GGML_USE_HIPBLAS) #define STRINGIZE_IMPL(...) #__VA_ARGS__ @@ -344,11 +100,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) static const char * cublas_get_error_str(const cublasStatus_t err) { -#ifndef GGML_USE_MUSA return cublasGetStatusString(err); -#else - return mublasStatus_to_string(err); -#endif // GGML_USE_MUSA } #else static const char * cublas_get_error_str(const cublasStatus_t err) { @@ -390,129 +142,7 @@ typedef half2 dfloat2; #else typedef float dfloat; // dequantize float typedef float2 dfloat2; -#endif //GGML_CUDA_F16 - -#if defined(GGML_USE_MUSA) -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); - -static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0x00 : 0xff; - } - return c; -} -#endif // defined(GGML_USE_MUSA) - -#if defined(GGML_USE_HIPBLAS) -#define __CUDA_ARCH__ 1300 - -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ - defined(__gfx1150__) || defined(__gfx1151__) -#define RDNA3 -#endif - -#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ - defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) -#define RDNA2 -#endif - -#if defined(__gfx1010__) || defined(__gfx1012__) -#define RDNA1 -#endif - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); -static __device__ __forceinline__ int __vsubss4(const int a, const int b) { - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); -#if __has_builtin(__builtin_elementwise_sub_sat) - const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); - return reinterpret_cast(c); -#else - int8x4_t c; - int16_t tmp; -#pragma unroll - for (int i = 0; i < 4; i++) { - tmp = va[i] - vb[i]; - if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); - if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); - c[i] = tmp; - } - return reinterpret_cast(c); -#endif // __has_builtin(__builtin_elementwise_sub_sat) -} - -static __device__ __forceinline__ int __vsub4(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0x00 : 0xff; - } - return c; -} - -#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 -// __shfl_xor() for half2 was added in ROCm 5.6 -static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { - typedef union half2_b32 { - half2 val; - int b32; - } half2_b32_t; - half2_b32_t tmp; - tmp.val = var; - tmp.b32 = __shfl_xor(tmp.b32, laneMask, width); - return tmp.val; -} -#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 -#endif // defined(GGML_USE_HIPBLAS) +#endif // GGML_CUDA_F16 #if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #define FP16_AVAILABLE diff --git a/llama/ggml-cuda/concat.cu b/llama/ggml-cuda/concat.cu index e77a1c44..c73b3311 100644 --- a/llama/ggml-cuda/concat.cu +++ b/llama/ggml-cuda/concat.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/concat.cuh b/llama/ggml-cuda/concat.cuh index f2010440..5869b398 100644 --- a/llama/ggml-cuda/concat.cuh +++ b/llama/ggml-cuda/concat.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/conv-transpose-1d.cu b/llama/ggml-cuda/conv-transpose-1d.cu index 0117a6b7..4124f559 100644 --- a/llama/ggml-cuda/conv-transpose-1d.cu +++ b/llama/ggml-cuda/conv-transpose-1d.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/conv-transpose-1d.cuh b/llama/ggml-cuda/conv-transpose-1d.cuh index 90ed15d0..41990dde 100644 --- a/llama/ggml-cuda/conv-transpose-1d.cuh +++ b/llama/ggml-cuda/conv-transpose-1d.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/convert.cu b/llama/ggml-cuda/convert.cu index 44a18e53..52d27ea3 100644 --- a/llama/ggml-cuda/convert.cu +++ b/llama/ggml-cuda/convert.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/convert.cuh b/llama/ggml-cuda/convert.cuh index a72f0206..40e15280 100644 --- a/llama/ggml-cuda/convert.cuh +++ b/llama/ggml-cuda/convert.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/cpy.cu b/llama/ggml-cuda/cpy.cu index d5024659..a4ee847b 100644 --- a/llama/ggml-cuda/cpy.cu +++ b/llama/ggml-cuda/cpy.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/cpy.cuh b/llama/ggml-cuda/cpy.cuh index 9907eb3e..96400051 100644 --- a/llama/ggml-cuda/cpy.cuh +++ b/llama/ggml-cuda/cpy.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/dequantize.cuh b/llama/ggml-cuda/dequantize.cuh index 4baf3f59..60f0b66e 100644 --- a/llama/ggml-cuda/dequantize.cuh +++ b/llama/ggml-cuda/dequantize.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/diagmask.cu b/llama/ggml-cuda/diagmask.cu index 14dbb972..1a6bcbc9 100644 --- a/llama/ggml-cuda/diagmask.cu +++ b/llama/ggml-cuda/diagmask.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/diagmask.cuh b/llama/ggml-cuda/diagmask.cuh index 1ec8e9ba..34f76fe2 100644 --- a/llama/ggml-cuda/diagmask.cuh +++ b/llama/ggml-cuda/diagmask.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/dmmv.cu b/llama/ggml-cuda/dmmv.cu index feb9bf80..a606493d 100644 --- a/llama/ggml-cuda/dmmv.cu +++ b/llama/ggml-cuda/dmmv.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -526,7 +526,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons } static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead const dim3 block_nums(block_num_y, 1, 1); @@ -536,7 +536,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -545,7 +545,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -554,7 +554,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -563,7 +563,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -614,7 +614,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f } static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -698,3 +698,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec( GGML_UNUSED(src1_ncols); GGML_UNUSED(src1_padded_row_size); } + +bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) { + return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 || + src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 || + src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K || + src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K || + src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K || + src0_type == GGML_TYPE_F16; +} diff --git a/llama/ggml-cuda/dmmv.cuh b/llama/ggml-cuda/dmmv.cuh index be2b3fa6..c68aa1d1 100644 --- a/llama/ggml-cuda/dmmv.cuh +++ b/llama/ggml-cuda/dmmv.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -42,3 +42,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream); + +bool ggml_cuda_dmmv_type_supported(ggml_type src0_type); diff --git a/llama/ggml-cuda/fattn-common.cuh b/llama/ggml-cuda/fattn-common.cuh index cba14ae2..ac1937a9 100644 --- a/llama/ggml-cuda/fattn-common.cuh +++ b/llama/ggml-cuda/fattn-common.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/fattn-tile-f16.cu b/llama/ggml-cuda/fattn-tile-f16.cu index a4fc2127..4718d05e 100644 --- a/llama/ggml-cuda/fattn-tile-f16.cu +++ b/llama/ggml-cuda/fattn-tile-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/fattn-tile-f16.cuh b/llama/ggml-cuda/fattn-tile-f16.cuh index c48c863d..fd9d0e22 100644 --- a/llama/ggml-cuda/fattn-tile-f16.cuh +++ b/llama/ggml-cuda/fattn-tile-f16.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/fattn-tile-f32.cu b/llama/ggml-cuda/fattn-tile-f32.cu index 49c1ec56..7c8b975c 100644 --- a/llama/ggml-cuda/fattn-tile-f32.cu +++ b/llama/ggml-cuda/fattn-tile-f32.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/fattn-tile-f32.cuh b/llama/ggml-cuda/fattn-tile-f32.cuh index 87c48525..67ae64e7 100644 --- a/llama/ggml-cuda/fattn-tile-f32.cuh +++ b/llama/ggml-cuda/fattn-tile-f32.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/fattn-vec-f16.cuh b/llama/ggml-cuda/fattn-vec-f16.cuh index 496535c1..b89d7200 100644 --- a/llama/ggml-cuda/fattn-vec-f16.cuh +++ b/llama/ggml-cuda/fattn-vec-f16.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/fattn-vec-f32.cuh b/llama/ggml-cuda/fattn-vec-f32.cuh index 1517ac72..143d1a28 100644 --- a/llama/ggml-cuda/fattn-vec-f32.cuh +++ b/llama/ggml-cuda/fattn-vec-f32.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/fattn-wmma-f16.cuh b/llama/ggml-cuda/fattn-wmma-f16.cuh index ce74f71d..1b5701bd 100644 --- a/llama/ggml-cuda/fattn-wmma-f16.cuh +++ b/llama/ggml-cuda/fattn-wmma-f16.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/fattn.cu b/llama/ggml-cuda/fattn.cu index 511e19d4..dc0652d1 100644 --- a/llama/ggml-cuda/fattn.cu +++ b/llama/ggml-cuda/fattn.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/fattn.cuh b/llama/ggml-cuda/fattn.cuh index e04eefbc..fab654c7 100644 --- a/llama/ggml-cuda/fattn.cuh +++ b/llama/ggml-cuda/fattn.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/getrows.cu b/llama/ggml-cuda/getrows.cu index 87b09d8b..a6c96599 100644 --- a/llama/ggml-cuda/getrows.cu +++ b/llama/ggml-cuda/getrows.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/getrows.cuh b/llama/ggml-cuda/getrows.cuh index 0700d3a6..3173d5ce 100644 --- a/llama/ggml-cuda/getrows.cuh +++ b/llama/ggml-cuda/getrows.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/im2col.cu b/llama/ggml-cuda/im2col.cu index 574e641b..3ee4b485 100644 --- a/llama/ggml-cuda/im2col.cu +++ b/llama/ggml-cuda/im2col.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/im2col.cuh b/llama/ggml-cuda/im2col.cuh index ca3d91f0..592125e1 100644 --- a/llama/ggml-cuda/im2col.cuh +++ b/llama/ggml-cuda/im2col.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/mma.cuh b/llama/ggml-cuda/mma.cuh index 2e7fff79..552e5082 100644 --- a/llama/ggml-cuda/mma.cuh +++ b/llama/ggml-cuda/mma.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/mmq.cu b/llama/ggml-cuda/mmq.cu index a5046bf1..2ad38cfd 100644 --- a/llama/ggml-cuda/mmq.cu +++ b/llama/ggml-cuda/mmq.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/mmq.cuh b/llama/ggml-cuda/mmq.cuh index ab18ee1f..fc6ff807 100644 --- a/llama/ggml-cuda/mmq.cuh +++ b/llama/ggml-cuda/mmq.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/mmvq.cu b/llama/ggml-cuda/mmvq.cu index f693109a..d6ffc0ec 100644 --- a/llama/ggml-cuda/mmvq.cu +++ b/llama/ggml-cuda/mmvq.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/mmvq.cuh b/llama/ggml-cuda/mmvq.cuh index c76123b1..5d311870 100644 --- a/llama/ggml-cuda/mmvq.cuh +++ b/llama/ggml-cuda/mmvq.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/norm.cu b/llama/ggml-cuda/norm.cu index f27c597f..dafbaf3f 100644 --- a/llama/ggml-cuda/norm.cu +++ b/llama/ggml-cuda/norm.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -168,8 +168,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i } } -static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) { - static const float eps = 1e-6f; +static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) { if (group_size < 1024) { const dim3 block_dims(WARP_SIZE, 1, 1); group_norm_f32<<>>(x, dst, group_size, ne_elements, eps); @@ -222,8 +221,12 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) GGML_ASSERT( dst->type == GGML_TYPE_F32); int num_groups = dst->op_params[0]; + + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream); + group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream); } void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/llama/ggml-cuda/norm.cuh b/llama/ggml-cuda/norm.cuh index cd20016a..9584aab0 100644 --- a/llama/ggml-cuda/norm.cuh +++ b/llama/ggml-cuda/norm.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/pad.cu b/llama/ggml-cuda/pad.cu index 38abb23e..7ddbe97c 100644 --- a/llama/ggml-cuda/pad.cu +++ b/llama/ggml-cuda/pad.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/pad.cuh b/llama/ggml-cuda/pad.cuh index 33b5f1b6..79a760e4 100644 --- a/llama/ggml-cuda/pad.cuh +++ b/llama/ggml-cuda/pad.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/pool2d.cu b/llama/ggml-cuda/pool2d.cu index f14bdd35..6fe947b5 100644 --- a/llama/ggml-cuda/pool2d.cu +++ b/llama/ggml-cuda/pool2d.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/pool2d.cuh b/llama/ggml-cuda/pool2d.cuh index 3a680462..b24f68bb 100644 --- a/llama/ggml-cuda/pool2d.cuh +++ b/llama/ggml-cuda/pool2d.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/quantize.cu b/llama/ggml-cuda/quantize.cu index 6c5b6f9f..3e310e97 100644 --- a/llama/ggml-cuda/quantize.cu +++ b/llama/ggml-cuda/quantize.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/quantize.cuh b/llama/ggml-cuda/quantize.cuh index f533e30e..4c046dd8 100644 --- a/llama/ggml-cuda/quantize.cuh +++ b/llama/ggml-cuda/quantize.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/rope.cu b/llama/ggml-cuda/rope.cu index 5046697c..11f0d459 100644 --- a/llama/ggml-cuda/rope.cu +++ b/llama/ggml-cuda/rope.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/rope.cuh b/llama/ggml-cuda/rope.cuh index aa34b1df..1f3b1c0b 100644 --- a/llama/ggml-cuda/rope.cuh +++ b/llama/ggml-cuda/rope.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/scale.cu b/llama/ggml-cuda/scale.cu index e2d849e0..41b72f6e 100644 --- a/llama/ggml-cuda/scale.cu +++ b/llama/ggml-cuda/scale.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/scale.cuh b/llama/ggml-cuda/scale.cuh index 4c0dc83f..508f8a11 100644 --- a/llama/ggml-cuda/scale.cuh +++ b/llama/ggml-cuda/scale.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/softmax.cu b/llama/ggml-cuda/softmax.cu index db94d7de..5793a3d0 100644 --- a/llama/ggml-cuda/softmax.cu +++ b/llama/ggml-cuda/softmax.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/softmax.cuh b/llama/ggml-cuda/softmax.cuh index ac4e2914..08f99632 100644 --- a/llama/ggml-cuda/softmax.cuh +++ b/llama/ggml-cuda/softmax.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/sumrows.cu b/llama/ggml-cuda/sumrows.cu index a6b8f720..4c55bdcb 100644 --- a/llama/ggml-cuda/sumrows.cu +++ b/llama/ggml-cuda/sumrows.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/sumrows.cuh b/llama/ggml-cuda/sumrows.cuh index 9b8c9cd6..b3b5cd3f 100644 --- a/llama/ggml-cuda/sumrows.cuh +++ b/llama/ggml-cuda/sumrows.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu index 05196989..7b9acc47 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu index fd02735b..6d2f5b2b 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu index 5fdcd8e4..036e25a4 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu index e032d0b3..cdc846c2 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu index 6c89d944..42b9553e 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu index b5326ec7..0d79f2ad 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu index c654b9d9..73e0683c 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu index 3eeed729..e21dc472 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu index 4c8b8e7e..4307b7ef 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu index ed93bda8..8782139f 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu index dd7a6ed9..c01dc3d5 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu index f13cbabb..2c39ee12 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu index c50660d2..ffdb9752 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu index a32ba4e0..48566bc0 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu index 117c686d..adb54260 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu index 83b169e4..6ba20855 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu index 44883202..e7fa59e8 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu index ea964906..e3b85fa4 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu index 488ff9a6..308dbaaa 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu index 1a0449a2..2a804981 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu index b1a2723e..ce1a7ab9 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu index 74f18b63..42549ea2 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu index d6350bec..a211f509 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu index 5ecc0c48..724cf117 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu index 641d6a04..92e6c590 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu index 7615d691..b53d0b40 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu index c5755ff3..0e37890a 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu index 375f370e..725904ab 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu index 555eba19..283daffd 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu index 29982a4c..0a4b699b 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu index cd8b538a..5f00fac6 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu index a102886a..03e828b4 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu index 700c84a8..c0ba74e4 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu index acf305d4..7c355e7e 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu index c29b8262..e589492c 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu index 5b96efad..dc25793c 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu index 2c4a76ad..7c1b8672 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu index 6a4a424c..5692cf32 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu index 949cba5f..3a6d82a4 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu index 7e360e14..824a9077 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu index afb4d80e..0f8bb517 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu index aa39aefb..0677d0a0 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu index 78bc0019..ae606891 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu index 35f772f1..224235f8 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu index 6afb111e..32a2a026 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu index 03a69b8c..0aae0de5 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu index 59ad9cd8..4254ae4d 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu index cd84c81e..4c70e71f 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu index 6ef8b30f..a90d0f09 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu index 1cdf9601..75a6b6dc 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu index 092b6757..387046e6 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu index 5fd20888..b90165a8 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu index 7fd85f46..c9ab4302 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu index 39d5f402..05786903 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu index 5dd34807..3ae05fe7 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu index 8fa2a892..98738e83 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu index 74a935f6..2a4b87ea 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu index 9c336952..97867cd0 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu index c1691913..4885065f 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu index ddb6f5c4..6be68bd4 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu index 460e0501..eccf3e84 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu index 788346ed..3400e197 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu index dfb2a12d..60061afb 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu index 4b9848d5..4b1a250c 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu index 141a3c0d..03c09c1e 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu index 5e9736b8..68d9cb38 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu index 6027c480..df587ef4 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu index d766d427..161acbb9 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu index 3af17ada..5f262c61 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu index 28ce6f86..90327e26 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu index 5dc4609e..cb67631e 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu index bd97d45e..e79cac28 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu index 7d0b363e..12a2e0ab 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu index 92ee4c0f..3a83cddc 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu index 51fde074..b902e3dc 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu index 235e3872..aaba5d45 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu index dc3715d7..785200ed 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu index a5b4241f..cbbf0726 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu index 9a2fe54a..2e902550 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu index 5d8153e2..3f24e03c 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu index 73102eaf..38f42660 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu index 2f1a60bc..4850c544 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu index 5c2395be..10a82b72 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu index e038d84f..ad160596 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu index 832789fa..ba59ca88 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu index c5b27e37..41fc980a 100644 --- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu index 2f34c8fb..0fdb4990 100644 --- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu index f443658e..b48eb117 100644 --- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu index 3e1304de..263d0428 100644 --- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu index d7c6d597..e2a0252d 100644 --- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu index 6bc3dc3f..e477e099 100644 --- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu index 7b484e65..f6be840f 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu index 445791db..5e1a54bc 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu index 4f7eb4ba..a2c15684 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu index bb1a3adb..c255311c 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu index 01affe46..192de6e6 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu index badd19cf..bdf93bd6 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu index e79360f9..d72ed1e6 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu index fa75948f..479303ce 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu index cb3d2b14..fff6937a 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu index 3afd2877..7fcee96a 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu index e6fcb3d5..326d23ce 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu index e8c23dae..da40b380 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu index 1b106850..41e76496 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu index d17d2636..dc835624 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu index e0f6b4ad..63622b42 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu index cc50ae8d..ec34a691 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu index 66cd6c91..baf377ae 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu index ac2f5322..e5077fc1 100644 --- a/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu +++ b/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/tsembd.cu b/llama/ggml-cuda/tsembd.cu index 3feed02b..b295567e 100644 --- a/llama/ggml-cuda/tsembd.cu +++ b/llama/ggml-cuda/tsembd.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/tsembd.cuh b/llama/ggml-cuda/tsembd.cuh index cbfd942e..11334f64 100644 --- a/llama/ggml-cuda/tsembd.cuh +++ b/llama/ggml-cuda/tsembd.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/unary.cu b/llama/ggml-cuda/unary.cu index db9fa38d..c23e6774 100644 --- a/llama/ggml-cuda/unary.cu +++ b/llama/ggml-cuda/unary.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/unary.cuh b/llama/ggml-cuda/unary.cuh index 3d4a675b..ab78ec6a 100644 --- a/llama/ggml-cuda/unary.cuh +++ b/llama/ggml-cuda/unary.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/upscale.cu b/llama/ggml-cuda/upscale.cu index 4e5e614f..49ebe347 100644 --- a/llama/ggml-cuda/upscale.cu +++ b/llama/ggml-cuda/upscale.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/upscale.cuh b/llama/ggml-cuda/upscale.cuh index e3951934..73fa0d37 100644 --- a/llama/ggml-cuda/upscale.cuh +++ b/llama/ggml-cuda/upscale.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-cuda/vecdotq.cuh b/llama/ggml-cuda/vecdotq.cuh index 97360639..b0cf4b1a 100644 --- a/llama/ggml-cuda/vecdotq.cuh +++ b/llama/ggml-cuda/vecdotq.cuh @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-impl.h b/llama/ggml-impl.h index 80ca886d..467c4cc7 100644 --- a/llama/ggml-impl.h +++ b/llama/ggml-impl.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -106,8 +106,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { /** * Converts float32 to brain16. * - * This function is binary identical to AMD Zen4 VCVTNEPS2BF16. - * Subnormals shall be flushed to zero, and NANs will be quiet. + * This is binary identical with Google Brain float conversion. + * Floats shall round to nearest even, and NANs shall be quiet. + * Subnormals aren't flushed to zero, except perhaps when used. * This code should vectorize nicely if using modern compilers. */ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { @@ -121,10 +122,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } - if (!(u.i & 0x7f800000)) { /* subnormal */ - h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */ - return h; - } h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; return h; } @@ -172,6 +169,7 @@ extern "C" { #if defined(__ARM_FEATURE_SVE) #include +#include #endif // 16-bit float diff --git a/llama/ggml-metal-darwin_arm64.m b/llama/ggml-metal-darwin_arm64.m index 67b638ac..8d705b9c 100644 --- a/llama/ggml-metal-darwin_arm64.m +++ b/llama/ggml-metal-darwin_arm64.m @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -2251,10 +2251,8 @@ static enum ggml_status ggml_metal_graph_compute( GGML_ASSERT(ne00 % 4 == 0); GGML_ASSERT(ggml_is_contiguous(src0)); - //float eps; - //memcpy(&eps, dst->op_params, sizeof(float)); - - const float eps = 1e-6f; // TODO: temporarily hardcoded + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); const int32_t n_groups = ((int32_t *) dst->op_params)[0]; diff --git a/llama/ggml-metal.h b/llama/ggml-metal.h index be606ecd..906ca235 100644 --- a/llama/ggml-metal.h +++ b/llama/ggml-metal.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-metal.metal b/llama/ggml-metal.metal index 287ff1ce..5e363c19 100644 --- a/llama/ggml-metal.metal +++ b/llama/ggml-metal.metal @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/ggml-quants.c b/llama/ggml-quants.c index 81d64d19..93552afd 100644 --- a/llama/ggml-quants.c +++ b/llama/ggml-quants.c @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -3844,7 +3844,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (svcntb() == QK8_0) { + if (ggml_sve_cnt_b == QK8_0) { const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); @@ -5329,7 +5329,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (svcntb() == QK8_0) { + if (ggml_sve_cnt_b == QK8_0) { svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); @@ -6475,22 +6475,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r // compute mask for subtraction vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl); - vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl); + vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl); m <<= 1; vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl); + vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl); m <<= 1; vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl); + vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl); m <<= 1; vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl); - vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl); + vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl); m <<= 1; // load Q8 and take product with Q3 @@ -7746,13 +7746,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl)); vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl); + vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl); m <<= 1; vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl)); vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl); + vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl); m <<= 1; vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl); diff --git a/llama/ggml-quants.h b/llama/ggml-quants.h index 39ece43c..26d494e9 100644 --- a/llama/ggml-quants.h +++ b/llama/ggml-quants.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -153,6 +153,10 @@ void iq2xs_free_impl(enum ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); +#if defined(__ARM_FEATURE_SVE) +extern int ggml_sve_cnt_b; +#endif + #ifdef __cplusplus } #endif diff --git a/llama/ggml.c b/llama/ggml.c index e7822f91..d4f84e51 100644 --- a/llama/ggml.c +++ b/llama/ggml.c @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -63,6 +63,9 @@ #include #endif +#if defined(__ARM_FEATURE_SVE) +int ggml_sve_cnt_b = 0; +#endif #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -167,7 +170,51 @@ typedef pthread_t ggml_thread_t; #include -#if defined(__linux__) +#if defined(__ANDROID__) +#include +#include +#include + +struct backtrace_state { + void ** current; + void ** end; +}; + +static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) { + struct backtrace_state * state = (struct backtrace_state *)arg; + uintptr_t pc = _Unwind_GetIP(context); + if (pc) { + if (state->current == state->end) { + return _URC_END_OF_STACK; + } else { + *state->current++ = (void*)pc; + } + } + return _URC_NO_REASON; +} + +static void ggml_print_backtrace_symbols(void) { + const int max = 100; + void* buffer[max]; + + struct backtrace_state state = {buffer, buffer + max}; + _Unwind_Backtrace(unwind_callback, &state); + + int count = state.current - buffer; + + for (int idx = 0; idx < count; ++idx) { + const void * addr = buffer[idx]; + const char * symbol = ""; + + Dl_info info; + if (dladdr(addr, &info) && info.dli_sname) { + symbol = info.dli_sname; + } + + fprintf(stderr, "%d: %p %s\n", idx, addr, symbol); + } +} +#elif defined(__linux__) && defined(__GLIBC__) #include static void ggml_print_backtrace_symbols(void) { void * trace[100]; @@ -462,9 +509,16 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { } } +void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) { + for (int i = 0; i < n; i++) { + y[i] = ggml_compute_fp32_to_bf16(x[i]); + } +} + void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) { int i = 0; #if defined(__AVX512BF16__) + // subnormals are flushed to zero on this platform for (; i + 32 <= n; i += 32) { _mm512_storeu_si512( (__m512i *)(y + i), @@ -944,7 +998,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, - .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, @@ -2284,7 +2338,7 @@ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } -inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } +inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } @@ -3533,6 +3587,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_ASSERT_ALIGNED(ctx->mem_buffer); +#if defined(__ARM_FEATURE_SVE) + if (!ggml_sve_cnt_b) { + ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); + } +#endif + GGML_PRINT_DEBUG("%s: context initialized\n", __func__); ggml_critical_section_end(); @@ -5340,6 +5400,7 @@ static struct ggml_tensor * ggml_group_norm_impl( struct ggml_context * ctx, struct ggml_tensor * a, int n_groups, + float eps, bool inplace) { bool is_node = false; @@ -5350,7 +5411,8 @@ static struct ggml_tensor * ggml_group_norm_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op_params[0] = n_groups; + ggml_set_op_params_i32(result, 0, n_groups); + ggml_set_op_params_f32(result, 1, eps); result->op = GGML_OP_GROUP_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5362,15 +5424,17 @@ static struct ggml_tensor * ggml_group_norm_impl( struct ggml_tensor * ggml_group_norm( struct ggml_context * ctx, struct ggml_tensor * a, - int n_groups) { - return ggml_group_norm_impl(ctx, a, n_groups, false); + int n_groups, + float eps) { + return ggml_group_norm_impl(ctx, a, n_groups, eps, false); } struct ggml_tensor * ggml_group_norm_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_groups) { - return ggml_group_norm_impl(ctx, a, n_groups, true); + int n_groups, + float eps) { + return ggml_group_norm_impl(ctx, a, n_groups, eps, true); } // ggml_mul_mat @@ -12061,10 +12125,11 @@ static void ggml_compute_forward_group_norm_f32( GGML_TENSOR_UNARY_OP_LOCALS - const float eps = 1e-6f; // TODO: make this a parameter - // TODO: optimize + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + int n_channels = src0->ne[2]; int n_groups = dst->op_params[0]; int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; @@ -20632,7 +20697,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_BF16: { size_t elemsize = sizeof(ggml_bf16_t); - ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n); + ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n); result = n * elemsize; } break; case GGML_TYPE_F32: diff --git a/llama/ggml.h b/llama/ggml.h index f5821853..3e87205b 100644 --- a/llama/ggml.h +++ b/llama/ggml.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -375,6 +375,7 @@ extern "C" { GGML_API ggml_bf16_t ggml_fp32_to_bf16(float); GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16 GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t); + GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t); GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t); struct ggml_object; @@ -1165,16 +1166,17 @@ extern "C" { // group normalize along ne0*ne1*n_groups // used in stable-diffusion - // TODO: eps is hardcoded to 1e-6 for now GGML_API struct ggml_tensor * ggml_group_norm( struct ggml_context * ctx, struct ggml_tensor * a, - int n_groups); + int n_groups, + float eps); GGML_API struct ggml_tensor * ggml_group_norm_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_groups); + int n_groups, + float eps); // a - x // b - dy @@ -1481,7 +1483,6 @@ extern "C" { // if mode & 2 == 1, GPT-NeoX style // // b is an int32 vector with size a->ne[2], it contains the positions - // c is freq factors (e.g. phi3-128k), (optional) GGML_API struct ggml_tensor * ggml_rope( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1498,6 +1499,7 @@ extern "C" { int mode); // custom RoPE + // c is freq factors (e.g. phi3-128k), (optional) GGML_API struct ggml_tensor * ggml_rope_ext( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/llama/grammar-parser.cpp b/llama/grammar-parser.cpp index ebfb3198..b20310d0 100644 --- a/llama/grammar-parser.cpp +++ b/llama/grammar-parser.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/grammar-parser.h b/llama/grammar-parser.h index 9a24cad8..1c4e3aff 100644 --- a/llama/grammar-parser.h +++ b/llama/grammar-parser.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/json-schema-to-grammar.cpp b/llama/json-schema-to-grammar.cpp index e78c57ab..df453bac 100644 --- a/llama/json-schema-to-grammar.cpp +++ b/llama/json-schema-to-grammar.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/json-schema-to-grammar.h b/llama/json-schema-to-grammar.h index d3311b70..0b2fb0d7 100644 --- a/llama/json-schema-to-grammar.h +++ b/llama/json-schema-to-grammar.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/llama-grammar.cpp b/llama/llama-grammar.cpp index e5e67c7b..422578cc 100644 --- a/llama/llama-grammar.cpp +++ b/llama/llama-grammar.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/llama-grammar.h b/llama/llama-grammar.h index 17f6f88a..8d4d342a 100644 --- a/llama/llama-grammar.h +++ b/llama/llama-grammar.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/llama-impl.h b/llama/llama-impl.h index 322307c7..f34ff1d8 100644 --- a/llama/llama-impl.h +++ b/llama/llama-impl.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/llama-sampling.cpp b/llama/llama-sampling.cpp index 935547c2..fdb3f079 100644 --- a/llama/llama-sampling.cpp +++ b/llama/llama-sampling.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/llama-sampling.h b/llama/llama-sampling.h index 89b8d33a..9c44e9b8 100644 --- a/llama/llama-sampling.h +++ b/llama/llama-sampling.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/llama-vocab.cpp b/llama/llama-vocab.cpp index a40a9259..154ab4dc 100644 --- a/llama/llama-vocab.cpp +++ b/llama/llama-vocab.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -842,6 +842,9 @@ struct llm_tokenizer_ugm { * the best tokenization. */ void tokenize(const std::string & text, std::vector & output) { + // get current size of output (for reversal later) + size_t output_size = output.size(); + // normalize the input first std::string normalized; normalize(text, &normalized); @@ -921,7 +924,7 @@ struct llm_tokenizer_ugm { } // reverse the output since we added tokens starting from the end of the input - std::reverse(output.begin(), output.end()); + std::reverse(output.begin() + output_size, output.end()); } private: @@ -1470,7 +1473,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { return token != -1 && ( token == llama_token_eos_impl(vocab) || - token == llama_token_eot_impl(vocab) + token == llama_token_eot_impl(vocab) || + token == llama_token_eom_impl(vocab) ); } @@ -1526,6 +1530,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { return vocab.special_eot_id; } +llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { + return vocab.special_eom_id; +} + int32_t llama_tokenize_impl( const struct llama_vocab & vocab, const char * text, diff --git a/llama/llama-vocab.h b/llama/llama-vocab.h index 84826366..2b7f5d0c 100644 --- a/llama/llama-vocab.h +++ b/llama/llama-vocab.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -71,6 +71,7 @@ struct llama_vocab { id special_suffix_id = -1; id special_middle_id = -1; id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token + id special_eom_id = -1; // tokenizer flags bool tokenizer_add_space_prefix = false; @@ -127,6 +128,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); llama_token llama_token_eot_impl (const struct llama_vocab & vocab); +llama_token llama_token_eom_impl (const struct llama_vocab & vocab); int32_t llama_tokenize_impl( const struct llama_vocab & vocab, diff --git a/llama/llama.cpp b/llama/llama.cpp index b95ed228..0281c556 100644 --- a/llama/llama.cpp +++ b/llama/llama.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * @@ -148,17 +148,14 @@ static std::string trim(const std::string & str) { } static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); } - s = std::move(result); } static bool is_float_close(float a, float b, float abs_tol) { @@ -388,6 +385,7 @@ enum llm_kv { LLM_KV_TOKENIZER_SUFFIX_ID, LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_EOT_ID, + LLM_KV_TOKENIZER_EOM_ID, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, @@ -485,6 +483,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, @@ -4995,6 +4994,7 @@ static void llm_load_hparams( hparams.attn_soft_cap = true; switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_2B; break; case 42: model.type = e_model::MODEL_9B; break; case 46: model.type = e_model::MODEL_27B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -5603,6 +5603,7 @@ static void llm_load_vocab( { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id }, { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id }, { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id }, + { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id }, }; for (const auto & it : special_token_types) { @@ -5655,6 +5656,17 @@ static void llm_load_vocab( } } } + + // find EOM token: "<|eom_id|>" + // + // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID + // for now, we apply this workaround to find the EOM token based on its text + if (vocab.special_eom_id == -1) { + const auto & t = vocab.token_to_id.find("<|eom_id|>"); + if (t != vocab.token_to_id.end()) { + vocab.special_eom_id = t->second; + } + } } // build special tokens cache @@ -11754,6 +11766,7 @@ struct llm_build_context { // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e switch (model.type) { + case e_model::MODEL_2B: case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; default: GGML_ABORT("fatal error"); diff --git a/llama/llama.h b/llama/llama.h index 469bf75e..c624a688 100644 --- a/llama/llama.h +++ b/llama/llama.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/llama_darwin.c b/llama/llama_darwin.c index 7d2d98c6..8164a759 100644 --- a/llama/llama_darwin.c +++ b/llama/llama_darwin.c @@ -1,3 +1,29 @@ +/** + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /** * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file * diff --git a/llama/llava.cpp b/llama/llava.cpp index d94196ec..d54175b3 100644 --- a/llama/llava.cpp +++ b/llama/llava.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/llava.h b/llama/llava.h index 61dde037..8d1ed60f 100644 --- a/llama/llava.h +++ b/llama/llava.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/log.h b/llama/log.h index 67e92545..e600939c 100644 --- a/llama/log.h +++ b/llama/log.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/patches/09-lora.diff b/llama/patches/09-lora.diff index 10c66d1d..21958476 100644 --- a/llama/patches/09-lora.diff +++ b/llama/patches/09-lora.diff @@ -1,40 +1,32 @@ diff --git a/common/common.cpp b/common/common.cpp -index dbb724fb..c26fe6ee 100644 +index 2e8374d5..70d0afde 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -2087,14 +2087,27 @@ std::tuple llama_init_from_gpt_par - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); -+ -+ // try to load as gguf - auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); - if (adapter == nullptr) { -- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); +@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (loaded_la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - llama_free(lctx); - llama_free_model(model); -- return std::make_tuple(nullptr, nullptr); -+ fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__); +- return iparams; + + // if that fails, try loading as ggla for compatibility + int err = llama_model_apply_lora_from_file(model, -+ lora_adapter.c_str(), -+ lora_scale, ++ la.path.c_str(), ++ la.scale, + nullptr, + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + llama_free(lctx); + llama_free_model(model); -+ return std::make_tuple(nullptr, nullptr); ++ return iparams; ++ } else { ++ break; + } -+ } else { -+ llama_lora_adapter_set(lctx, adapter, lora_scale); } -- llama_lora_adapter_set(lctx, adapter, lora_scale); + iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters } - - if (params.ignore_eos) { diff --git a/include/llama.h b/include/llama.h index 93fd77ca..b0fb37a6 100644 --- a/include/llama.h @@ -355,4 +347,4 @@ index 80a0dd0f..9d7b0e17 100644 + return 1; + } +} -\ No newline at end of file +\ No newline at end of file \ No newline at end of file diff --git a/llama/sampling.cpp b/llama/sampling.cpp index 1985ac2f..4c947815 100644 --- a/llama/sampling.cpp +++ b/llama/sampling.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/sampling.h b/llama/sampling.h index 30b4134f..b1c82a5c 100644 --- a/llama/sampling.h +++ b/llama/sampling.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/stb_image.h b/llama/stb_image.h index ed9badad..4f5c44a5 100644 --- a/llama/stb_image.h +++ b/llama/stb_image.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/unicode-data.cpp b/llama/unicode-data.cpp index ae01e5c4..7e89db07 100644 --- a/llama/unicode-data.cpp +++ b/llama/unicode-data.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/unicode-data.h b/llama/unicode-data.h index 3abb9c74..9cd3d5f9 100644 --- a/llama/unicode-data.h +++ b/llama/unicode-data.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/unicode.cpp b/llama/unicode.cpp index 774a5210..b7bdf334 100644 --- a/llama/unicode.cpp +++ b/llama/unicode.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License * diff --git a/llama/unicode.h b/llama/unicode.h index 1850ceeb..2ea5e168 100644 --- a/llama/unicode.h +++ b/llama/unicode.h @@ -1,5 +1,5 @@ /** - * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file + * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file * * MIT License *