Update sync with latest llama.cpp layout, and run against b3485
This commit is contained in:
parent
6c0d892498
commit
e9dd656ff5
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -42,6 +42,10 @@
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
@ -891,7 +895,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
embeddings = peg_0;
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1027,6 +1031,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
new_clip->backend = ggml_backend_cann_init(0);
|
||||
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
|
||||
#endif
|
||||
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
@ -1147,20 +1156,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
if (n < 32)
|
||||
hparams.image_grid_pinpoints[n] = 0;
|
||||
} catch (std::runtime_error & e) {
|
||||
} catch (std::runtime_error & /*e*/) {
|
||||
hparams.image_grid_pinpoints[0]=0;
|
||||
}
|
||||
|
||||
try {
|
||||
int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
|
||||
strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
|
||||
} catch (std::runtime_error & e) {
|
||||
} catch (std::runtime_error & /*e*/) {
|
||||
strcpy(hparams.mm_patch_merge_type, "flat");
|
||||
}
|
||||
|
||||
try {
|
||||
hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
|
||||
} catch(const std::exception& e) {
|
||||
} catch(const std::exception& /*e*/) {
|
||||
hparams.image_crop_resolution = hparams.image_size;
|
||||
}
|
||||
|
||||
@ -1199,7 +1208,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
try {
|
||||
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
||||
new_clip->has_class_embedding = true;
|
||||
} catch (const std::exception& e) {
|
||||
} catch (const std::exception& /*e*/) {
|
||||
new_clip->has_class_embedding = false;
|
||||
}
|
||||
|
||||
@ -1207,7 +1216,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
new_clip->has_pre_norm = true;
|
||||
} catch (std::exception & e) {
|
||||
} catch (std::exception & /*e*/) {
|
||||
new_clip->has_pre_norm = false;
|
||||
}
|
||||
|
||||
@ -1215,21 +1224,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
|
||||
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
|
||||
new_clip->has_post_norm = true;
|
||||
} catch (std::exception & e) {
|
||||
} catch (std::exception & /*e*/) {
|
||||
new_clip->has_post_norm = false;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
|
||||
new_clip->has_patch_bias = true;
|
||||
} catch (std::exception & e) {
|
||||
} catch (std::exception & /*e*/) {
|
||||
new_clip->has_patch_bias = false;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
} catch(const std::exception& e) {
|
||||
} catch(const std::exception& /*e*/) {
|
||||
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
|
||||
}
|
||||
|
||||
@ -1241,26 +1250,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
// Yi-type llava
|
||||
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
try {
|
||||
// missing in Yi-type llava
|
||||
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
try {
|
||||
// Yi-type llava
|
||||
vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
|
||||
vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
try {
|
||||
// Yi-type llava
|
||||
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
|
||||
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
try {
|
||||
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
||||
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
||||
} catch (std::runtime_error & e) { }
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projection
|
||||
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
1244
llama/common.cpp
1244
llama/common.cpp
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -78,6 +78,12 @@ int32_t cpu_get_num_math();
|
||||
// CLI argument parsing
|
||||
//
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
enum dimre_method {
|
||||
DIMRE_METHOD_PCA,
|
||||
DIMRE_METHOD_MEAN,
|
||||
};
|
||||
|
||||
struct gpt_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||
|
||||
@ -99,7 +105,6 @@ struct gpt_params {
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||
@ -120,6 +125,7 @@ struct gpt_params {
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
@ -128,6 +134,7 @@ struct gpt_params {
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string model_url = ""; // model url to download
|
||||
std::string hf_token = ""; // HF token
|
||||
std::string hf_repo = ""; // HF repo
|
||||
std::string hf_file = ""; // HF file
|
||||
std::string prompt = "";
|
||||
@ -147,7 +154,6 @@ struct gpt_params {
|
||||
|
||||
// TODO: avoid tuple, use struct
|
||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||
|
||||
@ -179,7 +185,6 @@ struct gpt_params {
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||
bool multiline_input = false; // reverse the usage of `\`
|
||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
@ -206,6 +211,12 @@ struct gpt_params {
|
||||
std::string mmproj = ""; // path to multimodal projector
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// embedding
|
||||
bool embedding = false; // get only sentence embedding
|
||||
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||
std::string embd_sep = "\n"; // separator of embendings
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
@ -216,6 +227,7 @@ struct gpt_params {
|
||||
std::string public_path = "";
|
||||
std::string chat_template = "";
|
||||
std::string system_prompt = "";
|
||||
bool enable_chat_template = true;
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
@ -229,6 +241,8 @@ struct gpt_params {
|
||||
|
||||
std::string slot_save_path;
|
||||
|
||||
float slot_prompt_similarity = 0.5f;
|
||||
|
||||
// batched-bench params
|
||||
bool is_pp_shared = false;
|
||||
|
||||
@ -256,8 +270,21 @@ struct gpt_params {
|
||||
|
||||
bool process_output = false; // collect data for the output tensor
|
||||
bool compute_ppl = true; // whether to compute perplexity
|
||||
|
||||
// cvector-generator params
|
||||
int n_pca_batch = 100;
|
||||
int n_pca_iterations = 1000;
|
||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||
std::string cvector_outfile = "control_vector.gguf";
|
||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||
};
|
||||
|
||||
void gpt_params_handle_hf_token(gpt_params & params);
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
|
||||
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
||||
@ -301,6 +328,7 @@ bool fs_validate_filename(const std::string & filename);
|
||||
bool fs_create_directory_with_parents(const std::string & path);
|
||||
|
||||
std::string fs_get_cache_directory();
|
||||
std::string fs_get_cache_file(const std::string & filename);
|
||||
|
||||
//
|
||||
// Model utils
|
||||
@ -312,8 +340,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||
|
||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
|
||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
|
||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
|
||||
// Batch utils
|
||||
|
||||
@ -351,21 +379,13 @@ std::string llama_token_to_piece(
|
||||
llama_token token,
|
||||
bool special = true);
|
||||
|
||||
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
||||
// that takes into account the tokenizer type and decides how to handle the leading space
|
||||
//
|
||||
// detokenizes a vector of tokens into a string
|
||||
// should work similar to Python's `tokenizer.decode`
|
||||
// removes the leading space from the first non-BOS token
|
||||
std::string llama_detokenize_spm(
|
||||
// optionally renders special/control tokens
|
||||
std::string llama_detokenize(
|
||||
llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens);
|
||||
|
||||
// detokenizes a vector of tokens into a string
|
||||
// should work similar to Python's `tokenizer.decode`
|
||||
std::string llama_detokenize_bpe(
|
||||
llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens);
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special = true);
|
||||
|
||||
// Uses the value from the model metadata if possible, otherwise
|
||||
// defaults to true when model type is SPM, otherwise false.
|
||||
@ -375,9 +395,34 @@ bool llama_should_add_bos_token(const llama_model * model);
|
||||
// Chat template utils
|
||||
//
|
||||
|
||||
// same with llama_chat_message, but uses std::string
|
||||
struct llama_chat_msg {
|
||||
std::string role;
|
||||
std::string content;
|
||||
};
|
||||
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
bool llama_chat_verify_template(const std::string & tmpl);
|
||||
|
||||
// CPP wrapper for llama_chat_apply_template
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
// If the custom "tmpl" is not supported, we throw an error
|
||||
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<llama_chat_msg> & chat,
|
||||
bool add_ass);
|
||||
|
||||
// Format single message, while taking into account the position of that message in chat history
|
||||
std::string llama_chat_format_single(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<llama_chat_msg> & past_msg,
|
||||
const llama_chat_msg & new_msg,
|
||||
bool add_ass);
|
||||
|
||||
// Returns an example of formatted chat
|
||||
std::string llama_chat_format_example(const struct llama_model * model,
|
||||
const std::string & tmpl);
|
||||
|
||||
//
|
||||
// KV cache utils
|
||||
//
|
||||
@ -392,7 +437,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
||||
// Embedding utils
|
||||
//
|
||||
|
||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
||||
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
||||
|
||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||
|
||||
@ -436,4 +481,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
||||
void yaml_dump_non_result_info(
|
||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||
|
||||
|
2219
llama/ggml-aarch64.c
Normal file
2219
llama/ggml-aarch64.c
Normal file
File diff suppressed because it is too large
Load Diff
65
llama/ggml-aarch64.h
Normal file
65
llama/ggml-aarch64.h
Normal file
@ -0,0 +1,65 @@
|
||||
/**
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// GEMV
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// GEMM
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -117,8 +117,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
||||
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
||||
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
||||
GGML_ASSERT(!"not enough space in the buffer");
|
||||
return;
|
||||
GGML_ABORT("not enough space in the buffer");
|
||||
}
|
||||
|
||||
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
|
||||
@ -159,7 +158,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
|
||||
return;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(!"out of allocated_tensors");
|
||||
GGML_ABORT("out of allocated_tensors");
|
||||
}
|
||||
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
@ -168,8 +167,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
|
||||
return;
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
|
||||
GGML_ASSERT(!"tensor not found");
|
||||
GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -202,8 +200,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
||||
// this should never happen
|
||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||
__func__, size, max_avail);
|
||||
GGML_ASSERT(!"not enough space in the buffer");
|
||||
GGML_UNREACHABLE();
|
||||
GGML_ABORT("not enough space in the buffer");
|
||||
}
|
||||
}
|
||||
|
||||
@ -365,6 +362,7 @@ struct hash_node {
|
||||
};
|
||||
|
||||
struct tensor_alloc {
|
||||
int buffer_id;
|
||||
size_t offset;
|
||||
size_t size_max; // 0 = pre-allocated, unused, or view
|
||||
};
|
||||
@ -375,7 +373,6 @@ struct leaf_alloc {
|
||||
};
|
||||
|
||||
struct node_alloc {
|
||||
int buffer_id;
|
||||
struct tensor_alloc dst;
|
||||
struct tensor_alloc src[GGML_MAX_SRC];
|
||||
};
|
||||
@ -412,8 +409,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
for (int i = 0; i < n_bufs; i++) {
|
||||
galloc->bufts[i] = bufts[i];
|
||||
galloc->buffers[i] = NULL;
|
||||
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
||||
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
||||
|
||||
// check if the same buffer type is used multiple times and reuse the same allocator
|
||||
for (int j = 0; j < i; j++) {
|
||||
if (bufts[i] == bufts[j]) {
|
||||
galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (galloc->buf_tallocs[i] == NULL) {
|
||||
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
||||
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
||||
}
|
||||
}
|
||||
galloc->n_buffers = n_bufs;
|
||||
|
||||
@ -431,14 +439,34 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
if (galloc->buffers != NULL) {
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
// skip if already freed
|
||||
bool freed = false;
|
||||
for (int j = 0; j < i; j++) {
|
||||
if (galloc->buffers[j] == galloc->buffers[i]) {
|
||||
freed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!freed) {
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
}
|
||||
}
|
||||
if (galloc->buf_tallocs != NULL) {
|
||||
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
||||
// skip if already freed
|
||||
bool freed = false;
|
||||
for (int j = 0; j < i; j++) {
|
||||
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
||||
freed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!freed) {
|
||||
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(galloc->hash_set.keys);
|
||||
ggml_hash_set_free(&galloc->hash_set);
|
||||
free(galloc->hash_values);
|
||||
free(galloc->bufts);
|
||||
free(galloc->buffers);
|
||||
@ -451,7 +479,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
typedef struct ggml_gallocr * ggml_gallocr_t;
|
||||
|
||||
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
|
||||
size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
|
||||
return &galloc->hash_values[i];
|
||||
}
|
||||
|
||||
@ -537,17 +565,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
||||
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
||||
// graph outputs are never freed
|
||||
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
||||
AT_PRINTF("not freeing output %s\n", node->name);
|
||||
return;
|
||||
}
|
||||
|
||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||
size_t offset = hn->offset;
|
||||
int buffer_id = hn->buffer_id;
|
||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
||||
hn->allocated = false;
|
||||
@ -559,8 +588,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
||||
|
||||
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||
// clear hash tables
|
||||
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
||||
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
||||
ggml_hash_set_reset(&galloc->hash_set);
|
||||
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
||||
|
||||
// allocate leafs
|
||||
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
||||
@ -652,11 +681,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||
AT_PRINTF("view_src %s: %d children, %d views\n",
|
||||
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
||||
ggml_gallocr_free_node(galloc, view_src, buffer_id);
|
||||
ggml_gallocr_free_node(galloc, view_src);
|
||||
}
|
||||
}
|
||||
else if (p_hn->allocated) {
|
||||
ggml_gallocr_free_node(galloc, parent, buffer_id);
|
||||
ggml_gallocr_free_node(galloc, parent);
|
||||
}
|
||||
}
|
||||
AT_PRINTF("\n");
|
||||
@ -665,21 +694,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||
size_t hash_size = graph->visited_hash_table.size;
|
||||
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
||||
// add 25% margin to avoid hash collisions
|
||||
min_hash_size += min_hash_size / 4;
|
||||
|
||||
// initialize hash table
|
||||
if (galloc->hash_set.size < hash_size) {
|
||||
free(galloc->hash_set.keys);
|
||||
free(galloc->hash_values);
|
||||
galloc->hash_set.size = hash_size;
|
||||
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
||||
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
||||
if (galloc->hash_set.size < min_hash_size) {
|
||||
ggml_hash_set_free(&galloc->hash_set);
|
||||
galloc->hash_set = ggml_hash_set_new(min_hash_size);
|
||||
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
||||
|
||||
free(galloc->hash_values);
|
||||
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
|
||||
GGML_ASSERT(galloc->hash_values != NULL);
|
||||
} else {
|
||||
// reset hash table
|
||||
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
|
||||
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
||||
}
|
||||
|
||||
// reset allocators
|
||||
@ -700,22 +727,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
||||
if (node->view_src || node->data) {
|
||||
node_alloc->dst.buffer_id = -1;
|
||||
node_alloc->dst.offset = SIZE_MAX;
|
||||
node_alloc->dst.size_max = 0;
|
||||
} else {
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||
node_alloc->dst.offset = hn->offset;
|
||||
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
||||
node_alloc->dst.buffer_id = hn->buffer_id;
|
||||
node_alloc->dst.offset = hn->offset;
|
||||
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
||||
}
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (!src || src->view_src || src->data) {
|
||||
node_alloc->src[j].buffer_id = -1;
|
||||
node_alloc->src[j].offset = SIZE_MAX;
|
||||
node_alloc->src[j].size_max = 0;
|
||||
} else {
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
||||
node_alloc->src[j].buffer_id = hn->buffer_id;
|
||||
node_alloc->src[j].offset = hn->offset;
|
||||
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
||||
}
|
||||
@ -732,9 +762,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
||||
if (leaf->view_src || leaf->data) {
|
||||
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
||||
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
||||
galloc->leaf_allocs[i].leaf.size_max = 0;
|
||||
} else {
|
||||
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
||||
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
||||
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
||||
}
|
||||
@ -742,6 +774,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
|
||||
// reallocate buffers if needed
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
// if the buffer type is used multiple times, we reuse the same buffer
|
||||
for (int j = 0; j < i; j++) {
|
||||
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
||||
galloc->buffers[i] = galloc->buffers[j];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
||||
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
||||
|
||||
@ -750,12 +790,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||
if (galloc->buffers[i] == NULL) {
|
||||
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -766,7 +808,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
||||
}
|
||||
|
||||
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
||||
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
||||
int buffer_id = tensor_alloc->buffer_id;
|
||||
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||
|
||||
if (tensor->view_src != NULL) {
|
||||
@ -794,9 +837,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
|
||||
ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
|
||||
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
||||
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
||||
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
||||
return talloc->size_max >= node_size;
|
||||
}
|
||||
|
||||
@ -819,7 +861,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||
|
||||
if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
|
||||
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
||||
#endif
|
||||
@ -831,7 +873,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
|
||||
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
||||
#endif
|
||||
@ -872,7 +914,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
||||
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
|
||||
ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
|
||||
}
|
||||
// nodes
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
@ -883,9 +925,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
|
||||
ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
|
||||
}
|
||||
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
||||
ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -897,6 +939,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
if (galloc->buffers[buffer_id] == NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < buffer_id; i++) {
|
||||
if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
|
||||
// this buffer is the same as a previous one due to the same buffer type being used multiple times
|
||||
// only return the buffer size the first time it appears to avoid double counting
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
@ -912,7 +963,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
||||
#endif
|
||||
for (size_t i = 0; i < *n_buffers; i++) {
|
||||
ggml_backend_buffer_free(*buffers[i]);
|
||||
ggml_backend_buffer_free((*buffers)[i]);
|
||||
}
|
||||
free(*buffers);
|
||||
return false;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -43,13 +43,15 @@ extern "C" {
|
||||
|
||||
struct ggml_backend_buffer_type_i {
|
||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
||||
// allocate a buffer of this type
|
||||
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
|
||||
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||
// tensor alignment
|
||||
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
|
||||
// max buffer size that can be allocated
|
||||
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
|
||||
// data size needed to allocate the tensor, including padding
|
||||
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
// check if tensor data is in host memory
|
||||
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
||||
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
||||
};
|
||||
|
||||
@ -118,27 +120,37 @@ extern "C" {
|
||||
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
||||
|
||||
// compute graph with a plan (not used currently)
|
||||
// create a new plan for a graph
|
||||
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
||||
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
||||
// compute the graph with the plan
|
||||
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
||||
// compute graph with a plan
|
||||
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
// compute graph without a plan (async)
|
||||
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
// check if the backend supports an operation
|
||||
// check if the backend can compute an operation
|
||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// check if the backend can use tensors allocated in a buffer type
|
||||
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
|
||||
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
||||
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
||||
// even if the weight has to be copied from the CPU temporarily
|
||||
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// (optional) event synchronization
|
||||
// create a new event that can record events on this backend instance
|
||||
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
||||
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
||||
// record an event on the backend instance that created it
|
||||
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
||||
// wait for an event on on a different backend instance
|
||||
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
// block until an event is recorded
|
||||
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
||||
};
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -70,10 +70,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
|
||||
return ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
return buft->iface.supports_backend(buft, backend);
|
||||
}
|
||||
|
||||
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||
if (buft->iface.is_host) {
|
||||
return buft->iface.is_host(buft);
|
||||
@ -169,6 +165,10 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
|
||||
}
|
||||
}
|
||||
|
||||
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
|
||||
return buffer->usage;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
||||
return buffer->buft;
|
||||
}
|
||||
@ -317,6 +317,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
||||
return backend->iface.supports_op(backend, op);
|
||||
}
|
||||
|
||||
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
return backend->iface.supports_buft(backend, buft);
|
||||
}
|
||||
|
||||
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
if (backend->iface.offload_op != NULL) {
|
||||
return backend->iface.offload_op(backend, op);
|
||||
@ -425,7 +429,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
|
||||
|
||||
// backend registry
|
||||
|
||||
#define GGML_REG_MAX_BACKENDS 16
|
||||
#define GGML_REG_MAX_BACKENDS 64
|
||||
|
||||
struct ggml_backend_reg {
|
||||
char name[128];
|
||||
@ -476,6 +480,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
||||
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
|
||||
ggml_backend_kompute_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
extern GGML_CALL int ggml_backend_cann_reg_devices(void);
|
||||
ggml_backend_cann_reg_devices();
|
||||
#endif
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||
@ -670,12 +679,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
return ggml_backend_is_cpu(backend);
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return true;
|
||||
|
||||
@ -690,7 +693,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||
},
|
||||
/* .context = */ NULL,
|
||||
@ -746,7 +748,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||
},
|
||||
/* .context = */ NULL,
|
||||
@ -867,6 +868,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
return ggml_backend_buft_is_host(buft);
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static struct ggml_backend_i cpu_backend_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_name,
|
||||
/* .free = */ ggml_backend_cpu_free,
|
||||
@ -877,9 +884,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||||
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
@ -1077,17 +1086,19 @@ struct ggml_backend_sched {
|
||||
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
||||
ggml_gallocr_t galloc;
|
||||
|
||||
// hash keys of the nodes in the graph
|
||||
struct ggml_hash_set hash_set;
|
||||
// hash values
|
||||
int * tensor_backend_id;
|
||||
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
||||
// hash map of the nodes in the graph
|
||||
struct ggml_hash_set hash_set;
|
||||
int * hv_tensor_backend_ids; // [hash_set.size]
|
||||
struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
|
||||
|
||||
int * node_backend_ids; // [graph_size]
|
||||
int * leaf_backend_ids; // [graph_size]
|
||||
|
||||
int * prev_node_backend_ids; // [graph_size]
|
||||
int * prev_leaf_backend_ids; // [graph_size]
|
||||
|
||||
// copy of the graph with modified inputs
|
||||
struct ggml_cgraph * graph;
|
||||
struct ggml_cgraph graph;
|
||||
|
||||
// graph splits
|
||||
struct ggml_backend_sched_split * splits;
|
||||
@ -1106,17 +1117,16 @@ struct ggml_backend_sched {
|
||||
ggml_backend_sched_eval_callback callback_eval;
|
||||
void * callback_eval_user_data;
|
||||
|
||||
// align context_buffer to GGML_MEM_ALIGN
|
||||
#ifdef _MSC_VER
|
||||
__declspec(align(GGML_MEM_ALIGN))
|
||||
#else
|
||||
__attribute__((aligned(GGML_MEM_ALIGN)))
|
||||
#endif
|
||||
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
||||
char * context_buffer;
|
||||
size_t context_buffer_size;
|
||||
|
||||
bool debug;
|
||||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
|
||||
#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
|
||||
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
|
||||
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
||||
|
||||
// returns the priority of the backend, lower id is higher priority
|
||||
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
@ -1128,22 +1138,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
|
||||
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
||||
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||
if (buffer == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// find highest prio backend that supports the buffer type
|
||||
// find highest prio backend that supports the buffer type and the op
|
||||
for (int i = 0; i < sched->n_backends; i++) {
|
||||
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
||||
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
||||
ggml_backend_supports_op(sched->backends[i], op)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
|
||||
__func__, ggml_backend_buffer_name(buffer), tensor->name);
|
||||
GGML_ASSERT(false);
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
||||
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
||||
#endif
|
||||
|
||||
return -1;
|
||||
}
|
||||
@ -1162,7 +1174,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
// TODO: use supports_op to check if the backend supports the op
|
||||
|
||||
// assign pre-allocated nodes to their backend
|
||||
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
||||
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
||||
if (cur_backend_id != -1) {
|
||||
SET_CAUSE(tensor, "1.dst");
|
||||
return cur_backend_id;
|
||||
@ -1170,7 +1182,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
|
||||
// view_src
|
||||
if (tensor->view_src != NULL) {
|
||||
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
||||
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
||||
if (cur_backend_id != -1) {
|
||||
SET_CAUSE(tensor, "1.vsrc");
|
||||
return cur_backend_id;
|
||||
@ -1184,7 +1196,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
return cur_backend_id;
|
||||
}
|
||||
|
||||
// assign nodes that use weights to the backend of the weights
|
||||
// operations with weights are preferably run on the same backend as the weights
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
const struct ggml_tensor * src = tensor->src[i];
|
||||
@ -1192,11 +1203,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
continue;
|
||||
}
|
||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||
// check if a backend with higher prio wants to offload the op
|
||||
if (src_backend_id == sched->n_backends - 1) {
|
||||
for (int b = 0; b < src_backend_id; b++) {
|
||||
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||
SET_CAUSE(tensor, "1.off");
|
||||
return b;
|
||||
}
|
||||
@ -1254,10 +1265,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||
}
|
||||
}
|
||||
|
||||
//#define DEBUG_PASS1
|
||||
//#define DEBUG_PASS2
|
||||
//#define DEBUG_PASS3
|
||||
//#define DEBUG_PASS4
|
||||
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
|
||||
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
||||
ggml_backend_buffer_type_t buft = NULL;
|
||||
|
||||
if (buf) {
|
||||
// the tensor is already allocated
|
||||
buft = buf->buft;
|
||||
} else {
|
||||
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
||||
int tensor_backend_id = tensor_backend_id(t);
|
||||
if (tensor_backend_id == -1 && t->view_src) {
|
||||
tensor_backend_id = tensor_backend_id(t->view_src);
|
||||
}
|
||||
if (tensor_backend_id != -1) {
|
||||
buft = sched->bufts[tensor_backend_id];
|
||||
}
|
||||
}
|
||||
|
||||
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
||||
}
|
||||
|
||||
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
||||
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.sup");
|
||||
}
|
||||
}
|
||||
|
||||
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
||||
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
@ -1267,7 +1301,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
sched->is_reset = false;
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/* .mem_size = */ sizeof(sched->context_buffer),
|
||||
/* .mem_size = */ sched->context_buffer_size,
|
||||
/* .mem_buffer = */ sched->context_buffer,
|
||||
/* .no_alloc = */ true
|
||||
};
|
||||
@ -1276,52 +1310,52 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
sched->ctx = ggml_init(params);
|
||||
if (sched->ctx == NULL) {
|
||||
fprintf(stderr, "%s: failed to initialize context\n", __func__);
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("%s: failed to initialize context\n", __func__);
|
||||
}
|
||||
|
||||
// pass 1: assign backends to ops with pre-allocated inputs
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
int * leaf_backend_id = &tensor_backend_id(leaf);
|
||||
if (*leaf_backend_id != -1) {
|
||||
// do not overwrite user assignments
|
||||
continue;
|
||||
// do not overwrite user assignments
|
||||
if (*leaf_backend_id == -1) {
|
||||
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||
}
|
||||
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||
}
|
||||
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
if (*node_backend_id != -1) {
|
||||
// do not overwrite user assignments
|
||||
continue;
|
||||
}
|
||||
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||
// src
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
// do not overwrite user assignments
|
||||
if (*node_backend_id == -1) {
|
||||
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||
|
||||
#if 0
|
||||
// src
|
||||
if (node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
int * src_backend_id = &tensor_backend_id(src);
|
||||
if (*src_backend_id == -1) {
|
||||
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
int * src_backend_id = &tensor_backend_id(src);
|
||||
if (*src_backend_id == -1) {
|
||||
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#ifdef DEBUG_PASS1
|
||||
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
|
||||
// pass 2: expand current backend assignments
|
||||
// assign the same backend to adjacent nodes
|
||||
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
||||
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
||||
|
||||
|
||||
// pass 2.2 expand gpu down
|
||||
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
||||
// expand gpu down
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
@ -1337,13 +1371,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
} else {
|
||||
cur_backend_id = *node_backend_id;
|
||||
}
|
||||
} else {
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.2");
|
||||
} else if (cur_backend_id != -1) {
|
||||
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
// pass 2.1 expand gpu up
|
||||
// expand gpu up
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||
@ -1359,13 +1392,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
} else {
|
||||
cur_backend_id = *node_backend_id;
|
||||
}
|
||||
} else {
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.1");
|
||||
} else if (cur_backend_id != -1) {
|
||||
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
// pass 2.4 expand rest down
|
||||
// expand rest down
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
@ -1376,13 +1408,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
if (*node_backend_id != -1) {
|
||||
cur_backend_id = *node_backend_id;
|
||||
} else {
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.4");
|
||||
} else if (cur_backend_id != -1) {
|
||||
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
// pass 2.3 expand rest up
|
||||
// expand rest up
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||
@ -1393,24 +1424,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
if (*node_backend_id != -1) {
|
||||
cur_backend_id = *node_backend_id;
|
||||
} else {
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.3");
|
||||
} else if (cur_backend_id != -1) {
|
||||
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG_PASS2
|
||||
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
||||
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
||||
// however, we also need to verify that the sources are in compatible buffer types
|
||||
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
||||
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
||||
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
||||
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
||||
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
continue;
|
||||
}
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
if (*node_backend_id == -1) {
|
||||
// unassigned node: find the backend with the most supported inputs
|
||||
int n_supported_best = -1;
|
||||
for (int b = 0; b < sched->n_backends; b++) {
|
||||
if (ggml_backend_supports_op(sched->backends[b], node)) {
|
||||
int n_supported = 0;
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
|
||||
n_supported++;
|
||||
}
|
||||
}
|
||||
if (n_supported > n_supported_best) {
|
||||
n_supported_best = n_supported;
|
||||
*node_backend_id = b;
|
||||
SET_CAUSE(node, "3.best");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// assigned node: upgrade to higher prio backend if possible
|
||||
for (int b = 0; b < *node_backend_id; b++) {
|
||||
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
|
||||
bool supported = true;
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
|
||||
supported = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (supported) {
|
||||
*node_backend_id = b;
|
||||
SET_CAUSE(node, "3.upg");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// pass 3: assign backends to remaining src from dst and view_src
|
||||
// pass 4: assign backends to remaining src from dst and view_src
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
int * cur_backend_id = &tensor_backend_id(node);
|
||||
if (node->view_src != NULL && *cur_backend_id == -1) {
|
||||
*cur_backend_id = tensor_backend_id(node->view_src);
|
||||
SET_CAUSE(node, "3.vsrc");
|
||||
SET_CAUSE(node, "4.vsrc");
|
||||
}
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
@ -1422,24 +1509,22 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
if (src->view_src != NULL) {
|
||||
// views are always on the same backend as the source
|
||||
*src_backend_id = tensor_backend_id(src->view_src);
|
||||
SET_CAUSE(src, "3.vsrc");
|
||||
SET_CAUSE(src, "4.vsrc");
|
||||
} else {
|
||||
*src_backend_id = *cur_backend_id;
|
||||
SET_CAUSE(src, "3.cur");
|
||||
SET_CAUSE(src, "4.cur");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef DEBUG_PASS3
|
||||
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
|
||||
// pass 4: split graph, find tensors that need to be copied
|
||||
// pass 5: split graph, find tensors that need to be copied
|
||||
{
|
||||
int i_split = 0;
|
||||
struct ggml_backend_sched_split * split = &sched->splits[0];
|
||||
// find the backend of the first split, skipping view ops
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
int i = 0;
|
||||
for (; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
if (!ggml_is_view_op(node->op)) {
|
||||
split->backend_id = tensor_backend_id(node);
|
||||
@ -1448,9 +1533,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
split->i_start = 0;
|
||||
split->n_inputs = 0;
|
||||
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
||||
int cur_backend_id = split->backend_id;
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
for (; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
@ -1459,7 +1543,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
const int node_backend_id = tensor_backend_id(node);
|
||||
|
||||
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
||||
assert(node_backend_id != -1); // all nodes should be assigned by now
|
||||
|
||||
// check if we should start a new split based on the sources of the current node
|
||||
bool need_new_split = false;
|
||||
@ -1473,16 +1557,18 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = tensor_backend_id(src);
|
||||
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
||||
if (src_backend_id != cur_backend_id) {
|
||||
need_new_split = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// check if the split has too many inputs
|
||||
// FIXME: count the number of inputs instead of only checking when full
|
||||
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
||||
const size_t id = hash_id(src);
|
||||
int src_backend_id = sched->tensor_backend_id[id];
|
||||
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
|
||||
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
||||
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
||||
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||
need_new_split = true;
|
||||
break;
|
||||
@ -1514,12 +1600,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
continue;
|
||||
}
|
||||
|
||||
const int src_backend_id = tensor_backend_id(src);
|
||||
size_t src_id = hash_id(src);
|
||||
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
||||
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
||||
size_t id = hash_id(src);
|
||||
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
||||
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
||||
ggml_backend_t backend = sched->backends[src_backend_id];
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * tensor_copy;
|
||||
@ -1533,7 +1619,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
ggml_set_input(tensor_copy);
|
||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||
}
|
||||
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
||||
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
||||
SET_CAUSE(tensor_copy, "4.cpy");
|
||||
}
|
||||
int n_graph_inputs = sched->n_graph_inputs++;
|
||||
@ -1542,10 +1628,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
}
|
||||
|
||||
if (src_backend_id != node_backend_id) {
|
||||
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
||||
// create a copy of the input in the split's backend
|
||||
const size_t id = hash_id(src);
|
||||
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
||||
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
|
||||
ggml_backend_t backend = sched->backends[cur_backend_id];
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||
@ -1554,27 +1639,49 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
ggml_set_input(tensor_copy);
|
||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||
}
|
||||
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
||||
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
||||
SET_CAUSE(tensor_copy, "4.cpy");
|
||||
}
|
||||
int n_inputs = split->n_inputs++;
|
||||
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||
split->inputs[n_inputs] = src;
|
||||
}
|
||||
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
||||
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
||||
}
|
||||
}
|
||||
}
|
||||
split->i_end = graph->n_nodes;
|
||||
sched->n_splits = i_split + 1;
|
||||
}
|
||||
#ifdef DEBUG_PASS4
|
||||
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
|
||||
// create copies of the graph for each split
|
||||
// TODO: avoid this copy
|
||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
||||
if (sched->debug) {
|
||||
ggml_backend_sched_print_assignments(sched, graph);
|
||||
}
|
||||
|
||||
// swap node_backend_ids and leaf _backend_ids with prevs
|
||||
{
|
||||
int * tmp = sched->node_backend_ids;
|
||||
sched->node_backend_ids = sched->prev_node_backend_ids;
|
||||
sched->prev_node_backend_ids = tmp;
|
||||
|
||||
tmp = sched->leaf_backend_ids;
|
||||
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
||||
sched->prev_leaf_backend_ids = tmp;
|
||||
}
|
||||
|
||||
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||
if (sched->graph.size < graph_size) {
|
||||
sched->graph.size = graph_size;
|
||||
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
||||
sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
||||
GGML_ASSERT(sched->graph.nodes != NULL);
|
||||
GGML_ASSERT(sched->graph.leafs != NULL);
|
||||
}
|
||||
sched->graph.n_nodes = 0;
|
||||
sched->graph.n_leafs = 0;
|
||||
|
||||
struct ggml_cgraph * graph_copy = &sched->graph;
|
||||
|
||||
for (int i = 0; i < sched->n_splits; i++) {
|
||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||
@ -1585,12 +1692,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
struct ggml_tensor * input = split->inputs[j];
|
||||
const size_t input_id = hash_id(input);
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
||||
struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
|
||||
|
||||
// add a dependency to the input source so that it is not freed before the copy is done
|
||||
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
||||
input_dep->src[0] = input;
|
||||
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
|
||||
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
||||
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
||||
|
||||
// add a dependency to the input copy so that it is allocated at the start of the split
|
||||
@ -1612,7 +1719,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
size_t id = hash_id(input);
|
||||
int backend_id = tensor_backend_id(input);
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||
}
|
||||
@ -1625,7 +1732,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
struct ggml_tensor * input = split->inputs[j];
|
||||
size_t id = hash_id(input);
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||
}
|
||||
@ -1639,20 +1746,36 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
||||
}
|
||||
|
||||
sched->graph = graph_copy;
|
||||
}
|
||||
|
||||
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||
bool backend_ids_changed = false;
|
||||
for (int i = 0; i < sched->graph.n_nodes; i++) {
|
||||
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
||||
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
||||
backend_ids_changed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!backend_ids_changed) {
|
||||
for (int i = 0; i < sched->graph.n_leafs; i++) {
|
||||
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
||||
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
||||
backend_ids_changed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// allocate graph
|
||||
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
||||
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
||||
// the re-allocation may cause the split inputs to be moved to a different address
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
|
||||
fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
||||
#endif
|
||||
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
||||
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
||||
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
||||
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
||||
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@ -1673,7 +1796,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
for (int j = 0; j < split->n_inputs; j++) {
|
||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
||||
struct ggml_tensor * input = split->inputs[j];
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
|
||||
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
||||
|
||||
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||
@ -1758,18 +1881,24 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
|
||||
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
||||
|
||||
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
||||
sched->n_backends = n_backends;
|
||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||
|
||||
// initialize hash table
|
||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
||||
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
||||
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
||||
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
||||
|
||||
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
||||
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
||||
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
||||
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
||||
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
||||
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
||||
|
||||
sched->n_backends = n_backends;
|
||||
|
||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||
sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
||||
sched->context_buffer = malloc(sched->context_buffer_size);
|
||||
|
||||
const int initial_splits_capacity = 16;
|
||||
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
||||
@ -1778,7 +1907,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
for (int b = 0; b < n_backends; b++) {
|
||||
sched->backends[b] = backends[b];
|
||||
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
||||
GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
|
||||
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
||||
if (sched->n_copies > 1) {
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
||||
@ -1804,35 +1933,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
}
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
ggml_hash_set_free(&sched->hash_set);
|
||||
free(sched->splits);
|
||||
free(sched->hash_set.keys);
|
||||
free(sched->tensor_backend_id);
|
||||
free(sched->tensor_copies);
|
||||
free(sched->hv_tensor_backend_ids);
|
||||
free(sched->hv_tensor_copies);
|
||||
free(sched->node_backend_ids);
|
||||
free(sched->leaf_backend_ids);
|
||||
free(sched->prev_node_backend_ids);
|
||||
free(sched->prev_leaf_backend_ids);
|
||||
free(sched->context_buffer);
|
||||
free(sched->graph.nodes);
|
||||
free(sched->graph.leafs);
|
||||
free(sched);
|
||||
}
|
||||
|
||||
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||
// reset state for the next run
|
||||
if (!sched->is_reset) {
|
||||
size_t hash_size = sched->hash_set.size;
|
||||
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
||||
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
||||
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
||||
|
||||
ggml_hash_set_reset(&sched->hash_set);
|
||||
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
||||
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
||||
sched->is_reset = true;
|
||||
}
|
||||
sched->is_alloc = false;
|
||||
}
|
||||
|
||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
// TODO: extract this to a separate function
|
||||
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
||||
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1843,10 +1974,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
}
|
||||
|
||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, graph);
|
||||
|
||||
|
||||
if (!ggml_backend_sched_alloc_splits(sched)) {
|
||||
return false;
|
||||
}
|
||||
@ -1895,6 +2027,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
||||
return sched->n_copies;
|
||||
}
|
||||
|
||||
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
||||
return sched->n_backends;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
||||
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
||||
return sched->backends[i];
|
||||
}
|
||||
|
||||
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
@ -1906,6 +2047,8 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
tensor_backend_id(node) = backend_index;
|
||||
SET_CAUSE(node, "usr");
|
||||
sched->is_reset = false;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
||||
@ -1948,9 +2091,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
||||
GGML_ASSERT(src != NULL);
|
||||
GGML_ASSERT(src->data && "graph must be allocated");
|
||||
|
||||
size_t id = ggml_hash_insert(hash_set, src);
|
||||
if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
|
||||
return node_copies[ggml_hash_find(hash_set, src)];
|
||||
size_t id = ggml_hash_insert(&hash_set, src);
|
||||
if (id == GGML_HASHSET_ALREADY_EXISTS) {
|
||||
return node_copies[ggml_hash_find(&hash_set, src)];
|
||||
}
|
||||
|
||||
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
||||
@ -1975,7 +2118,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
||||
return dst;
|
||||
}
|
||||
|
||||
static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
||||
static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
||||
size_t id = ggml_hash_find(hash_set, src);
|
||||
if (node_init[id]) {
|
||||
return;
|
||||
@ -2002,10 +2145,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
||||
}
|
||||
|
||||
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
||||
struct ggml_hash_set hash_set = {
|
||||
/* .size = */ graph->visited_hash_table.size,
|
||||
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
||||
};
|
||||
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
||||
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
||||
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
||||
|
||||
@ -2020,7 +2160,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||
|
||||
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
||||
fprintf(stderr, "failed to allocate context for graph copy\n");
|
||||
free(hash_set.keys);
|
||||
ggml_hash_set_free(&hash_set);
|
||||
free(node_copies);
|
||||
free(node_init);
|
||||
ggml_free(ctx_allocated);
|
||||
@ -2043,7 +2183,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
||||
if (buffer == NULL) {
|
||||
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
||||
free(hash_set.keys);
|
||||
ggml_hash_set_free(&hash_set);
|
||||
free(node_copies);
|
||||
free(node_init);
|
||||
ggml_free(ctx_allocated);
|
||||
@ -2061,19 +2201,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||
// copy data and init views
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
graph_copy_init_tensor(hash_set, node_copies, node_init, node);
|
||||
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
||||
}
|
||||
|
||||
// build graph copy
|
||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
|
||||
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
|
||||
graph_copy->nodes[i] = node_copy;
|
||||
}
|
||||
graph_copy->n_nodes = graph->n_nodes;
|
||||
|
||||
free(hash_set.keys);
|
||||
ggml_hash_set_free(&hash_set);
|
||||
free(node_copies);
|
||||
free(node_init);
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -49,28 +49,29 @@ extern "C" {
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
|
||||
// buffer
|
||||
enum ggml_backend_buffer_usage {
|
||||
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
|
||||
//
|
||||
// Backend
|
||||
@ -100,6 +101,7 @@ extern "C" {
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// tensor copy between different backends
|
||||
@ -116,7 +118,7 @@ extern "C" {
|
||||
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
|
||||
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
@ -145,7 +147,7 @@ extern "C" {
|
||||
|
||||
GGML_API size_t ggml_backend_reg_get_count(void);
|
||||
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
||||
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
||||
@ -208,6 +210,9 @@ extern "C" {
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||
|
||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
||||
|
||||
// Get the number of splits of the last graph
|
||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -45,7 +45,11 @@ typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_CUDA)
|
||||
#if defined(GGML_COMMON_DECL_MUSA)
|
||||
#include <musa_fp16.h>
|
||||
#else
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include <cstdint>
|
||||
|
||||
typedef half ggml_half;
|
||||
@ -132,19 +136,19 @@ typedef sycl::half2 ggml_half2;
|
||||
#define QR6_K 2
|
||||
|
||||
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
||||
#define QR2_XXS 8
|
||||
#define QR2_XXS 4
|
||||
|
||||
#define QI2_XS (QK_K / (4*QR2_XS))
|
||||
#define QR2_XS 8
|
||||
#define QR2_XS 4
|
||||
|
||||
#define QI2_S (QK_K / (4*QR2_S))
|
||||
#define QR2_S 8
|
||||
#define QR2_S 4
|
||||
|
||||
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
||||
#define QR3_XXS 8
|
||||
#define QR3_XXS 4
|
||||
|
||||
#define QI3_XS (QK_K / (4*QR3_XS))
|
||||
#define QR3_XS 8
|
||||
#define QR3_XS 4
|
||||
|
||||
#define QI1_S (QK_K / (4*QR1_S))
|
||||
#define QR1_S 8
|
||||
@ -156,10 +160,10 @@ typedef sycl::half2 ggml_half2;
|
||||
#define QR4_NL 2
|
||||
|
||||
#define QI4_XS (QK_K / (4*QR4_XS))
|
||||
#define QR4_XS 8
|
||||
#define QR4_XS 2
|
||||
|
||||
#define QI3_S (QK_K / (4*QR3_S))
|
||||
#define QR3_S 8
|
||||
#define QR3_S 4
|
||||
|
||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
||||
|
||||
@ -225,6 +229,30 @@ typedef struct {
|
||||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
|
||||
} block_q4_0x4;
|
||||
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
|
||||
} block_q4_0x8;
|
||||
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
||||
} block_q8_0x4;
|
||||
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
|
||||
} block_q8_0x8;
|
||||
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
@ -417,7 +445,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
||||
#define GGML_TABLE_END() };
|
||||
|
||||
#define GGML_COMMON_IMPL
|
||||
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
|
||||
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
|
||||
#include <cstdint>
|
||||
|
||||
#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -55,6 +55,7 @@
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
#include "ggml-cuda/unary.cuh"
|
||||
#include "ggml-cuda/upscale.cuh"
|
||||
#include "ggml-cuda/conv-transpose-1d.cuh"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@ -123,7 +124,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
||||
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
||||
// abort with GGML_ASSERT to get a stack trace
|
||||
GGML_ASSERT(!"CUDA error");
|
||||
GGML_ABORT("CUDA error");
|
||||
}
|
||||
|
||||
// this is faster on Windows
|
||||
@ -178,21 +179,21 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
||||
|
||||
int64_t total_vram = 0;
|
||||
#if defined(GGML_CUDA_FORCE_MMQ)
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
||||
#else
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
||||
#endif
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
||||
#endif // GGML_CUDA_FORCE_MMQ
|
||||
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
|
||||
#else
|
||||
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
||||
#endif
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
|
||||
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
int device_vmm = 0;
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||
CUdevice device;
|
||||
CU_CHECK(cuDeviceGet(&device, id));
|
||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||
@ -204,7 +205,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
alloc_prop.location.id = id;
|
||||
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIPBLAS)
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||
info.devices[id].vmm = !!device_vmm;
|
||||
|
||||
cudaDeviceProp prop;
|
||||
@ -214,13 +215,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.default_tensor_split[id] = total_vram;
|
||||
total_vram += prop.totalGlobalMem;
|
||||
|
||||
info.devices[id].nsm = prop.multiProcessorCount;
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
||||
#else
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
info.devices[id].nsm = prop.multiProcessorCount;
|
||||
}
|
||||
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
@ -338,7 +341,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
};
|
||||
|
||||
// pool with virtual memory
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
|
||||
@ -432,14 +435,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
||||
}
|
||||
};
|
||||
#endif // !defined(GGML_USE_HIPBLAS)
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||
|
||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||
if (ggml_cuda_info().devices[device].vmm) {
|
||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||
}
|
||||
#endif
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||
}
|
||||
|
||||
@ -491,12 +494,12 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||
return;
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
// initialize padding to 0 to avoid possible NaN values
|
||||
size_t original_size = ggml_nbytes(tensor);
|
||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||
|
||||
if (padded_size > original_size && tensor->view_src == nullptr) {
|
||||
if (padded_size > original_size) {
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
||||
}
|
||||
@ -573,6 +576,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
|
||||
@ -615,24 +622,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
if (!ggml_backend_is_cuda(backend)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
return buft_ctx->device == cuda_ctx->device;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
||||
/* .is_host = */ NULL,
|
||||
};
|
||||
|
||||
@ -671,7 +666,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
|
||||
}
|
||||
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
|
||||
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
|
||||
}
|
||||
return row_rounding;
|
||||
}
|
||||
@ -893,6 +888,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
||||
// instead, we allocate them for each tensor separately in init_tensor
|
||||
@ -936,12 +935,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
|
||||
return total_size;
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
return ggml_backend_is_cuda(backend);
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return false;
|
||||
|
||||
@ -954,7 +947,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
||||
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
||||
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
@ -1054,7 +1046,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .context = */ nullptr,
|
||||
@ -1377,10 +1368,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
||||
GGML_UNUSED(main_device);
|
||||
}
|
||||
|
||||
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
||||
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
||||
cudaMemcpy3DPeerParms p = {};
|
||||
p.dstDevice = dstDevice;
|
||||
p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
|
||||
p.srcDevice = srcDevice;
|
||||
p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
|
||||
p.extent = make_cudaExtent(width, height, 1);
|
||||
return cudaMemcpy3DPeerAsync(&p, stream);
|
||||
#else
|
||||
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
||||
GGML_UNUSED(dstDevice);
|
||||
GGML_UNUSED(srcDevice);
|
||||
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_mul_mat(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
||||
const bool convert_src1_to_q8_1) {
|
||||
quantize_cuda_t quantize_src1) {
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
@ -1437,7 +1448,9 @@ static void ggml_cuda_op_mul_mat(
|
||||
}
|
||||
|
||||
struct dev_data {
|
||||
ggml_cuda_pool_alloc<char> src0_dd_alloc;
|
||||
int cc;
|
||||
|
||||
ggml_cuda_pool_alloc<char> src0_dd_alloc;
|
||||
ggml_cuda_pool_alloc<float> src1_ddf_alloc;
|
||||
ggml_cuda_pool_alloc<char> src1_ddq_alloc;
|
||||
ggml_cuda_pool_alloc<float> dst_dd_alloc;
|
||||
@ -1456,6 +1469,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
int used_devices = 0;
|
||||
|
||||
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
dev[id].cc = ggml_cuda_info().devices[id].cc;
|
||||
|
||||
// by default, use all rows
|
||||
dev[id].row_low = 0;
|
||||
dev[id].row_high = ne01;
|
||||
@ -1500,17 +1515,28 @@ static void ggml_cuda_op_mul_mat(
|
||||
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
|
||||
}
|
||||
|
||||
// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
|
||||
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
||||
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
||||
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
||||
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
|
||||
}
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
dev[id].src1_ddf = (float *) src1->data;
|
||||
} else {
|
||||
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1) {
|
||||
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
||||
if (quantize_src1) {
|
||||
size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
|
||||
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
||||
src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
|
||||
}
|
||||
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
||||
quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
@ -1556,7 +1582,12 @@ static void ggml_cuda_op_mul_mat(
|
||||
const int64_t i03 = i0 / ne12;
|
||||
const int64_t i02 = i0 % ne12;
|
||||
|
||||
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
||||
size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
||||
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
||||
src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
|
||||
} else {
|
||||
src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
||||
}
|
||||
|
||||
// for split tensors the data begins at i0 == i0_offset_low
|
||||
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
||||
@ -1573,10 +1604,17 @@ static void ggml_cuda_op_mul_mat(
|
||||
// copy src0, src1 to device if necessary
|
||||
if (src1_is_contiguous) {
|
||||
if (id != ctx.device) {
|
||||
if (convert_src1_to_q8_1) {
|
||||
if (quantize_src1) {
|
||||
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
|
||||
src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
|
||||
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
||||
const size_t pitch = ne11*sizeof(block_q8_1_mmq);
|
||||
const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
|
||||
const size_t height = src1_padded_col_size/(4*QK8_1);
|
||||
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(
|
||||
src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
|
||||
}
|
||||
} else {
|
||||
float * src1_ddf_i_source = (float *) src1->data;
|
||||
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
||||
@ -1588,11 +1626,11 @@ static void ggml_cuda_op_mul_mat(
|
||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
||||
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||
if (quantize_src1 && !src1_is_contiguous) {
|
||||
quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
@ -1617,22 +1655,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
||||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
||||
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
||||
#if !defined(GGML_USE_HIPBLAS)
|
||||
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
||||
cudaMemcpy3DPeerParms p = {};
|
||||
p.dstDevice = ctx.device;
|
||||
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
|
||||
p.srcDevice = id;
|
||||
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
||||
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
||||
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
||||
#else
|
||||
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
||||
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
||||
dst_dd_i, row_diff*sizeof(float),
|
||||
row_diff*sizeof(float), src1_ncols,
|
||||
cudaMemcpyDeviceToDevice, stream));
|
||||
#endif
|
||||
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
|
||||
dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
|
||||
} else {
|
||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
||||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
||||
@ -1834,6 +1858,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
}
|
||||
}
|
||||
#else
|
||||
#ifdef GGML_USE_MUSA
|
||||
GGML_ASSERT(false);
|
||||
#else // !GGML_USE_MUSA
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
// use cublasGemmStridedBatchedEx
|
||||
@ -1876,6 +1903,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
#endif // GGML_USE_MUSA
|
||||
#endif
|
||||
|
||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
@ -1887,9 +1915,23 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
||||
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
|
||||
&& src1->ne[1] == 1;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
// if mmvq is available it's a better choice than dmmv:
|
||||
#ifndef GGML_CUDA_FORCE_DMMV
|
||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||
#endif // GGML_CUDA_FORCE_DMMV
|
||||
|
||||
bool any_gpus_with_slow_fp16 = false;
|
||||
|
||||
bool any_pascal_with_slow_fp16 = false;
|
||||
if (split) {
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
||||
auto & tensor_split = buft_ctx->tensor_split;
|
||||
@ -1899,60 +1941,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
continue;
|
||||
}
|
||||
|
||||
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
|
||||
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
||||
}
|
||||
if (ggml_cuda_info().devices[id].cc == 610) {
|
||||
any_pascal_with_slow_fp16 = true;
|
||||
}
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||
}
|
||||
} else {
|
||||
min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
|
||||
any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||
}
|
||||
|
||||
// check data types and tensor shapes for custom matrix multiplication kernels:
|
||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
||||
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
|
||||
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
||||
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
||||
#endif // CUDA_USE_TENSOR_CORES
|
||||
|
||||
#else
|
||||
|
||||
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
|
||||
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
|
||||
|
||||
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
|
||||
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
|
||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
|
||||
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
// when tensor cores are available, use them for large batch size
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
||||
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||
#endif // CUDA_USE_TENSOR_CORES
|
||||
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
// if mmvq is available it's a better choice than dmmv:
|
||||
#ifndef GGML_CUDA_FORCE_DMMV
|
||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||
#endif // GGML_CUDA_FORCE_DMMV
|
||||
|
||||
// debug helpers
|
||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
||||
@ -1961,23 +1959,24 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
// KQ single-batch
|
||||
if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
// FP32 precision KQ single-batch for batch size 1 without FlashAttention
|
||||
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
|
||||
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
|
||||
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
// KQ + KQV multi-batch
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
||||
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
// KQ + KQV multi-batch without FlashAttention
|
||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||
} else if (use_dequantize_mul_mat_vec) {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
||||
} else if (use_mul_mat_vec_q) {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
|
||||
} else if (use_mul_mat_q) {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
|
||||
} else {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2281,6 +2280,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_SQR:
|
||||
ggml_cuda_op_sqr(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SQRT:
|
||||
ggml_cuda_op_sqrt(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
ggml_cuda_op_clamp(ctx, dst);
|
||||
break;
|
||||
@ -2302,6 +2304,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_IM2COL:
|
||||
ggml_cuda_op_im2col(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_cuda_op_conv_transpose_1d(ctx,dst);
|
||||
break;
|
||||
case GGML_OP_POOL_2D:
|
||||
ggml_cuda_op_pool2d(ctx, dst);
|
||||
break;
|
||||
@ -2744,7 +2749,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
return true;
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@ -2752,27 +2757,40 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
struct ggml_tensor * a;
|
||||
struct ggml_tensor * b;
|
||||
struct ggml_tensor * a = op->src[0];
|
||||
if (op->op == GGML_OP_MUL_MAT) {
|
||||
a = op->src[0];
|
||||
b = op->src[1];
|
||||
} else {
|
||||
a = op->src[2];
|
||||
b = op->src[1];
|
||||
}
|
||||
if (a->ne[3] != b->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
||||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
|
||||
a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
|
||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||
struct ggml_tensor * b = op->src[1];
|
||||
if (a->ne[3] != b->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
switch (a->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
@ -2832,6 +2850,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
ggml_type src1_type = op->src[1]->type;
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
@ -2844,6 +2871,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
@ -2883,6 +2911,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
if (ggml_backend_buft_is_cuda_split(buft)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ggml_backend_buft_is_cuda(buft)) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
return buft_ctx->device == cuda_ctx->device;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
|
||||
@ -2937,7 +2979,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||
|
||||
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
|
||||
#endif
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -2955,9 +2997,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
||||
/* .event_new = */ ggml_backend_cuda_event_new,
|
||||
/* .event_free = */ ggml_backend_cuda_event_free,
|
||||
@ -3017,7 +3061,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
||||
return false;
|
||||
}
|
||||
|
||||
#if CUDART_VERSION >= 11100
|
||||
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
||||
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -32,6 +32,9 @@
|
||||
#ifdef GGML_USE_HIPBLAS
|
||||
#define GGML_CUDA_NAME "ROCm"
|
||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||
#elif defined(GGML_USE_MUSA)
|
||||
#define GGML_CUDA_NAME "MUSA"
|
||||
#define GGML_CUBLAS_NAME "muBLAS"
|
||||
#else
|
||||
#define GGML_CUDA_NAME "CUDA"
|
||||
#define GGML_CUBLAS_NAME "cuBLAS"
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,83 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,57 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -99,6 +99,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
||||
const dim3 block_nums(1, nrows, 1);
|
||||
const size_t shared_mem = ncols_pad * sizeof(int);
|
||||
|
||||
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
|
||||
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
|
||||
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
@ -106,7 +107,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
||||
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -285,7 +285,7 @@ static void ggml_cuda_op_bin_bcast(
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -29,6 +29,7 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-cuda.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
@ -37,6 +38,10 @@
|
||||
#else
|
||||
#define GGML_COMMON_DECL_CUDA
|
||||
#define GGML_COMMON_IMPL_CUDA
|
||||
#if defined(GGML_USE_MUSA)
|
||||
#define GGML_COMMON_DECL_MUSA
|
||||
#define GGML_COMMON_IMPL_MUSA
|
||||
#endif
|
||||
#endif
|
||||
#include "ggml-common.h"
|
||||
|
||||
@ -129,7 +134,7 @@
|
||||
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaSuccess hipSuccess
|
||||
#define __trap abort
|
||||
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
||||
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
||||
@ -139,6 +144,150 @@
|
||||
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
||||
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||
#elif defined(GGML_USE_MUSA)
|
||||
#include <musa_runtime.h>
|
||||
#include <musa.h>
|
||||
#include <mublas.h>
|
||||
#include <musa_fp16.h>
|
||||
// XXX: Keep the following order the same as hipBLAS
|
||||
// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
|
||||
// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
|
||||
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||
// #define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||
#define CUDA_R_16F MUSA_R_16F
|
||||
#define CUDA_R_32F MUSA_R_32F
|
||||
// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
// #define cublasComputeType_t mublasComputeType_t
|
||||
#define cublasCreate mublasCreate
|
||||
#define cublasDestroy mublasDestroy
|
||||
#define cublasGemmEx mublasGemmEx
|
||||
#define cublasGemmBatchedEx mublasGemmBatchedEx
|
||||
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
|
||||
#define cublasHandle_t mublasHandle_t
|
||||
// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetMathMode mublasSetMathMode
|
||||
#define cublasSetStream mublasSetStream
|
||||
#define cublasSgemm mublasSgemm
|
||||
#define cublasStatus_t mublasStatus_t
|
||||
#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
|
||||
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
||||
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
||||
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
|
||||
#define cudaDeviceProp musaDeviceProp
|
||||
#define cudaDeviceSynchronize musaDeviceSynchronize
|
||||
#define cudaError_t musaError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||
#define cudaEventDisableTiming musaEventDisableTiming
|
||||
#define cudaEventRecord musaEventRecord
|
||||
#define cudaEventSynchronize musaEventSynchronize
|
||||
#define cudaEvent_t musaEvent_t
|
||||
#define cudaEventDestroy musaEventDestroy
|
||||
#define cudaFree musaFree
|
||||
#define cudaFreeHost musaFreeHost
|
||||
#define cudaGetDevice musaGetDevice
|
||||
#define cudaGetDeviceCount musaGetDeviceCount
|
||||
#define cudaGetDeviceProperties musaGetDeviceProperties
|
||||
#define cudaGetErrorString musaGetErrorString
|
||||
#define cudaGetLastError musaGetLastError
|
||||
#define cudaHostRegister musaHostRegister
|
||||
#define cudaHostRegisterPortable musaHostRegisterPortable
|
||||
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
|
||||
#define cudaHostUnregister musaHostUnregister
|
||||
#define cudaLaunchHostFunc musaLaunchHostFunc
|
||||
#define cudaMalloc musaMalloc
|
||||
#define cudaMallocHost musaMallocHost
|
||||
#define cudaMemcpy musaMemcpy
|
||||
#define cudaMemcpyAsync musaMemcpyAsync
|
||||
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
|
||||
#define cudaMemcpy2DAsync musaMemcpy2DAsync
|
||||
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
|
||||
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
|
||||
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
|
||||
#define cudaMemcpyKind musaMemcpyKind
|
||||
#define cudaMemset musaMemset
|
||||
#define cudaMemsetAsync musaMemsetAsync
|
||||
#define cudaMemGetInfo musaMemGetInfo
|
||||
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
|
||||
#define cudaSetDevice musaSetDevice
|
||||
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
|
||||
#define cudaStreamDestroy musaStreamDestroy
|
||||
#define cudaStreamFireAndForget musaStreamFireAndForget
|
||||
#define cudaStreamNonBlocking musaStreamNonBlocking
|
||||
#define cudaStreamPerThread musaStreamPerThread
|
||||
#define cudaStreamSynchronize musaStreamSynchronize
|
||||
#define cudaStreamWaitEvent musaStreamWaitEvent
|
||||
#define cudaStream_t musaStream_t
|
||||
#define cudaSuccess musaSuccess
|
||||
|
||||
// XXX: Other CUDA => MUSA mapping
|
||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
|
||||
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
|
||||
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
|
||||
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
|
||||
#define CUdevice MUdevice
|
||||
#define CUdeviceptr MUdeviceptr
|
||||
#define CUmemAccessDesc MUmemAccessDesc
|
||||
#define CUmemAllocationProp MUmemAllocationProp
|
||||
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
|
||||
#define cuDeviceGet muDeviceGet
|
||||
#define cuDeviceGetAttribute muDeviceGetAttribute
|
||||
#define cuMemAddressFree muMemAddressFree
|
||||
#define cuMemAddressReserve muMemAddressReserve
|
||||
#define cuMemCreate muMemCreate
|
||||
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
|
||||
#define cuMemMap muMemMap
|
||||
#define cuMemRelease muMemRelease
|
||||
#define cuMemSetAccess muMemSetAccess
|
||||
#define cuMemUnmap muMemUnmap
|
||||
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
|
||||
#define cudaFuncSetAttribute musaFuncSetAttribute
|
||||
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
|
||||
#define make_cudaExtent make_musaExtent
|
||||
#define make_cudaPitchedPtr make_musaPitchedPtr
|
||||
|
||||
// XXX: USE_CUDA_GRAPH
|
||||
#define CUDA_SUCCESS MUSA_SUCCESS
|
||||
#define CUresult MUresult
|
||||
#define cuGetErrorString muGetErrorString
|
||||
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
|
||||
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
|
||||
#define cudaGraphDestroy musaGraphDestroy
|
||||
#define cudaGraphExecDestroy musaGraphExecDestroy
|
||||
#define cudaGraphExec_t musaGraphExec_t
|
||||
#define cudaGraphExecUpdate musaGraphExecUpdate
|
||||
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
|
||||
#define cudaGraphGetNodes musaGraphGetNodes
|
||||
#define cudaGraphInstantiate musaGraphInstantiate
|
||||
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
|
||||
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
|
||||
#define cudaGraphLaunch musaGraphLaunch
|
||||
#define cudaGraphNodeGetType musaGraphNodeGetType
|
||||
#define cudaGraphNode_t musaGraphNode_t
|
||||
#define cudaGraphNodeType musaGraphNodeType
|
||||
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
|
||||
#define cudaGraph_t musaGraph_t
|
||||
#define cudaKernelNodeParams musaKernelNodeParams
|
||||
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||
#define cudaStreamEndCapture musaStreamEndCapture
|
||||
|
||||
// XXX: cuBLAS => muBLAS mapping
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||
#define cublasComputeType_t cudaDataType_t
|
||||
|
||||
// XXX: Clang builtins mapping
|
||||
#define __vsub4 __vsub4_musa
|
||||
#define __vcmpeq4 __vcmpeq4_musa
|
||||
#define __vcmpne4 __vcmpne4_musa
|
||||
#else
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
@ -165,29 +314,13 @@
|
||||
#define CC_PASCAL 600
|
||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||
#define CC_VOLTA 700
|
||||
#define CC_TURING 750
|
||||
#define CC_AMPERE 800
|
||||
#define CC_OFFSET_AMD 1000000
|
||||
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
||||
|
||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
||||
// - 7B quantum model: +100-200 MB
|
||||
// - 13B quantum model: +200-400 MB
|
||||
//
|
||||
//#define GGML_CUDA_FORCE_MMQ
|
||||
|
||||
// TODO: improve this to be correct for more hardware
|
||||
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
||||
#if !defined(GGML_CUDA_FORCE_MMQ)
|
||||
#define CUDA_USE_TENSOR_CORES
|
||||
#endif
|
||||
|
||||
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
|
||||
#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
|
||||
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
@ -209,9 +342,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||
|
||||
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
|
||||
|
||||
#if CUDART_VERSION >= 12000
|
||||
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
|
||||
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||
#ifndef GGML_USE_MUSA
|
||||
return cublasGetStatusString(err);
|
||||
#else
|
||||
return mublasStatus_to_string(err);
|
||||
#endif // GGML_USE_MUSA
|
||||
}
|
||||
#else
|
||||
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||
@ -241,7 +378,7 @@ static const char * cu_get_error_str(CUresult err) {
|
||||
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
||||
#endif
|
||||
|
||||
#if CUDART_VERSION >= 11100
|
||||
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
||||
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
||||
#else
|
||||
#define GGML_CUDA_ASSUME(x)
|
||||
@ -255,6 +392,42 @@ typedef float dfloat; // dequantize float
|
||||
typedef float2 dfloat2;
|
||||
#endif //GGML_CUDA_F16
|
||||
|
||||
#if defined(GGML_USE_MUSA)
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
|
||||
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
#endif // defined(GGML_USE_MUSA)
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
@ -268,6 +441,10 @@ typedef float2 dfloat2;
|
||||
#define RDNA2
|
||||
#endif
|
||||
|
||||
#if defined(__gfx1010__) || defined(__gfx1012__)
|
||||
#define RDNA1
|
||||
#endif
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
@ -310,30 +487,15 @@ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigne
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(RDNA3)
|
||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||
#elif defined(__gfx1010__) || defined(__gfx900__)
|
||||
int tmp1;
|
||||
int tmp2;
|
||||
asm("\n \
|
||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
||||
v_add3_u32 %0, %1, %2, %0 \n \
|
||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
||||
v_add3_u32 %0, %1, %2, %0 \n \
|
||||
"
|
||||
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
||||
: "v"(a), "v"(b)
|
||||
);
|
||||
#else
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
||||
#endif
|
||||
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
@ -352,18 +514,34 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
|
||||
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
|
||||
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
#define FP16_AVAILABLE
|
||||
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
|
||||
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||
#define FAST_FP16_AVAILABLE
|
||||
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||
|
||||
static bool fast_fp16_available(const int cc) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||
#define INT8_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||
|
||||
static constexpr bool fast_fp16_available(const int cc) {
|
||||
return cc >= CC_PASCAL && cc != 610;
|
||||
}
|
||||
|
||||
static bool fp16_mma_available(const int cc) {
|
||||
static constexpr bool fp16_mma_available(const int cc) {
|
||||
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
||||
}
|
||||
|
||||
static constexpr bool int8_mma_available(const int cc) {
|
||||
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
static __device__ void no_device_code(
|
||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||
@ -384,7 +562,7 @@ static __device__ void no_device_code(
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
||||
#else
|
||||
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
||||
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||
@ -405,7 +583,7 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#pragma unroll
|
||||
@ -438,7 +616,7 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
||||
@ -491,10 +669,50 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
|
||||
return mask_low | mask_high;
|
||||
}
|
||||
#endif // CUDART_VERSION < 12000
|
||||
#endif // CUDART_VERSION < CUDART_HMASK
|
||||
|
||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(RDNA3)
|
||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||
#elif defined(__gfx1010__) || defined(__gfx900__)
|
||||
int tmp1;
|
||||
int tmp2;
|
||||
asm("\n \
|
||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
||||
v_add3_u32 %0, %1, %2, %0 \n \
|
||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
||||
v_add3_u32 %0, %1, %2, %0 \n \
|
||||
"
|
||||
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
||||
: "v"(a), "v"(b)
|
||||
);
|
||||
#else
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
||||
#endif
|
||||
return c;
|
||||
|
||||
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
return __dp4a(a, b, c);
|
||||
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
const int8_t * a8 = (const int8_t *) &a;
|
||||
const int8_t * b8 = (const int8_t *) &b;
|
||||
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
// TODO: move to ggml-common.h
|
||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||
|
||||
@ -652,19 +870,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
|
||||
static constexpr int qi = QI3_S;
|
||||
};
|
||||
|
||||
static int get_mmq_x_max_host(const int cc) {
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
|
||||
#else
|
||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
|
||||
#endif // CUDA_USE_TENSOR_CORES
|
||||
}
|
||||
|
||||
// Round rows to this value for --split-mode row:
|
||||
static int get_mmq_y_host(const int cc, const int mmq_x) {
|
||||
return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
|
||||
}
|
||||
|
||||
//////////////////////
|
||||
|
||||
struct ggml_cuda_device_info {
|
||||
@ -674,6 +879,7 @@ struct ggml_cuda_device_info {
|
||||
int cc; // compute capability
|
||||
int nsm; // number of streaming multiprocessors
|
||||
size_t smpb; // max. shared memory per block
|
||||
size_t smpbo; // max. shared memory per block (with opt-in)
|
||||
bool vmm; // virtual memory support
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
113
llama/ggml-cuda/conv-transpose-1d.cu
Normal file
113
llama/ggml-cuda/conv-transpose-1d.cu
Normal file
@ -0,0 +1,113 @@
|
||||
/**
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "conv-transpose-1d.cuh"
|
||||
|
||||
static __global__ void conv_transpose_1d_kernel(
|
||||
const int s0, const int p0, const int d0, const int output_size,
|
||||
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
|
||||
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
|
||||
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
|
||||
const float * src0, const float * src1, float * dst) {
|
||||
int global_index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (global_index >= output_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
int out_index = global_index / dst_ne0;
|
||||
|
||||
float accumulator = 0;
|
||||
|
||||
for (int c = 0; c < src0_ne2; c++) {
|
||||
int idx = global_index % dst_ne0;
|
||||
|
||||
int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
|
||||
int input_offset = src1_ne0 * c;
|
||||
|
||||
for (int i = 0; i < src1_ne0; i++) {
|
||||
if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
|
||||
continue;
|
||||
}
|
||||
int weight_idx = idx - i*s0;
|
||||
|
||||
float kernel_weight = src0[kernel_offset + weight_idx];
|
||||
float input_value = src1[input_offset+i];
|
||||
|
||||
accumulator += kernel_weight * input_value;
|
||||
}
|
||||
}
|
||||
dst[global_index] = accumulator;
|
||||
}
|
||||
|
||||
static void conv_transpose_1d_f32_f32_cuda(
|
||||
const int s0, const int p0, const int d0, const int output_size,
|
||||
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
|
||||
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
|
||||
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
|
||||
const float * src0, const float * src1, float * dst,
|
||||
cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
|
||||
conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
|
||||
s0,p0,d0,output_size,
|
||||
src0_ne0, src0_ne1, src0_ne2, src0_ne3,
|
||||
src1_ne0, src1_ne1, src1_ne2, src1_ne3,
|
||||
dst_ne0, dst_ne1, dst_ne2, dst_ne3,
|
||||
src0,src1, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
|
||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||
|
||||
const int s0 = opts[0];
|
||||
const int p0 = 0;//opts[3];
|
||||
const int d0 = 1;//opts[4];
|
||||
|
||||
const int64_t kernel_size = ggml_nelements(src0);
|
||||
const int64_t input_size = ggml_nelements(src1);
|
||||
const int64_t output_size = ggml_nelements(dst);
|
||||
|
||||
conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
src0_d, src1_d, dst_d, stream);
|
||||
}
|
31
llama/ggml-cuda/conv-transpose-1d.cuh
Normal file
31
llama/ggml-cuda/conv-transpose-1d.cuh
Normal file
@ -0,0 +1,31 @@
|
||||
/**
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -477,7 +477,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -510,7 +510,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -688,7 +688,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -80,12 +80,11 @@ typedef float (*vec_dot_KQ_f32_t)(
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
half sum = 0.0f;
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
||||
@ -95,12 +94,12 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
const int iqs4 = k_KQ % QI4_0;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int v = (get_int_b2(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
|
||||
const int sumi = __dp4a(v, u, 0);
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
@ -116,19 +115,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
@ -143,12 +134,12 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
const int iqs4 = k_KQ % QI4_1;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int v = (get_int_b4(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
|
||||
const int sumi = __dp4a(v, u, 0);
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
@ -168,19 +159,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
@ -196,8 +179,8 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
const int iqs8 = k_KQ % QI8_1;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
int v = (get_int_from_uint8(K_q5_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int vh = get_int_from_uint8(K_q5_0[ib].qh, 0) >> (iqs8 * QI5_0);
|
||||
int v = (get_int_b2(K_q5_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int vh = get_int_b2(K_q5_0[ib].qh, 0) >> (iqs8 * QI5_0);
|
||||
v |= (vh << 4) & 0x00000010; // 0 -> 4
|
||||
v |= (vh << 11) & 0x00001000; // 1 -> 12
|
||||
v |= (vh << 18) & 0x00100000; // 2 -> 20
|
||||
@ -205,9 +188,9 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
|
||||
const int sumi = __dp4a(v, u, 0);
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
@ -223,19 +206,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
@ -251,8 +226,8 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
const int iqs8 = k_KQ % QI8_1;
|
||||
const int shift = k_KQ & (QI8_1/2);
|
||||
|
||||
int v = (get_int_from_uint8(K_q5_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int vh = get_int_from_uint8(K_q5_1[ib].qh, 0) >> (iqs8 * QI5_1);
|
||||
int v = (get_int_b2(K_q5_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int vh = get_int_b2(K_q5_1[ib].qh, 0) >> (iqs8 * QI5_1);
|
||||
v |= (vh << 4) & 0x00000010; // 0 -> 4
|
||||
v |= (vh << 11) & 0x00001000; // 1 -> 12
|
||||
v |= (vh << 18) & 0x00100000; // 2 -> 20
|
||||
@ -260,9 +235,9 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
|
||||
const int sumi = __dp4a(v, u, 0);
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_ds = (const half2 *) Q_ds_v;
|
||||
|
||||
@ -282,19 +257,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template <typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
@ -308,7 +275,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
const int ib = k_KQ / QI8_0;
|
||||
const int iqs = k_KQ % QI8_0;
|
||||
|
||||
const int v = get_int_from_int8(K_q8_0[ib].qs, iqs);
|
||||
const int v = get_int_b2(K_q8_0[ib].qs, iqs);
|
||||
|
||||
T Q_d;
|
||||
if (std::is_same<T, half>::value) {
|
||||
@ -323,13 +290,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template <typename T, int D>
|
||||
@ -340,7 +300,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
const half2 * Q_h2 = (const half2 *) Q_v;
|
||||
|
||||
@ -433,7 +393,7 @@ static __device__ __forceinline__ T dequantize_1_q4_0(const void * __restrict__
|
||||
const int q0 = x[ib].qs[iqs];
|
||||
const int q = ((q0 >> (4*shift)) & 0x0F) - 8;
|
||||
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
return ((half) d)*((half) q);
|
||||
}
|
||||
@ -454,7 +414,7 @@ static __device__ __forceinline__ T dequantize_1_q4_1(const void * __restrict__
|
||||
const int q0 = x[ib].qs[iqs];
|
||||
const int q = ((q0 >> (4*shift)) & 0x0F);
|
||||
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
return __low2half(dm)*((half) q) + __high2half(dm);
|
||||
}
|
||||
@ -474,12 +434,12 @@ static __device__ __forceinline__ T dequantize_1_q5_0(const void * __restrict__
|
||||
|
||||
const T d = x[ib].d;
|
||||
const int ql0 = x[ib].qs[iqs];
|
||||
const int qh0 = get_int_from_uint8(x[ib].qh, 0);
|
||||
const int qh0 = get_int_b2(x[ib].qh, 0);
|
||||
const int ql = ((ql0 >> (4*shift)) & 0x0F);
|
||||
const int qh = ((qh0 >> idq) << 4) & 0x10;
|
||||
const int q = (ql | qh) - 16;
|
||||
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
return ((half) d)*((half) q);
|
||||
}
|
||||
@ -499,12 +459,12 @@ static __device__ __forceinline__ T dequantize_1_q5_1(const void * __restrict__
|
||||
|
||||
const half2 dm = x[ib].dm;
|
||||
const int ql0 = x[ib].qs[iqs];
|
||||
const int qh0 = get_int_from_uint8_aligned(x[ib].qh, 0);
|
||||
const int qh0 = get_int_b4(x[ib].qh, 0);
|
||||
const int ql = ((ql0 >> (4*shift)) & 0x0F);
|
||||
const int qh = ((qh0 >> idq) << 4) & 0x10;
|
||||
const int q = (ql | qh);
|
||||
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
return __low2half(dm)*((half) q) + __high2half(dm);
|
||||
}
|
||||
@ -523,7 +483,7 @@ static __device__ __forceinline__ T dequantize_1_q8_0(const void * __restrict__
|
||||
const T d = x[ib].d;
|
||||
const int q = x[ib].qs[iqs];
|
||||
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
return ((half) d)*((half) q);
|
||||
}
|
||||
@ -629,20 +589,20 @@ static void on_no_fattn_vec_case(const int D) {
|
||||
if (D == 64) {
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
|
||||
fprintf(stderr, "By default only f16 KV cache is supported.\n");
|
||||
fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
|
||||
GGML_ASSERT(false);
|
||||
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
|
||||
GGML_ABORT("fatal error");
|
||||
} else if (D == 128) {
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
|
||||
fprintf(stderr, "Supported combinations:\n");
|
||||
fprintf(stderr, " - K == q4_0, V == q4_0, 4.50 BPV\n");
|
||||
fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
|
||||
fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
|
||||
fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
|
||||
GGML_ASSERT(false);
|
||||
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
|
||||
fprintf(stderr, "Only f16 is supported.\n");
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -69,7 +69,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||
@ -313,7 +313,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -310,7 +310,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -66,7 +66,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -175,7 +175,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
Q_f2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
|
||||
Q_f2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
|
||||
Q_f2[j][i0/WARP_SIZE].x *= scale;
|
||||
Q_f2[j][i0/WARP_SIZE].y *= scale;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -27,9 +27,9 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
#if FP16_MMA_AVAILABLE
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
#include <mma.h>
|
||||
#endif
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
|
||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
|
||||
@ -71,7 +71,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#if FP16_MMA_AVAILABLE
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -64,7 +64,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
@ -89,7 +89,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
// ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
|
||||
// break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -112,7 +112,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
return;
|
||||
@ -140,7 +140,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
return;
|
||||
@ -167,7 +167,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -197,8 +197,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
break;
|
||||
default:
|
||||
// TODO: k-quants
|
||||
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
247
llama/ggml-cuda/mma.cuh
Normal file
247
llama/ggml-cuda/mma.cuh
Normal file
@ -0,0 +1,247 @@
|
||||
/**
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
struct mma_int_A_I16K4 {
|
||||
static constexpr int I = 16;
|
||||
static constexpr int K = 4;
|
||||
static constexpr int ne = 2;
|
||||
|
||||
int x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
const int ret = (l%2) * (I/2) + threadIdx.x / K;
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < I);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_k(const int /* l */) {
|
||||
const int ret = threadIdx.x % K;
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < K);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||
#if defined(INT8_MMA_AVAILABLE)
|
||||
const int * xs = xs0 + (threadIdx.x%I)*stride;
|
||||
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||
: "+r"(x[0]), "+r"(x[1])
|
||||
: "l"(xs));
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
x[l] = xs0[get_i(l)*stride + get_k(l)];
|
||||
}
|
||||
#endif // defined(INT8_MMA_AVAILABLE)
|
||||
}
|
||||
};
|
||||
|
||||
struct mma_int_A_I16K8 {
|
||||
static constexpr int I = 16;
|
||||
static constexpr int K = 8;
|
||||
static constexpr int ne = 4;
|
||||
|
||||
int x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
const int ret = (l%2) * (I/2) + threadIdx.x / (K/2);
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < I);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_k(const int l) {
|
||||
const int ret = (l/2) * (K/2) + threadIdx.x % (K/2);
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < K);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||
#if defined(INT8_MMA_AVAILABLE)
|
||||
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
|
||||
asm("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
||||
: "l"(xs));
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
x[l] = xs0[get_i(l)*stride + get_k(l)];
|
||||
}
|
||||
#endif // defined(INT8_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void load_low(const int * __restrict__ xs0, const int & stride) {
|
||||
((mma_int_A_I16K4 *) x)[0].load(xs0, stride);
|
||||
}
|
||||
};
|
||||
|
||||
struct mma_int_B_J8K4 {
|
||||
static constexpr int J = 8;
|
||||
static constexpr int K = 4;
|
||||
static constexpr int ne = 1;
|
||||
|
||||
int x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int /* l */) {
|
||||
const int ret = threadIdx.x / K;
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < J);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_k(const int /* l */) {
|
||||
const int ret = threadIdx.x % K;
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < K);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
|
||||
const int * xs = xs0 + (threadIdx.x%J)*stride;
|
||||
asm("ldmatrix.sync.aligned.m8n8.x1.b16 {%0}, [%1];"
|
||||
: "+r"(x[0])
|
||||
: "l"(xs));
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
x[l] = xs0[get_j(l)*stride + get_k(l)];
|
||||
}
|
||||
#endif // defined(INT8_MMA_AVAILABLE)
|
||||
}
|
||||
};
|
||||
|
||||
struct mma_int_B_J8K8 {
|
||||
static constexpr int J = 8;
|
||||
static constexpr int K = 8;
|
||||
static constexpr int ne = 2;
|
||||
|
||||
int x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int /* l */) {
|
||||
const int ret = threadIdx.x / (K/2);
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < J);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_k(const int l) {
|
||||
const int ret = l * (K/2) + threadIdx.x % (K/2);
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < K);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
|
||||
const int * xs = xs0 + (threadIdx.x%J)*stride + ((threadIdx.x/J)*(K/2)) % K;
|
||||
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||
: "+r"(x[0]), "+r"(x[1])
|
||||
: "l"(xs));
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < ne; ++l) {
|
||||
x[l] = xs0[get_j(l)*stride + get_k(l)];
|
||||
}
|
||||
#endif // defined(INT8_MMA_AVAILABLE)
|
||||
}
|
||||
};
|
||||
|
||||
struct mma_int_C_I16J8 {
|
||||
static constexpr int I = 16;
|
||||
static constexpr int J = 8;
|
||||
static constexpr int ne = 4;
|
||||
|
||||
int x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
const int ret = (l/2) * (I/2) + threadIdx.x / (J/2);
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < I);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
const int ret = 2 * (threadIdx.x % (J/2)) + l%2;
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < J);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
#if __CUDA_ARCH__ >= CC_AMPERE
|
||||
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
|
||||
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
||||
#else
|
||||
// On Turing m16n8k16 mma is not available, use 2x m8n8k16 mma instead:
|
||||
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
||||
: "+r"(x[0]), "+r"(x[1])
|
||||
: "r"(mma_A.x[0]), "r"(mma_B.x[0]));
|
||||
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
||||
: "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
||||
#endif // __CUDA_ARCH__ >= CC_AMPERE
|
||||
#else
|
||||
GGML_UNUSED(mma_A);
|
||||
GGML_UNUSED(mma_B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
#if __CUDA_ARCH__ >= CC_AMPERE
|
||||
asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
|
||||
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
|
||||
#else
|
||||
// On Turing m16n8k32 mma is not available, use 4x m8n8k16 mma instead:
|
||||
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
||||
: "+r"(x[0]), "+r"(x[1])
|
||||
: "r"(mma_A.x[0]), "r"(mma_B.x[0]));
|
||||
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
||||
: "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
||||
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
||||
: "+r"(x[0]), "+r"(x[1])
|
||||
: "r"(mma_A.x[2]), "r"(mma_B.x[1]));
|
||||
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
||||
: "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[3]), "r"(mma_B.x[1]));
|
||||
#endif // __CUDA_ARCH__ >= CC_AMPERE
|
||||
#else
|
||||
GGML_UNUSED(mma_A);
|
||||
GGML_UNUSED(mma_B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
};
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -37,6 +37,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
GGML_ASSERT(ne10 % QK8_1 == 0);
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
@ -51,41 +52,65 @@ void ggml_cuda_op_mul_mat_q(
|
||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
||||
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
|
||||
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
|
||||
@ -94,7 +119,13 @@ void ggml_cuda_op_mul_mat_q(
|
||||
GGML_UNUSED(src1_ddf_i);
|
||||
}
|
||||
|
||||
bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
||||
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||
return false;
|
||||
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||
|
||||
bool mmq_supported;
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
@ -106,8 +137,40 @@ bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
return true;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
mmq_supported = true;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
mmq_supported = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!mmq_supported) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (int8_mma_available(cc)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (cc < MIN_CC_DP4A) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
return true;
|
||||
#endif //GGML_CUDA_FORCE_MMQ
|
||||
|
||||
if (cc < CC_OFFSET_AMD) {
|
||||
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -54,16 +54,22 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
||||
|
||||
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
|
||||
1;
|
||||
}
|
||||
|
||||
@ -143,7 +149,7 @@ static __global__ void mul_mat_vec_q(
|
||||
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
|
||||
}
|
||||
|
||||
if (threadIdx.x < rows_per_cuda_block) {
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
|
||||
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
||||
}
|
||||
}
|
||||
@ -182,7 +188,7 @@ static void mul_mat_vec_q_cuda(
|
||||
rows_per_cuda_block = 2;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -216,7 +222,7 @@ static void mul_mat_vec_q_cuda(
|
||||
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -433,7 +439,7 @@ void ggml_cuda_op_mul_mat_vec_q(
|
||||
mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -26,6 +26,8 @@
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec_q(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -25,24 +25,25 @@
|
||||
*/
|
||||
|
||||
#include "quantize.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
|
||||
const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
|
||||
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (ix >= kx_padded) {
|
||||
if (ix0 >= kx0_padded) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int64_t ix1 = blockIdx.y;
|
||||
|
||||
const int64_t i_padded = (int64_t)iy*kx_padded + ix;
|
||||
const int64_t i_padded = ix1*kx0_padded + ix0;
|
||||
|
||||
block_q8_1 * y = (block_q8_1 *) vy;
|
||||
|
||||
const int64_t ib = i_padded / QK8_1; // block index
|
||||
const int64_t iqs = i_padded % QK8_1; // quant index
|
||||
|
||||
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
||||
const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
|
||||
float amax = fabsf(xi);
|
||||
float sum = xi;
|
||||
|
||||
@ -62,10 +63,133 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||
}
|
||||
|
||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
|
||||
const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const dim3 num_blocks(block_num_x, ky, 1);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
||||
template <mmq_q8_1_ds_layout ds_layout>
|
||||
static __global__ void quantize_mmq_q8_1(
|
||||
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
|
||||
|
||||
constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
|
||||
constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;
|
||||
|
||||
const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
|
||||
|
||||
if (ix0 >= kx0_padded) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float4 * x4 = (const float4 *) x;
|
||||
|
||||
const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
|
||||
|
||||
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
|
||||
|
||||
const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel
|
||||
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
|
||||
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
|
||||
|
||||
// Load 4 floats per thread and calculate max. abs. value between them:
|
||||
const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
float amax = fabsf(xi.x);
|
||||
amax = fmaxf(amax, fabsf(xi.y));
|
||||
amax = fmaxf(amax, fabsf(xi.z));
|
||||
amax = fmaxf(amax, fabsf(xi.w));
|
||||
|
||||
// Exchange max. abs. value between vals_per_scale/4 threads.
|
||||
#pragma unroll
|
||||
for (int mask = vals_per_scale/8; mask > 0; mask >>= 1) {
|
||||
amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE));
|
||||
}
|
||||
|
||||
float sum;
|
||||
if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) {
|
||||
sum = xi.x + xi.y + xi.z + xi.w;
|
||||
|
||||
// Exchange calculate sum across vals_per_sum/4 threads.
|
||||
#pragma unroll
|
||||
for (int mask = vals_per_sum/8; mask > 0; mask >>= 1) {
|
||||
sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, WARP_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
const float d_inv = 127.0f / amax;
|
||||
char4 q;
|
||||
q.x = roundf(xi.x*d_inv);
|
||||
q.y = roundf(xi.y*d_inv);
|
||||
q.z = roundf(xi.z*d_inv);
|
||||
q.w = roundf(xi.w*d_inv);
|
||||
|
||||
// Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
|
||||
char4 * yqs4 = (char4 *) y[ib].qs;
|
||||
yqs4[iqs/4] = q;
|
||||
|
||||
if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6) {
|
||||
if (iqs % 16 != 0 || iqs >= 96) {
|
||||
return;
|
||||
}
|
||||
|
||||
y[ib].d2s6[2 + iqs/16] = sum;
|
||||
|
||||
if (iqs % 64 != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float d = 1.0f / d_inv;
|
||||
|
||||
y[ib].d2s6[iqs/64] = d;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (iqs % 32 != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float d = 1.0f / d_inv;
|
||||
|
||||
if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) {
|
||||
y[ib].ds4[iqs/32] = make_half2(d, sum);
|
||||
} else {
|
||||
y[ib].d4[iqs/32] = d;
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q8_1_cuda(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
|
||||
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(kx0_padded % QK8_1 == 0);
|
||||
|
||||
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const dim3 num_blocks(block_num_x, kx1*channels, 1);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
|
||||
|
||||
GGML_UNUSED(type_x);
|
||||
}
|
||||
|
||||
void quantize_mmq_q8_1_cuda(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
|
||||
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
|
||||
|
||||
const int64_t block_num_x = (kx0_padded + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||
const dim3 num_blocks(block_num_x, kx1, channels);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
|
||||
switch (mmq_get_q8_1_ds_layout(type_x)) {
|
||||
case MMQ_Q8_1_DS_LAYOUT_D4:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
break;
|
||||
case MMQ_Q8_1_DS_LAYOUT_DS4:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_DS4>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
break;
|
||||
case MMQ_Q8_1_DS_LAYOUT_D2S6:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D2S6>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -24,8 +24,27 @@
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
#include "mmq.cuh"
|
||||
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||
#include <cstdint>
|
||||
|
||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
|
||||
|
||||
static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access.");
|
||||
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
|
||||
|
||||
typedef void (*quantize_cuda_t)(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
||||
const ggml_type type_x, cudaStream_t stream);
|
||||
|
||||
void quantize_row_q8_1_cuda(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
||||
const ggml_type type_x, cudaStream_t stream);
|
||||
|
||||
void quantize_mmq_q8_1_cuda(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
||||
const ggml_type type_x, cudaStream_t stream);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -277,7 +277,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
attn_factor, corr_dims, freq_factors, stream
|
||||
);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
@ -291,7 +291,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
attn_factor, corr_dims, freq_factors, stream
|
||||
);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@ -156,6 +156,7 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
|
||||
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
|
||||
switch (ncols_x) {
|
||||
case 32:
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ee459f40f65810a810151b24eba5b8bd174ceffe - do not edit this file
|
||||
* llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user