This commit is contained in:
jmorganca 2024-06-10 17:23:09 -07:00
parent 4d0e6c55b0
commit 763d7b601c
77 changed files with 35429 additions and 35585 deletions

View File

@ -1,392 +1,392 @@
/* /*
This is free and unencumbered software released into the public domain. This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any binary, for any purpose, commercial or non-commercial, and by any
means. means.
In jurisdictions that recognize copyright laws, the author or authors In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this relinquishment in perpetuity of all present and future rights to this
software under copyright law. software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE. OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org> For more information, please refer to <http://unlicense.org>
*/ */
#ifndef PUBLIC_DOMAIN_BASE64_HPP_ #ifndef PUBLIC_DOMAIN_BASE64_HPP_
#define PUBLIC_DOMAIN_BASE64_HPP_ #define PUBLIC_DOMAIN_BASE64_HPP_
#include <cstdint> #include <cstdint>
#include <iterator> #include <iterator>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
class base64_error : public std::runtime_error class base64_error : public std::runtime_error
{ {
public: public:
using std::runtime_error::runtime_error; using std::runtime_error::runtime_error;
}; };
class base64 class base64
{ {
public: public:
enum class alphabet enum class alphabet
{ {
/** the alphabet is detected automatically */ /** the alphabet is detected automatically */
auto_, auto_,
/** the standard base64 alphabet is used */ /** the standard base64 alphabet is used */
standard, standard,
/** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/ /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
url_filename_safe url_filename_safe
}; };
enum class decoding_behavior enum class decoding_behavior
{ {
/** if the input is not padded, the remaining bits are ignored */ /** if the input is not padded, the remaining bits are ignored */
moderate, moderate,
/** if a padding character is encounter decoding is finished */ /** if a padding character is encounter decoding is finished */
loose loose
}; };
/** /**
Encodes all the elements from `in_begin` to `in_end` to `out`. Encodes all the elements from `in_begin` to `in_end` to `out`.
@warning The source and destination cannot overlap. The destination must be able to hold at least @warning The source and destination cannot overlap. The destination must be able to hold at least
`required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator. `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
@tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
8 bits 8 bits
@tparam Output_iterator the destination; the elements written to it are from the type `char` @tparam Output_iterator the destination; the elements written to it are from the type `char`
@param in_begin the beginning of the source @param in_begin the beginning of the source
@param in_end the ending of the source @param in_end the ending of the source
@param out the destination iterator @param out the destination iterator
@param alphabet which alphabet should be used @param alphabet which alphabet should be used
@returns the iterator to the next element past the last element copied @returns the iterator to the next element past the last element copied
@throws see `Input_iterator` and `Output_iterator` @throws see `Input_iterator` and `Output_iterator`
*/ */
template<typename Input_iterator, typename Output_iterator> template<typename Input_iterator, typename Output_iterator>
static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out, static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
alphabet alphabet = alphabet::standard) alphabet alphabet = alphabet::standard)
{ {
constexpr auto pad = '='; constexpr auto pad = '=';
const char* alpha = alphabet == alphabet::url_filename_safe const char* alpha = alphabet == alphabet::url_filename_safe
? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
while (in_begin != in_end) { while (in_begin != in_end) {
std::uint8_t i0 = 0, i1 = 0, i2 = 0; std::uint8_t i0 = 0, i1 = 0, i2 = 0;
// first character // first character
i0 = static_cast<std::uint8_t>(*in_begin); i0 = static_cast<std::uint8_t>(*in_begin);
++in_begin; ++in_begin;
*out = alpha[i0 >> 2 & 0x3f]; *out = alpha[i0 >> 2 & 0x3f];
++out; ++out;
// part of first character and second // part of first character and second
if (in_begin != in_end) { if (in_begin != in_end) {
i1 = static_cast<std::uint8_t>(*in_begin); i1 = static_cast<std::uint8_t>(*in_begin);
++in_begin; ++in_begin;
*out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)]; *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
++out; ++out;
} else { } else {
*out = alpha[(i0 & 0x3) << 4]; *out = alpha[(i0 & 0x3) << 4];
++out; ++out;
// last padding // last padding
*out = pad; *out = pad;
++out; ++out;
// last padding // last padding
*out = pad; *out = pad;
++out; ++out;
break; break;
} }
// part of second character and third // part of second character and third
if (in_begin != in_end) { if (in_begin != in_end) {
i2 = static_cast<std::uint8_t>(*in_begin); i2 = static_cast<std::uint8_t>(*in_begin);
++in_begin; ++in_begin;
*out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)]; *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
++out; ++out;
} else { } else {
*out = alpha[(i1 & 0xf) << 2]; *out = alpha[(i1 & 0xf) << 2];
++out; ++out;
// last padding // last padding
*out = pad; *out = pad;
++out; ++out;
break; break;
} }
// rest of third // rest of third
*out = alpha[i2 & 0x3f]; *out = alpha[i2 & 0x3f];
++out; ++out;
} }
return out; return out;
} }
/** /**
Encodes a string. Encodes a string.
@param str the string that should be encoded @param str the string that should be encoded
@param alphabet which alphabet should be used @param alphabet which alphabet should be used
@returns the encoded base64 string @returns the encoded base64 string
@throws see base64::encode() @throws see base64::encode()
*/ */
static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard) static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
{ {
std::string result; std::string result;
result.reserve(required_encode_size(str.length()) + 1); result.reserve(required_encode_size(str.length()) + 1);
encode(str.begin(), str.end(), std::back_inserter(result), alphabet); encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
return result; return result;
} }
/** /**
Encodes a char array. Encodes a char array.
@param buffer the char array @param buffer the char array
@param size the size of the array @param size the size of the array
@param alphabet which alphabet should be used @param alphabet which alphabet should be used
@returns the encoded string @returns the encoded string
*/ */
static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard) static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
{ {
std::string result; std::string result;
result.reserve(required_encode_size(size) + 1); result.reserve(required_encode_size(size) + 1);
encode(buffer, buffer + size, std::back_inserter(result), alphabet); encode(buffer, buffer + size, std::back_inserter(result), alphabet);
return result; return result;
} }
/** /**
Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`, Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
in other words: inplace decoding is possible. in other words: inplace decoding is possible.
@warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`, @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
otherwise the behavior depends on the output iterator. otherwise the behavior depends on the output iterator.
@tparam Input_iterator the source; the returned elements are cast to `char` @tparam Input_iterator the source; the returned elements are cast to `char`
@tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t` @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
@param in_begin the beginning of the source @param in_begin the beginning of the source
@param in_end the ending of the source @param in_end the ending of the source
@param out the destination iterator @param out the destination iterator
@param alphabet which alphabet should be used @param alphabet which alphabet should be used
@param behavior the behavior when an error was detected @param behavior the behavior when an error was detected
@returns the iterator to the next element past the last element copied @returns the iterator to the next element past the last element copied
@throws base64_error depending on the set behavior @throws base64_error depending on the set behavior
@throws see `Input_iterator` and `Output_iterator` @throws see `Input_iterator` and `Output_iterator`
*/ */
template<typename Input_iterator, typename Output_iterator> template<typename Input_iterator, typename Output_iterator>
static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out, static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
alphabet alphabet = alphabet::auto_, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate) decoding_behavior behavior = decoding_behavior::moderate)
{ {
//constexpr auto pad = '='; //constexpr auto pad = '=';
std::uint8_t last = 0; std::uint8_t last = 0;
auto bits = 0; auto bits = 0;
while (in_begin != in_end) { while (in_begin != in_end) {
auto c = *in_begin; auto c = *in_begin;
++in_begin; ++in_begin;
if (c == '=') { if (c == '=') {
break; break;
} }
auto part = _base64_value(alphabet, c); auto part = _base64_value(alphabet, c);
// enough bits for one byte // enough bits for one byte
if (bits + 6 >= 8) { if (bits + 6 >= 8) {
*out = (last << (8 - bits)) | (part >> (bits - 2)); *out = (last << (8 - bits)) | (part >> (bits - 2));
++out; ++out;
bits -= 2; bits -= 2;
} else { } else {
bits += 6; bits += 6;
} }
last = part; last = part;
} }
// check padding // check padding
if (behavior != decoding_behavior::loose) { if (behavior != decoding_behavior::loose) {
while (in_begin != in_end) { while (in_begin != in_end) {
auto c = *in_begin; auto c = *in_begin;
++in_begin; ++in_begin;
if (c != '=') { if (c != '=') {
throw base64_error("invalid base64 character."); throw base64_error("invalid base64 character.");
} }
} }
} }
return out; return out;
} }
/** /**
Decodes a string. Decodes a string.
@param str the base64 encoded string @param str the base64 encoded string
@param alphabet which alphabet should be used @param alphabet which alphabet should be used
@param behavior the behavior when an error was detected @param behavior the behavior when an error was detected
@returns the decoded string @returns the decoded string
@throws see base64::decode() @throws see base64::decode()
*/ */
static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_, static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate) decoding_behavior behavior = decoding_behavior::moderate)
{ {
std::string result; std::string result;
result.reserve(max_decode_size(str.length())); result.reserve(max_decode_size(str.length()));
decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior); decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
return result; return result;
} }
/** /**
Decodes a string. Decodes a string.
@param buffer the base64 encoded buffer @param buffer the base64 encoded buffer
@param size the size of the buffer @param size the size of the buffer
@param alphabet which alphabet should be used @param alphabet which alphabet should be used
@param behavior the behavior when an error was detected @param behavior the behavior when an error was detected
@returns the decoded string @returns the decoded string
@throws see base64::decode() @throws see base64::decode()
*/ */
static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_, static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate) decoding_behavior behavior = decoding_behavior::moderate)
{ {
std::string result; std::string result;
result.reserve(max_decode_size(size)); result.reserve(max_decode_size(size));
decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior); decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
return result; return result;
} }
/** /**
Decodes a string inplace. Decodes a string inplace.
@param[in,out] str the base64 encoded string @param[in,out] str the base64 encoded string
@param alphabet which alphabet should be used @param alphabet which alphabet should be used
@param behavior the behavior when an error was detected @param behavior the behavior when an error was detected
@throws base64::decode_inplace() @throws base64::decode_inplace()
*/ */
static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_, static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate) decoding_behavior behavior = decoding_behavior::moderate)
{ {
str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin()); str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
} }
/** /**
Decodes a char array inplace. Decodes a char array inplace.
@param[in,out] str the string array @param[in,out] str the string array
@param size the length of the array @param size the length of the array
@param alphabet which alphabet should be used @param alphabet which alphabet should be used
@param behavior the behavior when an error was detected @param behavior the behavior when an error was detected
@returns the pointer to the next element past the last element decoded @returns the pointer to the next element past the last element decoded
@throws base64::decode_inplace() @throws base64::decode_inplace()
*/ */
static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_, static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate) decoding_behavior behavior = decoding_behavior::moderate)
{ {
return decode(str, str + size, str, alphabet, behavior); return decode(str, str + size, str, alphabet, behavior);
} }
/** /**
Returns the required decoding size for a given size. The value is calculated with the following formula: Returns the required decoding size for a given size. The value is calculated with the following formula:
$$ $$
\lceil \frac{size}{4} \rceil \cdot 3 \lceil \frac{size}{4} \rceil \cdot 3
$$ $$
@param size the size of the encoded input @param size the size of the encoded input
@returns the size of the resulting decoded buffer; this the absolute maximum @returns the size of the resulting decoded buffer; this the absolute maximum
*/ */
static std::size_t max_decode_size(std::size_t size) noexcept static std::size_t max_decode_size(std::size_t size) noexcept
{ {
return (size / 4 + (size % 4 ? 1 : 0)) * 3; return (size / 4 + (size % 4 ? 1 : 0)) * 3;
} }
/** /**
Returns the required encoding size for a given size. The value is calculated with the following formula: Returns the required encoding size for a given size. The value is calculated with the following formula:
$$ $$
\lceil \frac{size}{3} \rceil \cdot 4 \lceil \frac{size}{3} \rceil \cdot 4
$$ $$
@param size the size of the decoded input @param size the size of the decoded input
@returns the size of the resulting encoded buffer @returns the size of the resulting encoded buffer
*/ */
static std::size_t required_encode_size(std::size_t size) noexcept static std::size_t required_encode_size(std::size_t size) noexcept
{ {
return (size / 3 + (size % 3 ? 1 : 0)) * 4; return (size / 3 + (size % 3 ? 1 : 0)) * 4;
} }
private: private:
static std::uint8_t _base64_value(alphabet& alphabet, char c) static std::uint8_t _base64_value(alphabet& alphabet, char c)
{ {
if (c >= 'A' && c <= 'Z') { if (c >= 'A' && c <= 'Z') {
return c - 'A'; return c - 'A';
} else if (c >= 'a' && c <= 'z') { } else if (c >= 'a' && c <= 'z') {
return c - 'a' + 26; return c - 'a' + 26;
} else if (c >= '0' && c <= '9') { } else if (c >= '0' && c <= '9') {
return c - '0' + 52; return c - '0' + 52;
} }
// comes down to alphabet // comes down to alphabet
if (alphabet == alphabet::standard) { if (alphabet == alphabet::standard) {
if (c == '+') { if (c == '+') {
return 62; return 62;
} else if (c == '/') { } else if (c == '/') {
return 63; return 63;
} }
} else if (alphabet == alphabet::url_filename_safe) { } else if (alphabet == alphabet::url_filename_safe) {
if (c == '-') { if (c == '-') {
return 62; return 62;
} else if (c == '_') { } else if (c == '_') {
return 63; return 63;
} }
} // auto detect } // auto detect
else { else {
if (c == '+') { if (c == '+') {
alphabet = alphabet::standard; alphabet = alphabet::standard;
return 62; return 62;
} else if (c == '/') { } else if (c == '/') {
alphabet = alphabet::standard; alphabet = alphabet::standard;
return 63; return 63;
} else if (c == '-') { } else if (c == '-') {
alphabet = alphabet::url_filename_safe; alphabet = alphabet::url_filename_safe;
return 62; return 62;
} else if (c == '_') { } else if (c == '_') {
alphabet = alphabet::url_filename_safe; alphabet = alphabet::url_filename_safe;
return 63; return 63;
} }
} }
throw base64_error("invalid base64 character."); throw base64_error("invalid base64 character.");
} }
}; };
#endif // !PUBLIC_DOMAIN_BASE64_HPP_ #endif // !PUBLIC_DOMAIN_BASE64_HPP_

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/clip.cpp vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/clip.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

32
llama/common.cpp vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *
@ -226,13 +226,19 @@ void gpt_params_handle_model_default(gpt_params & params) {
} }
params.hf_file = params.model; params.hf_file = params.model;
} else if (params.model.empty()) { } else if (params.model.empty()) {
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); std::string cache_directory = fs_get_cache_directory();
const bool success = fs_create_directory_with_parents(cache_directory);
if (!success) {
throw std::runtime_error("failed to create cache directory: " + cache_directory);
}
params.model = cache_directory + string_split(params.hf_file, '/').back();
} }
} else if (!params.model_url.empty()) { } else if (!params.model_url.empty()) {
if (params.model.empty()) { if (params.model.empty()) {
auto f = string_split(params.model_url, '#').front(); auto f = string_split(params.model_url, '#').front();
f = string_split(f, '?').front(); f = string_split(f, '?').front();
params.model = fs_get_cache_file(string_split(f, '/').back()); f = string_split(f, '/').back();
params.model = "models/" + f;
} }
} else if (params.model.empty()) { } else if (params.model.empty()) {
params.model = DEFAULT_MODEL_PATH; params.model = DEFAULT_MODEL_PATH;
@ -1511,14 +1517,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.chat_template = argv[i]; params.chat_template = argv[i];
return true; return true;
} }
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.slot_prompt_similarity = std::stof(argv[i]);
return true;
}
if (arg == "-pps") { if (arg == "-pps") {
params.is_pp_shared = true; params.is_pp_shared = true;
return true; return true;
@ -1941,8 +1939,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"set custom jinja chat template (default: template taken from model's metadata)\n" "set custom jinja chat template (default: template taken from model's metadata)\n"
"only commonly used templates are accepted:\n" "only commonly used templates are accepted:\n"
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
options.push_back({ "logging" }); options.push_back({ "logging" });
@ -2299,16 +2295,6 @@ std::string fs_get_cache_directory() {
return ensure_trailing_slash(cache_directory); return ensure_trailing_slash(cache_directory);
} }
std::string fs_get_cache_file(const std::string & filename) {
GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
std::string cache_directory = fs_get_cache_directory();
const bool success = fs_create_directory_with_parents(cache_directory);
if (!success) {
throw std::runtime_error("failed to create cache directory: " + cache_directory);
}
return cache_directory + filename;
}
// //
// Model utils // Model utils

5
llama/common.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *
@ -229,8 +229,6 @@ struct gpt_params {
std::string slot_save_path; std::string slot_save_path;
float slot_prompt_similarity = 0.5f;
// batched-bench params // batched-bench params
bool is_pp_shared = false; bool is_pp_shared = false;
@ -303,7 +301,6 @@ bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path); bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory(); std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);
// //
// Model utils // Model utils

2
llama/ggml-alloc.c vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

154
llama/ggml-alloc.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *
@ -24,79 +24,79 @@
* SOFTWARE. * SOFTWARE.
*/ */
#pragma once #pragma once
#include "ggml.h" #include "ggml.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t; typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t; typedef struct ggml_backend * ggml_backend_t;
// Tensor allocator // Tensor allocator
struct ggml_tallocr { struct ggml_tallocr {
ggml_backend_buffer_t buffer; ggml_backend_buffer_t buffer;
void * base; void * base;
size_t alignment; size_t alignment;
size_t offset; size_t offset;
}; };
GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer); GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor); GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
// Graph allocator // Graph allocator
/* /*
Example usage: Example usage:
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type()); ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
// optional: create a worst-case graph and reserve the buffers to avoid reallocations // optional: create a worst-case graph and reserve the buffers to avoid reallocations
ggml_gallocr_reserve(galloc, build_graph(max_batch)); ggml_gallocr_reserve(galloc, build_graph(max_batch));
// allocate the graph // allocate the graph
struct ggml_cgraph * graph = build_graph(batch); struct ggml_cgraph * graph = build_graph(batch);
ggml_gallocr_alloc_graph(galloc, graph); ggml_gallocr_alloc_graph(galloc, graph);
printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0)); printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
// evaluate the graph // evaluate the graph
ggml_backend_graph_compute(backend, graph); ggml_backend_graph_compute(backend, graph);
*/ */
// special tensor flags for use with the graph allocator: // special tensor flags for use with the graph allocator:
// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses // ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
// ggml_set_output(): output tensors are never freed and never overwritten // ggml_set_output(): output tensors are never freed and never overwritten
typedef struct ggml_gallocr * ggml_gallocr_t; typedef struct ggml_gallocr * ggml_gallocr_t;
GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft); GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs); GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
// pre-allocate buffers from a measure graph - does not allocate or modify the graph // pre-allocate buffers from a measure graph - does not allocate or modify the graph
// call with a worst-case graph to avoid buffer reallocations // call with a worst-case graph to avoid buffer reallocations
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
// returns false if the buffer allocation failed // returns false if the buffer allocation failed
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API bool ggml_gallocr_reserve_n( GGML_API bool ggml_gallocr_reserve_n(
ggml_gallocr_t galloc, ggml_gallocr_t galloc,
struct ggml_cgraph * graph, struct ggml_cgraph * graph,
const int * node_buffer_ids, const int * node_buffer_ids,
const int * leaf_buffer_ids); const int * leaf_buffer_ids);
// automatic reallocation if the topology changes when using a single buffer // automatic reallocation if the topology changes when using a single buffer
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
// Utils // Utils
// Create a buffer and allocate all the tensors in a ggml_context // Create a buffer and allocate all the tensors in a ggml_context
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *
@ -24,144 +24,144 @@
* SOFTWARE. * SOFTWARE.
*/ */
#pragma once #pragma once
// ggml-backend internal header // ggml-backend internal header
#include "ggml-backend.h" #include "ggml-backend.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
// //
// Backend buffer // Backend buffer
// //
// buffer type // buffer type
typedef void * ggml_backend_buffer_type_context_t; typedef void * ggml_backend_buffer_type_context_t;
struct ggml_backend_buffer_type_i { struct ggml_backend_buffer_type_i {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory // check if tensor data is in host memory
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
}; };
struct ggml_backend_buffer_type { struct ggml_backend_buffer_type {
struct ggml_backend_buffer_type_i iface; struct ggml_backend_buffer_type_i iface;
ggml_backend_buffer_type_context_t context; ggml_backend_buffer_type_context_t context;
}; };
// buffer // buffer
typedef void * ggml_backend_buffer_context_t; typedef void * ggml_backend_buffer_context_t;
struct ggml_backend_buffer_i { struct ggml_backend_buffer_i {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer); const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer); void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer); void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value); void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
}; };
struct ggml_backend_buffer { struct ggml_backend_buffer {
struct ggml_backend_buffer_i iface; struct ggml_backend_buffer_i iface;
ggml_backend_buffer_type_t buft; ggml_backend_buffer_type_t buft;
ggml_backend_buffer_context_t context; ggml_backend_buffer_context_t context;
size_t size; size_t size;
enum ggml_backend_buffer_usage usage; enum ggml_backend_buffer_usage usage;
}; };
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft, ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface, struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context, ggml_backend_buffer_context_t context,
size_t size); size_t size);
// do not use directly, use ggml_backend_tensor_copy instead // do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
// buffer that contains a collection of buffers // buffer that contains a collection of buffers
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
// //
// Backend // Backend
// //
typedef void * ggml_backend_context_t; typedef void * ggml_backend_context_t;
struct ggml_backend_i { struct ggml_backend_i {
const char * (*GGML_CALL get_name)(ggml_backend_t backend); const char * (*GGML_CALL get_name)(ggml_backend_t backend);
void (*GGML_CALL free)(ggml_backend_t backend); void (*GGML_CALL free)(ggml_backend_t backend);
// buffer allocation // buffer allocation
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend); ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
// (optional) asynchronous tensor data access // (optional) asynchronous tensor data access
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations // (optional) complete all pending operations
void (*GGML_CALL synchronize)(ggml_backend_t backend); void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan (not used currently) // compute graph with a plan (not used currently)
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph with a plan // compute graph with a plan
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph without a plan (async) // compute graph without a plan (async)
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// check if the backend supports an operation // check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend // these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily // even if the weight has to be copied from the CPU temporarily
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization // (optional) event synchronization
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event); void (*GGML_CALL event_free) (ggml_backend_event_t event);
void (*GGML_CALL event_record) (ggml_backend_event_t event); void (*GGML_CALL event_record) (ggml_backend_event_t event);
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
}; };
struct ggml_backend { struct ggml_backend {
ggml_guid_t guid; ggml_guid_t guid;
struct ggml_backend_i iface; struct ggml_backend_i iface;
ggml_backend_context_t context; ggml_backend_context_t context;
}; };
struct ggml_backend_event { struct ggml_backend_event {
ggml_backend_t backend; ggml_backend_t backend;
void * context; void * context;
}; };
// //
// Backend registry // Backend registry
// //
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data); typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/ggml-common.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

90
llama/ggml-cuda.cu vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *
@ -1377,30 +1377,10 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
GGML_UNUSED(main_device); GGML_UNUSED(main_device);
} }
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS)
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {};
p.dstDevice = dstDevice;
p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
p.srcDevice = srcDevice;
p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
p.extent = make_cudaExtent(width, height, 1);
return cudaMemcpy3DPeerAsync(&p, stream);
#else
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
GGML_UNUSED(dstDevice);
GGML_UNUSED(srcDevice);
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
#endif // !defined(GGML_USE_HIPBLAS)
}
static void ggml_cuda_op_mul_mat( static void ggml_cuda_op_mul_mat(
ggml_backend_cuda_context & ctx, ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
quantize_cuda_t quantize_src1) { const bool convert_src1_to_q8_1) {
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
@ -1457,9 +1437,7 @@ static void ggml_cuda_op_mul_mat(
} }
struct dev_data { struct dev_data {
int cc; ggml_cuda_pool_alloc<char> src0_dd_alloc;
ggml_cuda_pool_alloc<char> src0_dd_alloc;
ggml_cuda_pool_alloc<float> src1_ddf_alloc; ggml_cuda_pool_alloc<float> src1_ddf_alloc;
ggml_cuda_pool_alloc<char> src1_ddq_alloc; ggml_cuda_pool_alloc<char> src1_ddq_alloc;
ggml_cuda_pool_alloc<float> dst_dd_alloc; ggml_cuda_pool_alloc<float> dst_dd_alloc;
@ -1478,8 +1456,6 @@ static void ggml_cuda_op_mul_mat(
int used_devices = 0; int used_devices = 0;
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
dev[id].cc = ggml_cuda_info().devices[id].cc;
// by default, use all rows // by default, use all rows
dev[id].row_low = 0; dev[id].row_low = 0;
dev[id].row_high = ne01; dev[id].row_high = ne01;
@ -1530,15 +1506,11 @@ static void ggml_cuda_op_mul_mat(
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1)); dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
} }
if (quantize_src1) { if (convert_src1_to_q8_1) {
size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs; dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
}
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
if (src1_on_device && src1_is_contiguous) { if (src1_on_device && src1_is_contiguous) {
quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream); quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
} }
} }
@ -1584,12 +1556,7 @@ static void ggml_cuda_op_mul_mat(
const int64_t i03 = i0 / ne12; const int64_t i03 = i0 / ne12;
const int64_t i02 = i0 % ne12; const int64_t i02 = i0 % ne12;
size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs; const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
} else {
src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
}
// for split tensors the data begins at i0 == i0_offset_low // for split tensors the data begins at i0 == i0_offset_low
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
@ -1606,17 +1573,10 @@ static void ggml_cuda_op_mul_mat(
// copy src0, src1 to device if necessary // copy src0, src1 to device if necessary
if (src1_is_contiguous) { if (src1_is_contiguous) {
if (id != ctx.device) { if (id != ctx.device) {
if (quantize_src1) { if (convert_src1_to_q8_1) {
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset; char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
if (quantize_src1 == quantize_mmq_q8_1_cuda) { CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
const size_t pitch = ne11*sizeof(block_q8_1_mmq); src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
const size_t height = src1_padded_col_size/(4*QK8_1);
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
} else {
CUDA_CHECK(cudaMemcpyPeerAsync(
src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
}
} else { } else {
float * src1_ddf_i_source = (float *) src1->data; float * src1_ddf_i_source = (float *) src1->data;
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10; src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
@ -1631,8 +1591,8 @@ static void ggml_cuda_op_mul_mat(
GGML_ASSERT(false); GGML_ASSERT(false);
} }
if (quantize_src1 && !src1_is_contiguous) { if (convert_src1_to_q8_1 && !src1_is_contiguous) {
quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream); quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
} }
@ -1657,8 +1617,22 @@ static void ggml_cuda_op_mul_mat(
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low; dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync( #if !defined(GGML_USE_HIPBLAS)
dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream)); // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {};
p.dstDevice = ctx.device;
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
p.srcDevice = id;
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
#else
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
dst_dd_i, row_diff*sizeof(float),
row_diff*sizeof(float), src1_ncols,
cudaMemcpyDeviceToDevice, stream));
#endif
} else { } else {
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
@ -1997,13 +1971,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
// KQ + KQV multi-batch // KQ + KQV multi-batch
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) { } else if (use_dequantize_mul_mat_vec) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr); ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
} else if (use_mul_mat_vec_q) { } else if (use_mul_mat_vec_q) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda); ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
} else if (use_mul_mat_q) { } else if (use_mul_mat_q) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda); ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
} else { } else {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr); ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
} }
} }

2
llama/ggml-cuda.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

View File

@ -1,47 +1,47 @@
#include "acc.cuh" #include "acc.cuh"
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne, static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
const int ne10, const int ne11, const int ne12, const int ne10, const int ne11, const int ne12,
const int nb1, const int nb2, int offset) { const int nb1, const int nb2, int offset) {
const int i = blockDim.x * blockIdx.x + threadIdx.x; const int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= ne) { if (i >= ne) {
return; return;
} }
int src1_idx = i - offset; int src1_idx = i - offset;
int oz = src1_idx / nb2; int oz = src1_idx / nb2;
int oy = (src1_idx - (oz * nb2)) / nb1; int oy = (src1_idx - (oz * nb2)) / nb1;
int ox = src1_idx % nb1; int ox = src1_idx % nb1;
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
} else { } else {
dst[i] = x[i]; dst[i] = x[i];
} }
} }
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements, static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
const int ne10, const int ne11, const int ne12, const int ne10, const int ne11, const int ne12,
const int nb1, const int nb2, const int offset, cudaStream_t stream) { const int nb1, const int nb2, const int offset, cudaStream_t stream) {
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE; int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset); acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
} }
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
const float * src1_d = (const float *)src1->data; const float * src1_d = (const float *)src1->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
int offset = dst->op_params[3] / 4; // offset in bytes int offset = dst->op_params[3] / 4; // offset in bytes
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream); acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
} }

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_ACC_BLOCK_SIZE 256 #define CUDA_ACC_BLOCK_SIZE 256
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,34 +1,34 @@
#include "arange.cuh" #include "arange.cuh"
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
// blockIDx.x: idx of ne0 / BLOCK_SIZE // blockIDx.x: idx of ne0 / BLOCK_SIZE
int nidx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) { if (nidx >= ne0) {
return; return;
} }
dst[nidx] = start + step * nidx; dst[nidx] = start + step * nidx;
} }
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE; int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step); arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
} }
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32);
float start; float start;
float stop; float stop;
float step; float step;
memcpy(&start, (float *)dst->op_params + 0, sizeof(float)); memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float)); memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
memcpy(&step, (float *)dst->op_params + 2, sizeof(float)); memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
int64_t steps = (int64_t)ceil((stop - start) / step); int64_t steps = (int64_t)ceil((stop - start) / step);
GGML_ASSERT(ggml_nelements(dst) == steps); GGML_ASSERT(ggml_nelements(dst) == steps);
arange_f32_cuda(dst_d, dst->ne[0], start, step, stream); arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
} }

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_ARANGE_BLOCK_SIZE 256 #define CUDA_ARANGE_BLOCK_SIZE 256
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,103 +1,103 @@
#include "argsort.cuh" #include "argsort.cuh"
template<typename T> template<typename T>
static inline __device__ void ggml_cuda_swap(T & a, T & b) { static inline __device__ void ggml_cuda_swap(T & a, T & b) {
T tmp = a; T tmp = a;
a = b; a = b;
b = tmp; b = tmp;
} }
template<ggml_sort_order order> template<ggml_sort_order order>
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) { static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
// bitonic sort // bitonic sort
int col = threadIdx.x; int col = threadIdx.x;
int row = blockIdx.y; int row = blockIdx.y;
if (col >= ncols_pad) { if (col >= ncols_pad) {
return; return;
} }
const float * x_row = x + row * ncols; const float * x_row = x + row * ncols;
extern __shared__ int dst_row[]; extern __shared__ int dst_row[];
// initialize indices // initialize indices
dst_row[col] = col; dst_row[col] = col;
__syncthreads(); __syncthreads();
for (int k = 2; k <= ncols_pad; k *= 2) { for (int k = 2; k <= ncols_pad; k *= 2) {
for (int j = k / 2; j > 0; j /= 2) { for (int j = k / 2; j > 0; j /= 2) {
int ixj = col ^ j; int ixj = col ^ j;
if (ixj > col) { if (ixj > col) {
if ((col & k) == 0) { if ((col & k) == 0) {
if (dst_row[col] >= ncols || if (dst_row[col] >= ncols ||
(dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ? (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]] :
x_row[dst_row[col]] < x_row[dst_row[ixj]])) x_row[dst_row[col]] < x_row[dst_row[ixj]]))
) { ) {
ggml_cuda_swap(dst_row[col], dst_row[ixj]); ggml_cuda_swap(dst_row[col], dst_row[ixj]);
} }
} else { } else {
if (dst_row[ixj] >= ncols || if (dst_row[ixj] >= ncols ||
(dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ? (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]] :
x_row[dst_row[col]] > x_row[dst_row[ixj]])) x_row[dst_row[col]] > x_row[dst_row[ixj]]))
) { ) {
ggml_cuda_swap(dst_row[col], dst_row[ixj]); ggml_cuda_swap(dst_row[col], dst_row[ixj]);
} }
} }
} }
__syncthreads(); __syncthreads();
} }
} }
// copy the result to dst without the padding // copy the result to dst without the padding
if (col < ncols) { if (col < ncols) {
dst[row * ncols + col] = dst_row[col]; dst[row * ncols + col] = dst_row[col];
} }
} }
static int next_power_of_2(int x) { static int next_power_of_2(int x) {
int n = 1; int n = 1;
while (n < x) { while (n < x) {
n *= 2; n *= 2;
} }
return n; return n;
} }
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
// bitonic sort requires ncols to be power of 2 // bitonic sort requires ncols to be power of 2
const int ncols_pad = next_power_of_2(ncols); const int ncols_pad = next_power_of_2(ncols);
const dim3 block_dims(ncols_pad, 1, 1); const dim3 block_dims(ncols_pad, 1, 1);
const dim3 block_nums(1, nrows, 1); const dim3 block_nums(1, nrows, 1);
const size_t shared_mem = ncols_pad * sizeof(int); const size_t shared_mem = ncols_pad * sizeof(int);
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb); GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
if (order == GGML_SORT_ORDER_ASC) { if (order == GGML_SORT_ORDER_ASC) {
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad); k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else if (order == GGML_SORT_ORDER_DESC) { } else if (order == GGML_SORT_ORDER_DESC) {
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad); k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} }
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_I32); GGML_ASSERT( dst->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
const int64_t ncols = src0->ne[0]; const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
} }

View File

@ -1,3 +1,3 @@
#include "common.cuh" #include "common.cuh"
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,280 +1,280 @@
#include "binbcast.cuh" #include "binbcast.cuh"
static __device__ __forceinline__ float op_repeat(const float a, const float b) { static __device__ __forceinline__ float op_repeat(const float a, const float b) {
return b; return b;
GGML_UNUSED(a); GGML_UNUSED(a);
} }
static __device__ __forceinline__ float op_add(const float a, const float b) { static __device__ __forceinline__ float op_add(const float a, const float b) {
return a + b; return a + b;
} }
static __device__ __forceinline__ float op_mul(const float a, const float b) { static __device__ __forceinline__ float op_mul(const float a, const float b) {
return a * b; return a * b;
} }
static __device__ __forceinline__ float op_div(const float a, const float b) { static __device__ __forceinline__ float op_div(const float a, const float b) {
return a / b; return a / b;
} }
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t> template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3, int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13, int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3, /*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03, /*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13) { /*int s10,*/ int s11, int s12, int s13) {
const int i0s = blockDim.x*blockIdx.x + threadIdx.x; const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y); const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3; const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3; const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return; return;
} }
const int i11 = i1 % ne11; const int i11 = i1 % ne11;
const int i12 = i2 % ne12; const int i12 = i2 % ne12;
const int i13 = i3 % ne13; const int i13 = i3 % ne13;
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1; const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const src0_t * src0_row = src0 + i_src0; const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1; const src1_t * src1_row = src1 + i_src1;
dst_t * dst_row = dst + i_dst; dst_t * dst_row = dst + i_dst;
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) { for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
const int i10 = i0 % ne10; const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
} }
} }
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t> template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3, int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13, int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3, /*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03, /*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13) { /*int s10,*/ int s11, int s12, int s13) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
const int i3 = i/(ne2*ne1*ne0); const int i3 = i/(ne2*ne1*ne0);
const int i2 = (i/(ne1*ne0)) % ne2; const int i2 = (i/(ne1*ne0)) % ne2;
const int i1 = (i/ne0) % ne1; const int i1 = (i/ne0) % ne1;
const int i0 = i % ne0; const int i0 = i % ne0;
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return; return;
} }
const int i11 = i1 % ne11; const int i11 = i1 % ne11;
const int i12 = i2 % ne12; const int i12 = i2 % ne12;
const int i13 = i3 % ne13; const int i13 = i3 % ne13;
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1; const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const src0_t * src0_row = src0 + i_src0; const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1; const src1_t * src1_row = src1 + i_src1;
dst_t * dst_row = dst + i_dst; dst_t * dst_row = dst + i_dst;
const int i10 = i0 % ne10; const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
} }
template<float (*bin_op)(const float, const float)> template<float (*bin_op)(const float, const float)>
struct bin_bcast_cuda { struct bin_bcast_cuda {
template<typename src0_t, typename src1_t, typename dst_t> template<typename src0_t, typename src1_t, typename dst_t>
void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
cudaStream_t stream) { cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
int nr0 = ne10/ne0; int nr0 = ne10/ne0;
int nr1 = ne11/ne1; int nr1 = ne11/ne1;
int nr2 = ne12/ne2; int nr2 = ne12/ne2;
int nr3 = ne13/ne3; int nr3 = ne13/ne3;
int nr[4] = { nr0, nr1, nr2, nr3 }; int nr[4] = { nr0, nr1, nr2, nr3 };
// collapse dimensions until first broadcast dimension // collapse dimensions until first broadcast dimension
int64_t cne[] = {ne0, ne1, ne2, ne3}; int64_t cne[] = {ne0, ne1, ne2, ne3};
int64_t cne0[] = {ne00, ne01, ne02, ne03}; int64_t cne0[] = {ne00, ne01, ne02, ne03};
int64_t cne1[] = {ne10, ne11, ne12, ne13}; int64_t cne1[] = {ne10, ne11, ne12, ne13};
size_t cnb[] = {nb0, nb1, nb2, nb3}; size_t cnb[] = {nb0, nb1, nb2, nb3};
size_t cnb0[] = {nb00, nb01, nb02, nb03}; size_t cnb0[] = {nb00, nb01, nb02, nb03};
size_t cnb1[] = {nb10, nb11, nb12, nb13}; size_t cnb1[] = {nb10, nb11, nb12, nb13};
auto collapse = [](int64_t cne[]) { auto collapse = [](int64_t cne[]) {
cne[0] *= cne[1]; cne[0] *= cne[1];
cne[1] = cne[2]; cne[1] = cne[2];
cne[2] = cne[3]; cne[2] = cne[3];
cne[3] = 1; cne[3] = 1;
}; };
auto collapse_nb = [](size_t cnb[], const int64_t cne[]) { auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
cnb[1] *= cne[1]; cnb[1] *= cne[1];
cnb[2] *= cne[2]; cnb[2] *= cne[2];
cnb[3] *= cne[3]; cnb[3] *= cne[3];
}; };
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) { if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
if (nr[i] != 1) { if (nr[i] != 1) {
break; break;
} }
if (i > 0) { if (i > 0) {
collapse_nb(cnb, cne); collapse_nb(cnb, cne);
collapse_nb(cnb0, cne0); collapse_nb(cnb0, cne0);
collapse_nb(cnb1, cne1); collapse_nb(cnb1, cne1);
collapse(cne); collapse(cne);
collapse(cne0); collapse(cne0);
collapse(cne1); collapse(cne1);
} }
} }
} }
{ {
int64_t ne0 = cne[0]; int64_t ne0 = cne[0];
int64_t ne1 = cne[1]; int64_t ne1 = cne[1];
int64_t ne2 = cne[2]; int64_t ne2 = cne[2];
int64_t ne3 = cne[3]; int64_t ne3 = cne[3];
//int64_t ne00 = cne0[0]; GGML_UNUSED(ne00); //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
//int64_t ne01 = cne0[1]; GGML_UNUSED(ne01); //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
//int64_t ne02 = cne0[2]; GGML_UNUSED(ne02); //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
//int64_t ne03 = cne0[3]; GGML_UNUSED(ne03); //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
int64_t ne10 = cne1[0]; int64_t ne10 = cne1[0];
int64_t ne11 = cne1[1]; int64_t ne11 = cne1[1];
int64_t ne12 = cne1[2]; int64_t ne12 = cne1[2];
int64_t ne13 = cne1[3]; int64_t ne13 = cne1[3];
size_t nb0 = cnb[0]; size_t nb0 = cnb[0];
size_t nb1 = cnb[1]; size_t nb1 = cnb[1];
size_t nb2 = cnb[2]; size_t nb2 = cnb[2];
size_t nb3 = cnb[3]; size_t nb3 = cnb[3];
size_t nb00 = cnb0[0]; size_t nb00 = cnb0[0];
size_t nb01 = cnb0[1]; size_t nb01 = cnb0[1];
size_t nb02 = cnb0[2]; size_t nb02 = cnb0[2];
size_t nb03 = cnb0[3]; size_t nb03 = cnb0[3];
size_t nb10 = cnb1[0]; size_t nb10 = cnb1[0];
size_t nb11 = cnb1[1]; size_t nb11 = cnb1[1];
size_t nb12 = cnb1[2]; size_t nb12 = cnb1[2];
size_t nb13 = cnb1[3]; size_t nb13 = cnb1[3];
size_t s0 = nb0 / sizeof(dst_t); size_t s0 = nb0 / sizeof(dst_t);
size_t s1 = nb1 / sizeof(dst_t); size_t s1 = nb1 / sizeof(dst_t);
size_t s2 = nb2 / sizeof(dst_t); size_t s2 = nb2 / sizeof(dst_t);
size_t s3 = nb3 / sizeof(dst_t); size_t s3 = nb3 / sizeof(dst_t);
size_t s10 = nb10 / sizeof(src1_t); size_t s10 = nb10 / sizeof(src1_t);
size_t s11 = nb11 / sizeof(src1_t); size_t s11 = nb11 / sizeof(src1_t);
size_t s12 = nb12 / sizeof(src1_t); size_t s12 = nb12 / sizeof(src1_t);
size_t s13 = nb13 / sizeof(src1_t); size_t s13 = nb13 / sizeof(src1_t);
size_t s00 = nb00 / sizeof(src0_t); size_t s00 = nb00 / sizeof(src0_t);
size_t s01 = nb01 / sizeof(src0_t); size_t s01 = nb01 / sizeof(src0_t);
size_t s02 = nb02 / sizeof(src0_t); size_t s02 = nb02 / sizeof(src0_t);
size_t s03 = nb03 / sizeof(src0_t); size_t s03 = nb03 / sizeof(src0_t);
GGML_ASSERT(nb0 % sizeof(dst_t) == 0); GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
GGML_ASSERT(nb1 % sizeof(dst_t) == 0); GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
GGML_ASSERT(nb2 % sizeof(dst_t) == 0); GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
GGML_ASSERT(nb3 % sizeof(dst_t) == 0); GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
GGML_ASSERT(nb00 % sizeof(src0_t) == 0); GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
GGML_ASSERT(nb01 % sizeof(src0_t) == 0); GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
GGML_ASSERT(nb02 % sizeof(src0_t) == 0); GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
GGML_ASSERT(nb03 % sizeof(src0_t) == 0); GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
GGML_ASSERT(nb10 % sizeof(src1_t) == 0); GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
GGML_ASSERT(nb11 % sizeof(src1_t) == 0); GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
GGML_ASSERT(nb12 % sizeof(src1_t) == 0); GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
GGML_ASSERT(nb13 % sizeof(src1_t) == 0); GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
GGML_ASSERT(s0 == 1); GGML_ASSERT(s0 == 1);
GGML_ASSERT(s00 == 1); GGML_ASSERT(s00 == 1);
GGML_ASSERT(s10 == 1); GGML_ASSERT(s10 == 1);
const int block_size = 128; const int block_size = 128;
int64_t hne0 = std::max(ne0/2LL, 1LL); int64_t hne0 = std::max(ne0/2LL, 1LL);
dim3 block_dims; dim3 block_dims;
block_dims.x = std::min<unsigned int>(hne0, block_size); block_dims.x = std::min<unsigned int>(hne0, block_size);
block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x); block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U); block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
dim3 block_nums( dim3 block_nums(
(hne0 + block_dims.x - 1) / block_dims.x, (hne0 + block_dims.x - 1) / block_dims.x,
(ne1 + block_dims.y - 1) / block_dims.y, (ne1 + block_dims.y - 1) / block_dims.y,
(ne2*ne3 + block_dims.z - 1) / block_dims.z (ne2*ne3 + block_dims.z - 1) / block_dims.z
); );
if (block_nums.z > 65535) { if (block_nums.z > 65535) {
// this is the maximum number of blocks in z dimension, fallback to 1D grid kernel // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>( k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
src0_dd, src1_dd, dst_dd, src0_dd, src1_dd, dst_dd,
ne0, ne1, ne2, ne3, ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13, ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3, /* s0, */ s1, s2, s3,
/* s00, */ s01, s02, s03, /* s00, */ s01, s02, s03,
/* s10, */ s11, s12, s13); /* s10, */ s11, s12, s13);
} else { } else {
k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>( k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd, src0_dd, src1_dd, dst_dd,
ne0, ne1, ne2, ne3, ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13, ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3, /* s0, */ s1, s2, s3,
/* s00, */ s01, s02, s03, /* s00, */ s01, s02, s03,
/* s10, */ s11, s12, s13); /* s10, */ s11, s12, s13);
} }
} }
} }
}; };
template<class op> template<class op>
static void ggml_cuda_op_bin_bcast( static void ggml_cuda_op_bin_bcast(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) { const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream); op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream); op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream); op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
} else { } else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} }
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream()); ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
} }
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
} }
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
} }
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
} }

View File

@ -1,6 +1,6 @@
#include "common.cuh" #include "common.cuh"
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_CLAMP_BLOCK_SIZE 256 #define CUDA_CLAMP_BLOCK_SIZE 256
void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_CONCAT_BLOCK_SIZE 256 #define CUDA_CONCAT_BLOCK_SIZE 256
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,13 +1,13 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
template<typename T> template<typename T>
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
typedef to_t_cuda_t<float> to_fp32_cuda_t; typedef to_t_cuda_t<float> to_fp32_cuda_t;
typedef to_t_cuda_t<half> to_fp16_cuda_t; typedef to_t_cuda_t<half> to_fp16_cuda_t;
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);

View File

@ -1,103 +1,103 @@
#include "common.cuh" #include "common.cuh"
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const block_q4_0 * x = (const block_q4_0 *) vx; const block_q4_0 * x = (const block_q4_0 *) vx;
const dfloat d = x[ib].d; const dfloat d = x[ib].d;
const int vui = x[ib].qs[iqs]; const int vui = x[ib].qs[iqs];
v.x = vui & 0xF; v.x = vui & 0xF;
v.y = vui >> 4; v.y = vui >> 4;
#ifdef GGML_CUDA_F16 #ifdef GGML_CUDA_F16
v = __hsub2(v, {8.0f, 8.0f}); v = __hsub2(v, {8.0f, 8.0f});
v = __hmul2(v, {d, d}); v = __hmul2(v, {d, d});
#else #else
v.x = (v.x - 8.0f) * d; v.x = (v.x - 8.0f) * d;
v.y = (v.y - 8.0f) * d; v.y = (v.y - 8.0f) * d;
#endif // GGML_CUDA_F16 #endif // GGML_CUDA_F16
} }
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const block_q4_1 * x = (const block_q4_1 *) vx; const block_q4_1 * x = (const block_q4_1 *) vx;
const dfloat d = __low2half(x[ib].dm); const dfloat d = __low2half(x[ib].dm);
const dfloat m = __high2half(x[ib].dm); const dfloat m = __high2half(x[ib].dm);
const int vui = x[ib].qs[iqs]; const int vui = x[ib].qs[iqs];
v.x = vui & 0xF; v.x = vui & 0xF;
v.y = vui >> 4; v.y = vui >> 4;
#ifdef GGML_CUDA_F16 #ifdef GGML_CUDA_F16
v = __hmul2(v, {d, d}); v = __hmul2(v, {d, d});
v = __hadd2(v, {m, m}); v = __hadd2(v, {m, m});
#else #else
v.x = (v.x * d) + m; v.x = (v.x * d) + m;
v.y = (v.y * d) + m; v.y = (v.y * d) + m;
#endif // GGML_CUDA_F16 #endif // GGML_CUDA_F16
} }
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const block_q5_0 * x = (const block_q5_0 *) vx; const block_q5_0 * x = (const block_q5_0 *) vx;
const dfloat d = x[ib].d; const dfloat d = x[ib].d;
uint32_t qh; uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh)); memcpy(&qh, x[ib].qh, sizeof(qh));
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
v.y = ((x[ib].qs[iqs] >> 4) | xh_1); v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
#ifdef GGML_CUDA_F16 #ifdef GGML_CUDA_F16
v = __hsub2(v, {16.0f, 16.0f}); v = __hsub2(v, {16.0f, 16.0f});
v = __hmul2(v, {d, d}); v = __hmul2(v, {d, d});
#else #else
v.x = (v.x - 16.0f) * d; v.x = (v.x - 16.0f) * d;
v.y = (v.y - 16.0f) * d; v.y = (v.y - 16.0f) * d;
#endif // GGML_CUDA_F16 #endif // GGML_CUDA_F16
} }
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const block_q5_1 * x = (const block_q5_1 *) vx; const block_q5_1 * x = (const block_q5_1 *) vx;
const dfloat d = __low2half(x[ib].dm); const dfloat d = __low2half(x[ib].dm);
const dfloat m = __high2half(x[ib].dm); const dfloat m = __high2half(x[ib].dm);
uint32_t qh; uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh)); memcpy(&qh, x[ib].qh, sizeof(qh));
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
v.y = ((x[ib].qs[iqs] >> 4) | xh_1); v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
#ifdef GGML_CUDA_F16 #ifdef GGML_CUDA_F16
v = __hmul2(v, {d, d}); v = __hmul2(v, {d, d});
v = __hadd2(v, {m, m}); v = __hadd2(v, {m, m});
#else #else
v.x = (v.x * d) + m; v.x = (v.x * d) + m;
v.y = (v.y * d) + m; v.y = (v.y * d) + m;
#endif // GGML_CUDA_F16 #endif // GGML_CUDA_F16
} }
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const block_q8_0 * x = (const block_q8_0 *) vx; const block_q8_0 * x = (const block_q8_0 *) vx;
const dfloat d = x[ib].d; const dfloat d = x[ib].d;
v.x = x[ib].qs[iqs + 0]; v.x = x[ib].qs[iqs + 0];
v.y = x[ib].qs[iqs + 1]; v.y = x[ib].qs[iqs + 1];
#ifdef GGML_CUDA_F16 #ifdef GGML_CUDA_F16
v = __hmul2(v, {d, d}); v = __hmul2(v, {d, d});
#else #else
v.x *= d; v.x *= d;
v.y *= d; v.y *= d;
#endif // GGML_CUDA_F16 #endif // GGML_CUDA_F16
} }

View File

@ -1,40 +1,40 @@
#include "diagmask.cuh" #include "diagmask.cuh"
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
const int col = blockDim.y*blockIdx.y + threadIdx.y; const int col = blockDim.y*blockIdx.y + threadIdx.y;
const int row = blockDim.x*blockIdx.x + threadIdx.x; const int row = blockDim.x*blockIdx.x + threadIdx.x;
if (col >= ncols) { if (col >= ncols) {
return; return;
} }
const int i = row*ncols + col; const int i = row*ncols + col;
//dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
//dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
} }
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1); const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
const dim3 block_nums(nrows_x, block_num_x, 1); const dim3 block_nums(nrows_x, block_num_x, 1);
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past); diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
} }
void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
const int nrows0 = ggml_nrows(src0); const int nrows0 = ggml_nrows(src0);
const int n_past = ((int32_t *) dst->op_params)[0]; const int n_past = ((int32_t *) dst->op_params)[0];
diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream); diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream);
} }

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,18 +1,18 @@
#include "common.cuh" #include "common.cuh"
// dmmv = dequantize_mul_mat_vec // dmmv = dequantize_mul_mat_vec
// TODO: remove this? // TODO: remove this?
#ifndef GGML_CUDA_DMMV_X #ifndef GGML_CUDA_DMMV_X
#define GGML_CUDA_DMMV_X 32 #define GGML_CUDA_DMMV_X 32
#endif #endif
#ifndef GGML_CUDA_MMV_Y #ifndef GGML_CUDA_MMV_Y
#define GGML_CUDA_MMV_Y 1 #define GGML_CUDA_MMV_Y 1
#endif #endif
void ggml_cuda_op_dequantize_mul_mat_vec( void ggml_cuda_op_dequantize_mul_mat_vec(
ggml_backend_cuda_context & ctx, ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream); const int64_t src1_padded_row_size, cudaStream_t stream);

View File

@ -1,178 +1,178 @@
#include "getrows.cuh" #include "getrows.cuh"
#include "dequantize.cuh" #include "dequantize.cuh"
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t> template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows( static __global__ void k_get_rows(
const void * src0, const int32_t * src1, dst_t * dst, const void * src0, const int32_t * src1, dst_t * dst,
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
/*size_t s0,*/ size_t s1, size_t s2, size_t s3, /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
size_t s10, size_t s11, size_t s12/*, size_t s13*/) { size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2; const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y; const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
if (i00 >= ne00) { if (i00 >= ne00) {
return; return;
} }
const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
const int ib = i00/qk; // block index const int ib = i00/qk; // block index
const int iqs = (i00%qk)/qr; // quant index const int iqs = (i00%qk)/qr; // quant index
const int iybs = i00 - i00%qk; // dst block start index const int iybs = i00 - i00%qk; // dst block start index
const int y_offset = qr == 1 ? 1 : qk/2; const int y_offset = qr == 1 ? 1 : qk/2;
// dequantize // dequantize
dfloat2 v; dfloat2 v;
dequantize_kernel(src0_row, ib, iqs, v); dequantize_kernel(src0_row, ib, iqs, v);
dst_row[iybs + iqs + 0] = v.x; dst_row[iybs + iqs + 0] = v.x;
dst_row[iybs + iqs + y_offset] = v.y; dst_row[iybs + iqs + y_offset] = v.y;
} }
template<typename src0_t, typename dst_t> template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float( static __global__ void k_get_rows_float(
const src0_t * src0, const int32_t * src1, dst_t * dst, const src0_t * src0, const int32_t * src1, dst_t * dst,
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
/*size_t s0,*/ size_t s1, size_t s2, size_t s3, /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
size_t s10, size_t s11, size_t s12/*, size_t s13*/) { size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
const int i00 = blockIdx.x*blockDim.x + threadIdx.x; const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y; const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
if (i00 >= ne00) { if (i00 >= ne00) {
return; return;
} }
const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
dst_row[i00] = src0_row[i00]; dst_row[i00] = src0_row[i00];
} }
template<int qk, int qr, dequantize_kernel_t dq> template<int qk, int qr, dequantize_kernel_t dq>
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
const dim3 block_nums(block_num_x, ne10, ne11*ne12); const dim3 block_nums(block_num_x, ne10, ne11*ne12);
// strides in elements // strides in elements
//const size_t s0 = nb0 / ggml_element_size(dst); //const size_t s0 = nb0 / ggml_element_size(dst);
const size_t s1 = nb1 / ggml_element_size(dst); const size_t s1 = nb1 / ggml_element_size(dst);
const size_t s2 = nb2 / ggml_element_size(dst); const size_t s2 = nb2 / ggml_element_size(dst);
const size_t s3 = nb3 / ggml_element_size(dst); const size_t s3 = nb3 / ggml_element_size(dst);
const size_t s10 = nb10 / ggml_element_size(src1); const size_t s10 = nb10 / ggml_element_size(src1);
const size_t s11 = nb11 / ggml_element_size(src1); const size_t s11 = nb11 / ggml_element_size(src1);
const size_t s12 = nb12 / ggml_element_size(src1); const size_t s12 = nb12 / ggml_element_size(src1);
//const size_t s13 = nb13 / ggml_element_size(src1); //const size_t s13 = nb13 / ggml_element_size(src1);
GGML_ASSERT(ne00 % 2 == 0); GGML_ASSERT(ne00 % 2 == 0);
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>( k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd, src0_dd, src1_dd, dst_dd,
ne00, /*ne01, ne02, ne03,*/ ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/ /*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3, /* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03, /* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/); s10, s11, s12/*, s13*/);
GGML_UNUSED(dst); GGML_UNUSED(dst);
} }
template<typename src0_t> template<typename src0_t>
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne10, ne11*ne12); const dim3 block_nums(block_num_x, ne10, ne11*ne12);
// strides in elements // strides in elements
//const size_t s0 = nb0 / ggml_element_size(dst); //const size_t s0 = nb0 / ggml_element_size(dst);
const size_t s1 = nb1 / ggml_element_size(dst); const size_t s1 = nb1 / ggml_element_size(dst);
const size_t s2 = nb2 / ggml_element_size(dst); const size_t s2 = nb2 / ggml_element_size(dst);
const size_t s3 = nb3 / ggml_element_size(dst); const size_t s3 = nb3 / ggml_element_size(dst);
const size_t s10 = nb10 / ggml_element_size(src1); const size_t s10 = nb10 / ggml_element_size(src1);
const size_t s11 = nb11 / ggml_element_size(src1); const size_t s11 = nb11 / ggml_element_size(src1);
const size_t s12 = nb12 / ggml_element_size(src1); const size_t s12 = nb12 / ggml_element_size(src1);
//const size_t s13 = nb13 / ggml_element_size(src1); //const size_t s13 = nb13 / ggml_element_size(src1);
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>( k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd, src0_dd, src1_dd, dst_dd,
ne00, /*ne01, ne02, ne03,*/ ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/ /*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3, /* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03, /* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/); s10, s11, s12/*, s13*/);
GGML_UNUSED(dst); GGML_UNUSED(dst);
} }
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
const float * src1_d = (const float *)src1->data; const float * src1_d = (const float *)src1->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
const int32_t * src1_i32 = (const int32_t *) src1_d; const int32_t * src1_i32 = (const int32_t *) src1_d;
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream); get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
break; break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break; break;
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break; break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break; break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break; break;
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break; break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break; break;
default: default:
// TODO: k-quants // TODO: k-quants
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
GGML_ASSERT(false); GGML_ASSERT(false);
break; break;
} }
} }

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_GET_ROWS_BLOCK_SIZE 256 #define CUDA_GET_ROWS_BLOCK_SIZE 256
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,104 +1,104 @@
#include "im2col.cuh" #include "im2col.cuh"
template <typename T> template <typename T>
static __global__ void im2col_kernel( static __global__ void im2col_kernel(
const float * x, T * dst, int64_t batch_offset, const float * x, T * dst, int64_t batch_offset,
int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW, int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
int s0, int s1, int p0, int p1, int d0, int d1) { int s0, int s1, int p0, int p1, int d0, int d1) {
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x; const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= pelements) { if (i >= pelements) {
return; return;
} }
const int64_t ksize = OW * (KH > 1 ? KW : 1); const int64_t ksize = OW * (KH > 1 ? KW : 1);
const int64_t kx = i / ksize; const int64_t kx = i / ksize;
const int64_t kd = kx * ksize; const int64_t kd = kx * ksize;
const int64_t ky = (i - kd) / OW; const int64_t ky = (i - kd) / OW;
const int64_t ix = i % OW; const int64_t ix = i % OW;
const int64_t oh = blockIdx.y; const int64_t oh = blockIdx.y;
const int64_t batch = blockIdx.z / IC; const int64_t batch = blockIdx.z / IC;
const int64_t ic = blockIdx.z % IC; const int64_t ic = blockIdx.z % IC;
const int64_t iiw = ix * s0 + kx * d0 - p0; const int64_t iiw = ix * s0 + kx * d0 - p0;
const int64_t iih = oh * s1 + ky * d1 - p1; const int64_t iih = oh * s1 + ky * d1 - p1;
const int64_t offset_dst = const int64_t offset_dst =
((batch * OH + oh) * OW + ix) * CHW + ((batch * OH + oh) * OW + ix) * CHW +
(ic * (KW * KH) + ky * KW + kx); (ic * (KW * KH) + ky * KW + kx);
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
dst[offset_dst] = 0.0f; dst[offset_dst] = 0.0f;
} else { } else {
const int64_t offset_src = ic * offset_delta + batch * batch_offset; const int64_t offset_src = ic * offset_delta + batch * batch_offset;
dst[offset_dst] = x[offset_src + iih * IW + iiw]; dst[offset_dst] = x[offset_src + iih * IW + iiw];
} }
} }
template <typename T> template <typename T>
static void im2col_cuda(const float * x, T* dst, static void im2col_cuda(const float * x, T* dst,
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
int64_t batch, int64_t batch_offset, int64_t offset_delta, int64_t batch, int64_t batch_offset, int64_t offset_delta,
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
const int parallel_elements = OW * KW * KH; const int parallel_elements = OW * KW * KH;
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE; const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
dim3 block_nums(num_blocks, OH, batch * IC); dim3 block_nums(num_blocks, OH, batch * IC);
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1); im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
} }
static void im2col_cuda_f16(const float * x, half * dst, static void im2col_cuda_f16(const float * x, half * dst,
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
int64_t batch, int64_t batch_offset, int64_t offset_delta, int64_t batch, int64_t batch_offset, int64_t offset_delta,
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream); im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
} }
static void im2col_cuda_f32(const float * x, float * dst, static void im2col_cuda_f32(const float * x, float * dst,
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
int64_t batch, int64_t batch_offset, int64_t offset_delta, int64_t batch, int64_t batch_offset, int64_t offset_delta,
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream); im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
} }
void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const float * src1_d = (const float *)src1->data; const float * src1_d = (const float *)src1->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
const int64_t IC = src1->ne[is_2D ? 2 : 1]; const int64_t IC = src1->ne[is_2D ? 2 : 1];
const int64_t IH = is_2D ? src1->ne[1] : 1; const int64_t IH = is_2D ? src1->ne[1] : 1;
const int64_t IW = src1->ne[0]; const int64_t IW = src1->ne[0];
const int64_t KH = is_2D ? src0->ne[1] : 1; const int64_t KH = is_2D ? src0->ne[1] : 1;
const int64_t KW = src0->ne[0]; const int64_t KW = src0->ne[0];
const int64_t OH = is_2D ? dst->ne[2] : 1; const int64_t OH = is_2D ? dst->ne[2] : 1;
const int64_t OW = dst->ne[1]; const int64_t OW = dst->ne[1];
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
const int64_t batch = src1->ne[3]; const int64_t batch = src1->ne[3];
const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
if(dst->type == GGML_TYPE_F16) { if(dst->type == GGML_TYPE_F16) {
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream); im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
} else { } else {
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream); im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
} }
} }

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_IM2COL_BLOCK_SIZE 256 #define CUDA_IM2COL_BLOCK_SIZE 256
void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -11,7 +11,6 @@ void ggml_cuda_op_mul_mat_q(
const int64_t nb01 = src0->nb[1]; const int64_t nb01 = src0->nb[1];
const int64_t ne10 = src1->ne[0]; const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(ne10 % QK8_1 == 0); GGML_ASSERT(ne10 % QK8_1 == 0);
const int64_t ne0 = dst->ne[0]; const int64_t ne0 = dst->ne[0];
@ -26,7 +25,7 @@ void ggml_cuda_op_mul_mat_q(
// nrows_dst == nrows of the matrix that the kernel writes into // nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst}; const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:

View File

@ -1,26 +1,15 @@
#pragma once
#include "common.cuh" #include "common.cuh"
#include "vecdotq.cuh" #include "vecdotq.cuh"
#include <climits> #include <climits>
#include <cstdint> #include <cstdint>
#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
typedef void (*load_tiles_mmq_t)( typedef void (*load_tiles_mmq_t)(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride); int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride);
typedef void (*vec_dot_mmq_t)( typedef void (*vec_dot_mmq_t)(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0); const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, float * __restrict__ sum, const int & k0);
struct block_q8_1_mmq {
half2 ds[4];
int8_t qs[4*QK8_1];
};
static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1), "Unexpected block_q8_1_mmq size");
struct tile_x_sizes { struct tile_x_sizes {
int ql; int ql;
@ -143,14 +132,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps> template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat( static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) { const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll #pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y; const int j = j0 + threadIdx.y;
@ -160,18 +145,19 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
const int i = i0 + threadIdx.x; const int i = i0 + threadIdx.x;
const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2)); const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
const float * x_dmf = (const float *) x_dm;
int u[2*VDR_Q4_0_Q8_1_MMQ]; int u[2*VDR_Q4_0_Q8_1_MMQ];
#pragma unroll #pragma unroll
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE]; u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_0) % WARP_SIZE]; u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
} }
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ> sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
(&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dmf[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0], (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
} }
} }
} }
@ -217,13 +203,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps> template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat( static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) { const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll #pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y; const int j = j0 + threadIdx.y;
@ -238,13 +221,13 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
#pragma unroll #pragma unroll
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE]; u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_1) % WARP_SIZE]; u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
} }
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ> sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
(&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1], (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
} }
} }
} }
@ -310,14 +293,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps> template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat( static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) { const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
#pragma unroll #pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y; const int j = j0 + threadIdx.y;
@ -327,18 +306,20 @@ static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
const int i = i0 + threadIdx.x; const int i = i0 + threadIdx.x;
const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2)); const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
const int index_bx = i*(WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0; const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
int u[2*VDR_Q5_0_Q8_1_MMQ]; int u[2*VDR_Q5_0_Q8_1_MMQ];
#pragma unroll #pragma unroll
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE]; u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_0) % WARP_SIZE]; u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
} }
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ> sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
(&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dmf[index_bx], y_df[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
} }
} }
} }
@ -402,13 +383,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps> template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat( static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) { const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll #pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y; const int j = j0 + threadIdx.y;
@ -418,18 +396,18 @@ static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
const int i = i0 + threadIdx.x; const int i = i0 + threadIdx.x;
const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2)); const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
const int index_bx = i*(WARP_SIZE/QI5_1) + i/QI5_1 + k0/QI5_1; const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k0/QI5_1;
int u[2*VDR_Q5_1_Q8_1_MMQ]; int u[2*VDR_Q5_1_Q8_1_MMQ];
#pragma unroll #pragma unroll
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE]; u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_1) % WARP_SIZE]; u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
} }
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ> sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
(&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dm[index_bx], y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
} }
} }
} }
@ -477,14 +455,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps> template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat( static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) { const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
#pragma unroll #pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y; const int j = j0 + threadIdx.y;
@ -493,9 +467,12 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x; const int i = i0 + threadIdx.x;
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ> sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
(&x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0], x_dmf[i*(WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0], (&x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[j * WARP_SIZE + k0], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
y_df[j*MMQ_TILE_Y_K + k0/QI8_1]); y_df[j * (WARP_SIZE/QI8_1) + k0/QI8_1]);
} }
} }
} }
@ -554,13 +531,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps> template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat( static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) { const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_qh);
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
#pragma unroll #pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y; const int j = j0 + threadIdx.y;
@ -571,10 +545,11 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
const int kbx = k0 / QI2_K; const int kbx = k0 / QI2_K;
const int ky = (k0 % QI2_K) * QR2_K; const int ky = (k0 % QI2_K) * QR2_K;
const float * y_df = (const float *) y_ds;
int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
const int kqsx = i*(WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
#pragma unroll #pragma unroll
@ -582,11 +557,11 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
} }
const uint8_t * scales = ((const uint8_t *) &x_sc[i*(WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
const int index_y = j * WARP_SIZE + (QR2_K*k0) % WARP_SIZE;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq( sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq(
v, &y_qs[j*MMQ_TILE_Y_K + (QR2_K*k0) % WARP_SIZE], scales, v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
x_dm[i*(WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[j*MMQ_TILE_Y_K + ((QR2_K*k0) % WARP_SIZE)/QI8_1]);
} }
} }
} }
@ -671,11 +646,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps> template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat( static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) { const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
#pragma unroll #pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@ -687,6 +658,8 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
const int kbx = k0 / QI3_K; const int kbx = k0 / QI3_K;
const int ky = (k0 % QI3_K) * QR3_K; const int ky = (k0 % QI3_K) * QR3_K;
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
@ -694,19 +667,19 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
#pragma unroll #pragma unroll
for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
const int kqsx = i*(WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
const int shift = 2 * ((ky % 32) / 8); const int shift = 2 * ((ky % 32) / 8);
const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
const int vh = x_qh[i*(WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
const int vlh = (vh << 2) & 0x04040404; const int vlh = (vh << 2) & 0x04040404;
v[l] = __vsubss4(vll, vlh); v[l] = __vsubss4(vll, vlh);
} }
const int index_y = j * WARP_SIZE + (k0*QR3_K) % WARP_SIZE;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq( sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq(
v, &y_qs[j*MMQ_TILE_Y_K + (k0*QR3_K) % WARP_SIZE], scales, v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
x_dmf[i*(WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[j*MMQ_TILE_Y_K + ((k0*QR3_K) % WARP_SIZE)/QI8_1]);
} }
} }
} }
@ -773,13 +746,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps> template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat( static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) { const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_qh);
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll #pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y; const int j = j0 + threadIdx.y;
@ -790,9 +760,9 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2*((k0 % 16) / 8); const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2*((k0 % 16) / 8);
const int index_y = j * WARP_SIZE + (QR4_K*k0) % WARP_SIZE;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq( sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
&x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + (QR4_K*k0) % WARP_SIZE], sc, sc+8, &x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
x_dm[i*(WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[j*MMQ_TILE_Y_K + ((QR4_K*k0) % WARP_SIZE)/QI8_1]);
} }
} }
} }
@ -872,13 +842,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps> template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat( static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) { const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_qh);
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll #pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y; const int j = j0 + threadIdx.y;
@ -889,9 +856,10 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8); const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k0;
const int index_y = j * WARP_SIZE + (QR5_K*k0) % WARP_SIZE;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq( sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
&x_ql[i*(QR5_K*WARP_SIZE + 1) + QR5_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR5_K*k0) % WARP_SIZE], sc, sc+8, &x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[j*MMQ_TILE_Y_K + ((QR5_K*k0) % WARP_SIZE)/QI8_1]);
} }
} }
} }
@ -964,14 +932,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps> template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat( static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) { const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_qh);
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
#pragma unroll #pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y; const int j = j0 + threadIdx.y;
@ -980,11 +944,15 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x; const int i = i0 + threadIdx.x;
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]); const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k0;
const int index_y = j * WARP_SIZE + (QR6_K*k0) % WARP_SIZE;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq( sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq(
&x_ql[i*(QR6_K*WARP_SIZE + 1) + QR6_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR6_K*k0) % WARP_SIZE], sc, &x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
x_dmf[i*(WARP_SIZE/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + ((QR6_K*k0) % WARP_SIZE)/QI8_1]);
} }
} }
} }
@ -996,6 +964,7 @@ struct mmq_type_traits;
template <int mmq_x, int mmq_y, int nwarps, bool need_check> template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> { struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
static constexpr bool need_sum = true;
static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ; static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, nwarps, need_check>; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1003,6 +972,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check> template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> { struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
static constexpr bool need_sum = true;
static constexpr int vdr = VDR_Q4_1_Q8_1_MMQ; static constexpr int vdr = VDR_Q4_1_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, nwarps, need_check>; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1010,6 +980,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check> template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> { struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
static constexpr bool need_sum = false;
static constexpr int vdr = VDR_Q5_0_Q8_1_MMQ; static constexpr int vdr = VDR_Q5_0_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, nwarps, need_check>; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1017,6 +988,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check> template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> { struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
static constexpr bool need_sum = true;
static constexpr int vdr = VDR_Q5_1_Q8_1_MMQ; static constexpr int vdr = VDR_Q5_1_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, nwarps, need_check>; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1024,6 +996,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check> template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> { struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
static constexpr bool need_sum = false;
static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ; static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, nwarps, need_check>; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q8_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q8_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1031,6 +1004,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check> template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> { struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
static constexpr bool need_sum = false;
static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ; static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, nwarps, need_check>; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q2_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q2_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1038,6 +1012,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check> template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> { struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
static constexpr bool need_sum = false;
static constexpr int vdr = VDR_Q3_K_Q8_1_MMQ; static constexpr int vdr = VDR_Q3_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, nwarps, need_check>; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q3_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q3_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1045,6 +1020,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check> template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> { struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
static constexpr bool need_sum = true;
static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ; static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1052,6 +1028,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check> template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> { struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
static constexpr bool need_sum = true;
static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ; static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1059,36 +1036,12 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check> template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> { struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
static constexpr bool need_sum = false;
static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ; static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
}; };
static int mmq_need_sum(const ggml_type type_x) {
switch (type_x) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
return true;
case GGML_TYPE_Q5_0:
return false;
case GGML_TYPE_Q5_1:
return true;
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
return false;
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
return true;
case GGML_TYPE_Q6_K:
return false;
default:
GGML_ASSERT(false);
break;
}
return false;
}
template <ggml_type type, int mmq_x, int nwarps, bool need_check> template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2) #if defined(RDNA3) || defined(RDNA2)
@ -1103,7 +1056,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
static __global__ void mul_mat_q( static __global__ void mul_mat_q(
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst,
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { const int ne00, const int ne01, const int stride00, const int ne10, const int ne11, const int ne0) {
// Skip unused template specializations for faster compilation: // Skip unused template specializations for faster compilation:
if (mmq_x > get_mmq_x_max_device()) { if (mmq_x > get_mmq_x_max_device()) {
@ -1115,6 +1068,7 @@ static __global__ void mul_mat_q(
constexpr int qr = ggml_cuda_type_traits<type>::qr; constexpr int qr = ggml_cuda_type_traits<type>::qr;
constexpr int qi = ggml_cuda_type_traits<type>::qi; constexpr int qi = ggml_cuda_type_traits<type>::qi;
constexpr int mmq_y = get_mmq_y_device(mmq_x); constexpr int mmq_y = get_mmq_y_device(mmq_x);
constexpr bool need_sum = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::need_sum;
constexpr int vdr = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr; constexpr int vdr = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr;
constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles; constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot; constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot;
@ -1126,38 +1080,62 @@ static __global__ void mul_mat_q(
half2 * tile_x_dm = (half2 *) (tile_x_ql + txs.ql); half2 * tile_x_dm = (half2 *) (tile_x_ql + txs.ql);
int * tile_x_qh = (int *) (tile_x_dm + txs.dm); int * tile_x_qh = (int *) (tile_x_dm + txs.dm);
int * tile_x_sc = (int *) (tile_x_qh + txs.qh); int * tile_x_sc = (int *) (tile_x_qh + txs.qh);
int * tile_y = (int *) (tile_x_sc + txs.sc); // [mmq_x * (WARP_SIZE + WARP_SIZE/QI8_1)] int * tile_y_qs = (int *) (tile_x_sc + txs.sc); // [mmq_x * WARP_SIZE]
half2 * tile_y_ds = (half2 *) (tile_y_qs + mmq_x*WARP_SIZE); // [mmq_x * WARP_SIZE/QI8_1];
const block_q8_1 * y = (const block_q8_1 *) yc;
const int blocks_per_row_x = ne00 / qk; const int blocks_per_row_x = ne00 / qk;
const int blocks_per_col_y = ne10 / QK8_1;
const int blocks_per_warp = WARP_SIZE / qi; const int blocks_per_warp = WARP_SIZE / qi;
const int & ne1 = ne11; const int & ne1 = ne11;
const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1; const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1;
const int * y = (const int *) yc + blockIdx.y*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f}; float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f};
for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) { for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride01*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride01); load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride00*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride00);
#pragma unroll #pragma unroll
for (int kr = 0; kr < qr; ++kr) { for (int kr = 0; kr < qr; ++kr) {
const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + kr*sizeof(block_q8_1_mmq)/sizeof(int)); const int kqs = kr*WARP_SIZE + threadIdx.x;
#pragma unroll const int kbxd = kqs / QI8_1;
for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
tile_y[l] = by0[l]; #pragma unroll
for (int i0 = 0; i0 < mmq_x; i0 += nwarps) {
const int i = min(blockIdx.y*mmq_x + threadIdx.y + i0, ne11-1); // to prevent out-of-bounds memory accesses
const block_q8_1 * by0 = &y[i*blocks_per_col_y + kb0 * (qk/QK8_1) + kbxd];
const int index_y = (i0 + threadIdx.y) * WARP_SIZE + kqs % WARP_SIZE;
tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
}
#pragma unroll
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
const int i_y_eff = min(blockIdx.y*mmq_x + ids, ne11-1);
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
const half2 * dsi_src = &y[i_y_eff*blocks_per_col_y + kb0 * (qk/QK8_1) + kr*(WARP_SIZE/QI8_1) + kby].ds;
half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
if (need_sum) {
*dsi_dst = *dsi_src;
} else {
float * dfi_dst = (float *) dsi_dst;
*dfi_dst = __low2float(*dsi_src);
}
} }
__syncthreads(); __syncthreads();
// #pragma unroll // unrolling this loop causes too much register pressure // #pragma unroll // unrolling this loop causes too much register pressure
for (int k0 = kr*WARP_SIZE/qr; k0 < (kr+1)*WARP_SIZE/qr; k0 += vdr) { for (int k0 = kr*WARP_SIZE/qr; k0 < (kr+1)*WARP_SIZE/qr; k0 += vdr) {
vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y, sum, k0); vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, sum, k0);
} }
__syncthreads(); __syncthreads();
@ -1187,8 +1165,8 @@ static __global__ void mul_mat_q(
struct mmq_args { struct mmq_args {
const char * x; const char * y; float * dst; const char * x; const char * y; float * dst;
int64_t ne00; int64_t ne01; int64_t stride01; int64_t ne00; int64_t ne01; int64_t stride00;
int64_t ne10; int64_t ne11; int64_t stride11; int64_t ne10; int64_t ne11;
int64_t ne0; int64_t ne0;
}; };
@ -1206,7 +1184,7 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y); const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y);
const int shmem_x = txs.ql*sizeof(int) + txs.dm*sizeof(half2) + txs.qh*sizeof(int) + txs.sc*sizeof(int); const int shmem_x = txs.ql*sizeof(int) + txs.dm*sizeof(half2) + txs.qh*sizeof(int) + txs.sc*sizeof(int);
const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2); const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2);
const int shmem = shmem_x + GGML_PAD(shmem_y, nwarps*WARP_SIZE*sizeof(int)); const int shmem = shmem_x + shmem_y;
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
@ -1220,11 +1198,11 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
if (args.ne01 % mmq_y == 0) { if (args.ne01 % mmq_y == 0) {
const bool need_check = false; const bool need_check = false;
mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>> mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
(args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
} else { } else {
const bool need_check = true; const bool need_check = true;
mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>> mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
(args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
} }
} }

View File

@ -1,7 +1,7 @@
#include "common.cuh" #include "common.cuh"
void ggml_cuda_op_mul_mat_vec_q( void ggml_cuda_op_mul_mat_vec_q(
ggml_backend_cuda_context & ctx, ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream); const int64_t src1_padded_row_size, cudaStream_t stream);

View File

@ -1,7 +1,7 @@
#include "common.cuh" #include "common.cuh"
void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,49 +1,49 @@
#include "pad.cuh" #include "pad.cuh"
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03 // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
// blockIdx.y: idx of ne1 // blockIdx.y: idx of ne1
// blockIDx.x: idx of ne0 / BLOCK_SIZE // blockIDx.x: idx of ne0 / BLOCK_SIZE
int nidx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) { if (nidx >= ne0) {
return; return;
} }
// operation // operation
int offset_dst = int offset_dst =
nidx + nidx +
blockIdx.y * ne0 + blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y; blockIdx.z * ne0 * gridDim.y;
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
int offset_src = int offset_src =
nidx + nidx +
blockIdx.y * ne00 + blockIdx.y * ne00 +
blockIdx.z * ne00 * ne01; blockIdx.z * ne00 * ne01;
dst[offset_dst] = x[offset_src]; dst[offset_dst] = x[offset_src];
} else { } else {
dst[offset_dst] = 0.0f; dst[offset_dst] = 0.0f;
} }
} }
static void pad_f32_cuda(const float * x, float * dst, static void pad_f32_cuda(const float * x, float * dst,
const int ne00, const int ne01, const int ne02, const int ne03, const int ne00, const int ne01, const int ne02, const int ne03,
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne1, ne2*ne3); dim3 gridDim(num_blocks, ne1, ne2*ne3);
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03); pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
} }
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
pad_f32_cuda(src0_d, dst_d, pad_f32_cuda(src0_d, dst_d,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
} }

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_PAD_BLOCK_SIZE 256 #define CUDA_PAD_BLOCK_SIZE 256
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,94 +1,94 @@
#include "pool2d.cuh" #include "pool2d.cuh"
template <typename Ti, typename To> template <typename Ti, typename To>
static __global__ void pool2d_nchw_kernel( static __global__ void pool2d_nchw_kernel(
const int ih, const int iw, const int oh, const int ow, const int ih, const int iw, const int oh, const int ow,
const int kh, const int kw, const int sh, const int sw, const int kh, const int kw, const int sh, const int sw,
const int ph, const int pw, const int parallel_elements, const int ph, const int pw, const int parallel_elements,
const Ti* src, To* dst, const enum ggml_op_pool op) { const Ti* src, To* dst, const enum ggml_op_pool op) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= parallel_elements) { if (idx >= parallel_elements) {
return; return;
} }
const int I_HW = ih * iw; const int I_HW = ih * iw;
const int O_HW = oh * ow; const int O_HW = oh * ow;
const int nc = idx / O_HW; const int nc = idx / O_HW;
const int cur_oh = idx % O_HW / ow; const int cur_oh = idx % O_HW / ow;
const int cur_ow = idx % O_HW % ow; const int cur_ow = idx % O_HW % ow;
const Ti* i_ptr = src + nc * I_HW; const Ti* i_ptr = src + nc * I_HW;
To* o_ptr = dst + nc * O_HW; To* o_ptr = dst + nc * O_HW;
const int start_h = cur_oh * sh - ph; const int start_h = cur_oh * sh - ph;
const int bh = max(0, start_h); const int bh = max(0, start_h);
const int eh = min(ih, start_h + kh); const int eh = min(ih, start_h + kh);
const int start_w = cur_ow * sw - pw; const int start_w = cur_ow * sw - pw;
const int bw = max(0, start_w); const int bw = max(0, start_w);
const int ew = min(iw, start_w + kw); const int ew = min(iw, start_w + kw);
const To scale = 1. / (kh * kw); const To scale = 1. / (kh * kw);
To res = 0; To res = 0;
switch (op) { switch (op) {
case GGML_OP_POOL_AVG: res = 0; break; case GGML_OP_POOL_AVG: res = 0; break;
case GGML_OP_POOL_MAX: res = -FLT_MAX; break; case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
default: assert(false); default: assert(false);
} }
for (int i = bh; i < eh; i += 1) { for (int i = bh; i < eh; i += 1) {
for (int j = bw; j < ew; j += 1) { for (int j = bw; j < ew; j += 1) {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
Ti cur = __ldg(i_ptr + i * iw + j); Ti cur = __ldg(i_ptr + i * iw + j);
#else #else
Ti cur = i_ptr[i * iw + j]; Ti cur = i_ptr[i * iw + j];
#endif #endif
switch (op) { switch (op) {
case GGML_OP_POOL_AVG: res += cur * scale; break; case GGML_OP_POOL_AVG: res += cur * scale; break;
case GGML_OP_POOL_MAX: res = max(res, (To)cur); break; case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
default: assert(false); default: assert(false);
} }
} }
} }
o_ptr[cur_oh * ow + cur_ow] = res; o_ptr[cur_oh * ow + cur_ow] = res;
} }
static void pool2d_nchw_kernel_f32_f32_cuda( static void pool2d_nchw_kernel_f32_f32_cuda(
const int ih, const int iw, const int oh, const int ow, const int ih, const int iw, const int oh, const int ow,
const int kh, const int kw, const int sh, const int sw, const int kh, const int kw, const int sh, const int sw,
const int ph, const int pw, const int parallel_elements, const int ph, const int pw, const int parallel_elements,
const float * src, float * dst, const enum ggml_op_pool op, const float * src, float * dst, const enum ggml_op_pool op,
cudaStream_t stream) { cudaStream_t stream) {
const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE; const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
dim3 block_nums(num_blocks); dim3 block_nums(num_blocks);
pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op); pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);
} }
void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int32_t * opts = (const int32_t *)dst->op_params; const int32_t * opts = (const int32_t *)dst->op_params;
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]); enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
const int k0 = opts[1]; const int k0 = opts[1];
const int k1 = opts[2]; const int k1 = opts[2];
const int s0 = opts[3]; const int s0 = opts[3];
const int s1 = opts[4]; const int s1 = opts[4];
const int p0 = opts[5]; const int p0 = opts[5];
const int p1 = opts[6]; const int p1 = opts[6];
const int64_t IH = src0->ne[1]; const int64_t IH = src0->ne[1];
const int64_t IW = src0->ne[0]; const int64_t IW = src0->ne[0];
const int64_t N = dst->ne[3]; const int64_t N = dst->ne[3];
const int64_t OC = dst->ne[2]; const int64_t OC = dst->ne[2];
const int64_t OH = dst->ne[1]; const int64_t OH = dst->ne[1];
const int64_t OW = dst->ne[0]; const int64_t OW = dst->ne[0];
const int parallel_elements = N * OC * OH * OW; const int parallel_elements = N * OC * OH * OW;
pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream); pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);
} }

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_POOL2D_BLOCK_SIZE 256 #define CUDA_POOL2D_BLOCK_SIZE 256
void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,23 +1,22 @@
#include "quantize.cuh" #include "quantize.cuh"
#include <cstdint>
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) { static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (ix0 >= kx0_padded) { if (ix >= kx_padded) {
return; return;
} }
const int64_t ix1 = blockIdx.y; const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
const int64_t i_padded = ix1*kx0_padded + ix0; const int64_t i_padded = (int64_t)iy*kx_padded + ix;
block_q8_1 * y = (block_q8_1 *) vy; block_q8_1 * y = (block_q8_1 *) vy;
const int64_t ib = i_padded / QK8_1; // block index const int64_t ib = i_padded / QK8_1; // block index
const int64_t iqs = i_padded % QK8_1; // quant index const int64_t iqs = i_padded % QK8_1; // quant index
const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f; const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
float amax = fabsf(xi); float amax = fabsf(xi);
float sum = xi; float sum = xi;
@ -37,76 +36,10 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
reinterpret_cast<half&>(y[ib].ds.y) = sum; reinterpret_cast<half&>(y[ib].ds.y) = sum;
} }
template <bool need_sum> void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
static __global__ void quantize_mmq_q8_1( const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) { const dim3 num_blocks(block_num_x, ky, 1);
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (ix0 >= kx0_padded) {
return;
}
const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
float amax = fabsf(xi);
amax = warp_reduce_max(amax);
float sum;
if (need_sum) {
sum = warp_reduce_sum(xi);
}
const float d = amax / 127;
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
y[ib].qs[iqs] = q;
if (iqs % QK8_1 != 0) {
return;
}
if (need_sum) {
y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
} else {
((float *) y[ib].ds)[iqs/QK8_1] = d;
}
}
void quantize_row_q8_1_cuda(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
GGML_ASSERT(kx0_padded % QK8_1 == 0);
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, kx1*channels, 1);
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded); quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
GGML_UNUSED(type_x);
} }
void quantize_mmq_q8_1_cuda(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, kx1, channels);
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
if (mmq_need_sum(type_x)) {
quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
} else {
quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
}
}

View File

@ -1,20 +1,5 @@
#pragma once
#include "common.cuh" #include "common.cuh"
#include "mmq.cuh"
#include <cstdint>
#define CUDA_QUANTIZE_BLOCK_SIZE 256 #define CUDA_QUANTIZE_BLOCK_SIZE 256
typedef void (*quantize_cuda_t)( void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
const ggml_type type_x, cudaStream_t stream);
void quantize_row_q8_1_cuda(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
const ggml_type type_x, cudaStream_t stream);
void quantize_mmq_q8_1_cuda(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
const ggml_type type_x, cudaStream_t stream);

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_ROPE_BLOCK_SIZE 256 #define CUDA_ROPE_BLOCK_SIZE 256
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_SCALE_BLOCK_SIZE 256 #define CUDA_SCALE_BLOCK_SIZE 256
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024 #define CUDA_SOFT_MAX_BLOCK_SIZE 1024
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,40 +1,40 @@
#include "sumrows.cuh" #include "sumrows.cuh"
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) { static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
const int row = blockIdx.x; const int row = blockIdx.x;
const int col = threadIdx.x; const int col = threadIdx.x;
float sum = 0.0f; float sum = 0.0f;
for (int i = col; i < ncols; i += blockDim.x) { for (int i = col; i < ncols; i += blockDim.x) {
sum += x[row * ncols + i]; sum += x[row * ncols + i];
} }
sum = warp_reduce_sum(sum); sum = warp_reduce_sum(sum);
if (col == 0) { if (col == 0) {
dst[row] = sum; dst[row] = sum;
} }
} }
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1); const dim3 block_nums(nrows, 1, 1);
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols); k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
} }
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
const int64_t ncols = src0->ne[0]; const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);
sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream); sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
} }

View File

@ -1,3 +1,3 @@
#include "common.cuh" #include "common.cuh"
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,47 +1,47 @@
#include "tsembd.cuh" #include "tsembd.cuh"
static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) { static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
// blockIDx.y: idx of timesteps->ne[0] // blockIDx.y: idx of timesteps->ne[0]
// blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
int i = blockIdx.y; int i = blockIdx.y;
int j = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.x + blockIdx.x * blockDim.x;
float * embed_data = (float *)((char *)dst + i*nb1); float * embed_data = (float *)((char *)dst + i*nb1);
if (dim % 2 != 0 && j == ((dim + 1) / 2)) { if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
embed_data[dim] = 0.f; embed_data[dim] = 0.f;
} }
int half = dim / 2; int half = dim / 2;
if (j >= half) { if (j >= half) {
return; return;
} }
float timestep = timesteps[i]; float timestep = timesteps[i];
float freq = (float)expf(-logf(max_period) * j / half); float freq = (float)expf(-logf(max_period) * j / half);
float arg = timestep * freq; float arg = timestep * freq;
embed_data[j] = cosf(arg); embed_data[j] = cosf(arg);
embed_data[j + half] = sinf(arg); embed_data[j + half] = sinf(arg);
} }
static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1, static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
const int dim, const int max_period, cudaStream_t stream) { const int dim, const int max_period, cudaStream_t stream) {
int half_ceil = (dim + 1) / 2; int half_ceil = (dim + 1) / 2;
int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE; int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne00, 1); dim3 gridDim(num_blocks, ne00, 1);
timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period); timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
} }
void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int dim = dst->op_params[0]; const int dim = dst->op_params[0];
const int max_period = dst->op_params[1]; const int max_period = dst->op_params[1];
timestep_embedding_f32_cuda(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); timestep_embedding_f32_cuda(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
} }

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 #define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,5 +1,5 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_UPSCALE_BLOCK_SIZE 256 #define CUDA_UPSCALE_BLOCK_SIZE 256
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

2
llama/ggml-impl.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/ggml-metal.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/ggml-quants.c vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

268
llama/ggml-quants.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *
@ -24,136 +24,136 @@
* SOFTWARE. * SOFTWARE.
*/ */
#pragma once #pragma once
#define GGML_COMMON_DECL_C #define GGML_COMMON_DECL_C
#include "ggml-common.h" #include "ggml-common.h"
#include "ggml.h" #include "ggml.h"
// GGML internal header // GGML internal header
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
// Quantization // Quantization
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k); void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k); void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k); void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k); void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
// Dequantization // Dequantization
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
// Dot product // Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
void iq2xs_init_impl(enum ggml_type type); void iq2xs_init_impl(enum ggml_type type);
void iq2xs_free_impl(enum ggml_type type); void iq2xs_free_impl(enum ggml_type type);
void iq3xs_init_impl(int grid_size); void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size); void iq3xs_free_impl(int grid_size);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

2
llama/ggml.c vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/ggml.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *
@ -24,32 +24,32 @@
* SOFTWARE. * SOFTWARE.
*/ */
// Implements a parser for an extended Backus-Naur form (BNF), producing the // Implements a parser for an extended Backus-Naur form (BNF), producing the
// binary context-free grammar format specified by llama.h. Supports character // binary context-free grammar format specified by llama.h. Supports character
// ranges, grouping, and repetition operators. As an example, a grammar for // ranges, grouping, and repetition operators. As an example, a grammar for
// arithmetic might look like: // arithmetic might look like:
// //
// root ::= expr // root ::= expr
// expr ::= term ([-+*/] term)* // expr ::= term ([-+*/] term)*
// term ::= num | "(" space expr ")" space // term ::= num | "(" space expr ")" space
// num ::= [0-9]+ space // num ::= [0-9]+ space
// space ::= [ \t\n]* // space ::= [ \t\n]*
#pragma once #pragma once
#include "llama.h" #include "llama.h"
#include <vector> #include <vector>
#include <map> #include <map>
#include <cstdint> #include <cstdint>
#include <string> #include <string>
namespace grammar_parser { namespace grammar_parser {
struct parse_state { struct parse_state {
std::map<std::string, uint32_t> symbol_ids; std::map<std::string, uint32_t> symbol_ids;
std::vector<std::vector<llama_grammar_element>> rules; std::vector<std::vector<llama_grammar_element>> rules;
std::vector<const llama_grammar_element *> c_rules(); std::vector<const llama_grammar_element *> c_rules();
}; };
parse_state parse(const char * src); parse_state parse(const char * src);
void print_grammar(FILE * file, const parse_state & state); void print_grammar(FILE * file, const parse_state & state);
} }

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

File diff suppressed because it is too large Load Diff

10
llama/llama.cpp vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *
@ -15255,14 +15255,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (imatrix_data) { if (imatrix_data) {
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
qs.has_imatrix = true; qs.has_imatrix = true;
// check imatrix for nans or infs
for (const auto & kv : *imatrix_data) {
for (float f : kv.second) {
if (!std::isfinite(f)) {
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
}
}
}
} }
} }

View File

@ -2,12 +2,12 @@ package llama
// #cgo CFLAGS: -std=c11 -DNDEBUG -DLOG_DISABLE_LOGS // #cgo CFLAGS: -std=c11 -DNDEBUG -DLOG_DISABLE_LOGS
// #cgo CXXFLAGS: -std=c++11 -DNDEBUG -DLOG_DISABLE_LOGS // #cgo CXXFLAGS: -std=c++11 -DNDEBUG -DLOG_DISABLE_LOGS
// #cgo darwin,arm64 CFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 // #cgo darwin,arm64 CFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DGGML_METAL_EMBED_LIBRARY -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,arm64 CXXFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 // #cgo darwin,arm64 CXXFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DGGML_METAL_EMBED_LIBRARY -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework Accelerate // #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/ggml-metal.o -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
// #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers // #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
// #cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers // #cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
// #cgo darwin,amd64 LDFLAGS: -framework Foundation // #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/ggml-metal.o -framework Foundation
// #cgo darwin,amd64,avx2 CFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 // #cgo darwin,amd64,avx2 CFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,amd64,avx2 CXXFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 // #cgo darwin,amd64,avx2 CXXFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,amd64,avx2 LDFLAGS: -framework Accelerate // #cgo darwin,amd64,avx2 LDFLAGS: -framework Accelerate

2
llama/llama.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/llava.cpp vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

102
llama/llava.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *
@ -24,53 +24,53 @@
* SOFTWARE. * SOFTWARE.
*/ */
#ifndef LLAVA_H #ifndef LLAVA_H
#define LLAVA_H #define LLAVA_H
#include "ggml.h" #include "ggml.h"
#ifdef LLAMA_SHARED #ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__) # if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD # ifdef LLAMA_BUILD
# define LLAVA_API __declspec(dllexport) # define LLAVA_API __declspec(dllexport)
# else # else
# define LLAVA_API __declspec(dllimport) # define LLAVA_API __declspec(dllimport)
# endif # endif
# else # else
# define LLAVA_API __attribute__ ((visibility ("default"))) # define LLAVA_API __attribute__ ((visibility ("default")))
# endif # endif
#else #else
# define LLAVA_API # define LLAVA_API
#endif #endif
struct clip_ctx; struct clip_ctx;
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
struct llava_image_embed { struct llava_image_embed {
float * embed; float * embed;
int n_image_pos; int n_image_pos;
}; };
/** sanity check for clip <-> llava embed size match */ /** sanity check for clip <-> llava embed size match */
LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip); LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
/** build an image embed from image file bytes */ /** build an image embed from image file bytes */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
/** build an image embed from a path to an image filename */ /** build an image embed from a path to an image filename */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** free an embedding made with llava_image_embed_make_* */ /** free an embedding made with llava_image_embed_make_* */
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif

2
llama/log.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/sampling.cpp vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/sampling.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

16794
llama/stb_image.h vendored

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/unicode.cpp vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *

2
llama/unicode.h vendored
View File

@ -1,5 +1,5 @@
/** /**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
* *
* MIT License * MIT License
* *