diff --git a/llama/.gitignore b/llama/.gitignore index 9710205b..4204cdea 100644 --- a/llama/.gitignore +++ b/llama/.gitignore @@ -3,3 +3,4 @@ *.lib *.exp *.dll +*.o \ No newline at end of file diff --git a/llama/base64.hpp b/llama/base64.hpp index 563247a6..04df58e8 100644 --- a/llama/base64.hpp +++ b/llama/base64.hpp @@ -1,392 +1,392 @@ -/* -This is free and unencumbered software released into the public domain. - -Anyone is free to copy, modify, publish, use, compile, sell, or -distribute this software, either in source code form or as a compiled -binary, for any purpose, commercial or non-commercial, and by any -means. - -In jurisdictions that recognize copyright laws, the author or authors -of this software dedicate any and all copyright interest in the -software to the public domain. We make this dedication for the benefit -of the public at large and to the detriment of our heirs and -successors. We intend this dedication to be an overt act of -relinquishment in perpetuity of all present and future rights to this -software under copyright law. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. - -For more information, please refer to -*/ - -#ifndef PUBLIC_DOMAIN_BASE64_HPP_ -#define PUBLIC_DOMAIN_BASE64_HPP_ - -#include -#include -#include -#include - -class base64_error : public std::runtime_error -{ -public: - using std::runtime_error::runtime_error; -}; - -class base64 -{ -public: - enum class alphabet - { - /** the alphabet is detected automatically */ - auto_, - /** the standard base64 alphabet is used */ - standard, - /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/ - url_filename_safe - }; - - enum class decoding_behavior - { - /** if the input is not padded, the remaining bits are ignored */ - moderate, - /** if a padding character is encounter decoding is finished */ - loose - }; - - /** - Encodes all the elements from `in_begin` to `in_end` to `out`. - - @warning The source and destination cannot overlap. The destination must be able to hold at least - `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator. - - @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than - 8 bits - @tparam Output_iterator the destination; the elements written to it are from the type `char` - @param in_begin the beginning of the source - @param in_end the ending of the source - @param out the destination iterator - @param alphabet which alphabet should be used - @returns the iterator to the next element past the last element copied - @throws see `Input_iterator` and `Output_iterator` - */ - template - static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out, - alphabet alphabet = alphabet::standard) - { - constexpr auto pad = '='; - const char* alpha = alphabet == alphabet::url_filename_safe - ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" - : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - - while (in_begin != in_end) { - std::uint8_t i0 = 0, i1 = 0, i2 = 0; - - // first character - i0 = static_cast(*in_begin); - ++in_begin; - - *out = alpha[i0 >> 2 & 0x3f]; - ++out; - - // part of first character and second - if (in_begin != in_end) { - i1 = static_cast(*in_begin); - ++in_begin; - - *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)]; - ++out; - } else { - *out = alpha[(i0 & 0x3) << 4]; - ++out; - - // last padding - *out = pad; - ++out; - - // last padding - *out = pad; - ++out; - - break; - } - - // part of second character and third - if (in_begin != in_end) { - i2 = static_cast(*in_begin); - ++in_begin; - - *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)]; - ++out; - } else { - *out = alpha[(i1 & 0xf) << 2]; - ++out; - - // last padding - *out = pad; - ++out; - - break; - } - - // rest of third - *out = alpha[i2 & 0x3f]; - ++out; - } - - return out; - } - /** - Encodes a string. - - @param str the string that should be encoded - @param alphabet which alphabet should be used - @returns the encoded base64 string - @throws see base64::encode() - */ - static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard) - { - std::string result; - - result.reserve(required_encode_size(str.length()) + 1); - - encode(str.begin(), str.end(), std::back_inserter(result), alphabet); - - return result; - } - /** - Encodes a char array. - - @param buffer the char array - @param size the size of the array - @param alphabet which alphabet should be used - @returns the encoded string - */ - static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard) - { - std::string result; - - result.reserve(required_encode_size(size) + 1); - - encode(buffer, buffer + size, std::back_inserter(result), alphabet); - - return result; - } - /** - Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`, - in other words: inplace decoding is possible. - - @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`, - otherwise the behavior depends on the output iterator. - - @tparam Input_iterator the source; the returned elements are cast to `char` - @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t` - @param in_begin the beginning of the source - @param in_end the ending of the source - @param out the destination iterator - @param alphabet which alphabet should be used - @param behavior the behavior when an error was detected - @returns the iterator to the next element past the last element copied - @throws base64_error depending on the set behavior - @throws see `Input_iterator` and `Output_iterator` - */ - template - static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out, - alphabet alphabet = alphabet::auto_, - decoding_behavior behavior = decoding_behavior::moderate) - { - //constexpr auto pad = '='; - std::uint8_t last = 0; - auto bits = 0; - - while (in_begin != in_end) { - auto c = *in_begin; - ++in_begin; - - if (c == '=') { - break; - } - - auto part = _base64_value(alphabet, c); - - // enough bits for one byte - if (bits + 6 >= 8) { - *out = (last << (8 - bits)) | (part >> (bits - 2)); - ++out; - - bits -= 2; - } else { - bits += 6; - } - - last = part; - } - - // check padding - if (behavior != decoding_behavior::loose) { - while (in_begin != in_end) { - auto c = *in_begin; - ++in_begin; - - if (c != '=') { - throw base64_error("invalid base64 character."); - } - } - } - - return out; - } - /** - Decodes a string. - - @param str the base64 encoded string - @param alphabet which alphabet should be used - @param behavior the behavior when an error was detected - @returns the decoded string - @throws see base64::decode() - */ - static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_, - decoding_behavior behavior = decoding_behavior::moderate) - { - std::string result; - - result.reserve(max_decode_size(str.length())); - - decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior); - - return result; - } - /** - Decodes a string. - - @param buffer the base64 encoded buffer - @param size the size of the buffer - @param alphabet which alphabet should be used - @param behavior the behavior when an error was detected - @returns the decoded string - @throws see base64::decode() - */ - static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_, - decoding_behavior behavior = decoding_behavior::moderate) - { - std::string result; - - result.reserve(max_decode_size(size)); - - decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior); - - return result; - } - /** - Decodes a string inplace. - - @param[in,out] str the base64 encoded string - @param alphabet which alphabet should be used - @param behavior the behavior when an error was detected - @throws base64::decode_inplace() - */ - static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_, - decoding_behavior behavior = decoding_behavior::moderate) - { - str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin()); - } - /** - Decodes a char array inplace. - - @param[in,out] str the string array - @param size the length of the array - @param alphabet which alphabet should be used - @param behavior the behavior when an error was detected - @returns the pointer to the next element past the last element decoded - @throws base64::decode_inplace() - */ - static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_, - decoding_behavior behavior = decoding_behavior::moderate) - { - return decode(str, str + size, str, alphabet, behavior); - } - /** - Returns the required decoding size for a given size. The value is calculated with the following formula: - - $$ - \lceil \frac{size}{4} \rceil \cdot 3 - $$ - - @param size the size of the encoded input - @returns the size of the resulting decoded buffer; this the absolute maximum - */ - static std::size_t max_decode_size(std::size_t size) noexcept - { - return (size / 4 + (size % 4 ? 1 : 0)) * 3; - } - /** - Returns the required encoding size for a given size. The value is calculated with the following formula: - - $$ - \lceil \frac{size}{3} \rceil \cdot 4 - $$ - - @param size the size of the decoded input - @returns the size of the resulting encoded buffer - */ - static std::size_t required_encode_size(std::size_t size) noexcept - { - return (size / 3 + (size % 3 ? 1 : 0)) * 4; - } - -private: - static std::uint8_t _base64_value(alphabet& alphabet, char c) - { - if (c >= 'A' && c <= 'Z') { - return c - 'A'; - } else if (c >= 'a' && c <= 'z') { - return c - 'a' + 26; - } else if (c >= '0' && c <= '9') { - return c - '0' + 52; - } - - // comes down to alphabet - if (alphabet == alphabet::standard) { - if (c == '+') { - return 62; - } else if (c == '/') { - return 63; - } - } else if (alphabet == alphabet::url_filename_safe) { - if (c == '-') { - return 62; - } else if (c == '_') { - return 63; - } - } // auto detect - else { - if (c == '+') { - alphabet = alphabet::standard; - - return 62; - } else if (c == '/') { - alphabet = alphabet::standard; - - return 63; - } else if (c == '-') { - alphabet = alphabet::url_filename_safe; - - return 62; - } else if (c == '_') { - alphabet = alphabet::url_filename_safe; - - return 63; - } - } - - throw base64_error("invalid base64 character."); - } -}; - -#endif // !PUBLIC_DOMAIN_BASE64_HPP_ +/* +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to +*/ + +#ifndef PUBLIC_DOMAIN_BASE64_HPP_ +#define PUBLIC_DOMAIN_BASE64_HPP_ + +#include +#include +#include +#include + +class base64_error : public std::runtime_error +{ +public: + using std::runtime_error::runtime_error; +}; + +class base64 +{ +public: + enum class alphabet + { + /** the alphabet is detected automatically */ + auto_, + /** the standard base64 alphabet is used */ + standard, + /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/ + url_filename_safe + }; + + enum class decoding_behavior + { + /** if the input is not padded, the remaining bits are ignored */ + moderate, + /** if a padding character is encounter decoding is finished */ + loose + }; + + /** + Encodes all the elements from `in_begin` to `in_end` to `out`. + + @warning The source and destination cannot overlap. The destination must be able to hold at least + `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator. + + @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than + 8 bits + @tparam Output_iterator the destination; the elements written to it are from the type `char` + @param in_begin the beginning of the source + @param in_end the ending of the source + @param out the destination iterator + @param alphabet which alphabet should be used + @returns the iterator to the next element past the last element copied + @throws see `Input_iterator` and `Output_iterator` + */ + template + static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out, + alphabet alphabet = alphabet::standard) + { + constexpr auto pad = '='; + const char* alpha = alphabet == alphabet::url_filename_safe + ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" + : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + + while (in_begin != in_end) { + std::uint8_t i0 = 0, i1 = 0, i2 = 0; + + // first character + i0 = static_cast(*in_begin); + ++in_begin; + + *out = alpha[i0 >> 2 & 0x3f]; + ++out; + + // part of first character and second + if (in_begin != in_end) { + i1 = static_cast(*in_begin); + ++in_begin; + + *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)]; + ++out; + } else { + *out = alpha[(i0 & 0x3) << 4]; + ++out; + + // last padding + *out = pad; + ++out; + + // last padding + *out = pad; + ++out; + + break; + } + + // part of second character and third + if (in_begin != in_end) { + i2 = static_cast(*in_begin); + ++in_begin; + + *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)]; + ++out; + } else { + *out = alpha[(i1 & 0xf) << 2]; + ++out; + + // last padding + *out = pad; + ++out; + + break; + } + + // rest of third + *out = alpha[i2 & 0x3f]; + ++out; + } + + return out; + } + /** + Encodes a string. + + @param str the string that should be encoded + @param alphabet which alphabet should be used + @returns the encoded base64 string + @throws see base64::encode() + */ + static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard) + { + std::string result; + + result.reserve(required_encode_size(str.length()) + 1); + + encode(str.begin(), str.end(), std::back_inserter(result), alphabet); + + return result; + } + /** + Encodes a char array. + + @param buffer the char array + @param size the size of the array + @param alphabet which alphabet should be used + @returns the encoded string + */ + static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard) + { + std::string result; + + result.reserve(required_encode_size(size) + 1); + + encode(buffer, buffer + size, std::back_inserter(result), alphabet); + + return result; + } + /** + Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`, + in other words: inplace decoding is possible. + + @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`, + otherwise the behavior depends on the output iterator. + + @tparam Input_iterator the source; the returned elements are cast to `char` + @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t` + @param in_begin the beginning of the source + @param in_end the ending of the source + @param out the destination iterator + @param alphabet which alphabet should be used + @param behavior the behavior when an error was detected + @returns the iterator to the next element past the last element copied + @throws base64_error depending on the set behavior + @throws see `Input_iterator` and `Output_iterator` + */ + template + static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out, + alphabet alphabet = alphabet::auto_, + decoding_behavior behavior = decoding_behavior::moderate) + { + //constexpr auto pad = '='; + std::uint8_t last = 0; + auto bits = 0; + + while (in_begin != in_end) { + auto c = *in_begin; + ++in_begin; + + if (c == '=') { + break; + } + + auto part = _base64_value(alphabet, c); + + // enough bits for one byte + if (bits + 6 >= 8) { + *out = (last << (8 - bits)) | (part >> (bits - 2)); + ++out; + + bits -= 2; + } else { + bits += 6; + } + + last = part; + } + + // check padding + if (behavior != decoding_behavior::loose) { + while (in_begin != in_end) { + auto c = *in_begin; + ++in_begin; + + if (c != '=') { + throw base64_error("invalid base64 character."); + } + } + } + + return out; + } + /** + Decodes a string. + + @param str the base64 encoded string + @param alphabet which alphabet should be used + @param behavior the behavior when an error was detected + @returns the decoded string + @throws see base64::decode() + */ + static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_, + decoding_behavior behavior = decoding_behavior::moderate) + { + std::string result; + + result.reserve(max_decode_size(str.length())); + + decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior); + + return result; + } + /** + Decodes a string. + + @param buffer the base64 encoded buffer + @param size the size of the buffer + @param alphabet which alphabet should be used + @param behavior the behavior when an error was detected + @returns the decoded string + @throws see base64::decode() + */ + static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_, + decoding_behavior behavior = decoding_behavior::moderate) + { + std::string result; + + result.reserve(max_decode_size(size)); + + decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior); + + return result; + } + /** + Decodes a string inplace. + + @param[in,out] str the base64 encoded string + @param alphabet which alphabet should be used + @param behavior the behavior when an error was detected + @throws base64::decode_inplace() + */ + static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_, + decoding_behavior behavior = decoding_behavior::moderate) + { + str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin()); + } + /** + Decodes a char array inplace. + + @param[in,out] str the string array + @param size the length of the array + @param alphabet which alphabet should be used + @param behavior the behavior when an error was detected + @returns the pointer to the next element past the last element decoded + @throws base64::decode_inplace() + */ + static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_, + decoding_behavior behavior = decoding_behavior::moderate) + { + return decode(str, str + size, str, alphabet, behavior); + } + /** + Returns the required decoding size for a given size. The value is calculated with the following formula: + + $$ + \lceil \frac{size}{4} \rceil \cdot 3 + $$ + + @param size the size of the encoded input + @returns the size of the resulting decoded buffer; this the absolute maximum + */ + static std::size_t max_decode_size(std::size_t size) noexcept + { + return (size / 4 + (size % 4 ? 1 : 0)) * 3; + } + /** + Returns the required encoding size for a given size. The value is calculated with the following formula: + + $$ + \lceil \frac{size}{3} \rceil \cdot 4 + $$ + + @param size the size of the decoded input + @returns the size of the resulting encoded buffer + */ + static std::size_t required_encode_size(std::size_t size) noexcept + { + return (size / 3 + (size % 3 ? 1 : 0)) * 4; + } + +private: + static std::uint8_t _base64_value(alphabet& alphabet, char c) + { + if (c >= 'A' && c <= 'Z') { + return c - 'A'; + } else if (c >= 'a' && c <= 'z') { + return c - 'a' + 26; + } else if (c >= '0' && c <= '9') { + return c - '0' + 52; + } + + // comes down to alphabet + if (alphabet == alphabet::standard) { + if (c == '+') { + return 62; + } else if (c == '/') { + return 63; + } + } else if (alphabet == alphabet::url_filename_safe) { + if (c == '-') { + return 62; + } else if (c == '_') { + return 63; + } + } // auto detect + else { + if (c == '+') { + alphabet = alphabet::standard; + + return 62; + } else if (c == '/') { + alphabet = alphabet::standard; + + return 63; + } else if (c == '-') { + alphabet = alphabet::url_filename_safe; + + return 62; + } else if (c == '_') { + alphabet = alphabet::url_filename_safe; + + return 63; + } + } + + throw base64_error("invalid base64 character."); + } +}; + +#endif // !PUBLIC_DOMAIN_BASE64_HPP_ diff --git a/llama/build-info.cpp b/llama/build-info.cpp index 348fda3f..e6eab288 100644 --- a/llama/build-info.cpp +++ b/llama/build-info.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/build_cuda.sh b/llama/build_cuda.sh index c2f0ec03..bd9621bb 100755 --- a/llama/build_cuda.sh +++ b/llama/build_cuda.sh @@ -9,7 +9,7 @@ else fi nvcc \ - -t 12 \ + -t $(nproc) \ --generate-code=arch=compute_50,code=[compute_50,sm_50] \ --generate-code=arch=compute_52,code=[compute_52,sm_52] \ --generate-code=arch=compute_61,code=[compute_61,sm_61] \ @@ -30,9 +30,18 @@ nvcc \ -use_fast_math \ -link \ -shared \ - -fPIC \ -I. \ -lcuda -lcublas -lcudart -lcublasLt \ -O3 \ -o $output \ - ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp + ggml-cuda.cu \ + ggml-cuda/*.cu \ + ggml-cuda/template-instances/fattn-wmma*.cu \ + ggml-cuda/template-instances/mmq*.cu \ + ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu \ + ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu \ + ggml-cuda/template-instances/fattn-vec*f16-f16.cu \ + ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp + +# -DGGML_CUDA_USE_GRAPHS=1 +# -DGGML_CUDA_FA_ALL_QUANTS=1 \ No newline at end of file diff --git a/llama/build_hipblas.sh b/llama/build_hipblas.sh index 15ed0cbc..51eb2419 100755 --- a/llama/build_hipblas.sh +++ b/llama/build_hipblas.sh @@ -26,7 +26,7 @@ additional_flags="" if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then output="ggml-hipblas.dll" - additional_flags=" -Xclang --dependent-lib=msvcrt -Wl,/subsystem:console" + additional_flags=" -Xclang --dependent-lib=msvcrt" else output="libggml-hipblas.so" archs+=("${linux_archs[@]}") @@ -36,37 +36,61 @@ for arch in "${archs[@]}"; do additional_flags+=" --offload-arch=$arch" done -hipcc \ - -v \ - -parallel-jobs=12 \ - -O3 \ - -DGGML_USE_CUDA \ - -DGGML_BUILD=1 \ - -DGGML_SHARED=1 \ - -DGGML_CUDA_DMMV_X=32 \ - -DGGML_CUDA_MMV_Y=1 \ - -DGGML_SCHED_MAX_COPIES=4 \ - -DGGML_USE_HIPBLAS \ - -DGGML_USE_LLAMAFILE \ - -DHIP_FAST_MATH \ - -DNDEBUG \ - -DK_QUANTS_PER_ITERATION=2 \ - -D_CRT_SECURE_NO_WARNINGS \ - -DCMAKE_POSITION_INDEPENDENT_CODE=on \ - -D_GNU_SOURCE \ - -Wno-expansion-to-defined \ - -Wno-invalid-noreturn \ - -Wno-ignored-attributes \ - -Wno-pass-failed \ - -Wno-deprecated-declarations \ - -Wno-unused-result \ - -I. \ - -lhipblas -lamdhip64 -lrocblas \ - -shared \ - $additional_flags \ - -o $output \ - ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp +# Create an array of all source files, expanding globs +sources=( + $(echo ggml-cuda/template-instances/fattn-wmma*.cu) + $(echo ggml-cuda/template-instances/mmq*.cu) + $(echo ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) + $(echo ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) + $(echo ggml-cuda/template-instances/fattn-vec*f16-f16.cu) + ggml-cuda.cu + $(echo ggml-cuda/*.cu) + ggml.c + ggml-backend.c + ggml-alloc.c + ggml-quants.c + sgemm.cpp +) - # -D_DLL \ - # -D_MT \ - # -D_XOPEN_SOURCE=600 \ +# Function to compile a single source file +compile_source() { + src="$1" + hipcc -c -O3 -DGGML_USE_CUDA -DGGML_BUILD=1 -DGGML_SHARED=1 -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 \ + -DGGML_SCHED_MAX_COPIES=4 -DGGML_USE_HIPBLAS -DGGML_USE_LLAMAFILE -DHIP_FAST_MATH -DNDEBUG \ + -DK_QUANTS_PER_ITERATION=2 -D_CRT_SECURE_NO_WARNINGS -DCMAKE_POSITION_INDEPENDENT_CODE=on \ + -D_GNU_SOURCE -Wno-expansion-to-defined -Wno-invalid-noreturn -Wno-ignored-attributes -Wno-pass-failed \ + -Wno-deprecated-declarations -Wno-unused-result -I. \ + $additional_flags -o "${src%.cu}.o" "$src" +} + +# Function to handle Ctrl+C +cleanup() { + echo "Terminating all background processes..." + kill 0 +} + +# Set trap to handle SIGINT (Ctrl+C) +trap cleanup SIGINT + +# Limit the number of concurrent jobs +max_jobs=$(nproc) +job_count=0 + +for src in "${sources[@]}"; do + echo "$src" + compile_source "$src" & + job_count=$((job_count + 1)) + if [[ $job_count -ge $max_jobs ]]; then + wait -n + job_count=$((job_count - 1)) + fi +done + +wait + +# Link all object files into a shared library +echo "Linking object files..." +hipcc -v -shared -o $output *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o -lhipblas -lamdhip64 -lrocblas + +# Clean up object files after linking +rm -f *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o \ No newline at end of file diff --git a/llama/clip.cpp b/llama/clip.cpp index e6229af0..76fa44f2 100644 --- a/llama/clip.cpp +++ b/llama/clip.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/clip.h b/llama/clip.h index d6c28c5a..b105adf2 100644 --- a/llama/clip.h +++ b/llama/clip.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/common.cpp b/llama/common.cpp index dd8bcee0..2fe2ca98 100644 --- a/llama/common.cpp +++ b/llama/common.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * @@ -226,19 +226,13 @@ void gpt_params_handle_model_default(gpt_params & params) { } params.hf_file = params.model; } else if (params.model.empty()) { - std::string cache_directory = fs_get_cache_directory(); - const bool success = fs_create_directory_with_parents(cache_directory); - if (!success) { - throw std::runtime_error("failed to create cache directory: " + cache_directory); - } - params.model = cache_directory + string_split(params.hf_file, '/').back(); + params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); } } else if (!params.model_url.empty()) { if (params.model.empty()) { auto f = string_split(params.model_url, '#').front(); f = string_split(f, '?').front(); - f = string_split(f, '/').back(); - params.model = "models/" + f; + params.model = fs_get_cache_file(string_split(f, '/').back()); } } else if (params.model.empty()) { params.model = DEFAULT_MODEL_PATH; @@ -1517,6 +1511,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.chat_template = argv[i]; return true; } + if (arg == "--slot-prompt-similarity" || arg == "-sps") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.slot_prompt_similarity = std::stof(argv[i]); + return true; + } if (arg == "-pps") { params.is_pp_shared = true; return true; @@ -1939,6 +1941,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "set custom jinja chat template (default: template taken from model's metadata)\n" "only commonly used templates are accepted:\n" "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); + options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", + "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); #ifndef LOG_DISABLE_LOGS options.push_back({ "logging" }); @@ -2295,6 +2299,16 @@ std::string fs_get_cache_directory() { return ensure_trailing_slash(cache_directory); } +std::string fs_get_cache_file(const std::string & filename) { + GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos); + std::string cache_directory = fs_get_cache_directory(); + const bool success = fs_create_directory_with_parents(cache_directory); + if (!success) { + throw std::runtime_error("failed to create cache directory: " + cache_directory); + } + return cache_directory + filename; +} + // // Model utils diff --git a/llama/common.h b/llama/common.h index 913d8c2b..e64513ae 100644 --- a/llama/common.h +++ b/llama/common.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * @@ -229,6 +229,8 @@ struct gpt_params { std::string slot_save_path; + float slot_prompt_similarity = 0.5f; + // batched-bench params bool is_pp_shared = false; @@ -301,6 +303,7 @@ bool fs_validate_filename(const std::string & filename); bool fs_create_directory_with_parents(const std::string & path); std::string fs_get_cache_directory(); +std::string fs_get_cache_file(const std::string & filename); // // Model utils diff --git a/llama/ggml-alloc.c b/llama/ggml-alloc.c index 8d296910..3a538098 100644 --- a/llama/ggml-alloc.c +++ b/llama/ggml-alloc.c @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml-alloc.h b/llama/ggml-alloc.h index b3647884..ecceb044 100644 --- a/llama/ggml-alloc.h +++ b/llama/ggml-alloc.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * @@ -24,79 +24,79 @@ * SOFTWARE. */ -#pragma once - -#include "ggml.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; -typedef struct ggml_backend_buffer * ggml_backend_buffer_t; -typedef struct ggml_backend * ggml_backend_t; - -// Tensor allocator -struct ggml_tallocr { - ggml_backend_buffer_t buffer; - void * base; - size_t alignment; - size_t offset; -}; - -GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer); -GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor); - -// Graph allocator -/* - Example usage: - ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type()); - - // optional: create a worst-case graph and reserve the buffers to avoid reallocations - ggml_gallocr_reserve(galloc, build_graph(max_batch)); - - // allocate the graph - struct ggml_cgraph * graph = build_graph(batch); - ggml_gallocr_alloc_graph(galloc, graph); - - printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0)); - - // evaluate the graph - ggml_backend_graph_compute(backend, graph); -*/ - -// special tensor flags for use with the graph allocator: -// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses -// ggml_set_output(): output tensors are never freed and never overwritten - -typedef struct ggml_gallocr * ggml_gallocr_t; - -GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft); -GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs); -GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); - -// pre-allocate buffers from a measure graph - does not allocate or modify the graph -// call with a worst-case graph to avoid buffer reallocations -// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed -// returns false if the buffer allocation failed -GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); -GGML_API bool ggml_gallocr_reserve_n( - ggml_gallocr_t galloc, - struct ggml_cgraph * graph, - const int * node_buffer_ids, - const int * leaf_buffer_ids); - -// automatic reallocation if the topology changes when using a single buffer -// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) -GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); - -GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); - -// Utils -// Create a buffer and allocate all the tensors in a ggml_context -GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); -GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); - -#ifdef __cplusplus -} -#endif +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; +typedef struct ggml_backend_buffer * ggml_backend_buffer_t; +typedef struct ggml_backend * ggml_backend_t; + +// Tensor allocator +struct ggml_tallocr { + ggml_backend_buffer_t buffer; + void * base; + size_t alignment; + size_t offset; +}; + +GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer); +GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor); + +// Graph allocator +/* + Example usage: + ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type()); + + // optional: create a worst-case graph and reserve the buffers to avoid reallocations + ggml_gallocr_reserve(galloc, build_graph(max_batch)); + + // allocate the graph + struct ggml_cgraph * graph = build_graph(batch); + ggml_gallocr_alloc_graph(galloc, graph); + + printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0)); + + // evaluate the graph + ggml_backend_graph_compute(backend, graph); +*/ + +// special tensor flags for use with the graph allocator: +// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses +// ggml_set_output(): output tensors are never freed and never overwritten + +typedef struct ggml_gallocr * ggml_gallocr_t; + +GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft); +GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs); +GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); + +// pre-allocate buffers from a measure graph - does not allocate or modify the graph +// call with a worst-case graph to avoid buffer reallocations +// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed +// returns false if the buffer allocation failed +GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +GGML_API bool ggml_gallocr_reserve_n( + ggml_gallocr_t galloc, + struct ggml_cgraph * graph, + const int * node_buffer_ids, + const int * leaf_buffer_ids); + +// automatic reallocation if the topology changes when using a single buffer +// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) +GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); + +GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); + +// Utils +// Create a buffer and allocate all the tensors in a ggml_context +GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); +GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); + +#ifdef __cplusplus +} +#endif diff --git a/llama/ggml-backend-impl.h b/llama/ggml-backend-impl.h index 91470140..d30845f9 100644 --- a/llama/ggml-backend-impl.h +++ b/llama/ggml-backend-impl.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * @@ -24,144 +24,144 @@ * SOFTWARE. */ -#pragma once - -// ggml-backend internal header - -#include "ggml-backend.h" - -#ifdef __cplusplus -extern "C" { -#endif - - // - // Backend buffer - // - - // buffer type - typedef void * ggml_backend_buffer_type_context_t; - - struct ggml_backend_buffer_type_i { - const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); - ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); - size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size - size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding - bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend - // check if tensor data is in host memory - // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) - bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); - }; - - struct ggml_backend_buffer_type { - struct ggml_backend_buffer_type_i iface; - ggml_backend_buffer_type_context_t context; - }; - - // buffer - typedef void * ggml_backend_buffer_context_t; - - struct ggml_backend_buffer_i { - const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer); - void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer); - void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer); - void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer - void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value); - void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras - }; - - struct ggml_backend_buffer { - struct ggml_backend_buffer_i iface; - ggml_backend_buffer_type_t buft; - ggml_backend_buffer_context_t context; - size_t size; - enum ggml_backend_buffer_usage usage; - }; - - GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( - ggml_backend_buffer_type_t buft, - struct ggml_backend_buffer_i iface, - ggml_backend_buffer_context_t context, - size_t size); - - // do not use directly, use ggml_backend_tensor_copy instead - bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); - - // buffer that contains a collection of buffers - GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); - GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); - GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); - - // - // Backend - // - - typedef void * ggml_backend_context_t; - - struct ggml_backend_i { - const char * (*GGML_CALL get_name)(ggml_backend_t backend); - - void (*GGML_CALL free)(ggml_backend_t backend); - - // buffer allocation - ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend); - - // (optional) asynchronous tensor data access - void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); - - // (optional) complete all pending operations - void (*GGML_CALL synchronize)(ggml_backend_t backend); - - // compute graph with a plan (not used currently) - ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); - void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - - // compute graph with a plan - enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph without a plan (async) - enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); - - // check if the backend supports an operation - bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); - - // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer - // these should be expensive operations with large batch sizes that may benefit from running on this backend - // even if the weight has to be copied from the CPU temporarily - bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); - - // (optional) event synchronization - ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); - void (*GGML_CALL event_free) (ggml_backend_event_t event); - void (*GGML_CALL event_record) (ggml_backend_event_t event); - void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); - void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); - }; - - struct ggml_backend { - ggml_guid_t guid; - - struct ggml_backend_i iface; - ggml_backend_context_t context; - }; - - struct ggml_backend_event { - ggml_backend_t backend; - void * context; - }; - - // - // Backend registry - // - - typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data); - - GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); - -#ifdef __cplusplus -} -#endif +#pragma once + +// ggml-backend internal header + +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + + // + // Backend buffer + // + + // buffer type + typedef void * ggml_backend_buffer_type_context_t; + + struct ggml_backend_buffer_type_i { + const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); + ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); + size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment + size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size + size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + // check if tensor data is in host memory + // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) + bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); + }; + + struct ggml_backend_buffer_type { + struct ggml_backend_buffer_type_i iface; + ggml_backend_buffer_type_context_t context; + }; + + // buffer + typedef void * ggml_backend_buffer_context_t; + + struct ggml_backend_buffer_i { + const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer); + void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer); + void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer); + void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer + void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras + }; + + struct ggml_backend_buffer { + struct ggml_backend_buffer_i iface; + ggml_backend_buffer_type_t buft; + ggml_backend_buffer_context_t context; + size_t size; + enum ggml_backend_buffer_usage usage; + }; + + GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( + ggml_backend_buffer_type_t buft, + struct ggml_backend_buffer_i iface, + ggml_backend_buffer_context_t context, + size_t size); + + // do not use directly, use ggml_backend_tensor_copy instead + bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); + + // buffer that contains a collection of buffers + GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); + GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); + GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + + // + // Backend + // + + typedef void * ggml_backend_context_t; + + struct ggml_backend_i { + const char * (*GGML_CALL get_name)(ggml_backend_t backend); + + void (*GGML_CALL free)(ggml_backend_t backend); + + // buffer allocation + ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend); + + // (optional) asynchronous tensor data access + void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); + + // (optional) complete all pending operations + void (*GGML_CALL synchronize)(ggml_backend_t backend); + + // compute graph with a plan (not used currently) + ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); + void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + + // compute graph with a plan + enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + // compute graph without a plan (async) + enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + + // check if the backend supports an operation + bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + + // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer + // these should be expensive operations with large batch sizes that may benefit from running on this backend + // even if the weight has to be copied from the CPU temporarily + bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); + + // (optional) event synchronization + ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); + void (*GGML_CALL event_free) (ggml_backend_event_t event); + void (*GGML_CALL event_record) (ggml_backend_event_t event); + void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); + void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); + }; + + struct ggml_backend { + ggml_guid_t guid; + + struct ggml_backend_i iface; + ggml_backend_context_t context; + }; + + struct ggml_backend_event { + ggml_backend_t backend; + void * context; + }; + + // + // Backend registry + // + + typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data); + + GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); + +#ifdef __cplusplus +} +#endif diff --git a/llama/ggml-backend.c b/llama/ggml-backend.c index 9225f7ab..0e6aa8a0 100644 --- a/llama/ggml-backend.c +++ b/llama/ggml-backend.c @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml-backend.h b/llama/ggml-backend.h index 1041d35e..84b5a5d1 100644 --- a/llama/ggml-backend.h +++ b/llama/ggml-backend.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml-common.h b/llama/ggml-common.h index dbbaa5d1..b13765b8 100644 --- a/llama/ggml-common.h +++ b/llama/ggml-common.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml-cuda.cu b/llama/ggml-cuda.cu index a7b17d44..4a598874 100644 --- a/llama/ggml-cuda.cu +++ b/llama/ggml-cuda.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * @@ -1377,10 +1377,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { GGML_UNUSED(main_device); } +static cudaError_t ggml_cuda_Memcpy2DPeerAsync( + void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) { + +#if !defined(GGML_USE_HIPBLAS) + // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices + cudaMemcpy3DPeerParms p = {}; + p.dstDevice = dstDevice; + p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height); + p.srcDevice = srcDevice; + p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height); + p.extent = make_cudaExtent(width, height, 1); + return cudaMemcpy3DPeerAsync(&p, stream); +#else + // HIP does not support cudaMemcpy3DPeerAsync or vmm pools + GGML_UNUSED(dstDevice); + GGML_UNUSED(srcDevice); + return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream); +#endif // !defined(GGML_USE_HIPBLAS) +} + static void ggml_cuda_op_mul_mat( ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, - const bool convert_src1_to_q8_1) { + quantize_cuda_t quantize_src1) { const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; @@ -1437,7 +1457,9 @@ static void ggml_cuda_op_mul_mat( } struct dev_data { - ggml_cuda_pool_alloc src0_dd_alloc; + int cc; + + ggml_cuda_pool_alloc src0_dd_alloc; ggml_cuda_pool_alloc src1_ddf_alloc; ggml_cuda_pool_alloc src1_ddq_alloc; ggml_cuda_pool_alloc dst_dd_alloc; @@ -1456,6 +1478,8 @@ static void ggml_cuda_op_mul_mat( int used_devices = 0; for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + dev[id].cc = ggml_cuda_info().devices[id].cc; + // by default, use all rows dev[id].row_low = 0; dev[id].row_high = ne01; @@ -1506,11 +1530,15 @@ static void ggml_cuda_op_mul_mat( dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1)); } - if (convert_src1_to_q8_1) { - dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs); + if (quantize_src1) { + size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs; + if (quantize_src1 == quantize_mmq_q8_1_cuda) { + src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq); + } + dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size); if (src1_on_device && src1_is_contiguous) { - quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream); + quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream); CUDA_CHECK(cudaGetLastError()); } } @@ -1556,7 +1584,12 @@ static void ggml_cuda_op_mul_mat( const int64_t i03 = i0 / ne12; const int64_t i02 = i0 % ne12; - const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs; + size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs; + if (quantize_src1 == quantize_mmq_q8_1_cuda) { + src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq); + } else { + src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs; + } // for split tensors the data begins at i0 == i0_offset_low char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; @@ -1573,10 +1606,17 @@ static void ggml_cuda_op_mul_mat( // copy src0, src1 to device if necessary if (src1_is_contiguous) { if (id != ctx.device) { - if (convert_src1_to_q8_1) { + if (quantize_src1) { char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset; - CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device, - src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream)); + if (quantize_src1 == quantize_mmq_q8_1_cuda) { + const size_t pitch = ne11*sizeof(block_q8_1_mmq); + const size_t width = src1_ncols*sizeof(block_q8_1_mmq); + const size_t height = src1_padded_col_size/(4*QK8_1); + CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream)); + } else { + CUDA_CHECK(cudaMemcpyPeerAsync( + src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream)); + } } else { float * src1_ddf_i_source = (float *) src1->data; src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10; @@ -1591,8 +1631,8 @@ static void ggml_cuda_op_mul_mat( GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && !src1_is_contiguous) { - quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); + if (quantize_src1 && !src1_is_contiguous) { + quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream); CUDA_CHECK(cudaGetLastError()); } @@ -1617,22 +1657,8 @@ static void ggml_cuda_op_mul_mat( float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); dhf_dst_i += src1_col_0*ne0 + dev[id].row_low; -#if !defined(GGML_USE_HIPBLAS) - // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices - cudaMemcpy3DPeerParms p = {}; - p.dstDevice = ctx.device; - p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols); - p.srcDevice = id; - p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols); - p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1); - CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream)); -#else - // HIP does not support cudaMemcpy3DPeerAsync or vmm pools - CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), - dst_dd_i, row_diff*sizeof(float), - row_diff*sizeof(float), src1_ncols, - cudaMemcpyDeviceToDevice, stream)); -#endif + CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync( + dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream)); } else { float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); @@ -1971,13 +1997,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor // KQ + KQV multi-batch ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { - ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); + ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr); } else if (use_mul_mat_vec_q) { - ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); + ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda); } else if (use_mul_mat_q) { - ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true); + ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda); } else { - ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); + ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr); } } diff --git a/llama/ggml-cuda.h b/llama/ggml-cuda.h index 821853e8..684e009c 100644 --- a/llama/ggml-cuda.h +++ b/llama/ggml-cuda.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml-cuda/acc.cu b/llama/ggml-cuda/acc.cu index 96bfe1c9..8d705ac9 100644 --- a/llama/ggml-cuda/acc.cu +++ b/llama/ggml-cuda/acc.cu @@ -1,47 +1,47 @@ -#include "acc.cuh" - -static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne, - const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, int offset) { - const int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= ne) { - return; - } - int src1_idx = i - offset; - int oz = src1_idx / nb2; - int oy = (src1_idx - (oz * nb2)) / nb1; - int ox = src1_idx % nb1; - if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { - dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; - } else { - dst[i] = x[i]; - } -} - -static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements, - const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, const int offset, cudaStream_t stream) { - int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE; - acc_f32<<>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset); -} - -void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - const float * src0_d = (const float *)src0->data; - const float * src1_d = (const float *)src1->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported - - int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 - int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 - // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused - int offset = dst->op_params[3] / 4; // offset in bytes - - acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream); -} +#include "acc.cuh" + +static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne, + const int ne10, const int ne11, const int ne12, + const int nb1, const int nb2, int offset) { + const int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i >= ne) { + return; + } + int src1_idx = i - offset; + int oz = src1_idx / nb2; + int oy = (src1_idx - (oz * nb2)) / nb1; + int ox = src1_idx % nb1; + if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { + dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; + } else { + dst[i] = x[i]; + } +} + +static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements, + const int ne10, const int ne11, const int ne12, + const int nb1, const int nb2, const int offset, cudaStream_t stream) { + int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE; + acc_f32<<>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset); +} + +void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported + + int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 + int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 + // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused + int offset = dst->op_params[3] / 4; // offset in bytes + + acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream); +} diff --git a/llama/ggml-cuda/acc.cuh b/llama/ggml-cuda/acc.cuh index 1168ea1b..a476eb9a 100644 --- a/llama/ggml-cuda/acc.cuh +++ b/llama/ggml-cuda/acc.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_ACC_BLOCK_SIZE 256 - -void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_ACC_BLOCK_SIZE 256 + +void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/arange.cu b/llama/ggml-cuda/arange.cu index b5e495a2..e35045dd 100644 --- a/llama/ggml-cuda/arange.cu +++ b/llama/ggml-cuda/arange.cu @@ -1,34 +1,34 @@ -#include "arange.cuh" - -static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { - // blockIDx.x: idx of ne0 / BLOCK_SIZE - int nidx = threadIdx.x + blockIdx.x * blockDim.x; - if (nidx >= ne0) { - return; - } - dst[nidx] = start + step * nidx; -} - -static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { - int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE; - arange_f32<<>>(dst, ne0, start, step); -} - -void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - float start; - float stop; - float step; - memcpy(&start, (float *)dst->op_params + 0, sizeof(float)); - memcpy(&stop, (float *)dst->op_params + 1, sizeof(float)); - memcpy(&step, (float *)dst->op_params + 2, sizeof(float)); - - int64_t steps = (int64_t)ceil((stop - start) / step); - GGML_ASSERT(ggml_nelements(dst) == steps); - - arange_f32_cuda(dst_d, dst->ne[0], start, step, stream); -} +#include "arange.cuh" + +static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { + // blockIDx.x: idx of ne0 / BLOCK_SIZE + int nidx = threadIdx.x + blockIdx.x * blockDim.x; + if (nidx >= ne0) { + return; + } + dst[nidx] = start + step * nidx; +} + +static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { + int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE; + arange_f32<<>>(dst, ne0, start, step); +} + +void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + float start; + float stop; + float step; + memcpy(&start, (float *)dst->op_params + 0, sizeof(float)); + memcpy(&stop, (float *)dst->op_params + 1, sizeof(float)); + memcpy(&step, (float *)dst->op_params + 2, sizeof(float)); + + int64_t steps = (int64_t)ceil((stop - start) / step); + GGML_ASSERT(ggml_nelements(dst) == steps); + + arange_f32_cuda(dst_d, dst->ne[0], start, step, stream); +} diff --git a/llama/ggml-cuda/arange.cuh b/llama/ggml-cuda/arange.cuh index 41e74fdf..13236306 100644 --- a/llama/ggml-cuda/arange.cuh +++ b/llama/ggml-cuda/arange.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_ARANGE_BLOCK_SIZE 256 - -void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_ARANGE_BLOCK_SIZE 256 + +void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/argsort.cu b/llama/ggml-cuda/argsort.cu index 16414406..d5b42c66 100644 --- a/llama/ggml-cuda/argsort.cu +++ b/llama/ggml-cuda/argsort.cu @@ -1,103 +1,103 @@ -#include "argsort.cuh" - -template -static inline __device__ void ggml_cuda_swap(T & a, T & b) { - T tmp = a; - a = b; - b = tmp; -} - -template -static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) { - // bitonic sort - int col = threadIdx.x; - int row = blockIdx.y; - - if (col >= ncols_pad) { - return; - } - - const float * x_row = x + row * ncols; - extern __shared__ int dst_row[]; - - // initialize indices - dst_row[col] = col; - - __syncthreads(); - - for (int k = 2; k <= ncols_pad; k *= 2) { - for (int j = k / 2; j > 0; j /= 2) { - int ixj = col ^ j; - if (ixj > col) { - if ((col & k) == 0) { - if (dst_row[col] >= ncols || - (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ? - x_row[dst_row[col]] > x_row[dst_row[ixj]] : - x_row[dst_row[col]] < x_row[dst_row[ixj]])) - ) { - ggml_cuda_swap(dst_row[col], dst_row[ixj]); - } - } else { - if (dst_row[ixj] >= ncols || - (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ? - x_row[dst_row[col]] < x_row[dst_row[ixj]] : - x_row[dst_row[col]] > x_row[dst_row[ixj]])) - ) { - ggml_cuda_swap(dst_row[col], dst_row[ixj]); - } - } - } - __syncthreads(); - } - } - - // copy the result to dst without the padding - if (col < ncols) { - dst[row * ncols + col] = dst_row[col]; - } -} - -static int next_power_of_2(int x) { - int n = 1; - while (n < x) { - n *= 2; - } - return n; -} - -static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { - // bitonic sort requires ncols to be power of 2 - const int ncols_pad = next_power_of_2(ncols); - - const dim3 block_dims(ncols_pad, 1, 1); - const dim3 block_nums(1, nrows, 1); - const size_t shared_mem = ncols_pad * sizeof(int); - - GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb); - - if (order == GGML_SORT_ORDER_ASC) { - k_argsort_f32_i32<<>>(x, dst, ncols, ncols_pad); - } else if (order == GGML_SORT_ORDER_DESC) { - k_argsort_f32_i32<<>>(x, dst, ncols, ncols_pad); - } else { - GGML_ASSERT(false); - } -} - -void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_is_contiguous(src0)); - - const int64_t ncols = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; - - argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); -} +#include "argsort.cuh" + +template +static inline __device__ void ggml_cuda_swap(T & a, T & b) { + T tmp = a; + a = b; + b = tmp; +} + +template +static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) { + // bitonic sort + int col = threadIdx.x; + int row = blockIdx.y; + + if (col >= ncols_pad) { + return; + } + + const float * x_row = x + row * ncols; + extern __shared__ int dst_row[]; + + // initialize indices + dst_row[col] = col; + + __syncthreads(); + + for (int k = 2; k <= ncols_pad; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + int ixj = col ^ j; + if (ixj > col) { + if ((col & k) == 0) { + if (dst_row[col] >= ncols || + (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] > x_row[dst_row[ixj]] : + x_row[dst_row[col]] < x_row[dst_row[ixj]])) + ) { + ggml_cuda_swap(dst_row[col], dst_row[ixj]); + } + } else { + if (dst_row[ixj] >= ncols || + (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] < x_row[dst_row[ixj]] : + x_row[dst_row[col]] > x_row[dst_row[ixj]])) + ) { + ggml_cuda_swap(dst_row[col], dst_row[ixj]); + } + } + } + __syncthreads(); + } + } + + // copy the result to dst without the padding + if (col < ncols) { + dst[row * ncols + col] = dst_row[col]; + } +} + +static int next_power_of_2(int x) { + int n = 1; + while (n < x) { + n *= 2; + } + return n; +} + +static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { + // bitonic sort requires ncols to be power of 2 + const int ncols_pad = next_power_of_2(ncols); + + const dim3 block_dims(ncols_pad, 1, 1); + const dim3 block_nums(1, nrows, 1); + const size_t shared_mem = ncols_pad * sizeof(int); + + GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb); + + if (order == GGML_SORT_ORDER_ASC) { + k_argsort_f32_i32<<>>(x, dst, ncols, ncols_pad); + } else if (order == GGML_SORT_ORDER_DESC) { + k_argsort_f32_i32<<>>(x, dst, ncols, ncols_pad); + } else { + GGML_ASSERT(false); + } +} + +void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_is_contiguous(src0)); + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + + argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); +} diff --git a/llama/ggml-cuda/argsort.cuh b/llama/ggml-cuda/argsort.cuh index 68a00154..b455d21d 100644 --- a/llama/ggml-cuda/argsort.cuh +++ b/llama/ggml-cuda/argsort.cuh @@ -1,3 +1,3 @@ -#include "common.cuh" - -void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/binbcast.cu b/llama/ggml-cuda/binbcast.cu index 19b08b74..9ce4a7ca 100644 --- a/llama/ggml-cuda/binbcast.cu +++ b/llama/ggml-cuda/binbcast.cu @@ -1,280 +1,280 @@ -#include "binbcast.cuh" - -static __device__ __forceinline__ float op_repeat(const float a, const float b) { - return b; - GGML_UNUSED(a); -} - -static __device__ __forceinline__ float op_add(const float a, const float b) { - return a + b; -} - -static __device__ __forceinline__ float op_mul(const float a, const float b) { - return a * b; -} - -static __device__ __forceinline__ float op_div(const float a, const float b) { - return a / b; -} - -template -static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s00,*/ int s01, int s02, int s03, - /*int s10,*/ int s11, int s12, int s13) { - const int i0s = blockDim.x*blockIdx.x + threadIdx.x; - const int i1 = (blockDim.y*blockIdx.y + threadIdx.y); - const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3; - const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3; - - if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i3*s3 + i2*s2 + i1*s1; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) { - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); - } -} - -template -static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s00,*/ int s01, int s02, int s03, - /*int s10,*/ int s11, int s12, int s13) { - - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - const int i3 = i/(ne2*ne1*ne0); - const int i2 = (i/(ne1*ne0)) % ne2; - const int i1 = (i/ne0) % ne1; - const int i0 = i % ne0; - - if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i3*s3 + i2*s2 + i1*s1; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); -} - -template -struct bin_bcast_cuda { - template - void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, - const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, - cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - int nr0 = ne10/ne0; - int nr1 = ne11/ne1; - int nr2 = ne12/ne2; - int nr3 = ne13/ne3; - - int nr[4] = { nr0, nr1, nr2, nr3 }; - - // collapse dimensions until first broadcast dimension - int64_t cne[] = {ne0, ne1, ne2, ne3}; - int64_t cne0[] = {ne00, ne01, ne02, ne03}; - int64_t cne1[] = {ne10, ne11, ne12, ne13}; - - size_t cnb[] = {nb0, nb1, nb2, nb3}; - size_t cnb0[] = {nb00, nb01, nb02, nb03}; - size_t cnb1[] = {nb10, nb11, nb12, nb13}; - - auto collapse = [](int64_t cne[]) { - cne[0] *= cne[1]; - cne[1] = cne[2]; - cne[2] = cne[3]; - cne[3] = 1; - }; - - auto collapse_nb = [](size_t cnb[], const int64_t cne[]) { - cnb[1] *= cne[1]; - cnb[2] *= cne[2]; - cnb[3] *= cne[3]; - }; - - if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) { - for (int i = 0; i < 4; i++) { - if (nr[i] != 1) { - break; - } - if (i > 0) { - collapse_nb(cnb, cne); - collapse_nb(cnb0, cne0); - collapse_nb(cnb1, cne1); - collapse(cne); - collapse(cne0); - collapse(cne1); - } - } - } - - { - int64_t ne0 = cne[0]; - int64_t ne1 = cne[1]; - int64_t ne2 = cne[2]; - int64_t ne3 = cne[3]; - - //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00); - //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01); - //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02); - //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03); - - int64_t ne10 = cne1[0]; - int64_t ne11 = cne1[1]; - int64_t ne12 = cne1[2]; - int64_t ne13 = cne1[3]; - - size_t nb0 = cnb[0]; - size_t nb1 = cnb[1]; - size_t nb2 = cnb[2]; - size_t nb3 = cnb[3]; - - size_t nb00 = cnb0[0]; - size_t nb01 = cnb0[1]; - size_t nb02 = cnb0[2]; - size_t nb03 = cnb0[3]; - - size_t nb10 = cnb1[0]; - size_t nb11 = cnb1[1]; - size_t nb12 = cnb1[2]; - size_t nb13 = cnb1[3]; - - size_t s0 = nb0 / sizeof(dst_t); - size_t s1 = nb1 / sizeof(dst_t); - size_t s2 = nb2 / sizeof(dst_t); - size_t s3 = nb3 / sizeof(dst_t); - - size_t s10 = nb10 / sizeof(src1_t); - size_t s11 = nb11 / sizeof(src1_t); - size_t s12 = nb12 / sizeof(src1_t); - size_t s13 = nb13 / sizeof(src1_t); - - size_t s00 = nb00 / sizeof(src0_t); - size_t s01 = nb01 / sizeof(src0_t); - size_t s02 = nb02 / sizeof(src0_t); - size_t s03 = nb03 / sizeof(src0_t); - - GGML_ASSERT(nb0 % sizeof(dst_t) == 0); - GGML_ASSERT(nb1 % sizeof(dst_t) == 0); - GGML_ASSERT(nb2 % sizeof(dst_t) == 0); - GGML_ASSERT(nb3 % sizeof(dst_t) == 0); - - GGML_ASSERT(nb00 % sizeof(src0_t) == 0); - GGML_ASSERT(nb01 % sizeof(src0_t) == 0); - GGML_ASSERT(nb02 % sizeof(src0_t) == 0); - GGML_ASSERT(nb03 % sizeof(src0_t) == 0); - - GGML_ASSERT(nb10 % sizeof(src1_t) == 0); - GGML_ASSERT(nb11 % sizeof(src1_t) == 0); - GGML_ASSERT(nb12 % sizeof(src1_t) == 0); - GGML_ASSERT(nb13 % sizeof(src1_t) == 0); - - GGML_ASSERT(s0 == 1); - GGML_ASSERT(s00 == 1); - GGML_ASSERT(s10 == 1); - - const int block_size = 128; - - int64_t hne0 = std::max(ne0/2LL, 1LL); - - dim3 block_dims; - block_dims.x = std::min(hne0, block_size); - block_dims.y = std::min(ne1, block_size / block_dims.x); - block_dims.z = std::min(std::min(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U); - - dim3 block_nums( - (hne0 + block_dims.x - 1) / block_dims.x, - (ne1 + block_dims.y - 1) / block_dims.y, - (ne2*ne3 + block_dims.z - 1) / block_dims.z - ); - - if (block_nums.z > 65535) { - // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel - int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; - k_bin_bcast_unravel<<>>( - src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s00, */ s01, s02, s03, - /* s10, */ s11, s12, s13); - } else { - k_bin_bcast<<>>( - src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s00, */ s01, s02, s03, - /* s10, */ s11, s12, s13); - } - } - } -}; - -template -static void ggml_cuda_op_bin_bcast( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) { - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream); - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream); - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { - op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream); - } else { - fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, - ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); - } -} - -void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_cuda_op_bin_bcast>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream()); -} - -void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); -} - -void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); -} - -void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); -} +#include "binbcast.cuh" + +static __device__ __forceinline__ float op_repeat(const float a, const float b) { + return b; + GGML_UNUSED(a); +} + +static __device__ __forceinline__ float op_add(const float a, const float b) { + return a + b; +} + +static __device__ __forceinline__ float op_mul(const float a, const float b) { + return a * b; +} + +static __device__ __forceinline__ float op_div(const float a, const float b) { + return a / b; +} + +template +static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s00,*/ int s01, int s02, int s03, + /*int s10,*/ int s11, int s12, int s13) { + const int i0s = blockDim.x*blockIdx.x + threadIdx.x; + const int i1 = (blockDim.y*blockIdx.y + threadIdx.y); + const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3; + const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3; + + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) { + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); + } +} + +template +static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s00,*/ int s01, int s02, int s03, + /*int s10,*/ int s11, int s12, int s13) { + + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + const int i3 = i/(ne2*ne1*ne0); + const int i2 = (i/(ne1*ne0)) % ne2; + const int i1 = (i/ne0) % ne1; + const int i0 = i % ne0; + + if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); +} + +template +struct bin_bcast_cuda { + template + void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, + const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, + cudaStream_t stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + int nr0 = ne10/ne0; + int nr1 = ne11/ne1; + int nr2 = ne12/ne2; + int nr3 = ne13/ne3; + + int nr[4] = { nr0, nr1, nr2, nr3 }; + + // collapse dimensions until first broadcast dimension + int64_t cne[] = {ne0, ne1, ne2, ne3}; + int64_t cne0[] = {ne00, ne01, ne02, ne03}; + int64_t cne1[] = {ne10, ne11, ne12, ne13}; + + size_t cnb[] = {nb0, nb1, nb2, nb3}; + size_t cnb0[] = {nb00, nb01, nb02, nb03}; + size_t cnb1[] = {nb10, nb11, nb12, nb13}; + + auto collapse = [](int64_t cne[]) { + cne[0] *= cne[1]; + cne[1] = cne[2]; + cne[2] = cne[3]; + cne[3] = 1; + }; + + auto collapse_nb = [](size_t cnb[], const int64_t cne[]) { + cnb[1] *= cne[1]; + cnb[2] *= cne[2]; + cnb[3] *= cne[3]; + }; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) { + for (int i = 0; i < 4; i++) { + if (nr[i] != 1) { + break; + } + if (i > 0) { + collapse_nb(cnb, cne); + collapse_nb(cnb0, cne0); + collapse_nb(cnb1, cne1); + collapse(cne); + collapse(cne0); + collapse(cne1); + } + } + } + + { + int64_t ne0 = cne[0]; + int64_t ne1 = cne[1]; + int64_t ne2 = cne[2]; + int64_t ne3 = cne[3]; + + //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00); + //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01); + //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02); + //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03); + + int64_t ne10 = cne1[0]; + int64_t ne11 = cne1[1]; + int64_t ne12 = cne1[2]; + int64_t ne13 = cne1[3]; + + size_t nb0 = cnb[0]; + size_t nb1 = cnb[1]; + size_t nb2 = cnb[2]; + size_t nb3 = cnb[3]; + + size_t nb00 = cnb0[0]; + size_t nb01 = cnb0[1]; + size_t nb02 = cnb0[2]; + size_t nb03 = cnb0[3]; + + size_t nb10 = cnb1[0]; + size_t nb11 = cnb1[1]; + size_t nb12 = cnb1[2]; + size_t nb13 = cnb1[3]; + + size_t s0 = nb0 / sizeof(dst_t); + size_t s1 = nb1 / sizeof(dst_t); + size_t s2 = nb2 / sizeof(dst_t); + size_t s3 = nb3 / sizeof(dst_t); + + size_t s10 = nb10 / sizeof(src1_t); + size_t s11 = nb11 / sizeof(src1_t); + size_t s12 = nb12 / sizeof(src1_t); + size_t s13 = nb13 / sizeof(src1_t); + + size_t s00 = nb00 / sizeof(src0_t); + size_t s01 = nb01 / sizeof(src0_t); + size_t s02 = nb02 / sizeof(src0_t); + size_t s03 = nb03 / sizeof(src0_t); + + GGML_ASSERT(nb0 % sizeof(dst_t) == 0); + GGML_ASSERT(nb1 % sizeof(dst_t) == 0); + GGML_ASSERT(nb2 % sizeof(dst_t) == 0); + GGML_ASSERT(nb3 % sizeof(dst_t) == 0); + + GGML_ASSERT(nb00 % sizeof(src0_t) == 0); + GGML_ASSERT(nb01 % sizeof(src0_t) == 0); + GGML_ASSERT(nb02 % sizeof(src0_t) == 0); + GGML_ASSERT(nb03 % sizeof(src0_t) == 0); + + GGML_ASSERT(nb10 % sizeof(src1_t) == 0); + GGML_ASSERT(nb11 % sizeof(src1_t) == 0); + GGML_ASSERT(nb12 % sizeof(src1_t) == 0); + GGML_ASSERT(nb13 % sizeof(src1_t) == 0); + + GGML_ASSERT(s0 == 1); + GGML_ASSERT(s00 == 1); + GGML_ASSERT(s10 == 1); + + const int block_size = 128; + + int64_t hne0 = std::max(ne0/2LL, 1LL); + + dim3 block_dims; + block_dims.x = std::min(hne0, block_size); + block_dims.y = std::min(ne1, block_size / block_dims.x); + block_dims.z = std::min(std::min(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U); + + dim3 block_nums( + (hne0 + block_dims.x - 1) / block_dims.x, + (ne1 + block_dims.y - 1) / block_dims.y, + (ne2*ne3 + block_dims.z - 1) / block_dims.z + ); + + if (block_nums.z > 65535) { + // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel + int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; + k_bin_bcast_unravel<<>>( + src0_dd, src1_dd, dst_dd, + ne0, ne1, ne2, ne3, + ne10, ne11, ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00, */ s01, s02, s03, + /* s10, */ s11, s12, s13); + } else { + k_bin_bcast<<>>( + src0_dd, src1_dd, dst_dd, + ne0, ne1, ne2, ne3, + ne10, ne11, ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00, */ s01, s02, s03, + /* s10, */ s11, s12, s13); + } + } + } +}; + +template +static void ggml_cuda_op_bin_bcast( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) { + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream); + } else { + fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, + ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ASSERT(false); + } +} + +void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_bin_bcast>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream()); +} + +void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); +} + +void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); +} + +void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); +} diff --git a/llama/ggml-cuda/binbcast.cuh b/llama/ggml-cuda/binbcast.cuh index 4f63d637..0e1d3c7c 100644 --- a/llama/ggml-cuda/binbcast.cuh +++ b/llama/ggml-cuda/binbcast.cuh @@ -1,6 +1,6 @@ -#include "common.cuh" - -void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); -void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); -void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst); -void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/clamp.cuh b/llama/ggml-cuda/clamp.cuh index 7f9559dd..8cbba12c 100644 --- a/llama/ggml-cuda/clamp.cuh +++ b/llama/ggml-cuda/clamp.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_CLAMP_BLOCK_SIZE 256 - -void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_CLAMP_BLOCK_SIZE 256 + +void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/concat.cuh b/llama/ggml-cuda/concat.cuh index aa506a05..b88a8d07 100644 --- a/llama/ggml-cuda/concat.cuh +++ b/llama/ggml-cuda/concat.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_CONCAT_BLOCK_SIZE 256 - -void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_CONCAT_BLOCK_SIZE 256 + +void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/convert.cuh b/llama/ggml-cuda/convert.cuh index 5394be9f..f34d783f 100644 --- a/llama/ggml-cuda/convert.cuh +++ b/llama/ggml-cuda/convert.cuh @@ -1,13 +1,13 @@ -#include "common.cuh" - -#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 - -template -using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); - -typedef to_t_cuda_t to_fp32_cuda_t; -typedef to_t_cuda_t to_fp16_cuda_t; - -to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); - -to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); +#include "common.cuh" + +#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 + +template +using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); + +typedef to_t_cuda_t to_fp32_cuda_t; +typedef to_t_cuda_t to_fp16_cuda_t; + +to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); + +to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); diff --git a/llama/ggml-cuda/dequantize.cuh b/llama/ggml-cuda/dequantize.cuh index bd3c2d9d..44eb23f8 100644 --- a/llama/ggml-cuda/dequantize.cuh +++ b/llama/ggml-cuda/dequantize.cuh @@ -1,103 +1,103 @@ -#include "common.cuh" - -static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ - const block_q4_0 * x = (const block_q4_0 *) vx; - - const dfloat d = x[ib].d; - - const int vui = x[ib].qs[iqs]; - - v.x = vui & 0xF; - v.y = vui >> 4; - -#ifdef GGML_CUDA_F16 - v = __hsub2(v, {8.0f, 8.0f}); - v = __hmul2(v, {d, d}); -#else - v.x = (v.x - 8.0f) * d; - v.y = (v.y - 8.0f) * d; -#endif // GGML_CUDA_F16 -} - -static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ - const block_q4_1 * x = (const block_q4_1 *) vx; - - const dfloat d = __low2half(x[ib].dm); - const dfloat m = __high2half(x[ib].dm); - - const int vui = x[ib].qs[iqs]; - - v.x = vui & 0xF; - v.y = vui >> 4; - -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); - v = __hadd2(v, {m, m}); -#else - v.x = (v.x * d) + m; - v.y = (v.y * d) + m; -#endif // GGML_CUDA_F16 -} - -static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ - const block_q5_0 * x = (const block_q5_0 *) vx; - - const dfloat d = x[ib].d; - - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); - v.y = ((x[ib].qs[iqs] >> 4) | xh_1); - -#ifdef GGML_CUDA_F16 - v = __hsub2(v, {16.0f, 16.0f}); - v = __hmul2(v, {d, d}); -#else - v.x = (v.x - 16.0f) * d; - v.y = (v.y - 16.0f) * d; -#endif // GGML_CUDA_F16 -} - -static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ - const block_q5_1 * x = (const block_q5_1 *) vx; - - const dfloat d = __low2half(x[ib].dm); - const dfloat m = __high2half(x[ib].dm); - - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); - v.y = ((x[ib].qs[iqs] >> 4) | xh_1); - -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); - v = __hadd2(v, {m, m}); -#else - v.x = (v.x * d) + m; - v.y = (v.y * d) + m; -#endif // GGML_CUDA_F16 -} - -static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ - const block_q8_0 * x = (const block_q8_0 *) vx; - - const dfloat d = x[ib].d; - - v.x = x[ib].qs[iqs + 0]; - v.y = x[ib].qs[iqs + 1]; - -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); -#else - v.x *= d; - v.y *= d; -#endif // GGML_CUDA_F16 -} +#include "common.cuh" + +static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const block_q4_0 * x = (const block_q4_0 *) vx; + + const dfloat d = x[ib].d; + + const int vui = x[ib].qs[iqs]; + + v.x = vui & 0xF; + v.y = vui >> 4; + +#ifdef GGML_CUDA_F16 + v = __hsub2(v, {8.0f, 8.0f}); + v = __hmul2(v, {d, d}); +#else + v.x = (v.x - 8.0f) * d; + v.y = (v.y - 8.0f) * d; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const block_q4_1 * x = (const block_q4_1 *) vx; + + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); + + const int vui = x[ib].qs[iqs]; + + v.x = vui & 0xF; + v.y = vui >> 4; + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x = (v.x * d) + m; + v.y = (v.y * d) + m; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const block_q5_0 * x = (const block_q5_0 *) vx; + + const dfloat d = x[ib].d; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_CUDA_F16 + v = __hsub2(v, {16.0f, 16.0f}); + v = __hmul2(v, {d, d}); +#else + v.x = (v.x - 16.0f) * d; + v.y = (v.y - 16.0f) * d; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const block_q5_1 * x = (const block_q5_1 *) vx; + + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x = (v.x * d) + m; + v.y = (v.y * d) + m; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const block_q8_0 * x = (const block_q8_0 *) vx; + + const dfloat d = x[ib].d; + + v.x = x[ib].qs[iqs + 0]; + v.y = x[ib].qs[iqs + 1]; + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); +#else + v.x *= d; + v.y *= d; +#endif // GGML_CUDA_F16 +} diff --git a/llama/ggml-cuda/diagmask.cu b/llama/ggml-cuda/diagmask.cu index 4b713ba2..a2ac4a85 100644 --- a/llama/ggml-cuda/diagmask.cu +++ b/llama/ggml-cuda/diagmask.cu @@ -1,40 +1,40 @@ -#include "diagmask.cuh" - -static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { - const int col = blockDim.y*blockIdx.y + threadIdx.y; - const int row = blockDim.x*blockIdx.x + threadIdx.x; - - if (col >= ncols) { - return; - } - - const int i = row*ncols + col; - //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; - //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU - dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; -} - -static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { - const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1); - const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; - const dim3 block_nums(nrows_x, block_num_x, 1); - diag_mask_inf_f32<<>>(x, dst, ncols_x, rows_per_channel, n_past); -} - -void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int nrows0 = ggml_nrows(src0); - - const int n_past = ((int32_t *) dst->op_params)[0]; - - diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream); -} +#include "diagmask.cuh" + +static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { + const int col = blockDim.y*blockIdx.y + threadIdx.y; + const int row = blockDim.x*blockIdx.x + threadIdx.x; + + if (col >= ncols) { + return; + } + + const int i = row*ncols + col; + //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; + //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU + dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; +} + +static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { + const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1); + const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; + const dim3 block_nums(nrows_x, block_num_x, 1); + diag_mask_inf_f32<<>>(x, dst, ncols_x, rows_per_channel, n_past); +} + +void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int nrows0 = ggml_nrows(src0); + + const int n_past = ((int32_t *) dst->op_params)[0]; + + diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream); +} diff --git a/llama/ggml-cuda/diagmask.cuh b/llama/ggml-cuda/diagmask.cuh index 6cdbef17..b7da8f89 100644 --- a/llama/ggml-cuda/diagmask.cuh +++ b/llama/ggml-cuda/diagmask.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 - -void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 + +void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/dmmv.cuh b/llama/ggml-cuda/dmmv.cuh index 4c5ebd47..85329e7c 100644 --- a/llama/ggml-cuda/dmmv.cuh +++ b/llama/ggml-cuda/dmmv.cuh @@ -1,18 +1,18 @@ -#include "common.cuh" - -// dmmv = dequantize_mul_mat_vec - -// TODO: remove this? -#ifndef GGML_CUDA_DMMV_X -#define GGML_CUDA_DMMV_X 32 -#endif - -#ifndef GGML_CUDA_MMV_Y -#define GGML_CUDA_MMV_Y 1 -#endif - -void ggml_cuda_op_dequantize_mul_mat_vec( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream); +#include "common.cuh" + +// dmmv = dequantize_mul_mat_vec + +// TODO: remove this? +#ifndef GGML_CUDA_DMMV_X +#define GGML_CUDA_DMMV_X 32 +#endif + +#ifndef GGML_CUDA_MMV_Y +#define GGML_CUDA_MMV_Y 1 +#endif + +void ggml_cuda_op_dequantize_mul_mat_vec( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream); diff --git a/llama/ggml-cuda/fattn-vec-f16.cu b/llama/ggml-cuda/fattn-vec-f16.cu deleted file mode 100644 index 54e1ac5d..00000000 --- a/llama/ggml-cuda/fattn-vec-f16.cu +++ /dev/null @@ -1,326 +0,0 @@ -#include "common.cuh" -#include "fattn-common.cuh" -#include "fattn-vec-f16.cuh" - -template // D == head size -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -__launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -static __global__ void flash_attn_vec_ext_f16( - const char * __restrict__ Q, - const char * __restrict__ K, - const char * __restrict__ V, - const char * __restrict__ mask, - float * __restrict__ dst, - float2 * __restrict__ dst_meta, - const float scale, - const float max_bias, - const float m0, - const float m1, - const uint32_t n_head_log2, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int nb31, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { -#if FP16_AVAILABLE - //In this kernel Q, K, V are matrices while i, j, k are matrix indices. - - const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. - const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. - - const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0); - const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio)); - const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) mask + ne11*ic0; - - const int stride_KV = nb11 / sizeof(half); - const int stride_KV2 = nb11 / sizeof(half2); - - const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1); - const half slopeh = __float2half(slopef); - - static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); - constexpr int nwarps = D / WARP_SIZE; - const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; - __builtin_assume(tid < D); - - __shared__ half KQ[ncols*D]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - KQ[j*D + tid] = -HALF_MAX_HALF; - } - half2 * KQ2 = (half2 *) KQ; - - half kqmax[ncols]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqmax[j] = -HALF_MAX_HALF; - } - half kqsum[ncols] = {0.0f}; - - __shared__ half kqmax_shared[ncols][WARP_SIZE]; - __shared__ half kqsum_shared[ncols][WARP_SIZE]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - if (threadIdx.y == 0) { - kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF; - kqsum_shared[j][threadIdx.x] = 0.0f; - } - } - __syncthreads(); - - // Convert Q to half2 and store in registers: - half2 Q_h2[ncols][D/(2*WARP_SIZE)]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { -#pragma unroll - for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { - const int i = i0 + threadIdx.x; - - const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i]; - Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y); - } - } - - half2 VKQ[ncols] = {{0.0f, 0.0f}}; - - const int k_start = parallel_blocks == 1 ? 0 : ip*D; - for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) { - // Calculate KQ tile and keep track of new maximum KQ values: - - // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression, - // see https://github.com/ggerganov/llama.cpp/pull/7061 . - // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable). - half kqmax_new = kqmax[0]; - half kqmax_new_arr[ncols]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqmax_new_arr[j] = kqmax[j]; - } - -#pragma unroll - for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) { - const int i_KQ = i_KQ_0 + threadIdx.y; - - if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) { - break; - } - - half2 sum2[ncols] = {{0.0f, 0.0f}}; -#pragma unroll - for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) { - const int k_KQ = k_KQ_0 + threadIdx.x; - - const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE]; - } - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - sum2[j] = warp_reduce_sum(sum2[j]); - half sum = __low2half(sum2[j]) + __high2half(sum2[j]); - sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f); - - if (ncols == 1) { - kqmax_new = ggml_cuda_hmax(kqmax_new, sum); - } else { - kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum); - } - - if (threadIdx.x == 0) { - KQ[j*D + i_KQ] = sum; - } - } - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j]; - - kqmax_new_j = warp_reduce_max(kqmax_new_j); - if (threadIdx.x == 0) { - kqmax_shared[j][threadIdx.y] = kqmax_new_j; - } - } - - __syncthreads(); - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - half kqmax_new_j = kqmax_shared[j][threadIdx.x]; - kqmax_new_j = warp_reduce_max(kqmax_new_j); - - const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j); - kqmax[j] = kqmax_new_j; - - const half val = hexp(KQ[j*D + tid] - kqmax[j]); - kqsum[j] = kqsum[j]*KQ_max_scale + val; - KQ[j*D + tid] = val; - - VKQ[j] *= __half2half2(KQ_max_scale); - } - - __syncthreads(); - -#pragma unroll - for (int k0 = 0; k0 < D; k0 += 2) { - if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) { - break; - } - - half2 V_k; - reinterpret_cast(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid]; - reinterpret_cast(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - VKQ[j] += V_k*KQ2[j*(D/2) + k0/2]; - } - } - - __syncthreads(); - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqsum[j] = warp_reduce_sum(kqsum[j]); - if (threadIdx.x == 0) { - kqsum_shared[j][threadIdx.y] = kqsum[j]; - } - } - - __syncthreads(); - -#pragma unroll - for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { - kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; - kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); - - half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ])); - if (parallel_blocks == 1) { - dst_val /= kqsum[j_VKQ]; - } - const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; - dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; - } - - if (parallel_blocks != 1 && tid < ncols) { - dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]); - } -#else - NO_DEVICE_CODE; -#endif // FP16_AVAILABLE -} - -void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_tensor * KQV = dst; - ggml_tensor * Q = dst->src[0]; - - const int32_t precision = KQV->op_params[2]; - GGML_ASSERT(precision == GGML_PREC_DEFAULT); - - constexpr int cols_per_block = 1; - constexpr int parallel_blocks = 4; - switch (Q->ne[0]) { - case 64: { - constexpr int D = 64; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - case 128: { - constexpr int D = 128; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - case 256: { - constexpr int D = 256; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - default: - GGML_ASSERT(false); - break; - } -} - -template -void launch_fattn_vec_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * Q = dst->src[0]; - switch (Q->ne[0]) { - case 64: { - constexpr int D = 64; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - case 128: { - constexpr int D = 128; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - default: { - GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); - } break; - } -} - -void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * KQV = dst; - const ggml_tensor * Q = dst->src[0]; - - const int32_t precision = KQV->op_params[2]; - GGML_ASSERT(precision == GGML_PREC_DEFAULT); - - if (Q->ne[1] == 1) { - ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); - return; - } - - if (Q->ne[1] == 2) { - constexpr int cols_per_block = 2; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f16_64_128(ctx, dst); - return; - } - - if (Q->ne[1] <= 4) { - constexpr int cols_per_block = 4; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f16_64_128(ctx, dst); - return; - } - - if (Q->ne[1] <= 8) { - constexpr int cols_per_block = 8; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f16_64_128(ctx, dst); - return; - } - - constexpr int cols_per_block = 8; - constexpr int parallel_blocks = 1; - launch_fattn_vec_f16_64_128(ctx, dst); -} diff --git a/llama/ggml-cuda/fattn-vec-f32.cu b/llama/ggml-cuda/fattn-vec-f32.cu deleted file mode 100644 index 5bcabd09..00000000 --- a/llama/ggml-cuda/fattn-vec-f32.cu +++ /dev/null @@ -1,275 +0,0 @@ -#include "common.cuh" -#include "fattn-common.cuh" -#include "fattn-vec-f32.cuh" - -template // D == head size -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -__launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -static __global__ void flash_attn_vec_ext_f32( - const char * __restrict__ Q, - const char * __restrict__ K, - const char * __restrict__ V, - const char * __restrict__ mask, - float * __restrict__ dst, - float2 * __restrict__ dst_meta, - const float scale, - const float max_bias, - const float m0, - const float m1, - const uint32_t n_head_log2, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int nb31, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { - //In this kernel Q, K, V are matrices while i, j, k are matrix indices. - - const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. - const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. - - const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0); - const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio)); - const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) mask + ne11*ic0; - - const int stride_KV = nb11 / sizeof(half); - const int stride_KV2 = nb11 / sizeof(half2); - - const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1); - - static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); - constexpr int nwarps = D / WARP_SIZE; - const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; - __builtin_assume(tid < D); - - __shared__ float KQ[ncols*D]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - KQ[j*D + tid] = -FLT_MAX/2.0f; - } - - float kqmax[ncols]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqmax[j] = -FLT_MAX/2.0f; - } - float kqsum[ncols] = {0.0f}; - - __shared__ float kqmax_shared[ncols][WARP_SIZE]; - __shared__ float kqsum_shared[ncols][WARP_SIZE]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - if (threadIdx.y == 0) { - kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f; - kqsum_shared[j][threadIdx.x] = 0.0f; - } - } - __syncthreads(); - - // Convert Q to half2 and store in registers: - float2 Q_h2[ncols][D/(2*WARP_SIZE)]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { -#pragma unroll - for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { - const int i = i0 + threadIdx.x; - - Q_h2[j][i0/WARP_SIZE] = Q_f2[j*(nb01/sizeof(float2)) + i]; - Q_h2[j][i0/WARP_SIZE].x *= scale; - Q_h2[j][i0/WARP_SIZE].y *= scale; - } - } - - float VKQ[ncols] = {0.0f}; - - const int k_start = parallel_blocks == 1 ? 0 : ip*D; - for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) { - // Calculate KQ tile and keep track of new maximum KQ values: - - float kqmax_new_arr[ncols]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqmax_new_arr[j] = kqmax[j]; - } - -#pragma unroll - for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) { - const int i_KQ = i_KQ_0 + threadIdx.y; - - if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) { - break; - } - - float sum[ncols] = {0.0f}; -#pragma unroll - for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) { - const int k_KQ = k_KQ_0 + threadIdx.x; - - const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - sum[j] += __low2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].x; - sum[j] += __high2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].y; - } - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - sum[j] = warp_reduce_sum(sum[j]); - sum[j] += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; - - kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum[j]); - - if (threadIdx.x == 0) { - KQ[j*D + i_KQ] = sum[j]; - } - } - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - float kqmax_new_j = kqmax_new_arr[j]; - - kqmax_new_j = warp_reduce_max(kqmax_new_j); - if (threadIdx.x == 0) { - kqmax_shared[j][threadIdx.y] = kqmax_new_j; - } - } - - __syncthreads(); - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - float kqmax_new_j = kqmax_shared[j][threadIdx.x]; - kqmax_new_j = warp_reduce_max(kqmax_new_j); - - const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j); - kqmax[j] = kqmax_new_j; - - const float val = expf(KQ[j*D + tid] - kqmax[j]); - kqsum[j] = kqsum[j]*KQ_max_scale + val; - KQ[j*D + tid] = val; - - VKQ[j] *= KQ_max_scale; - } - - __syncthreads(); - -#pragma unroll - for (int k = 0; k < D; ++k) { - if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) { - break; - } - - const float V_ki = __half2float(V_h[(k_VKQ_0 + k)*stride_KV + tid]); -#pragma unroll - for (int j = 0; j < ncols; ++j) { - VKQ[j] += V_ki*KQ[j*D + k]; - } - } - - __syncthreads(); - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqsum[j] = warp_reduce_sum(kqsum[j]); - if (threadIdx.x == 0) { - kqsum_shared[j][threadIdx.y] = kqsum[j]; - } - } - - __syncthreads(); - -#pragma unroll - for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { - kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; - kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); - - float dst_val = VKQ[j_VKQ]; - if (parallel_blocks == 1) { - dst_val /= kqsum[j_VKQ]; - } - const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; - dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; - } - - if (parallel_blocks != 1 && tid < ncols) { - dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]); - } -} - -template -void launch_fattn_vec_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * Q = dst->src[0]; - switch (Q->ne[0]) { - case 64: { - constexpr int D = 64; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - case 128: { - constexpr int D = 128; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - default: { - GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); - } break; - } -} - -void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * Q = dst->src[0]; - - if (Q->ne[1] == 1) { - constexpr int cols_per_block = 1; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f32_64_128(ctx, dst); - return; - } - - if (Q->ne[1] == 2) { - constexpr int cols_per_block = 2; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f32_64_128(ctx, dst); - return; - } - - if (Q->ne[1] <= 4) { - constexpr int cols_per_block = 4; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f32_64_128(ctx, dst); - return; - } - - if (Q->ne[1] <= 8) { - constexpr int cols_per_block = 8; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f32_64_128(ctx, dst); - return; - } - - constexpr int cols_per_block = 8; - constexpr int parallel_blocks = 1; - launch_fattn_vec_f32_64_128(ctx, dst); -} diff --git a/llama/ggml-cuda/getrows.cu b/llama/ggml-cuda/getrows.cu index 55af195f..7f8091bc 100644 --- a/llama/ggml-cuda/getrows.cu +++ b/llama/ggml-cuda/getrows.cu @@ -1,178 +1,178 @@ -#include "getrows.cuh" -#include "dequantize.cuh" - -template -static __global__ void k_get_rows( - const void * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12/*, size_t s13*/) { - - const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2; - const int i10 = blockDim.y*blockIdx.y + threadIdx.y; - const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; - const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; - - if (i00 >= ne00) { - return; - } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; - - const int ib = i00/qk; // block index - const int iqs = (i00%qk)/qr; // quant index - const int iybs = i00 - i00%qk; // dst block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - dfloat2 v; - dequantize_kernel(src0_row, ib, iqs, v); - - dst_row[iybs + iqs + 0] = v.x; - dst_row[iybs + iqs + y_offset] = v.y; -} - -template -static __global__ void k_get_rows_float( - const src0_t * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12/*, size_t s13*/) { - - const int i00 = blockIdx.x*blockDim.x + threadIdx.x; - const int i10 = blockDim.y*blockIdx.y + threadIdx.y; - const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; - const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; - - if (i00 >= ne00) { - return; - } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); - - dst_row[i00] = src0_row[i00]; -} - -template -static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); - const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); - const dim3 block_nums(block_num_x, ne10, ne11*ne12); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); - - GGML_ASSERT(ne00 % 2 == 0); - - k_get_rows<<>>( - src0_dd, src1_dd, dst_dd, - ne00, /*ne01, ne02, ne03,*/ - /*ne10, ne11,*/ ne12, /*ne13,*/ - /* s0,*/ s1, s2, s3, - /* nb00,*/ nb01, nb02, nb03, - s10, s11, s12/*, s13*/); - - GGML_UNUSED(dst); -} - -template -static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); - const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; - const dim3 block_nums(block_num_x, ne10, ne11*ne12); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); - - k_get_rows_float<<>>( - src0_dd, src1_dd, dst_dd, - ne00, /*ne01, ne02, ne03,*/ - /*ne10, ne11,*/ ne12, /*ne13,*/ - /* s0,*/ s1, s2, s3, - /* nb00,*/ nb01, nb02, nb03, - s10, s11, s12/*, s13*/); - - GGML_UNUSED(dst); -} - -void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - const float * src0_d = (const float *)src0->data; - const float * src1_d = (const float *)src1->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); - GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); - GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - - const int32_t * src1_i32 = (const int32_t *) src1_d; - - switch (src0->type) { - case GGML_TYPE_F16: - get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_F32: - get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q4_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q4_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q5_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q5_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q8_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - default: - // TODO: k-quants - fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - GGML_ASSERT(false); - break; - } -} +#include "getrows.cuh" +#include "dequantize.cuh" + +template +static __global__ void k_get_rows( + const void * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12/*, size_t s13*/) { + + const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2; + const int i10 = blockDim.y*blockIdx.y + threadIdx.y; + const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; + const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; + + const int ib = i00/qk; // block index + const int iqs = (i00%qk)/qr; // quant index + const int iybs = i00 - i00%qk; // dst block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(src0_row, ib, iqs, v); + + dst_row[iybs + iqs + 0] = v.x; + dst_row[iybs + iqs + y_offset] = v.y; +} + +template +static __global__ void k_get_rows_float( + const src0_t * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12/*, size_t s13*/) { + + const int i00 = blockIdx.x*blockDim.x + threadIdx.x; + const int i10 = blockDim.y*blockIdx.y + threadIdx.y; + const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; + const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); + + dst_row[i00] = src0_row[i00]; +} + +template +static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); + const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); + const dim3 block_nums(block_num_x, ne10, ne11*ne12); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + GGML_ASSERT(ne00 % 2 == 0); + + k_get_rows<<>>( + src0_dd, src1_dd, dst_dd, + ne00, /*ne01, ne02, ne03,*/ + /*ne10, ne11,*/ ne12, /*ne13,*/ + /* s0,*/ s1, s2, s3, + /* nb00,*/ nb01, nb02, nb03, + s10, s11, s12/*, s13*/); + + GGML_UNUSED(dst); +} + +template +static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); + const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; + const dim3 block_nums(block_num_x, ne10, ne11*ne12); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + k_get_rows_float<<>>( + src0_dd, src1_dd, dst_dd, + ne00, /*ne01, ne02, ne03,*/ + /*ne10, ne11,*/ ne12, /*ne13,*/ + /* s0,*/ s1, s2, s3, + /* nb00,*/ nb01, nb02, nb03, + s10, s11, s12/*, s13*/); + + GGML_UNUSED(dst); +} + +void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); + GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); + + const int32_t * src1_i32 = (const int32_t *) src1_d; + + switch (src0->type) { + case GGML_TYPE_F16: + get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_F32: + get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q4_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q4_1: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q5_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q5_1: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q8_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + default: + // TODO: k-quants + fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); + GGML_ASSERT(false); + break; + } +} diff --git a/llama/ggml-cuda/getrows.cuh b/llama/ggml-cuda/getrows.cuh index bbf13023..04e1c187 100644 --- a/llama/ggml-cuda/getrows.cuh +++ b/llama/ggml-cuda/getrows.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_GET_ROWS_BLOCK_SIZE 256 - -void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_GET_ROWS_BLOCK_SIZE 256 + +void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/im2col.cu b/llama/ggml-cuda/im2col.cu index 3d0d8d4e..5510b43c 100644 --- a/llama/ggml-cuda/im2col.cu +++ b/llama/ggml-cuda/im2col.cu @@ -1,104 +1,104 @@ -#include "im2col.cuh" - -template -static __global__ void im2col_kernel( - const float * x, T * dst, int64_t batch_offset, - int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW, - int s0, int s1, int p0, int p1, int d0, int d1) { - const int64_t i = threadIdx.x + blockIdx.x * blockDim.x; - if (i >= pelements) { - return; - } - - const int64_t ksize = OW * (KH > 1 ? KW : 1); - const int64_t kx = i / ksize; - const int64_t kd = kx * ksize; - const int64_t ky = (i - kd) / OW; - const int64_t ix = i % OW; - - const int64_t oh = blockIdx.y; - const int64_t batch = blockIdx.z / IC; - const int64_t ic = blockIdx.z % IC; - - const int64_t iiw = ix * s0 + kx * d0 - p0; - const int64_t iih = oh * s1 + ky * d1 - p1; - - const int64_t offset_dst = - ((batch * OH + oh) * OW + ix) * CHW + - (ic * (KW * KH) + ky * KW + kx); - - if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { - dst[offset_dst] = 0.0f; - } else { - const int64_t offset_src = ic * offset_delta + batch * batch_offset; - dst[offset_dst] = x[offset_src + iih * IW + iiw]; - } -} - -template -static void im2col_cuda(const float * x, T* dst, - int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, - int64_t batch, int64_t batch_offset, int64_t offset_delta, - int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { - const int parallel_elements = OW * KW * KH; - const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE; - dim3 block_nums(num_blocks, OH, batch * IC); - im2col_kernel<<>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1); -} - -static void im2col_cuda_f16(const float * x, half * dst, - int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, - int64_t batch, int64_t batch_offset, int64_t offset_delta, - int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { - - im2col_cuda(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream); -} - -static void im2col_cuda_f32(const float * x, float * dst, - int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, - int64_t batch, int64_t batch_offset, int64_t offset_delta, - int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { - - im2col_cuda(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream); -} - -void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - const float * src1_d = (const float *)src1->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; - - const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; - - const int64_t IC = src1->ne[is_2D ? 2 : 1]; - const int64_t IH = is_2D ? src1->ne[1] : 1; - const int64_t IW = src1->ne[0]; - - const int64_t KH = is_2D ? src0->ne[1] : 1; - const int64_t KW = src0->ne[0]; - - const int64_t OH = is_2D ? dst->ne[2] : 1; - const int64_t OW = dst->ne[1]; - - const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 - const int64_t batch = src1->ne[3]; - const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 - - if(dst->type == GGML_TYPE_F16) { - im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream); - } else { - im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream); - } -} +#include "im2col.cuh" + +template +static __global__ void im2col_kernel( + const float * x, T * dst, int64_t batch_offset, + int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW, + int s0, int s1, int p0, int p1, int d0, int d1) { + const int64_t i = threadIdx.x + blockIdx.x * blockDim.x; + if (i >= pelements) { + return; + } + + const int64_t ksize = OW * (KH > 1 ? KW : 1); + const int64_t kx = i / ksize; + const int64_t kd = kx * ksize; + const int64_t ky = (i - kd) / OW; + const int64_t ix = i % OW; + + const int64_t oh = blockIdx.y; + const int64_t batch = blockIdx.z / IC; + const int64_t ic = blockIdx.z % IC; + + const int64_t iiw = ix * s0 + kx * d0 - p0; + const int64_t iih = oh * s1 + ky * d1 - p1; + + const int64_t offset_dst = + ((batch * OH + oh) * OW + ix) * CHW + + (ic * (KW * KH) + ky * KW + kx); + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst[offset_dst] = 0.0f; + } else { + const int64_t offset_src = ic * offset_delta + batch * batch_offset; + dst[offset_dst] = x[offset_src + iih * IW + iiw]; + } +} + +template +static void im2col_cuda(const float * x, T* dst, + int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, + int64_t batch, int64_t batch_offset, int64_t offset_delta, + int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { + const int parallel_elements = OW * KW * KH; + const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE; + dim3 block_nums(num_blocks, OH, batch * IC); + im2col_kernel<<>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1); +} + +static void im2col_cuda_f16(const float * x, half * dst, + int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, + int64_t batch, int64_t batch_offset, int64_t offset_delta, + int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { + + im2col_cuda(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream); +} + +static void im2col_cuda_f32(const float * x, float * dst, + int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, + int64_t batch, int64_t batch_offset, int64_t offset_delta, + int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { + + im2col_cuda(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream); +} + +void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const float * src1_d = (const float *)src1->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + + const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; + + const int64_t IC = src1->ne[is_2D ? 2 : 1]; + const int64_t IH = is_2D ? src1->ne[1] : 1; + const int64_t IW = src1->ne[0]; + + const int64_t KH = is_2D ? src0->ne[1] : 1; + const int64_t KW = src0->ne[0]; + + const int64_t OH = is_2D ? dst->ne[2] : 1; + const int64_t OW = dst->ne[1]; + + const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 + const int64_t batch = src1->ne[3]; + const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 + + if(dst->type == GGML_TYPE_F16) { + im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream); + } else { + im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream); + } +} diff --git a/llama/ggml-cuda/im2col.cuh b/llama/ggml-cuda/im2col.cuh index 1ce8fae4..55c898bf 100644 --- a/llama/ggml-cuda/im2col.cuh +++ b/llama/ggml-cuda/im2col.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_IM2COL_BLOCK_SIZE 256 - -void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_IM2COL_BLOCK_SIZE 256 + +void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/mmq.cu b/llama/ggml-cuda/mmq.cu index 58799e4c..1d6b9e69 100644 --- a/llama/ggml-cuda/mmq.cu +++ b/llama/ggml-cuda/mmq.cu @@ -11,6 +11,7 @@ void ggml_cuda_op_mul_mat_q( const int64_t nb01 = src0->nb[1]; const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; GGML_ASSERT(ne10 % QK8_1 == 0); const int64_t ne0 = dst->ne[0]; @@ -25,7 +26,7 @@ void ggml_cuda_op_mul_mat_q( // nrows_dst == nrows of the matrix that the kernel writes into const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst}; + const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst}; switch (src0->type) { case GGML_TYPE_Q4_0: diff --git a/llama/ggml-cuda/mmq.cuh b/llama/ggml-cuda/mmq.cuh index 6744cce6..3ccae8a0 100644 --- a/llama/ggml-cuda/mmq.cuh +++ b/llama/ggml-cuda/mmq.cuh @@ -1,15 +1,26 @@ +#pragma once + #include "common.cuh" #include "vecdotq.cuh" #include #include +#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1) + typedef void (*load_tiles_mmq_t)( const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride); typedef void (*vec_dot_mmq_t)( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, float * __restrict__ sum, const int & k0); + const int * __restrict__ y, float * __restrict__ sum, const int & k0); + +struct block_q8_1_mmq { + half2 ds[4]; + int8_t qs[4*QK8_1]; +}; +static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size"); +static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1), "Unexpected block_q8_1_mmq size"); struct tile_x_sizes { int ql; @@ -132,10 +143,14 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) { + const int * __restrict__ y, float * __restrict__ sum, const int & k0) { GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + const float * x_dmf = (const float *) x_dm; + const int * y_qs = (const int *) y + 4; + const half2 * y_ds = (const half2 *) y; + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -145,19 +160,18 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat( const int i = i0 + threadIdx.x; const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2)); - const float * x_dmf = (const float *) x_dm; int u[2*VDR_Q4_0_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; + u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_0) % WARP_SIZE]; } sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dmf[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0], + y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); } } } @@ -203,10 +217,13 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) { + const int * __restrict__ y, float * __restrict__ sum, const int & k0) { GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + const int * y_qs = (const int *) y + 4; + const half2 * y_ds = (const half2 *) y; + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -221,13 +238,13 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; + u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_1) % WARP_SIZE]; } sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1], + y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); } } } @@ -293,10 +310,14 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) { + const int * __restrict__ y, float * __restrict__ sum, const int & k0) { GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + const float * x_dmf = (const float *) x_dm; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -306,20 +327,18 @@ static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat( const int i = i0 + threadIdx.x; const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0; - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; + const int index_bx = i*(WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0; int u[2*VDR_Q5_0_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; + u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_0) % WARP_SIZE]; } sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dmf[index_bx], y_df[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); } } } @@ -383,10 +402,13 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) { + const int * __restrict__ y, float * __restrict__ sum, const int & k0) { GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + const int * y_qs = (const int *) y + 4; + const half2 * y_ds = (const half2 *) y; + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -396,18 +418,18 @@ static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat( const int i = i0 + threadIdx.x; const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k0/QI5_1; + const int index_bx = i*(WARP_SIZE/QI5_1) + i/QI5_1 + k0/QI5_1; int u[2*VDR_Q5_1_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; + u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_1) % WARP_SIZE]; } sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dm[index_bx], y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]); } } } @@ -455,10 +477,14 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) { + const int * __restrict__ y, float * __restrict__ sum, const int & k0) { GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + const float * x_dmf = (const float *) x_dm; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -467,12 +493,9 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat( for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { const int i = i0 + threadIdx.x; - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[j * WARP_SIZE + k0], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0], - y_df[j * (WARP_SIZE/QI8_1) + k0/QI8_1]); + (&x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0], x_dmf[i*(WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0], + y_df[j*MMQ_TILE_Y_K + k0/QI8_1]); } } } @@ -531,10 +554,13 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) { + const int * __restrict__ y, float * __restrict__ sum, const int & k0) { GGML_UNUSED(x_qh); + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -545,11 +571,10 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat( const int kbx = k0 / QI2_K; const int ky = (k0 % QI2_K) * QR2_K; - const float * y_df = (const float *) y_ds; int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; - const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); + const int kqsx = i*(WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); #pragma unroll @@ -557,11 +582,11 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat( v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; } - const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; + const uint8_t * scales = ((const uint8_t *) &x_sc[i*(WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; - const int index_y = j * WARP_SIZE + (QR2_K*k0) % WARP_SIZE; sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq( - v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); + v, &y_qs[j*MMQ_TILE_Y_K + (QR2_K*k0) % WARP_SIZE], scales, + x_dm[i*(WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[j*MMQ_TILE_Y_K + ((QR2_K*k0) % WARP_SIZE)/QI8_1]); } } } @@ -646,7 +671,11 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) { + const int * __restrict__ y, float * __restrict__ sum, const int & k0) { + + const float * x_dmf = (const float *) x_dm; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { @@ -658,8 +687,6 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat( const int kbx = k0 / QI3_K; const int ky = (k0 % QI3_K) * QR3_K; - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; @@ -667,19 +694,19 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat( #pragma unroll for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { - const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); + const int kqsx = i*(WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); const int shift = 2 * ((ky % 32) / 8); const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; - const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); + const int vh = x_qh[i*(WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); const int vlh = (vh << 2) & 0x04040404; v[l] = __vsubss4(vll, vlh); } - const int index_y = j * WARP_SIZE + (k0*QR3_K) % WARP_SIZE; sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq( - v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); + v, &y_qs[j*MMQ_TILE_Y_K + (k0*QR3_K) % WARP_SIZE], scales, + x_dmf[i*(WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[j*MMQ_TILE_Y_K + ((k0*QR3_K) % WARP_SIZE)/QI8_1]); } } } @@ -746,10 +773,13 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) { + const int * __restrict__ y, float * __restrict__ sum, const int & k0) { GGML_UNUSED(x_qh); + const int * y_qs = (const int *) y + 4; + const half2 * y_ds = (const half2 *) y; + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -760,9 +790,9 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat( const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2*((k0 % 16) / 8); - const int index_y = j * WARP_SIZE + (QR4_K*k0) % WARP_SIZE; sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq( - &x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); + &x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + (QR4_K*k0) % WARP_SIZE], sc, sc+8, + x_dm[i*(WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[j*MMQ_TILE_Y_K + ((QR4_K*k0) % WARP_SIZE)/QI8_1]); } } } @@ -842,10 +872,13 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) { + const int * __restrict__ y, float * __restrict__ sum, const int & k0) { GGML_UNUSED(x_qh); + const int * y_qs = (const int *) y + 4; + const half2 * y_ds = (const half2 *) y; + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -856,10 +889,9 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat( const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8); - const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k0; - const int index_y = j * WARP_SIZE + (QR5_K*k0) % WARP_SIZE; sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq( - &x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); + &x_ql[i*(QR5_K*WARP_SIZE + 1) + QR5_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR5_K*k0) % WARP_SIZE], sc, sc+8, + x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[j*MMQ_TILE_Y_K + ((QR5_K*k0) % WARP_SIZE)/QI8_1]); } } } @@ -932,10 +964,14 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) { + const int * __restrict__ y, float * __restrict__ sum, const int & k0) { GGML_UNUSED(x_qh); + const float * x_dmf = (const float *) x_dm; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -944,15 +980,11 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat( for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { const int i = i0 + threadIdx.x; - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]); - const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k0; - const int index_y = j * WARP_SIZE + (QR6_K*k0) % WARP_SIZE; sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq( - &x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); + &x_ql[i*(QR6_K*WARP_SIZE + 1) + QR6_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR6_K*k0) % WARP_SIZE], sc, + x_dmf[i*(WARP_SIZE/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + ((QR6_K*k0) % WARP_SIZE)/QI8_1]); } } } @@ -964,7 +996,6 @@ struct mmq_type_traits; template struct mmq_type_traits { - static constexpr bool need_sum = true; static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_0_q8_1_mul_mat; @@ -972,7 +1003,6 @@ struct mmq_type_traits { template struct mmq_type_traits { - static constexpr bool need_sum = true; static constexpr int vdr = VDR_Q4_1_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_1_q8_1_mul_mat; @@ -980,7 +1010,6 @@ struct mmq_type_traits { template struct mmq_type_traits { - static constexpr bool need_sum = false; static constexpr int vdr = VDR_Q5_0_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_0_q8_1_mul_mat; @@ -988,7 +1017,6 @@ struct mmq_type_traits { template struct mmq_type_traits { - static constexpr bool need_sum = true; static constexpr int vdr = VDR_Q5_1_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_1_q8_1_mul_mat; @@ -996,7 +1024,6 @@ struct mmq_type_traits { template struct mmq_type_traits { - static constexpr bool need_sum = false; static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q8_0_q8_1_mul_mat; @@ -1004,7 +1031,6 @@ struct mmq_type_traits { template struct mmq_type_traits { - static constexpr bool need_sum = false; static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q2_K_q8_1_mul_mat; @@ -1012,7 +1038,6 @@ struct mmq_type_traits { template struct mmq_type_traits { - static constexpr bool need_sum = false; static constexpr int vdr = VDR_Q3_K_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q3_K_q8_1_mul_mat; @@ -1020,7 +1045,6 @@ struct mmq_type_traits { template struct mmq_type_traits { - static constexpr bool need_sum = true; static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mul_mat; @@ -1028,7 +1052,6 @@ struct mmq_type_traits { template struct mmq_type_traits { - static constexpr bool need_sum = true; static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mul_mat; @@ -1036,12 +1059,36 @@ struct mmq_type_traits { template struct mmq_type_traits { - static constexpr bool need_sum = false; static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K; static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mul_mat; }; +static int mmq_need_sum(const ggml_type type_x) { + switch (type_x) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return true; + case GGML_TYPE_Q5_0: + return false; + case GGML_TYPE_Q5_1: + return true; + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + return false; + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return true; + case GGML_TYPE_Q6_K: + return false; + default: + GGML_ASSERT(false); + break; + } + return false; +} + template #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -1056,7 +1103,7 @@ template #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) static __global__ void mul_mat_q( const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, - const int ne00, const int ne01, const int stride00, const int ne10, const int ne11, const int ne0) { + const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { // Skip unused template specializations for faster compilation: if (mmq_x > get_mmq_x_max_device()) { @@ -1068,7 +1115,6 @@ static __global__ void mul_mat_q( constexpr int qr = ggml_cuda_type_traits::qr; constexpr int qi = ggml_cuda_type_traits::qi; constexpr int mmq_y = get_mmq_y_device(mmq_x); - constexpr bool need_sum = mmq_type_traits::need_sum; constexpr int vdr = mmq_type_traits::vdr; constexpr load_tiles_mmq_t load_tiles = mmq_type_traits::load_tiles; constexpr vec_dot_mmq_t vec_dot = mmq_type_traits::vec_dot; @@ -1080,62 +1126,38 @@ static __global__ void mul_mat_q( half2 * tile_x_dm = (half2 *) (tile_x_ql + txs.ql); int * tile_x_qh = (int *) (tile_x_dm + txs.dm); int * tile_x_sc = (int *) (tile_x_qh + txs.qh); - int * tile_y_qs = (int *) (tile_x_sc + txs.sc); // [mmq_x * WARP_SIZE] - half2 * tile_y_ds = (half2 *) (tile_y_qs + mmq_x*WARP_SIZE); // [mmq_x * WARP_SIZE/QI8_1]; - - const block_q8_1 * y = (const block_q8_1 *) yc; + int * tile_y = (int *) (tile_x_sc + txs.sc); // [mmq_x * (WARP_SIZE + WARP_SIZE/QI8_1)] const int blocks_per_row_x = ne00 / qk; - const int blocks_per_col_y = ne10 / QK8_1; const int blocks_per_warp = WARP_SIZE / qi; const int & ne1 = ne11; const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1; + const int * y = (const int *) yc + blockIdx.y*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int)); + float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f}; for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) { - load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride00*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride00); + load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride01*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride01); #pragma unroll for (int kr = 0; kr < qr; ++kr) { - const int kqs = kr*WARP_SIZE + threadIdx.x; - const int kbxd = kqs / QI8_1; - + const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + kr*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll - for (int i0 = 0; i0 < mmq_x; i0 += nwarps) { - const int i = min(blockIdx.y*mmq_x + threadIdx.y + i0, ne11-1); // to prevent out-of-bounds memory accesses + for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { + int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; - const block_q8_1 * by0 = &y[i*blocks_per_col_y + kb0 * (qk/QK8_1) + kbxd]; - - const int index_y = (i0 + threadIdx.y) * WARP_SIZE + kqs % WARP_SIZE; - tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); - } - -#pragma unroll - for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { - const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x; - const int kby = threadIdx.x % (WARP_SIZE/QI8_1); - const int i_y_eff = min(blockIdx.y*mmq_x + ids, ne11-1); - - // if the sum is not needed it's faster to transform the scale to f32 ahead of time - const half2 * dsi_src = &y[i_y_eff*blocks_per_col_y + kb0 * (qk/QK8_1) + kr*(WARP_SIZE/QI8_1) + kby].ds; - half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby]; - if (need_sum) { - *dsi_dst = *dsi_src; - } else { - float * dfi_dst = (float *) dsi_dst; - *dfi_dst = __low2float(*dsi_src); - } + tile_y[l] = by0[l]; } __syncthreads(); // #pragma unroll // unrolling this loop causes too much register pressure for (int k0 = kr*WARP_SIZE/qr; k0 < (kr+1)*WARP_SIZE/qr; k0 += vdr) { - vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, sum, k0); + vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y, sum, k0); } __syncthreads(); @@ -1165,8 +1187,8 @@ static __global__ void mul_mat_q( struct mmq_args { const char * x; const char * y; float * dst; - int64_t ne00; int64_t ne01; int64_t stride00; - int64_t ne10; int64_t ne11; + int64_t ne00; int64_t ne01; int64_t stride01; + int64_t ne10; int64_t ne11; int64_t stride11; int64_t ne0; }; @@ -1184,7 +1206,7 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) { const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y); const int shmem_x = txs.ql*sizeof(int) + txs.dm*sizeof(half2) + txs.qh*sizeof(int) + txs.sc*sizeof(int); const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2); - const int shmem = shmem_x + shmem_y; + const int shmem = shmem_x + GGML_PAD(shmem_y, nwarps*WARP_SIZE*sizeof(int)); #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; @@ -1198,11 +1220,11 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) { if (args.ne01 % mmq_y == 0) { const bool need_check = false; mul_mat_q<<>> - (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0); + (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); } else { const bool need_check = true; mul_mat_q<<>> - (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0); + (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); } } diff --git a/llama/ggml-cuda/mmvq.cuh b/llama/ggml-cuda/mmvq.cuh index 88c42c4b..46785d5a 100644 --- a/llama/ggml-cuda/mmvq.cuh +++ b/llama/ggml-cuda/mmvq.cuh @@ -1,7 +1,7 @@ -#include "common.cuh" - -void ggml_cuda_op_mul_mat_vec_q( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream); +#include "common.cuh" + +void ggml_cuda_op_mul_mat_vec_q( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream); diff --git a/llama/ggml-cuda/norm.cuh b/llama/ggml-cuda/norm.cuh index 431a8f74..61f880b0 100644 --- a/llama/ggml-cuda/norm.cuh +++ b/llama/ggml-cuda/norm.cuh @@ -1,7 +1,7 @@ -#include "common.cuh" - -void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); - -void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); - -void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/pad.cu b/llama/ggml-cuda/pad.cu index aba539e8..a033a8d0 100644 --- a/llama/ggml-cuda/pad.cu +++ b/llama/ggml-cuda/pad.cu @@ -1,49 +1,49 @@ -#include "pad.cuh" - -static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { - // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 - // blockIdx.y: idx of ne1 - // blockIDx.x: idx of ne0 / BLOCK_SIZE - int nidx = threadIdx.x + blockIdx.x * blockDim.x; - if (nidx >= ne0) { - return; - } - - // operation - int offset_dst = - nidx + - blockIdx.y * ne0 + - blockIdx.z * ne0 * gridDim.y; - if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { - int offset_src = - nidx + - blockIdx.y * ne00 + - blockIdx.z * ne00 * ne01; - dst[offset_dst] = x[offset_src]; - } else { - dst[offset_dst] = 0.0f; - } -} - -static void pad_f32_cuda(const float * x, float * dst, - const int ne00, const int ne01, const int ne02, const int ne03, - const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { - int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; - dim3 gridDim(num_blocks, ne1, ne2*ne3); - pad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); -} - -void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - - pad_f32_cuda(src0_d, dst_d, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); -} +#include "pad.cuh" + +static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { + // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 + // blockIdx.y: idx of ne1 + // blockIDx.x: idx of ne0 / BLOCK_SIZE + int nidx = threadIdx.x + blockIdx.x * blockDim.x; + if (nidx >= ne0) { + return; + } + + // operation + int offset_dst = + nidx + + blockIdx.y * ne0 + + blockIdx.z * ne0 * gridDim.y; + if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { + int offset_src = + nidx + + blockIdx.y * ne00 + + blockIdx.z * ne00 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + dst[offset_dst] = 0.0f; + } +} + +static void pad_f32_cuda(const float * x, float * dst, + const int ne00, const int ne01, const int ne02, const int ne03, + const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { + int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; + dim3 gridDim(num_blocks, ne1, ne2*ne3); + pad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); +} + +void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + + pad_f32_cuda(src0_d, dst_d, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); +} diff --git a/llama/ggml-cuda/pad.cuh b/llama/ggml-cuda/pad.cuh index 8fd386b0..b8ebc809 100644 --- a/llama/ggml-cuda/pad.cuh +++ b/llama/ggml-cuda/pad.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_PAD_BLOCK_SIZE 256 - -void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_PAD_BLOCK_SIZE 256 + +void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/pool2d.cu b/llama/ggml-cuda/pool2d.cu index c6d51e4d..2b098207 100644 --- a/llama/ggml-cuda/pool2d.cu +++ b/llama/ggml-cuda/pool2d.cu @@ -1,94 +1,94 @@ -#include "pool2d.cuh" - -template -static __global__ void pool2d_nchw_kernel( - const int ih, const int iw, const int oh, const int ow, - const int kh, const int kw, const int sh, const int sw, - const int ph, const int pw, const int parallel_elements, - const Ti* src, To* dst, const enum ggml_op_pool op) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx >= parallel_elements) { - return; - } - - const int I_HW = ih * iw; - const int O_HW = oh * ow; - const int nc = idx / O_HW; - const int cur_oh = idx % O_HW / ow; - const int cur_ow = idx % O_HW % ow; - const Ti* i_ptr = src + nc * I_HW; - To* o_ptr = dst + nc * O_HW; - const int start_h = cur_oh * sh - ph; - const int bh = max(0, start_h); - const int eh = min(ih, start_h + kh); - const int start_w = cur_ow * sw - pw; - const int bw = max(0, start_w); - const int ew = min(iw, start_w + kw); - const To scale = 1. / (kh * kw); - To res = 0; - - switch (op) { - case GGML_OP_POOL_AVG: res = 0; break; - case GGML_OP_POOL_MAX: res = -FLT_MAX; break; - default: assert(false); - } - - for (int i = bh; i < eh; i += 1) { - for (int j = bw; j < ew; j += 1) { -#if __CUDA_ARCH__ >= 350 - Ti cur = __ldg(i_ptr + i * iw + j); -#else - Ti cur = i_ptr[i * iw + j]; -#endif - switch (op) { - case GGML_OP_POOL_AVG: res += cur * scale; break; - case GGML_OP_POOL_MAX: res = max(res, (To)cur); break; - default: assert(false); - } - } - } - o_ptr[cur_oh * ow + cur_ow] = res; -} - -static void pool2d_nchw_kernel_f32_f32_cuda( - const int ih, const int iw, const int oh, const int ow, - const int kh, const int kw, const int sh, const int sw, - const int ph, const int pw, const int parallel_elements, - const float * src, float * dst, const enum ggml_op_pool op, - cudaStream_t stream) { - - const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE; - dim3 block_nums(num_blocks); - pool2d_nchw_kernel<<>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op); -} - -void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = static_cast(opts[0]); - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - - const int64_t IH = src0->ne[1]; - const int64_t IW = src0->ne[0]; - - const int64_t N = dst->ne[3]; - const int64_t OC = dst->ne[2]; - const int64_t OH = dst->ne[1]; - const int64_t OW = dst->ne[0]; - - const int parallel_elements = N * OC * OH * OW; - - pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream); -} +#include "pool2d.cuh" + +template +static __global__ void pool2d_nchw_kernel( + const int ih, const int iw, const int oh, const int ow, + const int kh, const int kw, const int sh, const int sw, + const int ph, const int pw, const int parallel_elements, + const Ti* src, To* dst, const enum ggml_op_pool op) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= parallel_elements) { + return; + } + + const int I_HW = ih * iw; + const int O_HW = oh * ow; + const int nc = idx / O_HW; + const int cur_oh = idx % O_HW / ow; + const int cur_ow = idx % O_HW % ow; + const Ti* i_ptr = src + nc * I_HW; + To* o_ptr = dst + nc * O_HW; + const int start_h = cur_oh * sh - ph; + const int bh = max(0, start_h); + const int eh = min(ih, start_h + kh); + const int start_w = cur_ow * sw - pw; + const int bw = max(0, start_w); + const int ew = min(iw, start_w + kw); + const To scale = 1. / (kh * kw); + To res = 0; + + switch (op) { + case GGML_OP_POOL_AVG: res = 0; break; + case GGML_OP_POOL_MAX: res = -FLT_MAX; break; + default: assert(false); + } + + for (int i = bh; i < eh; i += 1) { + for (int j = bw; j < ew; j += 1) { +#if __CUDA_ARCH__ >= 350 + Ti cur = __ldg(i_ptr + i * iw + j); +#else + Ti cur = i_ptr[i * iw + j]; +#endif + switch (op) { + case GGML_OP_POOL_AVG: res += cur * scale; break; + case GGML_OP_POOL_MAX: res = max(res, (To)cur); break; + default: assert(false); + } + } + } + o_ptr[cur_oh * ow + cur_ow] = res; +} + +static void pool2d_nchw_kernel_f32_f32_cuda( + const int ih, const int iw, const int oh, const int ow, + const int kh, const int kw, const int sh, const int sw, + const int ph, const int pw, const int parallel_elements, + const float * src, float * dst, const enum ggml_op_pool op, + cudaStream_t stream) { + + const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE; + dim3 block_nums(num_blocks); + pool2d_nchw_kernel<<>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op); +} + +void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + const int64_t IH = src0->ne[1]; + const int64_t IW = src0->ne[0]; + + const int64_t N = dst->ne[3]; + const int64_t OC = dst->ne[2]; + const int64_t OH = dst->ne[1]; + const int64_t OW = dst->ne[0]; + + const int parallel_elements = N * OC * OH * OW; + + pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream); +} diff --git a/llama/ggml-cuda/pool2d.cuh b/llama/ggml-cuda/pool2d.cuh index 7841292b..de65151b 100644 --- a/llama/ggml-cuda/pool2d.cuh +++ b/llama/ggml-cuda/pool2d.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_POOL2D_BLOCK_SIZE 256 - -void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_POOL2D_BLOCK_SIZE 256 + +void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/quantize.cu b/llama/ggml-cuda/quantize.cu index 7578c4b6..b4678682 100644 --- a/llama/ggml-cuda/quantize.cu +++ b/llama/ggml-cuda/quantize.cu @@ -1,22 +1,23 @@ #include "quantize.cuh" +#include -static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) { - const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; +static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) { + const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; - if (ix >= kx_padded) { + if (ix0 >= kx0_padded) { return; } - const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ix1 = blockIdx.y; - const int64_t i_padded = (int64_t)iy*kx_padded + ix; + const int64_t i_padded = ix1*kx0_padded + ix0; block_q8_1 * y = (block_q8_1 *) vy; const int64_t ib = i_padded / QK8_1; // block index const int64_t iqs = i_padded % QK8_1; // quant index - const float xi = ix < kx ? x[iy*kx + ix] : 0.0f; + const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f; float amax = fabsf(xi); float sum = xi; @@ -36,10 +37,76 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest reinterpret_cast(y[ib].ds.y) = sum; } -void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) { - const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; - const dim3 num_blocks(block_num_x, ky, 1); - const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); - quantize_q8_1<<>>(x, vy, kx, kx_padded); +template +static __global__ void quantize_mmq_q8_1( + const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) { + + const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; + + if (ix0 >= kx0_padded) { + return; + } + + const int64_t ix1 = kx1*blockIdx.z + blockIdx.y; + + block_q8_1_mmq * y = (block_q8_1_mmq *) vy; + + const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel + const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel + const int64_t iqs = ix0 % (4*QK8_1); // quant index in block + + const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f; + float amax = fabsf(xi); + + amax = warp_reduce_max(amax); + + float sum; + if (need_sum) { + sum = warp_reduce_sum(xi); + } + + const float d = amax / 127; + const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); + + y[ib].qs[iqs] = q; + + if (iqs % QK8_1 != 0) { + return; + } + + if (need_sum) { + y[ib].ds[iqs/QK8_1] = make_half2(d, sum); + } else { + ((float *) y[ib].ds)[iqs/QK8_1] = d; + } } +void quantize_row_q8_1_cuda( + const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, + const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) { + + GGML_ASSERT(kx0_padded % QK8_1 == 0); + + const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + const dim3 num_blocks(block_num_x, kx1*channels, 1); + const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); + quantize_q8_1<<>>(x, vy, kx0, kx0_padded); + + GGML_UNUSED(type_x); +} + +void quantize_mmq_q8_1_cuda( + const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, + const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) { + + GGML_ASSERT(kx0_padded % (4*QK8_1) == 0); + + const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + const dim3 num_blocks(block_num_x, kx1, channels); + const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); + if (mmq_need_sum(type_x)) { + quantize_mmq_q8_1<<>>(x, vy, kx0, kx1, kx0_padded); + } else { + quantize_mmq_q8_1<<>>(x, vy, kx0, kx1, kx0_padded); + } +} diff --git a/llama/ggml-cuda/quantize.cuh b/llama/ggml-cuda/quantize.cuh index b37a4752..486c9360 100644 --- a/llama/ggml-cuda/quantize.cuh +++ b/llama/ggml-cuda/quantize.cuh @@ -1,5 +1,20 @@ +#pragma once + #include "common.cuh" +#include "mmq.cuh" + +#include #define CUDA_QUANTIZE_BLOCK_SIZE 256 -void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream); +typedef void (*quantize_cuda_t)( + const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, + const ggml_type type_x, cudaStream_t stream); + +void quantize_row_q8_1_cuda( + const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, + const ggml_type type_x, cudaStream_t stream); + +void quantize_mmq_q8_1_cuda( + const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, + const ggml_type type_x, cudaStream_t stream); diff --git a/llama/ggml-cuda/rope.cuh b/llama/ggml-cuda/rope.cuh index 0f787a0b..c2b29a7d 100644 --- a/llama/ggml-cuda/rope.cuh +++ b/llama/ggml-cuda/rope.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_ROPE_BLOCK_SIZE 256 - -void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_ROPE_BLOCK_SIZE 256 + +void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/scale.cuh b/llama/ggml-cuda/scale.cuh index 8ff75c82..f35503f7 100644 --- a/llama/ggml-cuda/scale.cuh +++ b/llama/ggml-cuda/scale.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_SCALE_BLOCK_SIZE 256 - -void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_SCALE_BLOCK_SIZE 256 + +void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/softmax.cuh b/llama/ggml-cuda/softmax.cuh index 4ef4ff86..5f6564af 100644 --- a/llama/ggml-cuda/softmax.cuh +++ b/llama/ggml-cuda/softmax.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_SOFT_MAX_BLOCK_SIZE 1024 - -void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_SOFT_MAX_BLOCK_SIZE 1024 + +void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/sumrows.cu b/llama/ggml-cuda/sumrows.cu index 82e8e875..0422821d 100644 --- a/llama/ggml-cuda/sumrows.cu +++ b/llama/ggml-cuda/sumrows.cu @@ -1,40 +1,40 @@ -#include "sumrows.cuh" - -static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) { - const int row = blockIdx.x; - const int col = threadIdx.x; - - float sum = 0.0f; - for (int i = col; i < ncols; i += blockDim.x) { - sum += x[row * ncols + i]; - } - - sum = warp_reduce_sum(sum); - - if (col == 0) { - dst[row] = sum; - } -} - -static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - const dim3 block_dims(WARP_SIZE, 1, 1); - const dim3 block_nums(nrows, 1, 1); - k_sum_rows_f32<<>>(x, dst, ncols); -} - -void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_is_contiguous(src0)); - - - const int64_t ncols = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream); -} +#include "sumrows.cuh" + +static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) { + const int row = blockIdx.x; + const int col = threadIdx.x; + + float sum = 0.0f; + for (int i = col; i < ncols; i += blockDim.x) { + sum += x[row * ncols + i]; + } + + sum = warp_reduce_sum(sum); + + if (col == 0) { + dst[row] = sum; + } +} + +static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + const dim3 block_dims(WARP_SIZE, 1, 1); + const dim3 block_nums(nrows, 1, 1); + k_sum_rows_f32<<>>(x, dst, ncols); +} + +void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream); +} diff --git a/llama/ggml-cuda/sumrows.cuh b/llama/ggml-cuda/sumrows.cuh index e7545f83..7558b493 100644 --- a/llama/ggml-cuda/sumrows.cuh +++ b/llama/ggml-cuda/sumrows.cuh @@ -1,3 +1,3 @@ -#include "common.cuh" - -void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu new file mode 100644 index 00000000..6696a238 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu new file mode 100644 index 00000000..dd070db2 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu new file mode 100644 index 00000000..54dcde6f --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu new file mode 100644 index 00000000..4ec22f79 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu new file mode 100644 index 00000000..3c15bf7f --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu new file mode 100644 index 00000000..7e61b5fd --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu new file mode 100644 index 00000000..fdb15b58 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu new file mode 100644 index 00000000..0f7c417d --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu new file mode 100644 index 00000000..851f33c4 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu new file mode 100644 index 00000000..763809cb --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu new file mode 100644 index 00000000..f2a276e5 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu new file mode 100644 index 00000000..cb227f6f --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu new file mode 100644 index 00000000..97ac0520 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu new file mode 100644 index 00000000..c772b426 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu new file mode 100644 index 00000000..5cb74308 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu new file mode 100644 index 00000000..98a709d1 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu new file mode 100644 index 00000000..4f2f947a --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu new file mode 100644 index 00000000..11f96b6f --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu new file mode 100644 index 00000000..b39bdc06 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu new file mode 100644 index 00000000..bbd6a2c7 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu new file mode 100644 index 00000000..9d84ff2b --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu new file mode 100644 index 00000000..bc8a5bff --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu new file mode 100644 index 00000000..a679100c --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu new file mode 100644 index 00000000..8f21bccf --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu new file mode 100644 index 00000000..858b00fd --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu new file mode 100644 index 00000000..0fc8011f --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu new file mode 100644 index 00000000..261fdf62 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu new file mode 100644 index 00000000..0fb82473 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu new file mode 100644 index 00000000..a9d9d089 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu new file mode 100644 index 00000000..7d7b2792 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu new file mode 100644 index 00000000..a092ee2d --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu new file mode 100644 index 00000000..db55927a --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu new file mode 100644 index 00000000..c3c21cef --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu new file mode 100644 index 00000000..35dd9f52 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu new file mode 100644 index 00000000..050c22ac --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu new file mode 100644 index 00000000..de4866c5 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu new file mode 100644 index 00000000..57a10bc4 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu new file mode 100644 index 00000000..e0f08b46 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu new file mode 100644 index 00000000..1c8e8a46 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu new file mode 100644 index 00000000..cefed83f --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu new file mode 100644 index 00000000..aede6e35 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu new file mode 100644 index 00000000..1a1a92c7 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu new file mode 100644 index 00000000..ad667473 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu new file mode 100644 index 00000000..c499f455 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu new file mode 100644 index 00000000..8286ebf3 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu new file mode 100644 index 00000000..45878688 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu new file mode 100644 index 00000000..d89103ce --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu new file mode 100644 index 00000000..bb75fd42 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu new file mode 100644 index 00000000..b1629817 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu new file mode 100644 index 00000000..d8657604 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu new file mode 100644 index 00000000..2e5bd2f1 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu new file mode 100644 index 00000000..be5f302d --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu new file mode 100644 index 00000000..8dd91cd7 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu new file mode 100644 index 00000000..4cb79150 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu new file mode 100644 index 00000000..09dea426 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu new file mode 100644 index 00000000..0fbb6076 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu new file mode 100644 index 00000000..2aeab83b --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu new file mode 100644 index 00000000..599415b4 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu new file mode 100644 index 00000000..e4f8e308 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu new file mode 100644 index 00000000..34d16652 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu new file mode 100644 index 00000000..4bebef45 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu new file mode 100644 index 00000000..326468da --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu new file mode 100644 index 00000000..511b58f4 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu new file mode 100644 index 00000000..d9906d14 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu new file mode 100644 index 00000000..f61c183a --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu new file mode 100644 index 00000000..c10450fd --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu new file mode 100644 index 00000000..2d5cb195 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu new file mode 100644 index 00000000..b384f34d --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu new file mode 100644 index 00000000..446e293b --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu new file mode 100644 index 00000000..6f430298 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu new file mode 100644 index 00000000..1cd8ba88 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu new file mode 100644 index 00000000..1ee2eab6 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu new file mode 100644 index 00000000..2bc77816 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu new file mode 100644 index 00000000..d55ced08 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu new file mode 100644 index 00000000..8361e99c --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu new file mode 100644 index 00000000..7507a67c --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu new file mode 100644 index 00000000..61f050b2 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu new file mode 100644 index 00000000..d4a49d9c --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu new file mode 100644 index 00000000..d1462789 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu new file mode 100644 index 00000000..e73f917a --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu new file mode 100644 index 00000000..d40825df --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu new file mode 100644 index 00000000..b5c6869f --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu new file mode 100644 index 00000000..4e21b0cc --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu new file mode 100644 index 00000000..2eac321b --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu new file mode 100644 index 00000000..f7d2c3b4 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu new file mode 100644 index 00000000..a013f400 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu new file mode 100644 index 00000000..2d94e65c --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu @@ -0,0 +1,10 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +DECL_FATTN_WMMA_F16_CASE(64, 16, float); +DECL_FATTN_WMMA_F16_CASE(80, 16, float); +DECL_FATTN_WMMA_F16_CASE(96, 16, float); +DECL_FATTN_WMMA_F16_CASE(112, 16, float); +DECL_FATTN_WMMA_F16_CASE(128, 16, float); +DECL_FATTN_WMMA_F16_CASE(256, 16, float); diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu new file mode 100644 index 00000000..c3d9df3c --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu @@ -0,0 +1,9 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +DECL_FATTN_WMMA_F16_CASE(64, 32, float); +DECL_FATTN_WMMA_F16_CASE(80, 32, float); +DECL_FATTN_WMMA_F16_CASE(96, 32, float); +DECL_FATTN_WMMA_F16_CASE(112, 32, float); +DECL_FATTN_WMMA_F16_CASE(128, 32, float); diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu new file mode 100644 index 00000000..bb680e40 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu @@ -0,0 +1,10 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +DECL_FATTN_WMMA_F16_CASE(64, 16, half); +DECL_FATTN_WMMA_F16_CASE(80, 16, half); +DECL_FATTN_WMMA_F16_CASE(96, 16, half); +DECL_FATTN_WMMA_F16_CASE(112, 16, half); +DECL_FATTN_WMMA_F16_CASE(128, 16, half); +DECL_FATTN_WMMA_F16_CASE(256, 16, half); diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu new file mode 100644 index 00000000..073f71b1 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu @@ -0,0 +1,10 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +DECL_FATTN_WMMA_F16_CASE(64, 32, half); +DECL_FATTN_WMMA_F16_CASE(80, 32, half); +DECL_FATTN_WMMA_F16_CASE(96, 32, half); +DECL_FATTN_WMMA_F16_CASE(112, 32, half); +DECL_FATTN_WMMA_F16_CASE(128, 32, half); +DECL_FATTN_WMMA_F16_CASE(256, 32, half); diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu new file mode 100644 index 00000000..d30710c5 --- /dev/null +++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu @@ -0,0 +1,8 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +DECL_FATTN_WMMA_F16_CASE(64, 8, half); +DECL_FATTN_WMMA_F16_CASE(96, 8, half); +DECL_FATTN_WMMA_F16_CASE(128, 8, half); +DECL_FATTN_WMMA_F16_CASE(256, 8, half); diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu new file mode 100644 index 00000000..6415369d --- /dev/null +++ b/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmq.cuh" + +DECL_MMQ_CASE(GGML_TYPE_Q2_K); diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu new file mode 100644 index 00000000..ffb6213a --- /dev/null +++ b/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmq.cuh" + +DECL_MMQ_CASE(GGML_TYPE_Q3_K); diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu new file mode 100644 index 00000000..0c0b0c8a --- /dev/null +++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmq.cuh" + +DECL_MMQ_CASE(GGML_TYPE_Q4_0); diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu new file mode 100644 index 00000000..ee67f694 --- /dev/null +++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmq.cuh" + +DECL_MMQ_CASE(GGML_TYPE_Q4_1); diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu new file mode 100644 index 00000000..9eeb3cd7 --- /dev/null +++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmq.cuh" + +DECL_MMQ_CASE(GGML_TYPE_Q4_K); diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu new file mode 100644 index 00000000..cc57fb97 --- /dev/null +++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmq.cuh" + +DECL_MMQ_CASE(GGML_TYPE_Q5_0); diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu new file mode 100644 index 00000000..721ac790 --- /dev/null +++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmq.cuh" + +DECL_MMQ_CASE(GGML_TYPE_Q5_1); diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu new file mode 100644 index 00000000..a2e90ffd --- /dev/null +++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmq.cuh" + +DECL_MMQ_CASE(GGML_TYPE_Q5_K); diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu new file mode 100644 index 00000000..470938fe --- /dev/null +++ b/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmq.cuh" + +DECL_MMQ_CASE(GGML_TYPE_Q6_K); diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu new file mode 100644 index 00000000..974477bb --- /dev/null +++ b/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmq.cuh" + +DECL_MMQ_CASE(GGML_TYPE_Q8_0); diff --git a/llama/ggml-cuda/tsembd.cu b/llama/ggml-cuda/tsembd.cu index 153ddbcd..3fca3327 100644 --- a/llama/ggml-cuda/tsembd.cu +++ b/llama/ggml-cuda/tsembd.cu @@ -1,47 +1,47 @@ -#include "tsembd.cuh" - -static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) { - // blockIDx.y: idx of timesteps->ne[0] - // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE - int i = blockIdx.y; - int j = threadIdx.x + blockIdx.x * blockDim.x; - float * embed_data = (float *)((char *)dst + i*nb1); - - if (dim % 2 != 0 && j == ((dim + 1) / 2)) { - embed_data[dim] = 0.f; - } - - int half = dim / 2; - if (j >= half) { - return; - } - - float timestep = timesteps[i]; - float freq = (float)expf(-logf(max_period) * j / half); - float arg = timestep * freq; - embed_data[j] = cosf(arg); - embed_data[j + half] = sinf(arg); -} - -static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1, - const int dim, const int max_period, cudaStream_t stream) { - int half_ceil = (dim + 1) / 2; - int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE; - dim3 gridDim(num_blocks, ne00, 1); - timestep_embedding_f32<<>>(x, dst, nb1, dim, max_period); -} - -void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - const int dim = dst->op_params[0]; - const int max_period = dst->op_params[1]; - - timestep_embedding_f32_cuda(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); -} +#include "tsembd.cuh" + +static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) { + // blockIDx.y: idx of timesteps->ne[0] + // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE + int i = blockIdx.y; + int j = threadIdx.x + blockIdx.x * blockDim.x; + float * embed_data = (float *)((char *)dst + i*nb1); + + if (dim % 2 != 0 && j == ((dim + 1) / 2)) { + embed_data[dim] = 0.f; + } + + int half = dim / 2; + if (j >= half) { + return; + } + + float timestep = timesteps[i]; + float freq = (float)expf(-logf(max_period) * j / half); + float arg = timestep * freq; + embed_data[j] = cosf(arg); + embed_data[j + half] = sinf(arg); +} + +static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1, + const int dim, const int max_period, cudaStream_t stream) { + int half_ceil = (dim + 1) / 2; + int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE; + dim3 gridDim(num_blocks, ne00, 1); + timestep_embedding_f32<<>>(x, dst, nb1, dim, max_period); +} + +void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + + timestep_embedding_f32_cuda(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); +} diff --git a/llama/ggml-cuda/tsembd.cuh b/llama/ggml-cuda/tsembd.cuh index 84340e3d..2313b21b 100644 --- a/llama/ggml-cuda/tsembd.cuh +++ b/llama/ggml-cuda/tsembd.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 - -void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 + +void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-cuda/upscale.cuh b/llama/ggml-cuda/upscale.cuh index d4d76523..651031dc 100644 --- a/llama/ggml-cuda/upscale.cuh +++ b/llama/ggml-cuda/upscale.cuh @@ -1,5 +1,5 @@ -#include "common.cuh" - -#define CUDA_UPSCALE_BLOCK_SIZE 256 - -void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#include "common.cuh" + +#define CUDA_UPSCALE_BLOCK_SIZE 256 + +void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/llama/ggml-impl.h b/llama/ggml-impl.h index 557c828d..94ac3821 100644 --- a/llama/ggml-impl.h +++ b/llama/ggml-impl.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml-metal-darwin_arm64.m b/llama/ggml-metal-darwin_arm64.m index cd151a17..5285bff3 100644 --- a/llama/ggml-metal-darwin_arm64.m +++ b/llama/ggml-metal-darwin_arm64.m @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml-metal.h b/llama/ggml-metal.h index c117f051..51545e42 100644 --- a/llama/ggml-metal.h +++ b/llama/ggml-metal.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml-metal.metal b/llama/ggml-metal.metal index 88b8aece..f7e24876 100644 --- a/llama/ggml-metal.metal +++ b/llama/ggml-metal.metal @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml-metal.o b/llama/ggml-metal.o deleted file mode 100644 index 70b219df..00000000 Binary files a/llama/ggml-metal.o and /dev/null differ diff --git a/llama/ggml-quants.c b/llama/ggml-quants.c index 0eda1ffc..dde7f1d1 100644 --- a/llama/ggml-quants.c +++ b/llama/ggml-quants.c @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml-quants.h b/llama/ggml-quants.h index f6d07143..16ad861b 100644 --- a/llama/ggml-quants.h +++ b/llama/ggml-quants.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * @@ -24,136 +24,136 @@ * SOFTWARE. */ -#pragma once - -#define GGML_COMMON_DECL_C -#include "ggml-common.h" - -#include "ggml.h" - -// GGML internal header - -#ifdef __cplusplus -extern "C" { -#endif - -// Quantization -void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); -void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); -void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); -void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k); -void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k); -void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k); - -void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); -void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k); -void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k); -void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k); -void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); -void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k); - -void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); -void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); -void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); -void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k); -void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); - -void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); - -void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); - -void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); - -// Dequantization -void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); - -void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); - -void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); - -// Dot product -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") -size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); - -size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); - -void iq2xs_init_impl(enum ggml_type type); -void iq2xs_free_impl(enum ggml_type type); -void iq3xs_init_impl(int grid_size); -void iq3xs_free_impl(int grid_size); - -#ifdef __cplusplus -} -#endif - +#pragma once + +#define GGML_COMMON_DECL_C +#include "ggml-common.h" + +#include "ggml.h" + +// GGML internal header + +#ifdef __cplusplus +extern "C" { +#endif + +// Quantization +void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); +void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k); + +void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); +void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k); +void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k); +void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k); + +void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); +void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); +void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); +void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); + +void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + +void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + +void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + +// Dequantization +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + +void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + +// Dot product +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + +// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") +size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +void iq2xs_init_impl(enum ggml_type type); +void iq2xs_free_impl(enum ggml_type type); +void iq3xs_init_impl(int grid_size); +void iq3xs_free_impl(int grid_size); + +#ifdef __cplusplus +} +#endif + diff --git a/llama/ggml.c b/llama/ggml.c index 398bf731..790915e8 100644 --- a/llama/ggml.c +++ b/llama/ggml.c @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/ggml.h b/llama/ggml.h index 00ef4582..9dd8684e 100644 --- a/llama/ggml.h +++ b/llama/ggml.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/grammar-parser.cpp b/llama/grammar-parser.cpp index 6e69d073..70da76db 100644 --- a/llama/grammar-parser.cpp +++ b/llama/grammar-parser.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/grammar-parser.h b/llama/grammar-parser.h index bb25744d..c3f07348 100644 --- a/llama/grammar-parser.h +++ b/llama/grammar-parser.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * @@ -24,32 +24,32 @@ * SOFTWARE. */ -// Implements a parser for an extended Backus-Naur form (BNF), producing the -// binary context-free grammar format specified by llama.h. Supports character -// ranges, grouping, and repetition operators. As an example, a grammar for -// arithmetic might look like: -// -// root ::= expr -// expr ::= term ([-+*/] term)* -// term ::= num | "(" space expr ")" space -// num ::= [0-9]+ space -// space ::= [ \t\n]* - -#pragma once -#include "llama.h" -#include -#include -#include -#include - -namespace grammar_parser { - struct parse_state { - std::map symbol_ids; - std::vector> rules; - - std::vector c_rules(); - }; - - parse_state parse(const char * src); - void print_grammar(FILE * file, const parse_state & state); -} +// Implements a parser for an extended Backus-Naur form (BNF), producing the +// binary context-free grammar format specified by llama.h. Supports character +// ranges, grouping, and repetition operators. As an example, a grammar for +// arithmetic might look like: +// +// root ::= expr +// expr ::= term ([-+*/] term)* +// term ::= num | "(" space expr ")" space +// num ::= [0-9]+ space +// space ::= [ \t\n]* + +#pragma once +#include "llama.h" +#include +#include +#include +#include + +namespace grammar_parser { + struct parse_state { + std::map symbol_ids; + std::vector> rules; + + std::vector c_rules(); + }; + + parse_state parse(const char * src); + void print_grammar(FILE * file, const parse_state & state); +} diff --git a/llama/json-schema-to-grammar.cpp b/llama/json-schema-to-grammar.cpp index 40212c0d..eab7dd1d 100644 --- a/llama/json-schema-to-grammar.cpp +++ b/llama/json-schema-to-grammar.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/json-schema-to-grammar.h b/llama/json-schema-to-grammar.h index 9761e6ce..45f4a14a 100644 --- a/llama/json-schema-to-grammar.h +++ b/llama/json-schema-to-grammar.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28 + * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c * * MIT License * diff --git a/llama/json.hpp b/llama/json.hpp index a858728c..e020407a 100644 --- a/llama/json.hpp +++ b/llama/json.hpp @@ -1,24766 +1,24766 @@ -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - -/****************************************************************************\ - * Note on documentation: The source files contain links to the online * - * documentation of the public API at https://json.nlohmann.me. This URL * - * contains the most recent documentation and should also be applicable to * - * previous versions; documentation for deprecated functions is not * - * removed, but marked deprecated. See "Generate documentation" section in * - * file docs/README.md. * -\****************************************************************************/ - -#ifndef INCLUDE_NLOHMANN_JSON_HPP_ -#define INCLUDE_NLOHMANN_JSON_HPP_ - -#include // all_of, find, for_each -#include // nullptr_t, ptrdiff_t, size_t -#include // hash, less -#include // initializer_list -#ifndef JSON_NO_IO - #include // istream, ostream -#endif // JSON_NO_IO -#include // random_access_iterator_tag -#include // unique_ptr -#include // string, stoi, to_string -#include // declval, forward, move, pair, swap -#include // vector - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -#include - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -// This file contains all macro definitions affecting or depending on the ABI - -#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK - #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH) - #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3 - #warning "Already included a different version of the library!" - #endif - #endif -#endif - -#define NLOHMANN_JSON_VERSION_MAJOR 3 // NOLINT(modernize-macro-to-enum) -#define NLOHMANN_JSON_VERSION_MINOR 11 // NOLINT(modernize-macro-to-enum) -#define NLOHMANN_JSON_VERSION_PATCH 3 // NOLINT(modernize-macro-to-enum) - -#ifndef JSON_DIAGNOSTICS - #define JSON_DIAGNOSTICS 0 -#endif - -#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON - #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0 -#endif - -#if JSON_DIAGNOSTICS - #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag -#else - #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS -#endif - -#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON - #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp -#else - #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON -#endif - -#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION - #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0 -#endif - -// Construct the namespace ABI tags component -#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b) json_abi ## a ## b -#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b) \ - NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b) - -#define NLOHMANN_JSON_ABI_TAGS \ - NLOHMANN_JSON_ABI_TAGS_CONCAT( \ - NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS, \ - NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON) - -// Construct the namespace version component -#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \ - _v ## major ## _ ## minor ## _ ## patch -#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \ - NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) - -#if NLOHMANN_JSON_NAMESPACE_NO_VERSION -#define NLOHMANN_JSON_NAMESPACE_VERSION -#else -#define NLOHMANN_JSON_NAMESPACE_VERSION \ - NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \ - NLOHMANN_JSON_VERSION_MINOR, \ - NLOHMANN_JSON_VERSION_PATCH) -#endif - -// Combine namespace components -#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b -#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \ - NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) - -#ifndef NLOHMANN_JSON_NAMESPACE -#define NLOHMANN_JSON_NAMESPACE \ - nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \ - NLOHMANN_JSON_ABI_TAGS, \ - NLOHMANN_JSON_NAMESPACE_VERSION) -#endif - -#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN -#define NLOHMANN_JSON_NAMESPACE_BEGIN \ - namespace nlohmann \ - { \ - inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \ - NLOHMANN_JSON_ABI_TAGS, \ - NLOHMANN_JSON_NAMESPACE_VERSION) \ - { -#endif - -#ifndef NLOHMANN_JSON_NAMESPACE_END -#define NLOHMANN_JSON_NAMESPACE_END \ - } /* namespace (inline namespace) NOLINT(readability/namespace) */ \ - } // namespace nlohmann -#endif - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -#include // transform -#include // array -#include // forward_list -#include // inserter, front_inserter, end -#include // map -#include // string -#include // tuple, make_tuple -#include // is_arithmetic, is_same, is_enum, underlying_type, is_convertible -#include // unordered_map -#include // pair, declval -#include // valarray - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -#include // nullptr_t -#include // exception -#if JSON_DIAGNOSTICS - #include // accumulate -#endif -#include // runtime_error -#include // to_string -#include // vector - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -#include // array -#include // size_t -#include // uint8_t -#include // string - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -#include // declval, pair -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -#include - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -// #include - - -NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - -template struct make_void -{ - using type = void; -}; -template using void_t = typename make_void::type; - -} // namespace detail -NLOHMANN_JSON_NAMESPACE_END - - -NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - -// https://en.cppreference.com/w/cpp/experimental/is_detected -struct nonesuch -{ - nonesuch() = delete; - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; - nonesuch(nonesuch const&&) = delete; - void operator=(nonesuch const&) = delete; - void operator=(nonesuch&&) = delete; -}; - -template class Op, - class... Args> -struct detector -{ - using value_t = std::false_type; - using type = Default; -}; - -template class Op, class... Args> -struct detector>, Op, Args...> -{ - using value_t = std::true_type; - using type = Op; -}; - -template class Op, class... Args> -using is_detected = typename detector::value_t; - -template class Op, class... Args> -struct is_detected_lazy : is_detected { }; - -template class Op, class... Args> -using detected_t = typename detector::type; - -template class Op, class... Args> -using detected_or = detector; - -template class Op, class... Args> -using detected_or_t = typename detected_or::type; - -template class Op, class... Args> -using is_detected_exact = std::is_same>; - -template class Op, class... Args> -using is_detected_convertible = - std::is_convertible, To>; - -} // namespace detail -NLOHMANN_JSON_NAMESPACE_END - -// #include - - -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-FileCopyrightText: 2016-2021 Evan Nemerson -// SPDX-License-Identifier: MIT - -/* Hedley - https://nemequ.github.io/hedley - * Created by Evan Nemerson - */ - -#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 15) -#if defined(JSON_HEDLEY_VERSION) - #undef JSON_HEDLEY_VERSION -#endif -#define JSON_HEDLEY_VERSION 15 - -#if defined(JSON_HEDLEY_STRINGIFY_EX) - #undef JSON_HEDLEY_STRINGIFY_EX -#endif -#define JSON_HEDLEY_STRINGIFY_EX(x) #x - -#if defined(JSON_HEDLEY_STRINGIFY) - #undef JSON_HEDLEY_STRINGIFY -#endif -#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x) - -#if defined(JSON_HEDLEY_CONCAT_EX) - #undef JSON_HEDLEY_CONCAT_EX -#endif -#define JSON_HEDLEY_CONCAT_EX(a,b) a##b - -#if defined(JSON_HEDLEY_CONCAT) - #undef JSON_HEDLEY_CONCAT -#endif -#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b) - -#if defined(JSON_HEDLEY_CONCAT3_EX) - #undef JSON_HEDLEY_CONCAT3_EX -#endif -#define JSON_HEDLEY_CONCAT3_EX(a,b,c) a##b##c - -#if defined(JSON_HEDLEY_CONCAT3) - #undef JSON_HEDLEY_CONCAT3 -#endif -#define JSON_HEDLEY_CONCAT3(a,b,c) JSON_HEDLEY_CONCAT3_EX(a,b,c) - -#if defined(JSON_HEDLEY_VERSION_ENCODE) - #undef JSON_HEDLEY_VERSION_ENCODE -#endif -#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision)) - -#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR) - #undef JSON_HEDLEY_VERSION_DECODE_MAJOR -#endif -#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000) - -#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR) - #undef JSON_HEDLEY_VERSION_DECODE_MINOR -#endif -#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000) - -#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION) - #undef JSON_HEDLEY_VERSION_DECODE_REVISION -#endif -#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000) - -#if defined(JSON_HEDLEY_GNUC_VERSION) - #undef JSON_HEDLEY_GNUC_VERSION -#endif -#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__) - #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) -#elif defined(__GNUC__) - #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0) -#endif - -#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK) - #undef JSON_HEDLEY_GNUC_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_GNUC_VERSION) - #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_MSVC_VERSION) - #undef JSON_HEDLEY_MSVC_VERSION -#endif -#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL) - #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100) -#elif defined(_MSC_FULL_VER) && !defined(__ICL) - #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10) -#elif defined(_MSC_VER) && !defined(__ICL) - #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0) -#endif - -#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK) - #undef JSON_HEDLEY_MSVC_VERSION_CHECK -#endif -#if !defined(JSON_HEDLEY_MSVC_VERSION) - #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0) -#elif defined(_MSC_VER) && (_MSC_VER >= 1400) - #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch))) -#elif defined(_MSC_VER) && (_MSC_VER >= 1200) - #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch))) -#else - #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor))) -#endif - -#if defined(JSON_HEDLEY_INTEL_VERSION) - #undef JSON_HEDLEY_INTEL_VERSION -#endif -#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL) - #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE) -#elif defined(__INTEL_COMPILER) && !defined(__ICL) - #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0) -#endif - -#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK) - #undef JSON_HEDLEY_INTEL_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_INTEL_VERSION) - #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_INTEL_CL_VERSION) - #undef JSON_HEDLEY_INTEL_CL_VERSION -#endif -#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL) - #define JSON_HEDLEY_INTEL_CL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0) -#endif - -#if defined(JSON_HEDLEY_INTEL_CL_VERSION_CHECK) - #undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_INTEL_CL_VERSION) - #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_CL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_PGI_VERSION) - #undef JSON_HEDLEY_PGI_VERSION -#endif -#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__) - #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__) -#endif - -#if defined(JSON_HEDLEY_PGI_VERSION_CHECK) - #undef JSON_HEDLEY_PGI_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_PGI_VERSION) - #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_SUNPRO_VERSION) - #undef JSON_HEDLEY_SUNPRO_VERSION -#endif -#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000) - #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10) -#elif defined(__SUNPRO_C) - #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf) -#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000) - #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10) -#elif defined(__SUNPRO_CC) - #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf) -#endif - -#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK) - #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_SUNPRO_VERSION) - #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION) - #undef JSON_HEDLEY_EMSCRIPTEN_VERSION -#endif -#if defined(__EMSCRIPTEN__) - #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__) -#endif - -#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK) - #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION) - #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_ARM_VERSION) - #undef JSON_HEDLEY_ARM_VERSION -#endif -#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION) - #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100) -#elif defined(__CC_ARM) && defined(__ARMCC_VERSION) - #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100) -#endif - -#if defined(JSON_HEDLEY_ARM_VERSION_CHECK) - #undef JSON_HEDLEY_ARM_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_ARM_VERSION) - #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_IBM_VERSION) - #undef JSON_HEDLEY_IBM_VERSION -#endif -#if defined(__ibmxl__) - #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__) -#elif defined(__xlC__) && defined(__xlC_ver__) - #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff) -#elif defined(__xlC__) - #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0) -#endif - -#if defined(JSON_HEDLEY_IBM_VERSION_CHECK) - #undef JSON_HEDLEY_IBM_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_IBM_VERSION) - #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_TI_VERSION) - #undef JSON_HEDLEY_TI_VERSION -#endif -#if \ - defined(__TI_COMPILER_VERSION__) && \ - ( \ - defined(__TMS470__) || defined(__TI_ARM__) || \ - defined(__MSP430__) || \ - defined(__TMS320C2000__) \ - ) -#if (__TI_COMPILER_VERSION__ >= 16000000) - #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif -#endif - -#if defined(JSON_HEDLEY_TI_VERSION_CHECK) - #undef JSON_HEDLEY_TI_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_TI_VERSION) - #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_TI_CL2000_VERSION) - #undef JSON_HEDLEY_TI_CL2000_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__) - #define JSON_HEDLEY_TI_CL2000_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK) - #undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_TI_CL2000_VERSION) - #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_TI_CL430_VERSION) - #undef JSON_HEDLEY_TI_CL430_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__) - #define JSON_HEDLEY_TI_CL430_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK) - #undef JSON_HEDLEY_TI_CL430_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_TI_CL430_VERSION) - #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_TI_ARMCL_VERSION) - #undef JSON_HEDLEY_TI_ARMCL_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__)) - #define JSON_HEDLEY_TI_ARMCL_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK) - #undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_TI_ARMCL_VERSION) - #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_TI_CL6X_VERSION) - #undef JSON_HEDLEY_TI_CL6X_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__) - #define JSON_HEDLEY_TI_CL6X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK) - #undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_TI_CL6X_VERSION) - #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_TI_CL7X_VERSION) - #undef JSON_HEDLEY_TI_CL7X_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__) - #define JSON_HEDLEY_TI_CL7X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK) - #undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_TI_CL7X_VERSION) - #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_TI_CLPRU_VERSION) - #undef JSON_HEDLEY_TI_CLPRU_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__) - #define JSON_HEDLEY_TI_CLPRU_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK) - #undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_TI_CLPRU_VERSION) - #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_CRAY_VERSION) - #undef JSON_HEDLEY_CRAY_VERSION -#endif -#if defined(_CRAYC) - #if defined(_RELEASE_PATCHLEVEL) - #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL) - #else - #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0) - #endif -#endif - -#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK) - #undef JSON_HEDLEY_CRAY_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_CRAY_VERSION) - #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_IAR_VERSION) - #undef JSON_HEDLEY_IAR_VERSION -#endif -#if defined(__IAR_SYSTEMS_ICC__) - #if __VER__ > 1000 - #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000)) - #else - #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0) - #endif -#endif - -#if defined(JSON_HEDLEY_IAR_VERSION_CHECK) - #undef JSON_HEDLEY_IAR_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_IAR_VERSION) - #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_TINYC_VERSION) - #undef JSON_HEDLEY_TINYC_VERSION -#endif -#if defined(__TINYC__) - #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100) -#endif - -#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK) - #undef JSON_HEDLEY_TINYC_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_TINYC_VERSION) - #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_DMC_VERSION) - #undef JSON_HEDLEY_DMC_VERSION -#endif -#if defined(__DMC__) - #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf) -#endif - -#if defined(JSON_HEDLEY_DMC_VERSION_CHECK) - #undef JSON_HEDLEY_DMC_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_DMC_VERSION) - #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_COMPCERT_VERSION) - #undef JSON_HEDLEY_COMPCERT_VERSION -#endif -#if defined(__COMPCERT_VERSION__) - #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100) -#endif - -#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK) - #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_COMPCERT_VERSION) - #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_PELLES_VERSION) - #undef JSON_HEDLEY_PELLES_VERSION -#endif -#if defined(__POCC__) - #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0) -#endif - -#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK) - #undef JSON_HEDLEY_PELLES_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_PELLES_VERSION) - #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_MCST_LCC_VERSION) - #undef JSON_HEDLEY_MCST_LCC_VERSION -#endif -#if defined(__LCC__) && defined(__LCC_MINOR__) - #define JSON_HEDLEY_MCST_LCC_VERSION JSON_HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__) -#endif - -#if defined(JSON_HEDLEY_MCST_LCC_VERSION_CHECK) - #undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_MCST_LCC_VERSION) - #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_MCST_LCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_GCC_VERSION) - #undef JSON_HEDLEY_GCC_VERSION -#endif -#if \ - defined(JSON_HEDLEY_GNUC_VERSION) && \ - !defined(__clang__) && \ - !defined(JSON_HEDLEY_INTEL_VERSION) && \ - !defined(JSON_HEDLEY_PGI_VERSION) && \ - !defined(JSON_HEDLEY_ARM_VERSION) && \ - !defined(JSON_HEDLEY_CRAY_VERSION) && \ - !defined(JSON_HEDLEY_TI_VERSION) && \ - !defined(JSON_HEDLEY_TI_ARMCL_VERSION) && \ - !defined(JSON_HEDLEY_TI_CL430_VERSION) && \ - !defined(JSON_HEDLEY_TI_CL2000_VERSION) && \ - !defined(JSON_HEDLEY_TI_CL6X_VERSION) && \ - !defined(JSON_HEDLEY_TI_CL7X_VERSION) && \ - !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && \ - !defined(__COMPCERT__) && \ - !defined(JSON_HEDLEY_MCST_LCC_VERSION) - #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION -#endif - -#if defined(JSON_HEDLEY_GCC_VERSION_CHECK) - #undef JSON_HEDLEY_GCC_VERSION_CHECK -#endif -#if defined(JSON_HEDLEY_GCC_VERSION) - #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else - #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(JSON_HEDLEY_HAS_ATTRIBUTE) - #undef JSON_HEDLEY_HAS_ATTRIBUTE -#endif -#if \ - defined(__has_attribute) && \ - ( \ - (!defined(JSON_HEDLEY_IAR_VERSION) || JSON_HEDLEY_IAR_VERSION_CHECK(8,5,9)) \ - ) -# define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute) -#else -# define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0) -#endif - -#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE) - #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE -#endif -#if defined(__has_attribute) - #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute) -#else - #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE) - #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE -#endif -#if defined(__has_attribute) - #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute) -#else - #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE) - #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE -#endif -#if \ - defined(__has_cpp_attribute) && \ - defined(__cplusplus) && \ - (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) - #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute) -#else - #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0) -#endif - -#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS) - #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS -#endif -#if !defined(__cplusplus) || !defined(__has_cpp_attribute) - #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) -#elif \ - !defined(JSON_HEDLEY_PGI_VERSION) && \ - !defined(JSON_HEDLEY_IAR_VERSION) && \ - (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \ - (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute) -#else - #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) -#endif - -#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE) - #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE -#endif -#if defined(__has_cpp_attribute) && defined(__cplusplus) - #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) -#else - #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE) - #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE -#endif -#if defined(__has_cpp_attribute) && defined(__cplusplus) - #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) -#else - #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_HAS_BUILTIN) - #undef JSON_HEDLEY_HAS_BUILTIN -#endif -#if defined(__has_builtin) - #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin) -#else - #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0) -#endif - -#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN) - #undef JSON_HEDLEY_GNUC_HAS_BUILTIN -#endif -#if defined(__has_builtin) - #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) -#else - #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN) - #undef JSON_HEDLEY_GCC_HAS_BUILTIN -#endif -#if defined(__has_builtin) - #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) -#else - #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_HAS_FEATURE) - #undef JSON_HEDLEY_HAS_FEATURE -#endif -#if defined(__has_feature) - #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature) -#else - #define JSON_HEDLEY_HAS_FEATURE(feature) (0) -#endif - -#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE) - #undef JSON_HEDLEY_GNUC_HAS_FEATURE -#endif -#if defined(__has_feature) - #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) -#else - #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_GCC_HAS_FEATURE) - #undef JSON_HEDLEY_GCC_HAS_FEATURE -#endif -#if defined(__has_feature) - #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) -#else - #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_HAS_EXTENSION) - #undef JSON_HEDLEY_HAS_EXTENSION -#endif -#if defined(__has_extension) - #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension) -#else - #define JSON_HEDLEY_HAS_EXTENSION(extension) (0) -#endif - -#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION) - #undef JSON_HEDLEY_GNUC_HAS_EXTENSION -#endif -#if defined(__has_extension) - #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) -#else - #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION) - #undef JSON_HEDLEY_GCC_HAS_EXTENSION -#endif -#if defined(__has_extension) - #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) -#else - #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE) - #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE -#endif -#if defined(__has_declspec_attribute) - #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute) -#else - #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0) -#endif - -#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE) - #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE -#endif -#if defined(__has_declspec_attribute) - #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) -#else - #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE) - #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE -#endif -#if defined(__has_declspec_attribute) - #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) -#else - #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_HAS_WARNING) - #undef JSON_HEDLEY_HAS_WARNING -#endif -#if defined(__has_warning) - #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning) -#else - #define JSON_HEDLEY_HAS_WARNING(warning) (0) -#endif - -#if defined(JSON_HEDLEY_GNUC_HAS_WARNING) - #undef JSON_HEDLEY_GNUC_HAS_WARNING -#endif -#if defined(__has_warning) - #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) -#else - #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_GCC_HAS_WARNING) - #undef JSON_HEDLEY_GCC_HAS_WARNING -#endif -#if defined(__has_warning) - #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) -#else - #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ - defined(__clang__) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ - JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \ - JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \ - JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \ - (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR)) - #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value) -#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) - #define JSON_HEDLEY_PRAGMA(value) __pragma(value) -#else - #define JSON_HEDLEY_PRAGMA(value) -#endif - -#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH) - #undef JSON_HEDLEY_DIAGNOSTIC_PUSH -#endif -#if defined(JSON_HEDLEY_DIAGNOSTIC_POP) - #undef JSON_HEDLEY_DIAGNOSTIC_POP -#endif -#if defined(__clang__) - #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") - #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop") -#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") - #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") -#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) - #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") - #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") -#elif \ - JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push)) - #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop)) -#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) - #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push") - #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop") -#elif \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) - #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push") - #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop") -#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0) - #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") - #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") -#else - #define JSON_HEDLEY_DIAGNOSTIC_PUSH - #define JSON_HEDLEY_DIAGNOSTIC_POP -#endif - -/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for - HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ -#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) - #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ -#endif -#if defined(__cplusplus) -# if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat") -# if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions") -# if JSON_HEDLEY_HAS_WARNING("-Wc++1z-extensions") -# define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ - _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ - _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \ - xpr \ - JSON_HEDLEY_DIAGNOSTIC_POP -# else -# define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ - _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ - xpr \ - JSON_HEDLEY_DIAGNOSTIC_POP -# endif -# else -# define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ - xpr \ - JSON_HEDLEY_DIAGNOSTIC_POP -# endif -# endif -#endif -#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x -#endif - -#if defined(JSON_HEDLEY_CONST_CAST) - #undef JSON_HEDLEY_CONST_CAST -#endif -#if defined(__cplusplus) -# define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) -#elif \ - JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ - ((T) (expr)); \ - JSON_HEDLEY_DIAGNOSTIC_POP \ - })) -#else -# define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(JSON_HEDLEY_REINTERPRET_CAST) - #undef JSON_HEDLEY_REINTERPRET_CAST -#endif -#if defined(__cplusplus) - #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) -#else - #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(JSON_HEDLEY_STATIC_CAST) - #undef JSON_HEDLEY_STATIC_CAST -#endif -#if defined(__cplusplus) - #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) -#else - #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(JSON_HEDLEY_CPP_CAST) - #undef JSON_HEDLEY_CPP_CAST -#endif -#if defined(__cplusplus) -# if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast") -# define JSON_HEDLEY_CPP_CAST(T, expr) \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \ - ((T) (expr)) \ - JSON_HEDLEY_DIAGNOSTIC_POP -# elif JSON_HEDLEY_IAR_VERSION_CHECK(8,3,0) -# define JSON_HEDLEY_CPP_CAST(T, expr) \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("diag_suppress=Pe137") \ - JSON_HEDLEY_DIAGNOSTIC_POP -# else -# define JSON_HEDLEY_CPP_CAST(T, expr) ((T) (expr)) -# endif -#else -# define JSON_HEDLEY_CPP_CAST(T, expr) (expr) -#endif - -#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED) - #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED -#endif -#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations") - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") -#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)") -#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786)) -#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445") -#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") -#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") -#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996)) -#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") -#elif \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718") -#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)") -#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)") -#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215") -#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)") -#else - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED -#endif - -#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS) - #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS -#endif -#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas") - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"") -#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)") -#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161)) -#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675") -#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") -#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068)) -#elif \ - JSON_HEDLEY_TI_VERSION_CHECK(16,9,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") -#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") -#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161") -#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161") -#else - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS -#endif - -#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES) - #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES -#endif -#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes") - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"") -#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") -#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)") -#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292)) -#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030)) -#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098") -#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") -#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)") -#elif \ - JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173") -#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097") -#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") -#else - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES -#endif - -#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL) - #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL -#endif -#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual") - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"") -#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)") -#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") -#else - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL -#endif - -#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION) - #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION -#endif -#if JSON_HEDLEY_HAS_WARNING("-Wunused-function") - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"") -#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"") -#elif JSON_HEDLEY_MSVC_VERSION_CHECK(1,0,0) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505)) -#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142") -#else - #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION -#endif - -#if defined(JSON_HEDLEY_DEPRECATED) - #undef JSON_HEDLEY_DEPRECATED -#endif -#if defined(JSON_HEDLEY_DEPRECATED_FOR) - #undef JSON_HEDLEY_DEPRECATED_FOR -#endif -#if \ - JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since)) - #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement)) -#elif \ - (JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(JSON_HEDLEY_IAR_VERSION)) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ - JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \ - JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since))) - #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement))) -#elif defined(__cplusplus) && (__cplusplus >= 201402L) - #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]]) - #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]]) -#elif \ - JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) - #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__)) - #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__)) -#elif \ - JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated) - #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated) -#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) - #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated") - #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated") -#else - #define JSON_HEDLEY_DEPRECATED(since) - #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) -#endif - -#if defined(JSON_HEDLEY_UNAVAILABLE) - #undef JSON_HEDLEY_UNAVAILABLE -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since))) -#else - #define JSON_HEDLEY_UNAVAILABLE(available_since) -#endif - -#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT) - #undef JSON_HEDLEY_WARN_UNUSED_RESULT -#endif -#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG) - #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ - JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) - #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__)) -#elif (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L) - #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) - #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]]) -#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) - #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) - #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) -#elif defined(_Check_return_) /* SAL */ - #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_ - #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_ -#else - #define JSON_HEDLEY_WARN_UNUSED_RESULT - #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) -#endif - -#if defined(JSON_HEDLEY_SENTINEL) - #undef JSON_HEDLEY_SENTINEL -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position))) -#else - #define JSON_HEDLEY_SENTINEL(position) -#endif - -#if defined(JSON_HEDLEY_NO_RETURN) - #undef JSON_HEDLEY_NO_RETURN -#endif -#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) - #define JSON_HEDLEY_NO_RETURN __noreturn -#elif \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__)) -#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L - #define JSON_HEDLEY_NO_RETURN _Noreturn -#elif defined(__cplusplus) && (__cplusplus >= 201103L) - #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]]) -#elif \ - JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \ - JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) - #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__)) -#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) - #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return") -#elif \ - JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_NO_RETURN __declspec(noreturn) -#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) - #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;") -#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) - #define JSON_HEDLEY_NO_RETURN __attribute((noreturn)) -#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0) - #define JSON_HEDLEY_NO_RETURN __declspec(noreturn) -#else - #define JSON_HEDLEY_NO_RETURN -#endif - -#if defined(JSON_HEDLEY_NO_ESCAPE) - #undef JSON_HEDLEY_NO_ESCAPE -#endif -#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape) - #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__)) -#else - #define JSON_HEDLEY_NO_ESCAPE -#endif - -#if defined(JSON_HEDLEY_UNREACHABLE) - #undef JSON_HEDLEY_UNREACHABLE -#endif -#if defined(JSON_HEDLEY_UNREACHABLE_RETURN) - #undef JSON_HEDLEY_UNREACHABLE_RETURN -#endif -#if defined(JSON_HEDLEY_ASSUME) - #undef JSON_HEDLEY_ASSUME -#endif -#if \ - JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_ASSUME(expr) __assume(expr) -#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume) - #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr) -#elif \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) - #if defined(__cplusplus) - #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr) - #else - #define JSON_HEDLEY_ASSUME(expr) _nassert(expr) - #endif -#endif -#if \ - (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ - JSON_HEDLEY_PGI_VERSION_CHECK(18,10,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5) || \ - JSON_HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable() -#elif defined(JSON_HEDLEY_ASSUME) - #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0) -#endif -#if !defined(JSON_HEDLEY_ASSUME) - #if defined(JSON_HEDLEY_UNREACHABLE) - #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1))) - #else - #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr) - #endif -#endif -#if defined(JSON_HEDLEY_UNREACHABLE) - #if \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) - #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value)) - #else - #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE() - #endif -#else - #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value) -#endif -#if !defined(JSON_HEDLEY_UNREACHABLE) - #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0) -#endif - -JSON_HEDLEY_DIAGNOSTIC_PUSH -#if JSON_HEDLEY_HAS_WARNING("-Wpedantic") - #pragma clang diagnostic ignored "-Wpedantic" -#endif -#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus) - #pragma clang diagnostic ignored "-Wc++98-compat-pedantic" -#endif -#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0) - #if defined(__clang__) - #pragma clang diagnostic ignored "-Wvariadic-macros" - #elif defined(JSON_HEDLEY_GCC_VERSION) - #pragma GCC diagnostic ignored "-Wvariadic-macros" - #endif -#endif -#if defined(JSON_HEDLEY_NON_NULL) - #undef JSON_HEDLEY_NON_NULL -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) - #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__))) -#else - #define JSON_HEDLEY_NON_NULL(...) -#endif -JSON_HEDLEY_DIAGNOSTIC_POP - -#if defined(JSON_HEDLEY_PRINTF_FORMAT) - #undef JSON_HEDLEY_PRINTF_FORMAT -#endif -#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO) - #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check))) -#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO) - #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check))) -#elif \ - JSON_HEDLEY_HAS_ATTRIBUTE(format) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check))) -#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0) - #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check)) -#else - #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) -#endif - -#if defined(JSON_HEDLEY_CONSTEXPR) - #undef JSON_HEDLEY_CONSTEXPR -#endif -#if defined(__cplusplus) - #if __cplusplus >= 201103L - #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr) - #endif -#endif -#if !defined(JSON_HEDLEY_CONSTEXPR) - #define JSON_HEDLEY_CONSTEXPR -#endif - -#if defined(JSON_HEDLEY_PREDICT) - #undef JSON_HEDLEY_PREDICT -#endif -#if defined(JSON_HEDLEY_LIKELY) - #undef JSON_HEDLEY_LIKELY -#endif -#if defined(JSON_HEDLEY_UNLIKELY) - #undef JSON_HEDLEY_UNLIKELY -#endif -#if defined(JSON_HEDLEY_UNPREDICTABLE) - #undef JSON_HEDLEY_UNPREDICTABLE -#endif -#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable) - #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr)) -#endif -#if \ - (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(JSON_HEDLEY_PGI_VERSION)) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability( (expr), (value), (probability)) -# define JSON_HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1 , (probability)) -# define JSON_HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0 , (probability)) -# define JSON_HEDLEY_LIKELY(expr) __builtin_expect (!!(expr), 1 ) -# define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect (!!(expr), 0 ) -#elif \ - (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \ - JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define JSON_HEDLEY_PREDICT(expr, expected, probability) \ - (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))) -# define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \ - (__extension__ ({ \ - double hedley_probability_ = (probability); \ - ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \ - })) -# define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \ - (__extension__ ({ \ - double hedley_probability_ = (probability); \ - ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \ - })) -# define JSON_HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) -# define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) -#else -# define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)) -# define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr)) -# define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr)) -# define JSON_HEDLEY_LIKELY(expr) (!!(expr)) -# define JSON_HEDLEY_UNLIKELY(expr) (!!(expr)) -#endif -#if !defined(JSON_HEDLEY_UNPREDICTABLE) - #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5) -#endif - -#if defined(JSON_HEDLEY_MALLOC) - #undef JSON_HEDLEY_MALLOC -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_MALLOC __attribute__((__malloc__)) -#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) - #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory") -#elif \ - JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_MALLOC __declspec(restrict) -#else - #define JSON_HEDLEY_MALLOC -#endif - -#if defined(JSON_HEDLEY_PURE) - #undef JSON_HEDLEY_PURE -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define JSON_HEDLEY_PURE __attribute__((__pure__)) -#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) -# define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data") -#elif defined(__cplusplus) && \ - ( \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \ - ) -# define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;") -#else -# define JSON_HEDLEY_PURE -#endif - -#if defined(JSON_HEDLEY_CONST) - #undef JSON_HEDLEY_CONST -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(const) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_CONST __attribute__((__const__)) -#elif \ - JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) - #define JSON_HEDLEY_CONST _Pragma("no_side_effect") -#else - #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE -#endif - -#if defined(JSON_HEDLEY_RESTRICT) - #undef JSON_HEDLEY_RESTRICT -#endif -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus) - #define JSON_HEDLEY_RESTRICT restrict -#elif \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \ - JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ - defined(__clang__) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_RESTRICT __restrict -#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus) - #define JSON_HEDLEY_RESTRICT _Restrict -#else - #define JSON_HEDLEY_RESTRICT -#endif - -#if defined(JSON_HEDLEY_INLINE) - #undef JSON_HEDLEY_INLINE -#endif -#if \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ - (defined(__cplusplus) && (__cplusplus >= 199711L)) - #define JSON_HEDLEY_INLINE inline -#elif \ - defined(JSON_HEDLEY_GCC_VERSION) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0) - #define JSON_HEDLEY_INLINE __inline__ -#elif \ - JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_INLINE __inline -#else - #define JSON_HEDLEY_INLINE -#endif - -#if defined(JSON_HEDLEY_ALWAYS_INLINE) - #undef JSON_HEDLEY_ALWAYS_INLINE -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) -# define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE -#elif \ - JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define JSON_HEDLEY_ALWAYS_INLINE __forceinline -#elif defined(__cplusplus) && \ - ( \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \ - ) -# define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;") -#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced") -#else -# define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE -#endif - -#if defined(JSON_HEDLEY_NEVER_INLINE) - #undef JSON_HEDLEY_NEVER_INLINE -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) - #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__)) -#elif \ - JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline) -#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0) - #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline") -#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) - #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;") -#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) - #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never") -#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) - #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline)) -#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0) - #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline) -#else - #define JSON_HEDLEY_NEVER_INLINE -#endif - -#if defined(JSON_HEDLEY_PRIVATE) - #undef JSON_HEDLEY_PRIVATE -#endif -#if defined(JSON_HEDLEY_PUBLIC) - #undef JSON_HEDLEY_PUBLIC -#endif -#if defined(JSON_HEDLEY_IMPORT) - #undef JSON_HEDLEY_IMPORT -#endif -#if defined(_WIN32) || defined(__CYGWIN__) -# define JSON_HEDLEY_PRIVATE -# define JSON_HEDLEY_PUBLIC __declspec(dllexport) -# define JSON_HEDLEY_IMPORT __declspec(dllimport) -#else -# if \ - JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - ( \ - defined(__TI_EABI__) && \ - ( \ - (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \ - ) \ - ) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden"))) -# define JSON_HEDLEY_PUBLIC __attribute__((__visibility__("default"))) -# else -# define JSON_HEDLEY_PRIVATE -# define JSON_HEDLEY_PUBLIC -# endif -# define JSON_HEDLEY_IMPORT extern -#endif - -#if defined(JSON_HEDLEY_NO_THROW) - #undef JSON_HEDLEY_NO_THROW -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__)) -#elif \ - JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) - #define JSON_HEDLEY_NO_THROW __declspec(nothrow) -#else - #define JSON_HEDLEY_NO_THROW -#endif - -#if defined(JSON_HEDLEY_FALL_THROUGH) - #undef JSON_HEDLEY_FALL_THROUGH -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(7,0,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__)) -#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough) - #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]]) -#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough) - #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]]) -#elif defined(__fallthrough) /* SAL */ - #define JSON_HEDLEY_FALL_THROUGH __fallthrough -#else - #define JSON_HEDLEY_FALL_THROUGH -#endif - -#if defined(JSON_HEDLEY_RETURNS_NON_NULL) - #undef JSON_HEDLEY_RETURNS_NON_NULL -#endif -#if \ - JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__)) -#elif defined(_Ret_notnull_) /* SAL */ - #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_ -#else - #define JSON_HEDLEY_RETURNS_NON_NULL -#endif - -#if defined(JSON_HEDLEY_ARRAY_PARAM) - #undef JSON_HEDLEY_ARRAY_PARAM -#endif -#if \ - defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \ - !defined(__STDC_NO_VLA__) && \ - !defined(__cplusplus) && \ - !defined(JSON_HEDLEY_PGI_VERSION) && \ - !defined(JSON_HEDLEY_TINYC_VERSION) - #define JSON_HEDLEY_ARRAY_PARAM(name) (name) -#else - #define JSON_HEDLEY_ARRAY_PARAM(name) -#endif - -#if defined(JSON_HEDLEY_IS_CONSTANT) - #undef JSON_HEDLEY_IS_CONSTANT -#endif -#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR) - #undef JSON_HEDLEY_REQUIRE_CONSTEXPR -#endif -/* JSON_HEDLEY_IS_CONSTEXPR_ is for - HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ -#if defined(JSON_HEDLEY_IS_CONSTEXPR_) - #undef JSON_HEDLEY_IS_CONSTEXPR_ -#endif -#if \ - JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \ - JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr) -#endif -#if !defined(__cplusplus) -# if \ - JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ - JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24) -#if defined(__INTPTR_TYPE__) - #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*) -#else - #include - #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*) -#endif -# elif \ - ( \ - defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ - !defined(JSON_HEDLEY_SUNPRO_VERSION) && \ - !defined(JSON_HEDLEY_PGI_VERSION) && \ - !defined(JSON_HEDLEY_IAR_VERSION)) || \ - (JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(JSON_HEDLEY_IAR_VERSION)) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \ - JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ - JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0) -#if defined(__INTPTR_TYPE__) - #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0) -#else - #include - #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0) -#endif -# elif \ - defined(JSON_HEDLEY_GCC_VERSION) || \ - defined(JSON_HEDLEY_INTEL_VERSION) || \ - defined(JSON_HEDLEY_TINYC_VERSION) || \ - defined(JSON_HEDLEY_TI_ARMCL_VERSION) || \ - JSON_HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \ - defined(JSON_HEDLEY_TI_CL2000_VERSION) || \ - defined(JSON_HEDLEY_TI_CL6X_VERSION) || \ - defined(JSON_HEDLEY_TI_CL7X_VERSION) || \ - defined(JSON_HEDLEY_TI_CLPRU_VERSION) || \ - defined(__clang__) -# define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \ - sizeof(void) != \ - sizeof(*( \ - 1 ? \ - ((void*) ((expr) * 0L) ) : \ -((struct { char v[sizeof(void) * 2]; } *) 1) \ - ) \ - ) \ - ) -# endif -#endif -#if defined(JSON_HEDLEY_IS_CONSTEXPR_) - #if !defined(JSON_HEDLEY_IS_CONSTANT) - #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr) - #endif - #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1)) -#else - #if !defined(JSON_HEDLEY_IS_CONSTANT) - #define JSON_HEDLEY_IS_CONSTANT(expr) (0) - #endif - #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr) -#endif - -#if defined(JSON_HEDLEY_BEGIN_C_DECLS) - #undef JSON_HEDLEY_BEGIN_C_DECLS -#endif -#if defined(JSON_HEDLEY_END_C_DECLS) - #undef JSON_HEDLEY_END_C_DECLS -#endif -#if defined(JSON_HEDLEY_C_DECL) - #undef JSON_HEDLEY_C_DECL -#endif -#if defined(__cplusplus) - #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" { - #define JSON_HEDLEY_END_C_DECLS } - #define JSON_HEDLEY_C_DECL extern "C" -#else - #define JSON_HEDLEY_BEGIN_C_DECLS - #define JSON_HEDLEY_END_C_DECLS - #define JSON_HEDLEY_C_DECL -#endif - -#if defined(JSON_HEDLEY_STATIC_ASSERT) - #undef JSON_HEDLEY_STATIC_ASSERT -#endif -#if \ - !defined(__cplusplus) && ( \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ - (JSON_HEDLEY_HAS_FEATURE(c_static_assert) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \ - JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - defined(_Static_assert) \ - ) -# define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message) -#elif \ - (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ - JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) -#else -# define JSON_HEDLEY_STATIC_ASSERT(expr, message) -#endif - -#if defined(JSON_HEDLEY_NULL) - #undef JSON_HEDLEY_NULL -#endif -#if defined(__cplusplus) - #if __cplusplus >= 201103L - #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr) - #elif defined(NULL) - #define JSON_HEDLEY_NULL NULL - #else - #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0) - #endif -#elif defined(NULL) - #define JSON_HEDLEY_NULL NULL -#else - #define JSON_HEDLEY_NULL ((void*) 0) -#endif - -#if defined(JSON_HEDLEY_MESSAGE) - #undef JSON_HEDLEY_MESSAGE -#endif -#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas") -# define JSON_HEDLEY_MESSAGE(msg) \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ - JSON_HEDLEY_PRAGMA(message msg) \ - JSON_HEDLEY_DIAGNOSTIC_POP -#elif \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg) -#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) -# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg) -#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg)) -#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0) -# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg)) -#else -# define JSON_HEDLEY_MESSAGE(msg) -#endif - -#if defined(JSON_HEDLEY_WARNING) - #undef JSON_HEDLEY_WARNING -#endif -#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas") -# define JSON_HEDLEY_WARNING(msg) \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ - JSON_HEDLEY_PRAGMA(clang warning msg) \ - JSON_HEDLEY_DIAGNOSTIC_POP -#elif \ - JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \ - JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ - JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg) -#elif \ - JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg)) -#else -# define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg) -#endif - -#if defined(JSON_HEDLEY_REQUIRE) - #undef JSON_HEDLEY_REQUIRE -#endif -#if defined(JSON_HEDLEY_REQUIRE_MSG) - #undef JSON_HEDLEY_REQUIRE_MSG -#endif -#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if) -# if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat") -# define JSON_HEDLEY_REQUIRE(expr) \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ - __attribute__((diagnose_if(!(expr), #expr, "error"))) \ - JSON_HEDLEY_DIAGNOSTIC_POP -# define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ - __attribute__((diagnose_if(!(expr), msg, "error"))) \ - JSON_HEDLEY_DIAGNOSTIC_POP -# else -# define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error"))) -# define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error"))) -# endif -#else -# define JSON_HEDLEY_REQUIRE(expr) -# define JSON_HEDLEY_REQUIRE_MSG(expr,msg) -#endif - -#if defined(JSON_HEDLEY_FLAGS) - #undef JSON_HEDLEY_FLAGS -#endif -#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || JSON_HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion")) - #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__)) -#else - #define JSON_HEDLEY_FLAGS -#endif - -#if defined(JSON_HEDLEY_FLAGS_CAST) - #undef JSON_HEDLEY_FLAGS_CAST -#endif -#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0) -# define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \ - JSON_HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("warning(disable:188)") \ - ((T) (expr)); \ - JSON_HEDLEY_DIAGNOSTIC_POP \ - })) -#else -# define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr) -#endif - -#if defined(JSON_HEDLEY_EMPTY_BASES) - #undef JSON_HEDLEY_EMPTY_BASES -#endif -#if \ - (JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \ - JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) - #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases) -#else - #define JSON_HEDLEY_EMPTY_BASES -#endif - -/* Remaining macros are deprecated. */ - -#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK) - #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK -#endif -#if defined(__clang__) - #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0) -#else - #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE) - #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE -#endif -#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute) - -#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE) - #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE -#endif -#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) - -#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN) - #undef JSON_HEDLEY_CLANG_HAS_BUILTIN -#endif -#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin) - -#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE) - #undef JSON_HEDLEY_CLANG_HAS_FEATURE -#endif -#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature) - -#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION) - #undef JSON_HEDLEY_CLANG_HAS_EXTENSION -#endif -#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension) - -#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE) - #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE -#endif -#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) - -#if defined(JSON_HEDLEY_CLANG_HAS_WARNING) - #undef JSON_HEDLEY_CLANG_HAS_WARNING -#endif -#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning) - -#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */ - - -// This file contains all internal macro definitions (except those affecting ABI) -// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them - -// #include - - -// exclude unsupported compilers -#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK) - #if defined(__clang__) - #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400 - #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers" - #endif - #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER)) - #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800 - #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers" - #endif - #endif -#endif - -// C++ language standard detection -// if the user manually specified the used c++ version this is skipped -#if !defined(JSON_HAS_CPP_20) && !defined(JSON_HAS_CPP_17) && !defined(JSON_HAS_CPP_14) && !defined(JSON_HAS_CPP_11) - #if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) - #define JSON_HAS_CPP_20 - #define JSON_HAS_CPP_17 - #define JSON_HAS_CPP_14 - #elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464 - #define JSON_HAS_CPP_17 - #define JSON_HAS_CPP_14 - #elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1) - #define JSON_HAS_CPP_14 - #endif - // the cpp 11 flag is always specified because it is the minimal required version - #define JSON_HAS_CPP_11 -#endif - -#ifdef __has_include - #if __has_include() - #include - #endif -#endif - -#if !defined(JSON_HAS_FILESYSTEM) && !defined(JSON_HAS_EXPERIMENTAL_FILESYSTEM) - #ifdef JSON_HAS_CPP_17 - #if defined(__cpp_lib_filesystem) - #define JSON_HAS_FILESYSTEM 1 - #elif defined(__cpp_lib_experimental_filesystem) - #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1 - #elif !defined(__has_include) - #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1 - #elif __has_include() - #define JSON_HAS_FILESYSTEM 1 - #elif __has_include() - #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1 - #endif - - // std::filesystem does not work on MinGW GCC 8: https://sourceforge.net/p/mingw-w64/bugs/737/ - #if defined(__MINGW32__) && defined(__GNUC__) && __GNUC__ == 8 - #undef JSON_HAS_FILESYSTEM - #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM - #endif - - // no filesystem support before GCC 8: https://en.cppreference.com/w/cpp/compiler_support - #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8 - #undef JSON_HAS_FILESYSTEM - #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM - #endif - - // no filesystem support before Clang 7: https://en.cppreference.com/w/cpp/compiler_support - #if defined(__clang_major__) && __clang_major__ < 7 - #undef JSON_HAS_FILESYSTEM - #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM - #endif - - // no filesystem support before MSVC 19.14: https://en.cppreference.com/w/cpp/compiler_support - #if defined(_MSC_VER) && _MSC_VER < 1914 - #undef JSON_HAS_FILESYSTEM - #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM - #endif - - // no filesystem support before iOS 13 - #if defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 130000 - #undef JSON_HAS_FILESYSTEM - #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM - #endif - - // no filesystem support before macOS Catalina - #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 101500 - #undef JSON_HAS_FILESYSTEM - #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM - #endif - #endif -#endif - -#ifndef JSON_HAS_EXPERIMENTAL_FILESYSTEM - #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 0 -#endif - -#ifndef JSON_HAS_FILESYSTEM - #define JSON_HAS_FILESYSTEM 0 -#endif - -#ifndef JSON_HAS_THREE_WAY_COMPARISON - #if defined(__cpp_impl_three_way_comparison) && __cpp_impl_three_way_comparison >= 201907L \ - && defined(__cpp_lib_three_way_comparison) && __cpp_lib_three_way_comparison >= 201907L - #define JSON_HAS_THREE_WAY_COMPARISON 1 - #else - #define JSON_HAS_THREE_WAY_COMPARISON 0 - #endif -#endif - -#ifndef JSON_HAS_RANGES - // ranges header shipping in GCC 11.1.0 (released 2021-04-27) has syntax error - #if defined(__GLIBCXX__) && __GLIBCXX__ == 20210427 - #define JSON_HAS_RANGES 0 - #elif defined(__cpp_lib_ranges) - #define JSON_HAS_RANGES 1 - #else - #define JSON_HAS_RANGES 0 - #endif -#endif - -#ifndef JSON_HAS_STATIC_RTTI - #if !defined(_HAS_STATIC_RTTI) || _HAS_STATIC_RTTI != 0 - #define JSON_HAS_STATIC_RTTI 1 - #else - #define JSON_HAS_STATIC_RTTI 0 - #endif -#endif - -#ifdef JSON_HAS_CPP_17 - #define JSON_INLINE_VARIABLE inline -#else - #define JSON_INLINE_VARIABLE -#endif - -#if JSON_HEDLEY_HAS_ATTRIBUTE(no_unique_address) - #define JSON_NO_UNIQUE_ADDRESS [[no_unique_address]] -#else - #define JSON_NO_UNIQUE_ADDRESS -#endif - -// disable documentation warnings on clang -#if defined(__clang__) - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wdocumentation" - #pragma clang diagnostic ignored "-Wdocumentation-unknown-command" -#endif - -// allow disabling exceptions -#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION) - #define JSON_THROW(exception) throw exception - #define JSON_TRY try - #define JSON_CATCH(exception) catch(exception) - #define JSON_INTERNAL_CATCH(exception) catch(exception) -#else - #include - #define JSON_THROW(exception) std::abort() - #define JSON_TRY if(true) - #define JSON_CATCH(exception) if(false) - #define JSON_INTERNAL_CATCH(exception) if(false) -#endif - -// override exception macros -#if defined(JSON_THROW_USER) - #undef JSON_THROW - #define JSON_THROW JSON_THROW_USER -#endif -#if defined(JSON_TRY_USER) - #undef JSON_TRY - #define JSON_TRY JSON_TRY_USER -#endif -#if defined(JSON_CATCH_USER) - #undef JSON_CATCH - #define JSON_CATCH JSON_CATCH_USER - #undef JSON_INTERNAL_CATCH - #define JSON_INTERNAL_CATCH JSON_CATCH_USER -#endif -#if defined(JSON_INTERNAL_CATCH_USER) - #undef JSON_INTERNAL_CATCH - #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER -#endif - -// allow overriding assert -#if !defined(JSON_ASSERT) - #include // assert - #define JSON_ASSERT(x) assert(x) -#endif - -// allow to access some private functions (needed by the test suite) -#if defined(JSON_TESTS_PRIVATE) - #define JSON_PRIVATE_UNLESS_TESTED public -#else - #define JSON_PRIVATE_UNLESS_TESTED private -#endif - -/*! -@brief macro to briefly define a mapping between an enum and JSON -@def NLOHMANN_JSON_SERIALIZE_ENUM -@since version 3.4.0 -*/ -#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...) \ - template \ - inline void to_json(BasicJsonType& j, const ENUM_TYPE& e) \ - { \ - static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ - static const std::pair m[] = __VA_ARGS__; \ - auto it = std::find_if(std::begin(m), std::end(m), \ - [e](const std::pair& ej_pair) -> bool \ - { \ - return ej_pair.first == e; \ - }); \ - j = ((it != std::end(m)) ? it : std::begin(m))->second; \ - } \ - template \ - inline void from_json(const BasicJsonType& j, ENUM_TYPE& e) \ - { \ - static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ - static const std::pair m[] = __VA_ARGS__; \ - auto it = std::find_if(std::begin(m), std::end(m), \ - [&j](const std::pair& ej_pair) -> bool \ - { \ - return ej_pair.second == j; \ - }); \ - e = ((it != std::end(m)) ? it : std::begin(m))->first; \ - } - -// Ugly macros to avoid uglier copy-paste when specializing basic_json. They -// may be removed in the future once the class is split. - -#define NLOHMANN_BASIC_JSON_TPL_DECLARATION \ - template class ObjectType, \ - template class ArrayType, \ - class StringType, class BooleanType, class NumberIntegerType, \ - class NumberUnsignedType, class NumberFloatType, \ - template class AllocatorType, \ - template class JSONSerializer, \ - class BinaryType, \ - class CustomBaseClass> - -#define NLOHMANN_BASIC_JSON_TPL \ - basic_json - -// Macros to simplify conversion from/to types - -#define NLOHMANN_JSON_EXPAND( x ) x -#define NLOHMANN_JSON_GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, NAME,...) NAME -#define NLOHMANN_JSON_PASTE(...) NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__, \ - NLOHMANN_JSON_PASTE64, \ - NLOHMANN_JSON_PASTE63, \ - NLOHMANN_JSON_PASTE62, \ - NLOHMANN_JSON_PASTE61, \ - NLOHMANN_JSON_PASTE60, \ - NLOHMANN_JSON_PASTE59, \ - NLOHMANN_JSON_PASTE58, \ - NLOHMANN_JSON_PASTE57, \ - NLOHMANN_JSON_PASTE56, \ - NLOHMANN_JSON_PASTE55, \ - NLOHMANN_JSON_PASTE54, \ - NLOHMANN_JSON_PASTE53, \ - NLOHMANN_JSON_PASTE52, \ - NLOHMANN_JSON_PASTE51, \ - NLOHMANN_JSON_PASTE50, \ - NLOHMANN_JSON_PASTE49, \ - NLOHMANN_JSON_PASTE48, \ - NLOHMANN_JSON_PASTE47, \ - NLOHMANN_JSON_PASTE46, \ - NLOHMANN_JSON_PASTE45, \ - NLOHMANN_JSON_PASTE44, \ - NLOHMANN_JSON_PASTE43, \ - NLOHMANN_JSON_PASTE42, \ - NLOHMANN_JSON_PASTE41, \ - NLOHMANN_JSON_PASTE40, \ - NLOHMANN_JSON_PASTE39, \ - NLOHMANN_JSON_PASTE38, \ - NLOHMANN_JSON_PASTE37, \ - NLOHMANN_JSON_PASTE36, \ - NLOHMANN_JSON_PASTE35, \ - NLOHMANN_JSON_PASTE34, \ - NLOHMANN_JSON_PASTE33, \ - NLOHMANN_JSON_PASTE32, \ - NLOHMANN_JSON_PASTE31, \ - NLOHMANN_JSON_PASTE30, \ - NLOHMANN_JSON_PASTE29, \ - NLOHMANN_JSON_PASTE28, \ - NLOHMANN_JSON_PASTE27, \ - NLOHMANN_JSON_PASTE26, \ - NLOHMANN_JSON_PASTE25, \ - NLOHMANN_JSON_PASTE24, \ - NLOHMANN_JSON_PASTE23, \ - NLOHMANN_JSON_PASTE22, \ - NLOHMANN_JSON_PASTE21, \ - NLOHMANN_JSON_PASTE20, \ - NLOHMANN_JSON_PASTE19, \ - NLOHMANN_JSON_PASTE18, \ - NLOHMANN_JSON_PASTE17, \ - NLOHMANN_JSON_PASTE16, \ - NLOHMANN_JSON_PASTE15, \ - NLOHMANN_JSON_PASTE14, \ - NLOHMANN_JSON_PASTE13, \ - NLOHMANN_JSON_PASTE12, \ - NLOHMANN_JSON_PASTE11, \ - NLOHMANN_JSON_PASTE10, \ - NLOHMANN_JSON_PASTE9, \ - NLOHMANN_JSON_PASTE8, \ - NLOHMANN_JSON_PASTE7, \ - NLOHMANN_JSON_PASTE6, \ - NLOHMANN_JSON_PASTE5, \ - NLOHMANN_JSON_PASTE4, \ - NLOHMANN_JSON_PASTE3, \ - NLOHMANN_JSON_PASTE2, \ - NLOHMANN_JSON_PASTE1)(__VA_ARGS__)) -#define NLOHMANN_JSON_PASTE2(func, v1) func(v1) -#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2) -#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3) -#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4) -#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5) -#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6) -#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7) -#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8) -#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9) -#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10) -#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) -#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) -#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) -#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) -#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) -#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) -#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) -#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) -#define NLOHMANN_JSON_PASTE20(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) -#define NLOHMANN_JSON_PASTE21(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) -#define NLOHMANN_JSON_PASTE22(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE21(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) -#define NLOHMANN_JSON_PASTE23(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE22(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) -#define NLOHMANN_JSON_PASTE24(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE23(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) -#define NLOHMANN_JSON_PASTE25(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE24(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) -#define NLOHMANN_JSON_PASTE26(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE25(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) -#define NLOHMANN_JSON_PASTE27(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE26(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) -#define NLOHMANN_JSON_PASTE28(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE27(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) -#define NLOHMANN_JSON_PASTE29(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE28(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) -#define NLOHMANN_JSON_PASTE30(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE29(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) -#define NLOHMANN_JSON_PASTE31(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE30(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) -#define NLOHMANN_JSON_PASTE32(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE31(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) -#define NLOHMANN_JSON_PASTE33(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE32(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) -#define NLOHMANN_JSON_PASTE34(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE33(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) -#define NLOHMANN_JSON_PASTE35(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE34(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) -#define NLOHMANN_JSON_PASTE36(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE35(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) -#define NLOHMANN_JSON_PASTE37(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE36(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) -#define NLOHMANN_JSON_PASTE38(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE37(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) -#define NLOHMANN_JSON_PASTE39(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE38(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) -#define NLOHMANN_JSON_PASTE40(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE39(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) -#define NLOHMANN_JSON_PASTE41(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE40(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) -#define NLOHMANN_JSON_PASTE42(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE41(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) -#define NLOHMANN_JSON_PASTE43(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE42(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) -#define NLOHMANN_JSON_PASTE44(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE43(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) -#define NLOHMANN_JSON_PASTE45(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE44(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) -#define NLOHMANN_JSON_PASTE46(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE45(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) -#define NLOHMANN_JSON_PASTE47(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE46(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) -#define NLOHMANN_JSON_PASTE48(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE47(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) -#define NLOHMANN_JSON_PASTE49(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE48(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) -#define NLOHMANN_JSON_PASTE50(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE49(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) -#define NLOHMANN_JSON_PASTE51(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE50(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) -#define NLOHMANN_JSON_PASTE52(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE51(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) -#define NLOHMANN_JSON_PASTE53(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE52(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) -#define NLOHMANN_JSON_PASTE54(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE53(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) -#define NLOHMANN_JSON_PASTE55(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE54(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) -#define NLOHMANN_JSON_PASTE56(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE55(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) -#define NLOHMANN_JSON_PASTE57(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE56(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) -#define NLOHMANN_JSON_PASTE58(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE57(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) -#define NLOHMANN_JSON_PASTE59(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE58(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) -#define NLOHMANN_JSON_PASTE60(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE59(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) -#define NLOHMANN_JSON_PASTE61(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE60(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) -#define NLOHMANN_JSON_PASTE62(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE61(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) -#define NLOHMANN_JSON_PASTE63(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE62(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) -#define NLOHMANN_JSON_PASTE64(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE63(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) - -#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1; -#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1); -#define NLOHMANN_JSON_FROM_WITH_DEFAULT(v1) nlohmann_json_t.v1 = nlohmann_json_j.value(#v1, nlohmann_json_default_obj.v1); - -/*! -@brief macro -@def NLOHMANN_DEFINE_TYPE_INTRUSIVE -@since version 3.9.0 -*/ -#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...) \ - friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ - friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) } - -#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(Type, ...) \ - friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ - friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } - -#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, ...) \ - friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } - -/*! -@brief macro -@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE -@since version 3.9.0 -*/ -#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...) \ - inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ - inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) } - -#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...) \ - inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } - -#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, ...) \ - inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ - inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } - -// inspired from https://stackoverflow.com/a/26745591 -// allows to call any std function as if (e.g. with begin): -// using std::begin; begin(x); -// -// it allows using the detected idiom to retrieve the return type -// of such an expression -#define NLOHMANN_CAN_CALL_STD_FUNC_IMPL(std_name) \ - namespace detail { \ - using std::std_name; \ - \ - template \ - using result_of_##std_name = decltype(std_name(std::declval()...)); \ - } \ - \ - namespace detail2 { \ - struct std_name##_tag \ - { \ - }; \ - \ - template \ - std_name##_tag std_name(T&&...); \ - \ - template \ - using result_of_##std_name = decltype(std_name(std::declval()...)); \ - \ - template \ - struct would_call_std_##std_name \ - { \ - static constexpr auto const value = ::nlohmann::detail:: \ - is_detected_exact::value; \ - }; \ - } /* namespace detail2 */ \ - \ - template \ - struct would_call_std_##std_name : detail2::would_call_std_##std_name \ - { \ - } - -#ifndef JSON_USE_IMPLICIT_CONVERSIONS - #define JSON_USE_IMPLICIT_CONVERSIONS 1 -#endif - -#if JSON_USE_IMPLICIT_CONVERSIONS - #define JSON_EXPLICIT -#else - #define JSON_EXPLICIT explicit -#endif - -#ifndef JSON_DISABLE_ENUM_SERIALIZATION - #define JSON_DISABLE_ENUM_SERIALIZATION 0 -#endif - -#ifndef JSON_USE_GLOBAL_UDLS - #define JSON_USE_GLOBAL_UDLS 1 -#endif - -#if JSON_HAS_THREE_WAY_COMPARISON - #include // partial_ordering -#endif - -NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - -/////////////////////////// -// JSON type enumeration // -/////////////////////////// - -/*! -@brief the JSON type enumeration - -This enumeration collects the different JSON types. It is internally used to -distinguish the stored values, and the functions @ref basic_json::is_null(), -@ref basic_json::is_object(), @ref basic_json::is_array(), -@ref basic_json::is_string(), @ref basic_json::is_boolean(), -@ref basic_json::is_number() (with @ref basic_json::is_number_integer(), -@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()), -@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and -@ref basic_json::is_structured() rely on it. - -@note There are three enumeration entries (number_integer, number_unsigned, and -number_float), because the library distinguishes these three types for numbers: -@ref basic_json::number_unsigned_t is used for unsigned integers, -@ref basic_json::number_integer_t is used for signed integers, and -@ref basic_json::number_float_t is used for floating-point numbers or to -approximate integers which do not fit in the limits of their respective type. - -@sa see @ref basic_json::basic_json(const value_t value_type) -- create a JSON -value with the default value for a given type - -@since version 1.0.0 -*/ -enum class value_t : std::uint8_t -{ - null, ///< null value - object, ///< object (unordered set of name/value pairs) - array, ///< array (ordered collection of values) - string, ///< string value - boolean, ///< boolean value - number_integer, ///< number value (signed integer) - number_unsigned, ///< number value (unsigned integer) - number_float, ///< number value (floating-point) - binary, ///< binary array (ordered collection of bytes) - discarded ///< discarded by the parser callback function -}; - -/*! -@brief comparison operator for JSON types - -Returns an ordering that is similar to Python: -- order: null < boolean < number < object < array < string < binary -- furthermore, each type is not smaller than itself -- discarded values are not comparable -- binary is represented as a b"" string in python and directly comparable to a - string; however, making a binary array directly comparable with a string would - be surprising behavior in a JSON file. - -@since version 1.0.0 -*/ -#if JSON_HAS_THREE_WAY_COMPARISON - inline std::partial_ordering operator<=>(const value_t lhs, const value_t rhs) noexcept // *NOPAD* -#else - inline bool operator<(const value_t lhs, const value_t rhs) noexcept -#endif -{ - static constexpr std::array order = {{ - 0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */, - 1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */, - 6 /* binary */ - } - }; - - const auto l_index = static_cast(lhs); - const auto r_index = static_cast(rhs); -#if JSON_HAS_THREE_WAY_COMPARISON - if (l_index < order.size() && r_index < order.size()) - { - return order[l_index] <=> order[r_index]; // *NOPAD* - } - return std::partial_ordering::unordered; -#else - return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index]; -#endif -} - -// GCC selects the built-in operator< over an operator rewritten from -// a user-defined spaceship operator -// Clang, MSVC, and ICC select the rewritten candidate -// (see GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105200) -#if JSON_HAS_THREE_WAY_COMPARISON && defined(__GNUC__) -inline bool operator<(const value_t lhs, const value_t rhs) noexcept -{ - return std::is_lt(lhs <=> rhs); // *NOPAD* -} -#endif - -} // namespace detail -NLOHMANN_JSON_NAMESPACE_END - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -// #include - - -NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - -/*! -@brief replace all occurrences of a substring by another string - -@param[in,out] s the string to manipulate; changed so that all - occurrences of @a f are replaced with @a t -@param[in] f the substring to replace with @a t -@param[in] t the string to replace @a f - -@pre The search string @a f must not be empty. **This precondition is -enforced with an assertion.** - -@since version 2.0.0 -*/ -template -inline void replace_substring(StringType& s, const StringType& f, - const StringType& t) -{ - JSON_ASSERT(!f.empty()); - for (auto pos = s.find(f); // find first occurrence of f - pos != StringType::npos; // make sure f was found - s.replace(pos, f.size(), t), // replace with t, and - pos = s.find(f, pos + t.size())) // find next occurrence of f - {} -} - -/*! - * @brief string escaping as described in RFC 6901 (Sect. 4) - * @param[in] s string to escape - * @return escaped string - * - * Note the order of escaping "~" to "~0" and "/" to "~1" is important. - */ -template -inline StringType escape(StringType s) -{ - replace_substring(s, StringType{"~"}, StringType{"~0"}); - replace_substring(s, StringType{"/"}, StringType{"~1"}); - return s; -} - -/*! - * @brief string unescaping as described in RFC 6901 (Sect. 4) - * @param[in] s string to unescape - * @return unescaped string - * - * Note the order of escaping "~1" to "/" and "~0" to "~" is important. - */ -template -static void unescape(StringType& s) -{ - replace_substring(s, StringType{"~1"}, StringType{"/"}); - replace_substring(s, StringType{"~0"}, StringType{"~"}); -} - -} // namespace detail -NLOHMANN_JSON_NAMESPACE_END - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -#include // size_t - -// #include - - -NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - -/// struct to capture the start position of the current token -struct position_t -{ - /// the total number of characters read - std::size_t chars_read_total = 0; - /// the number of characters read in the current line - std::size_t chars_read_current_line = 0; - /// the number of lines read - std::size_t lines_read = 0; - - /// conversion to size_t to preserve SAX interface - constexpr operator size_t() const - { - return chars_read_total; - } -}; - -} // namespace detail -NLOHMANN_JSON_NAMESPACE_END - -// #include - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-FileCopyrightText: 2018 The Abseil Authors -// SPDX-License-Identifier: MIT - - - -#include // array -#include // size_t -#include // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type -#include // index_sequence, make_index_sequence, index_sequence_for - -// #include - - -NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - -template -using uncvref_t = typename std::remove_cv::type>::type; - -#ifdef JSON_HAS_CPP_14 - -// the following utilities are natively available in C++14 -using std::enable_if_t; -using std::index_sequence; -using std::make_index_sequence; -using std::index_sequence_for; - -#else - -// alias templates to reduce boilerplate -template -using enable_if_t = typename std::enable_if::type; - -// The following code is taken from https://github.com/abseil/abseil-cpp/blob/10cb35e459f5ecca5b2ff107635da0bfa41011b4/absl/utility/utility.h -// which is part of Google Abseil (https://github.com/abseil/abseil-cpp), licensed under the Apache License 2.0. - -//// START OF CODE FROM GOOGLE ABSEIL - -// integer_sequence -// -// Class template representing a compile-time integer sequence. An instantiation -// of `integer_sequence` has a sequence of integers encoded in its -// type through its template arguments (which is a common need when -// working with C++11 variadic templates). `absl::integer_sequence` is designed -// to be a drop-in replacement for C++14's `std::integer_sequence`. -// -// Example: -// -// template< class T, T... Ints > -// void user_function(integer_sequence); -// -// int main() -// { -// // user_function's `T` will be deduced to `int` and `Ints...` -// // will be deduced to `0, 1, 2, 3, 4`. -// user_function(make_integer_sequence()); -// } -template -struct integer_sequence -{ - using value_type = T; - static constexpr std::size_t size() noexcept - { - return sizeof...(Ints); - } -}; - -// index_sequence -// -// A helper template for an `integer_sequence` of `size_t`, -// `absl::index_sequence` is designed to be a drop-in replacement for C++14's -// `std::index_sequence`. -template -using index_sequence = integer_sequence; - -namespace utility_internal -{ - -template -struct Extend; - -// Note that SeqSize == sizeof...(Ints). It's passed explicitly for efficiency. -template -struct Extend, SeqSize, 0> -{ - using type = integer_sequence < T, Ints..., (Ints + SeqSize)... >; -}; - -template -struct Extend, SeqSize, 1> -{ - using type = integer_sequence < T, Ints..., (Ints + SeqSize)..., 2 * SeqSize >; -}; - -// Recursion helper for 'make_integer_sequence'. -// 'Gen::type' is an alias for 'integer_sequence'. -template -struct Gen -{ - using type = - typename Extend < typename Gen < T, N / 2 >::type, N / 2, N % 2 >::type; -}; - -template -struct Gen -{ - using type = integer_sequence; -}; - -} // namespace utility_internal - -// Compile-time sequences of integers - -// make_integer_sequence -// -// This template alias is equivalent to -// `integer_sequence`, and is designed to be a drop-in -// replacement for C++14's `std::make_integer_sequence`. -template -using make_integer_sequence = typename utility_internal::Gen::type; - -// make_index_sequence -// -// This template alias is equivalent to `index_sequence<0, 1, ..., N-1>`, -// and is designed to be a drop-in replacement for C++14's -// `std::make_index_sequence`. -template -using make_index_sequence = make_integer_sequence; - -// index_sequence_for -// -// Converts a typename pack into an index sequence of the same length, and -// is designed to be a drop-in replacement for C++14's -// `std::index_sequence_for()` -template -using index_sequence_for = make_index_sequence; - -//// END OF CODE FROM GOOGLE ABSEIL - -#endif - -// dispatch utility (taken from ranges-v3) -template struct priority_tag : priority_tag < N - 1 > {}; -template<> struct priority_tag<0> {}; - -// taken from ranges-v3 -template -struct static_const -{ - static JSON_INLINE_VARIABLE constexpr T value{}; -}; - -#ifndef JSON_HAS_CPP_17 - template - constexpr T static_const::value; -#endif - -template -inline constexpr std::array make_array(Args&& ... args) -{ - return std::array {{static_cast(std::forward(args))...}}; -} - -} // namespace detail -NLOHMANN_JSON_NAMESPACE_END - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -#include // numeric_limits -#include // false_type, is_constructible, is_integral, is_same, true_type -#include // declval -#include // tuple -#include // char_traits - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -#include // random_access_iterator_tag - -// #include - -// #include - -// #include - - -NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - -template -struct iterator_types {}; - -template -struct iterator_types < - It, - void_t> -{ - using difference_type = typename It::difference_type; - using value_type = typename It::value_type; - using pointer = typename It::pointer; - using reference = typename It::reference; - using iterator_category = typename It::iterator_category; -}; - -// This is required as some compilers implement std::iterator_traits in a way that -// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341. -template -struct iterator_traits -{ -}; - -template -struct iterator_traits < T, enable_if_t < !std::is_pointer::value >> - : iterator_types -{ -}; - -template -struct iterator_traits::value>> -{ - using iterator_category = std::random_access_iterator_tag; - using value_type = T; - using difference_type = ptrdiff_t; - using pointer = T*; - using reference = T&; -}; - -} // namespace detail -NLOHMANN_JSON_NAMESPACE_END - -// #include - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -// #include - - -NLOHMANN_JSON_NAMESPACE_BEGIN - -NLOHMANN_CAN_CALL_STD_FUNC_IMPL(begin); - -NLOHMANN_JSON_NAMESPACE_END - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - - - -// #include - - -NLOHMANN_JSON_NAMESPACE_BEGIN - -NLOHMANN_CAN_CALL_STD_FUNC_IMPL(end); - -NLOHMANN_JSON_NAMESPACE_END - -// #include - -// #include - -// #include -// __ _____ _____ _____ -// __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.3 -// |_____|_____|_____|_|___| https://github.com/nlohmann/json -// -// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann -// SPDX-License-Identifier: MIT - -#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_ - #define INCLUDE_NLOHMANN_JSON_FWD_HPP_ - - #include // int64_t, uint64_t - #include // map - #include // allocator - #include // string - #include // vector - - // #include - - - /*! - @brief namespace for Niels Lohmann - @see https://github.com/nlohmann - @since version 1.0.0 - */ - NLOHMANN_JSON_NAMESPACE_BEGIN - - /*! - @brief default JSONSerializer template argument - - This serializer ignores the template arguments and uses ADL - ([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl)) - for serialization. - */ - template - struct adl_serializer; - - /// a class to store JSON values - /// @sa https://json.nlohmann.me/api/basic_json/ - template class ObjectType = - std::map, - template class ArrayType = std::vector, - class StringType = std::string, class BooleanType = bool, - class NumberIntegerType = std::int64_t, - class NumberUnsignedType = std::uint64_t, - class NumberFloatType = double, - template class AllocatorType = std::allocator, - template class JSONSerializer = - adl_serializer, - class BinaryType = std::vector, // cppcheck-suppress syntaxError - class CustomBaseClass = void> - class basic_json; - - /// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document - /// @sa https://json.nlohmann.me/api/json_pointer/ - template - class json_pointer; - - /*! - @brief default specialization - @sa https://json.nlohmann.me/api/json/ - */ - using json = basic_json<>; - - /// @brief a minimal map-like container that preserves insertion order - /// @sa https://json.nlohmann.me/api/ordered_map/ - template - struct ordered_map; - - /// @brief specialization that maintains the insertion order of object keys - /// @sa https://json.nlohmann.me/api/ordered_json/ - using ordered_json = basic_json; - - NLOHMANN_JSON_NAMESPACE_END - -#endif // INCLUDE_NLOHMANN_JSON_FWD_HPP_ - - -NLOHMANN_JSON_NAMESPACE_BEGIN -/*! -@brief detail namespace with internal helper functions - -This namespace collects functions that should not be exposed, -implementations of some @ref basic_json methods, and meta-programming helpers. - -@since version 2.1.0 -*/ -namespace detail -{ - -///////////// -// helpers // -///////////// - -// Note to maintainers: -// -// Every trait in this file expects a non CV-qualified type. -// The only exceptions are in the 'aliases for detected' section -// (i.e. those of the form: decltype(T::member_function(std::declval()))) -// -// In this case, T has to be properly CV-qualified to constraint the function arguments -// (e.g. to_json(BasicJsonType&, const T&)) - -template struct is_basic_json : std::false_type {}; - -NLOHMANN_BASIC_JSON_TPL_DECLARATION -struct is_basic_json : std::true_type {}; - -// used by exceptions create() member functions -// true_type for pointer to possibly cv-qualified basic_json or std::nullptr_t -// false_type otherwise -template -struct is_basic_json_context : - std::integral_constant < bool, - is_basic_json::type>::type>::value - || std::is_same::value > -{}; - -////////////////////// -// json_ref helpers // -////////////////////// - -template -class json_ref; - -template -struct is_json_ref : std::false_type {}; - -template -struct is_json_ref> : std::true_type {}; - -////////////////////////// -// aliases for detected // -////////////////////////// - -template -using mapped_type_t = typename T::mapped_type; - -template -using key_type_t = typename T::key_type; - -template -using value_type_t = typename T::value_type; - -template -using difference_type_t = typename T::difference_type; - -template -using pointer_t = typename T::pointer; - -template -using reference_t = typename T::reference; - -template -using iterator_category_t = typename T::iterator_category; - -template -using to_json_function = decltype(T::to_json(std::declval()...)); - -template -using from_json_function = decltype(T::from_json(std::declval()...)); - -template -using get_template_function = decltype(std::declval().template get()); - -// trait checking if JSONSerializer::from_json(json const&, udt&) exists -template -struct has_from_json : std::false_type {}; - -// trait checking if j.get is valid -// use this trait instead of std::is_constructible or std::is_convertible, -// both rely on, or make use of implicit conversions, and thus fail when T -// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958) -template -struct is_getable -{ - static constexpr bool value = is_detected::value; -}; - -template -struct has_from_json < BasicJsonType, T, enable_if_t < !is_basic_json::value >> -{ - using serializer = typename BasicJsonType::template json_serializer; - - static constexpr bool value = - is_detected_exact::value; -}; - -// This trait checks if JSONSerializer::from_json(json const&) exists -// this overload is used for non-default-constructible user-defined-types -template -struct has_non_default_from_json : std::false_type {}; - -template -struct has_non_default_from_json < BasicJsonType, T, enable_if_t < !is_basic_json::value >> -{ - using serializer = typename BasicJsonType::template json_serializer; - - static constexpr bool value = - is_detected_exact::value; -}; - -// This trait checks if BasicJsonType::json_serializer::to_json exists -// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion. -template -struct has_to_json : std::false_type {}; - -template -struct has_to_json < BasicJsonType, T, enable_if_t < !is_basic_json::value >> -{ - using serializer = typename BasicJsonType::template json_serializer; - - static constexpr bool value = - is_detected_exact::value; -}; - -template -using detect_key_compare = typename T::key_compare; - -template -struct has_key_compare : std::integral_constant::value> {}; - -// obtains the actual object key comparator -template -struct actual_object_comparator -{ - using object_t = typename BasicJsonType::object_t; - using object_comparator_t = typename BasicJsonType::default_object_comparator_t; - using type = typename std::conditional < has_key_compare::value, - typename object_t::key_compare, object_comparator_t>::type; -}; - -template -using actual_object_comparator_t = typename actual_object_comparator::type; - -///////////////// -// char_traits // -///////////////// - -// Primary template of char_traits calls std char_traits -template -struct char_traits : std::char_traits -{}; - -// Explicitly define char traits for unsigned char since it is not standard -template<> -struct char_traits : std::char_traits -{ - using char_type = unsigned char; - using int_type = uint64_t; - - // Redefine to_int_type function - static int_type to_int_type(char_type c) noexcept - { - return static_cast(c); - } - - static char_type to_char_type(int_type i) noexcept - { - return static_cast(i); - } - - static constexpr int_type eof() noexcept - { - return static_cast(EOF); - } -}; - -// Explicitly define char traits for signed char since it is not standard -template<> -struct char_traits : std::char_traits -{ - using char_type = signed char; - using int_type = uint64_t; - - // Redefine to_int_type function - static int_type to_int_type(char_type c) noexcept - { - return static_cast(c); - } - - static char_type to_char_type(int_type i) noexcept - { - return static_cast(i); - } - - static constexpr int_type eof() noexcept - { - return static_cast(EOF); - } -}; - -/////////////////// -// is_ functions // -/////////////////// - -// https://en.cppreference.com/w/cpp/types/conjunction -template struct conjunction : std::true_type { }; -template struct conjunction : B { }; -template -struct conjunction -: std::conditional(B::value), conjunction, B>::type {}; - -// https://en.cppreference.com/w/cpp/types/negation -template struct negation : std::integral_constant < bool, !B::value > { }; - -// Reimplementation of is_constructible and is_default_constructible, due to them being broken for -// std::pair and std::tuple until LWG 2367 fix (see https://cplusplus.github.io/LWG/lwg-defects.html#2367). -// This causes compile errors in e.g. clang 3.5 or gcc 4.9. -template -struct is_default_constructible : std::is_default_constructible {}; - -template -struct is_default_constructible> - : conjunction, is_default_constructible> {}; - -template -struct is_default_constructible> - : conjunction, is_default_constructible> {}; - -template -struct is_default_constructible> - : conjunction...> {}; - -template -struct is_default_constructible> - : conjunction...> {}; - -template -struct is_constructible : std::is_constructible {}; - -template -struct is_constructible> : is_default_constructible> {}; - -template -struct is_constructible> : is_default_constructible> {}; - -template -struct is_constructible> : is_default_constructible> {}; - -template -struct is_constructible> : is_default_constructible> {}; - -template -struct is_iterator_traits : std::false_type {}; - -template -struct is_iterator_traits> -{ - private: - using traits = iterator_traits; - - public: - static constexpr auto value = - is_detected::value && - is_detected::value && - is_detected::value && - is_detected::value && - is_detected::value; -}; - -template -struct is_range -{ - private: - using t_ref = typename std::add_lvalue_reference::type; - - using iterator = detected_t; - using sentinel = detected_t; - - // to be 100% correct, it should use https://en.cppreference.com/w/cpp/iterator/input_or_output_iterator - // and https://en.cppreference.com/w/cpp/iterator/sentinel_for - // but reimplementing these would be too much work, as a lot of other concepts are used underneath - static constexpr auto is_iterator_begin = - is_iterator_traits>::value; - - public: - static constexpr bool value = !std::is_same::value && !std::is_same::value && is_iterator_begin; -}; - -template -using iterator_t = enable_if_t::value, result_of_begin())>>; - -template -using range_value_t = value_type_t>>; - -// The following implementation of is_complete_type is taken from -// https://blogs.msdn.microsoft.com/vcblog/2015/12/02/partial-support-for-expression-sfinae-in-vs-2015-update-1/ -// and is written by Xiang Fan who agreed to using it in this library. - -template -struct is_complete_type : std::false_type {}; - -template -struct is_complete_type : std::true_type {}; - -template -struct is_compatible_object_type_impl : std::false_type {}; - -template -struct is_compatible_object_type_impl < - BasicJsonType, CompatibleObjectType, - enable_if_t < is_detected::value&& - is_detected::value >> -{ - using object_t = typename BasicJsonType::object_t; - - // macOS's is_constructible does not play well with nonesuch... - static constexpr bool value = - is_constructible::value && - is_constructible::value; -}; - -template -struct is_compatible_object_type - : is_compatible_object_type_impl {}; - -template -struct is_constructible_object_type_impl : std::false_type {}; - -template -struct is_constructible_object_type_impl < - BasicJsonType, ConstructibleObjectType, - enable_if_t < is_detected::value&& - is_detected::value >> -{ - using object_t = typename BasicJsonType::object_t; - - static constexpr bool value = - (is_default_constructible::value && - (std::is_move_assignable::value || - std::is_copy_assignable::value) && - (is_constructible::value && - std::is_same < - typename object_t::mapped_type, - typename ConstructibleObjectType::mapped_type >::value)) || - (has_from_json::value || - has_non_default_from_json < - BasicJsonType, - typename ConstructibleObjectType::mapped_type >::value); -}; - -template -struct is_constructible_object_type - : is_constructible_object_type_impl {}; - -template -struct is_compatible_string_type -{ - static constexpr auto value = - is_constructible::value; -}; - -template -struct is_constructible_string_type -{ - // launder type through decltype() to fix compilation failure on ICPC -#ifdef __INTEL_COMPILER - using laundered_type = decltype(std::declval()); -#else - using laundered_type = ConstructibleStringType; -#endif - - static constexpr auto value = - conjunction < - is_constructible, - is_detected_exact>::value; -}; - -template -struct is_compatible_array_type_impl : std::false_type {}; - -template -struct is_compatible_array_type_impl < - BasicJsonType, CompatibleArrayType, - enable_if_t < - is_detected::value&& - is_iterator_traits>>::value&& -// special case for types like std::filesystem::path whose iterator's value_type are themselves -// c.f. https://github.com/nlohmann/json/pull/3073 - !std::is_same>::value >> -{ - static constexpr bool value = - is_constructible>::value; -}; - -template -struct is_compatible_array_type - : is_compatible_array_type_impl {}; - -template -struct is_constructible_array_type_impl : std::false_type {}; - -template -struct is_constructible_array_type_impl < - BasicJsonType, ConstructibleArrayType, - enable_if_t::value >> - : std::true_type {}; - -template -struct is_constructible_array_type_impl < - BasicJsonType, ConstructibleArrayType, - enable_if_t < !std::is_same::value&& - !is_compatible_string_type::value&& - is_default_constructible::value&& -(std::is_move_assignable::value || - std::is_copy_assignable::value)&& -is_detected::value&& -is_iterator_traits>>::value&& -is_detected::value&& -// special case for types like std::filesystem::path whose iterator's value_type are themselves -// c.f. https://github.com/nlohmann/json/pull/3073 -!std::is_same>::value&& - is_complete_type < - detected_t>::value >> -{ - using value_type = range_value_t; - - static constexpr bool value = - std::is_same::value || - has_from_json::value || - has_non_default_from_json < - BasicJsonType, - value_type >::value; -}; - -template -struct is_constructible_array_type - : is_constructible_array_type_impl {}; - -template -struct is_compatible_integer_type_impl : std::false_type {}; - -template -struct is_compatible_integer_type_impl < - RealIntegerType, CompatibleNumberIntegerType, - enable_if_t < std::is_integral::value&& - std::is_integral::value&& - !std::is_same::value >> -{ - // is there an assert somewhere on overflows? - using RealLimits = std::numeric_limits; - using CompatibleLimits = std::numeric_limits; - - static constexpr auto value = - is_constructible::value && - CompatibleLimits::is_integer && - RealLimits::is_signed == CompatibleLimits::is_signed; -}; - -template -struct is_compatible_integer_type - : is_compatible_integer_type_impl {}; - -template -struct is_compatible_type_impl: std::false_type {}; - -template -struct is_compatible_type_impl < - BasicJsonType, CompatibleType, - enable_if_t::value >> -{ - static constexpr bool value = - has_to_json::value; -}; - -template -struct is_compatible_type - : is_compatible_type_impl {}; - -template -struct is_constructible_tuple : std::false_type {}; - -template -struct is_constructible_tuple> : conjunction...> {}; - -template -struct is_json_iterator_of : std::false_type {}; - -template -struct is_json_iterator_of : std::true_type {}; - -template -struct is_json_iterator_of : std::true_type -{}; - -// checks if a given type T is a template specialization of Primary -template