This commit is contained in:
jmorganca 2024-06-10 17:23:09 -07:00
parent 4d0e6c55b0
commit 763d7b601c
77 changed files with 35429 additions and 35585 deletions

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/clip.cpp vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/clip.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

32
llama/common.cpp vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*
@ -226,13 +226,19 @@ void gpt_params_handle_model_default(gpt_params & params) {
}
params.hf_file = params.model;
} else if (params.model.empty()) {
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
std::string cache_directory = fs_get_cache_directory();
const bool success = fs_create_directory_with_parents(cache_directory);
if (!success) {
throw std::runtime_error("failed to create cache directory: " + cache_directory);
}
params.model = cache_directory + string_split(params.hf_file, '/').back();
}
} else if (!params.model_url.empty()) {
if (params.model.empty()) {
auto f = string_split(params.model_url, '#').front();
f = string_split(f, '?').front();
params.model = fs_get_cache_file(string_split(f, '/').back());
f = string_split(f, '/').back();
params.model = "models/" + f;
}
} else if (params.model.empty()) {
params.model = DEFAULT_MODEL_PATH;
@ -1511,14 +1517,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.chat_template = argv[i];
return true;
}
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.slot_prompt_similarity = std::stof(argv[i]);
return true;
}
if (arg == "-pps") {
params.is_pp_shared = true;
return true;
@ -1941,8 +1939,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"set custom jinja chat template (default: template taken from model's metadata)\n"
"only commonly used templates are accepted:\n"
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
#ifndef LOG_DISABLE_LOGS
options.push_back({ "logging" });
@ -2299,16 +2295,6 @@ std::string fs_get_cache_directory() {
return ensure_trailing_slash(cache_directory);
}
std::string fs_get_cache_file(const std::string & filename) {
GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
std::string cache_directory = fs_get_cache_directory();
const bool success = fs_create_directory_with_parents(cache_directory);
if (!success) {
throw std::runtime_error("failed to create cache directory: " + cache_directory);
}
return cache_directory + filename;
}
//
// Model utils

5
llama/common.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*
@ -229,8 +229,6 @@ struct gpt_params {
std::string slot_save_path;
float slot_prompt_similarity = 0.5f;
// batched-bench params
bool is_pp_shared = false;
@ -303,7 +301,6 @@ bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);
//
// Model utils

2
llama/ggml-alloc.c vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/ggml-alloc.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/ggml-common.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

90
llama/ggml-cuda.cu vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*
@ -1377,30 +1377,10 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
GGML_UNUSED(main_device);
}
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS)
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {};
p.dstDevice = dstDevice;
p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
p.srcDevice = srcDevice;
p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
p.extent = make_cudaExtent(width, height, 1);
return cudaMemcpy3DPeerAsync(&p, stream);
#else
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
GGML_UNUSED(dstDevice);
GGML_UNUSED(srcDevice);
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
#endif // !defined(GGML_USE_HIPBLAS)
}
static void ggml_cuda_op_mul_mat(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
quantize_cuda_t quantize_src1) {
const bool convert_src1_to_q8_1) {
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
@ -1457,9 +1437,7 @@ static void ggml_cuda_op_mul_mat(
}
struct dev_data {
int cc;
ggml_cuda_pool_alloc<char> src0_dd_alloc;
ggml_cuda_pool_alloc<char> src0_dd_alloc;
ggml_cuda_pool_alloc<float> src1_ddf_alloc;
ggml_cuda_pool_alloc<char> src1_ddq_alloc;
ggml_cuda_pool_alloc<float> dst_dd_alloc;
@ -1478,8 +1456,6 @@ static void ggml_cuda_op_mul_mat(
int used_devices = 0;
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
dev[id].cc = ggml_cuda_info().devices[id].cc;
// by default, use all rows
dev[id].row_low = 0;
dev[id].row_high = ne01;
@ -1530,15 +1506,11 @@ static void ggml_cuda_op_mul_mat(
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
}
if (quantize_src1) {
size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
}
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
if (convert_src1_to_q8_1) {
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
if (src1_on_device && src1_is_contiguous) {
quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
CUDA_CHECK(cudaGetLastError());
}
}
@ -1584,12 +1556,7 @@ static void ggml_cuda_op_mul_mat(
const int64_t i03 = i0 / ne12;
const int64_t i02 = i0 % ne12;
size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
} else {
src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
}
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
// for split tensors the data begins at i0 == i0_offset_low
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
@ -1606,17 +1573,10 @@ static void ggml_cuda_op_mul_mat(
// copy src0, src1 to device if necessary
if (src1_is_contiguous) {
if (id != ctx.device) {
if (quantize_src1) {
if (convert_src1_to_q8_1) {
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
const size_t pitch = ne11*sizeof(block_q8_1_mmq);
const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
const size_t height = src1_padded_col_size/(4*QK8_1);
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
} else {
CUDA_CHECK(cudaMemcpyPeerAsync(
src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
}
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
} else {
float * src1_ddf_i_source = (float *) src1->data;
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
@ -1631,8 +1591,8 @@ static void ggml_cuda_op_mul_mat(
GGML_ASSERT(false);
}
if (quantize_src1 && !src1_is_contiguous) {
quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
CUDA_CHECK(cudaGetLastError());
}
@ -1657,8 +1617,22 @@ static void ggml_cuda_op_mul_mat(
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
#if !defined(GGML_USE_HIPBLAS)
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {};
p.dstDevice = ctx.device;
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
p.srcDevice = id;
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
#else
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
dst_dd_i, row_diff*sizeof(float),
row_diff*sizeof(float), src1_ncols,
cudaMemcpyDeviceToDevice, stream));
#endif
} else {
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
@ -1997,13 +1971,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
// KQ + KQV multi-batch
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
} else if (use_mul_mat_vec_q) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
} else if (use_mul_mat_q) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
} else {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
}
}

2
llama/ggml-cuda.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -11,7 +11,6 @@ void ggml_cuda_op_mul_mat_q(
const int64_t nb01 = src0->nb[1];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(ne10 % QK8_1 == 0);
const int64_t ne0 = dst->ne[0];
@ -26,7 +25,7 @@ void ggml_cuda_op_mul_mat_q(
// nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
switch (src0->type) {
case GGML_TYPE_Q4_0:

View File

@ -1,26 +1,15 @@
#pragma once
#include "common.cuh"
#include "vecdotq.cuh"
#include <climits>
#include <cstdint>
#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
typedef void (*load_tiles_mmq_t)(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride);
typedef void (*vec_dot_mmq_t)(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0);
struct block_q8_1_mmq {
half2 ds[4];
int8_t qs[4*QK8_1];
};
static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1), "Unexpected block_q8_1_mmq size");
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, float * __restrict__ sum, const int & k0);
struct tile_x_sizes {
int ql;
@ -143,14 +132,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
@ -160,18 +145,19 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
const int i = i0 + threadIdx.x;
const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
const float * x_dmf = (const float *) x_dm;
int u[2*VDR_Q4_0_Q8_1_MMQ];
#pragma unroll
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_0) % WARP_SIZE];
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
}
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
(&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dmf[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
(&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
}
}
}
@ -217,13 +203,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
@ -238,13 +221,13 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
#pragma unroll
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_1) % WARP_SIZE];
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
}
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
(&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
(&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
}
}
}
@ -310,14 +293,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
@ -327,18 +306,20 @@ static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
const int i = i0 + threadIdx.x;
const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
const int index_bx = i*(WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
int u[2*VDR_Q5_0_Q8_1_MMQ];
#pragma unroll
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_0) % WARP_SIZE];
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
}
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
(&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dmf[index_bx], y_df[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
}
}
}
@ -402,13 +383,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
@ -418,18 +396,18 @@ static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
const int i = i0 + threadIdx.x;
const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
const int index_bx = i*(WARP_SIZE/QI5_1) + i/QI5_1 + k0/QI5_1;
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k0/QI5_1;
int u[2*VDR_Q5_1_Q8_1_MMQ];
#pragma unroll
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_1) % WARP_SIZE];
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
}
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
(&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dm[index_bx], y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
}
}
}
@ -477,14 +455,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
@ -493,9 +467,12 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
(&x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0], x_dmf[i*(WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
y_df[j*MMQ_TILE_Y_K + k0/QI8_1]);
(&x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[j * WARP_SIZE + k0], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
y_df[j * (WARP_SIZE/QI8_1) + k0/QI8_1]);
}
}
}
@ -554,13 +531,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh);
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
@ -571,10 +545,11 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
const int kbx = k0 / QI2_K;
const int ky = (k0 % QI2_K) * QR2_K;
const float * y_df = (const float *) y_ds;
int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
const int kqsx = i*(WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
#pragma unroll
@ -582,11 +557,11 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
}
const uint8_t * scales = ((const uint8_t *) &x_sc[i*(WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
const int index_y = j * WARP_SIZE + (QR2_K*k0) % WARP_SIZE;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq(
v, &y_qs[j*MMQ_TILE_Y_K + (QR2_K*k0) % WARP_SIZE], scales,
x_dm[i*(WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[j*MMQ_TILE_Y_K + ((QR2_K*k0) % WARP_SIZE)/QI8_1]);
v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
}
}
}
@ -671,11 +646,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@ -687,6 +658,8 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
const int kbx = k0 / QI3_K;
const int ky = (k0 % QI3_K) * QR3_K;
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
@ -694,19 +667,19 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
#pragma unroll
for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
const int kqsx = i*(WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
const int shift = 2 * ((ky % 32) / 8);
const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
const int vh = x_qh[i*(WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
const int vlh = (vh << 2) & 0x04040404;
v[l] = __vsubss4(vll, vlh);
}
const int index_y = j * WARP_SIZE + (k0*QR3_K) % WARP_SIZE;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq(
v, &y_qs[j*MMQ_TILE_Y_K + (k0*QR3_K) % WARP_SIZE], scales,
x_dmf[i*(WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[j*MMQ_TILE_Y_K + ((k0*QR3_K) % WARP_SIZE)/QI8_1]);
v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
}
}
}
@ -773,13 +746,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh);
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
@ -790,9 +760,9 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2*((k0 % 16) / 8);
const int index_y = j * WARP_SIZE + (QR4_K*k0) % WARP_SIZE;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
&x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + (QR4_K*k0) % WARP_SIZE], sc, sc+8,
x_dm[i*(WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[j*MMQ_TILE_Y_K + ((QR4_K*k0) % WARP_SIZE)/QI8_1]);
&x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
}
}
}
@ -872,13 +842,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh);
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
@ -889,9 +856,10 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k0;
const int index_y = j * WARP_SIZE + (QR5_K*k0) % WARP_SIZE;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
&x_ql[i*(QR5_K*WARP_SIZE + 1) + QR5_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR5_K*k0) % WARP_SIZE], sc, sc+8,
x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[j*MMQ_TILE_Y_K + ((QR5_K*k0) % WARP_SIZE)/QI8_1]);
&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
}
}
}
@ -964,14 +932,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh);
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
@ -980,11 +944,15 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k0;
const int index_y = j * WARP_SIZE + (QR6_K*k0) % WARP_SIZE;
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq(
&x_ql[i*(QR6_K*WARP_SIZE + 1) + QR6_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR6_K*k0) % WARP_SIZE], sc,
x_dmf[i*(WARP_SIZE/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + ((QR6_K*k0) % WARP_SIZE)/QI8_1]);
&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
}
}
}
@ -996,6 +964,7 @@ struct mmq_type_traits;
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
static constexpr bool need_sum = true;
static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1003,6 +972,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
static constexpr bool need_sum = true;
static constexpr int vdr = VDR_Q4_1_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1010,6 +980,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
static constexpr bool need_sum = false;
static constexpr int vdr = VDR_Q5_0_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1017,6 +988,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
static constexpr bool need_sum = true;
static constexpr int vdr = VDR_Q5_1_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1024,6 +996,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
static constexpr bool need_sum = false;
static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q8_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1031,6 +1004,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
static constexpr bool need_sum = false;
static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q2_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1038,6 +1012,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
static constexpr bool need_sum = false;
static constexpr int vdr = VDR_Q3_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q3_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1045,6 +1020,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
static constexpr bool need_sum = true;
static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1052,6 +1028,7 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
static constexpr bool need_sum = true;
static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@ -1059,36 +1036,12 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
static constexpr bool need_sum = false;
static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
};
static int mmq_need_sum(const ggml_type type_x) {
switch (type_x) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
return true;
case GGML_TYPE_Q5_0:
return false;
case GGML_TYPE_Q5_1:
return true;
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
return false;
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
return true;
case GGML_TYPE_Q6_K:
return false;
default:
GGML_ASSERT(false);
break;
}
return false;
}
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
@ -1103,7 +1056,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
static __global__ void mul_mat_q(
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst,
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
const int ne00, const int ne01, const int stride00, const int ne10, const int ne11, const int ne0) {
// Skip unused template specializations for faster compilation:
if (mmq_x > get_mmq_x_max_device()) {
@ -1115,6 +1068,7 @@ static __global__ void mul_mat_q(
constexpr int qr = ggml_cuda_type_traits<type>::qr;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
constexpr int mmq_y = get_mmq_y_device(mmq_x);
constexpr bool need_sum = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::need_sum;
constexpr int vdr = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr;
constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot;
@ -1126,38 +1080,62 @@ static __global__ void mul_mat_q(
half2 * tile_x_dm = (half2 *) (tile_x_ql + txs.ql);
int * tile_x_qh = (int *) (tile_x_dm + txs.dm);
int * tile_x_sc = (int *) (tile_x_qh + txs.qh);
int * tile_y = (int *) (tile_x_sc + txs.sc); // [mmq_x * (WARP_SIZE + WARP_SIZE/QI8_1)]
int * tile_y_qs = (int *) (tile_x_sc + txs.sc); // [mmq_x * WARP_SIZE]
half2 * tile_y_ds = (half2 *) (tile_y_qs + mmq_x*WARP_SIZE); // [mmq_x * WARP_SIZE/QI8_1];
const block_q8_1 * y = (const block_q8_1 *) yc;
const int blocks_per_row_x = ne00 / qk;
const int blocks_per_col_y = ne10 / QK8_1;
const int blocks_per_warp = WARP_SIZE / qi;
const int & ne1 = ne11;
const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1;
const int * y = (const int *) yc + blockIdx.y*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f};
for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride01*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride01);
load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride00*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride00);
#pragma unroll
for (int kr = 0; kr < qr; ++kr) {
const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + kr*sizeof(block_q8_1_mmq)/sizeof(int));
#pragma unroll
for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
const int kqs = kr*WARP_SIZE + threadIdx.x;
const int kbxd = kqs / QI8_1;
tile_y[l] = by0[l];
#pragma unroll
for (int i0 = 0; i0 < mmq_x; i0 += nwarps) {
const int i = min(blockIdx.y*mmq_x + threadIdx.y + i0, ne11-1); // to prevent out-of-bounds memory accesses
const block_q8_1 * by0 = &y[i*blocks_per_col_y + kb0 * (qk/QK8_1) + kbxd];
const int index_y = (i0 + threadIdx.y) * WARP_SIZE + kqs % WARP_SIZE;
tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
}
#pragma unroll
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
const int i_y_eff = min(blockIdx.y*mmq_x + ids, ne11-1);
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
const half2 * dsi_src = &y[i_y_eff*blocks_per_col_y + kb0 * (qk/QK8_1) + kr*(WARP_SIZE/QI8_1) + kby].ds;
half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
if (need_sum) {
*dsi_dst = *dsi_src;
} else {
float * dfi_dst = (float *) dsi_dst;
*dfi_dst = __low2float(*dsi_src);
}
}
__syncthreads();
// #pragma unroll // unrolling this loop causes too much register pressure
for (int k0 = kr*WARP_SIZE/qr; k0 < (kr+1)*WARP_SIZE/qr; k0 += vdr) {
vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y, sum, k0);
vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, sum, k0);
}
__syncthreads();
@ -1187,8 +1165,8 @@ static __global__ void mul_mat_q(
struct mmq_args {
const char * x; const char * y; float * dst;
int64_t ne00; int64_t ne01; int64_t stride01;
int64_t ne10; int64_t ne11; int64_t stride11;
int64_t ne00; int64_t ne01; int64_t stride00;
int64_t ne10; int64_t ne11;
int64_t ne0;
};
@ -1206,7 +1184,7 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y);
const int shmem_x = txs.ql*sizeof(int) + txs.dm*sizeof(half2) + txs.qh*sizeof(int) + txs.sc*sizeof(int);
const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2);
const int shmem = shmem_x + GGML_PAD(shmem_y, nwarps*WARP_SIZE*sizeof(int));
const int shmem = shmem_x + shmem_y;
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
@ -1220,11 +1198,11 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
if (args.ne01 % mmq_y == 0) {
const bool need_check = false;
mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
(args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
(args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
} else {
const bool need_check = true;
mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
(args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
(args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
}
}

View File

@ -1,23 +1,22 @@
#include "quantize.cuh"
#include <cstdint>
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (ix0 >= kx0_padded) {
if (ix >= kx_padded) {
return;
}
const int64_t ix1 = blockIdx.y;
const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
const int64_t i_padded = ix1*kx0_padded + ix0;
const int64_t i_padded = (int64_t)iy*kx_padded + ix;
block_q8_1 * y = (block_q8_1 *) vy;
const int64_t ib = i_padded / QK8_1; // block index
const int64_t iqs = i_padded % QK8_1; // quant index
const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
float amax = fabsf(xi);
float sum = xi;
@ -37,76 +36,10 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
reinterpret_cast<half&>(y[ib].ds.y) = sum;
}
template <bool need_sum>
static __global__ void quantize_mmq_q8_1(
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (ix0 >= kx0_padded) {
return;
}
const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
float amax = fabsf(xi);
amax = warp_reduce_max(amax);
float sum;
if (need_sum) {
sum = warp_reduce_sum(xi);
}
const float d = amax / 127;
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
y[ib].qs[iqs] = q;
if (iqs % QK8_1 != 0) {
return;
}
if (need_sum) {
y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
} else {
((float *) y[ib].ds)[iqs/QK8_1] = d;
}
}
void quantize_row_q8_1_cuda(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
GGML_ASSERT(kx0_padded % QK8_1 == 0);
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, kx1*channels, 1);
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, ky, 1);
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
GGML_UNUSED(type_x);
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
}
void quantize_mmq_q8_1_cuda(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, kx1, channels);
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
if (mmq_need_sum(type_x)) {
quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
} else {
quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
}
}

View File

@ -1,20 +1,5 @@
#pragma once
#include "common.cuh"
#include "mmq.cuh"
#include <cstdint>
#define CUDA_QUANTIZE_BLOCK_SIZE 256
typedef void (*quantize_cuda_t)(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
const ggml_type type_x, cudaStream_t stream);
void quantize_row_q8_1_cuda(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
const ggml_type type_x, cudaStream_t stream);
void quantize_mmq_q8_1_cuda(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
const ggml_type type_x, cudaStream_t stream);
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);

2
llama/ggml-impl.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/ggml-metal.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/ggml-quants.c vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/ggml-quants.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/ggml.c vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/ggml.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

10
llama/llama.cpp vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*
@ -15255,14 +15255,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (imatrix_data) {
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
qs.has_imatrix = true;
// check imatrix for nans or infs
for (const auto & kv : *imatrix_data) {
for (float f : kv.second) {
if (!std::isfinite(f)) {
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
}
}
}
}
}

View File

@ -2,12 +2,12 @@ package llama
// #cgo CFLAGS: -std=c11 -DNDEBUG -DLOG_DISABLE_LOGS
// #cgo CXXFLAGS: -std=c++11 -DNDEBUG -DLOG_DISABLE_LOGS
// #cgo darwin,arm64 CFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,arm64 CXXFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
// #cgo darwin,arm64 CFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DGGML_METAL_EMBED_LIBRARY -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,arm64 CXXFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DGGML_METAL_EMBED_LIBRARY -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/ggml-metal.o -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
// #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
// #cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
// #cgo darwin,amd64 LDFLAGS: -framework Foundation
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/ggml-metal.o -framework Foundation
// #cgo darwin,amd64,avx2 CFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,amd64,avx2 CXXFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
// #cgo darwin,amd64,avx2 LDFLAGS: -framework Accelerate

2
llama/llama.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/llava.cpp vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/llava.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/log.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/sampling.cpp vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/sampling.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/stb_image.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/unicode.cpp vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*

2
llama/unicode.h vendored
View File

@ -1,5 +1,5 @@
/**
* llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
* llama.cpp - git ee459f40f65810a810151b24eba5b8bd174ceffe
*
* MIT License
*