From 542476682b1124971719e2f4b1cb10972b387e99 Mon Sep 17 00:00:00 2001 From: sentseven Date: Wed, 1 Apr 2026 11:34:21 -0500 Subject: [PATCH] feat: add PrismML Q1_0/Q1_0_G128 1-bit ternary quantization support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port PrismML's custom Q1_0 (32-element blocks) and Q1_0_G128 (128-element blocks) 1-bit ternary quantization types into the turboquant fork. This enables loading and running Bonsai 1-bit models (1.7B, 4B, 8B) which use these types. Changes: - ggml.h: Add GGML_TYPE_Q1_0=42, GGML_TYPE_Q1_0_G128=43, bump COUNT to 44 Add GGML_FTYPE_MOSTLY_Q1_0=27, GGML_FTYPE_MOSTLY_Q1_0_G128=28 - ggml-common.h: Add block_q1_0 (6 bytes) and block_q1_0_g128 (18 bytes) struct definitions with static_asserts - ggml-quants.h/c: Add quantize_row, dequantize_row, and quantize functions for both Q1_0 and Q1_0_G128 - ggml.c: Add type_traits entries, ftype switch cases, quantize dispatch - ggml-cpu/ggml-cpu.c: Register CPU type traits with vec_dot support - ggml-cpu/quants.h/c: Add vec_dot implementations (Q1_0 x Q8_0 and Q1_0_G128 x Q8_0) and quantize_row wrappers - ggml-cpu/ops.cpp: Add Q1_0/Q1_0_G128 to all quantized-type switch statements (get_rows, set_rows, cpy, etc.) - gguf.cpp: Add PrismML compatibility remap — when general.file_type indicates a PrismML model (ftype 40 or 41), remap tensor type IDs 40->42 (Q1_0) and 41->43 (Q1_0_G128) to avoid collision with NVFP4 (40) and TQ3_0 (41) - gguf-py/constants.py: Add Python enum entries and block size mappings Type ID conflict resolution: PrismML uses type 40=Q1_0, 41=Q1_0_g128 Turboquant uses type 40=NVFP4, 41=TQ3_0 This port assigns Q1_0=42, Q1_0_G128=43 and remaps at GGUF load time Algorithm: Ternary quantization where each value is encoded as a single sign bit (1=+scale, 0=-scale), with scale = mean(abs(block_values)). Tested: Bonsai-4B and Bonsai-8B GGUFs load successfully and produce coherent output on CPU inference (0.7 t/s for 4B, 0.3 t/s for 8B). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ggml/include/ggml.h | 6 ++- ggml/src/ggml-common.h | 18 ++++++++ ggml/src/ggml-cpu/ggml-cpu.c | 12 +++++ ggml/src/ggml-cpu/ops.cpp | 14 ++++++ ggml/src/ggml-cpu/quants.c | 77 ++++++++++++++++++++++++++++++++ ggml/src/ggml-cpu/quants.h | 6 +++ ggml/src/ggml-quants.c | 86 ++++++++++++++++++++++++++++++++++++ ggml/src/ggml-quants.h | 9 ++++ ggml/src/ggml.c | 20 +++++++++ ggml/src/gguf.cpp | 27 +++++++++++ gguf-py/gguf/constants.py | 8 ++++ 11 files changed, 282 insertions(+), 1 deletion(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 509ac1eb8..f0475320e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -429,7 +429,9 @@ extern "C" { GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) GGML_TYPE_TQ3_0 = 41, // TurboQuant 3-bit polar + QJL (no per-block scale) - GGML_TYPE_COUNT = 42, + GGML_TYPE_Q1_0 = 42, // PrismML 1-bit ternary (32-element blocks) + GGML_TYPE_Q1_0_G128 = 43, // PrismML 1-bit ternary (128-element blocks) + GGML_TYPE_COUNT = 44, }; // precision @@ -466,6 +468,8 @@ extern "C" { GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors + GGML_FTYPE_MOSTLY_Q1_0 = 27, // except 1d tensors (PrismML 1-bit) + GGML_FTYPE_MOSTLY_Q1_0_G128 = 28, // except 1d tensors (PrismML 1-bit g128) }; // available tensor operations: diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index f8433333e..5ce668558 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -281,6 +281,24 @@ typedef struct { } block_tq3_0; static_assert(sizeof(block_tq3_0) == QK_TQ3_0/4 + QK_TQ3_0/8 + sizeof(ggml_half), "wrong tq3_0 block size/padding"); +// PrismML Q1_0: 1-bit ternary quantization (32-element blocks) +// Each value quantized as sign bit: bit=1 → +scale, bit=0 → −scale +// scale = mean(abs(values)) per block +#define QK1_0 32 +typedef struct { + ggml_half d; // scale (mean absolute value) + uint8_t qs[QK1_0 / 8]; // sign bits: 32 × 1 bit = 4 bytes +} block_q1_0; +static_assert(sizeof(block_q1_0) == sizeof(ggml_half) + QK1_0/8, "wrong q1_0 block size/padding"); + +// PrismML Q1_0_G128: 1-bit ternary quantization (128-element blocks) +#define QK1_0_G128 128 +typedef struct { + ggml_half d; // scale + uint8_t qs[QK1_0_G128 / 8]; // sign bits: 128 × 1 bit = 16 bytes +} block_q1_0_g128; +static_assert(sizeof(block_q1_0_g128) == sizeof(ggml_half) + QK1_0_G128/8, "wrong q1_0_g128 block size/padding"); + // // Super-block quantization structures // diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index e039c43a2..2dc7cc8a0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -394,6 +394,18 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .from_float = quantize_row_tq3_0, .nrows = 1, }, + [GGML_TYPE_Q1_0] = { + .from_float = quantize_row_q1_0, + .vec_dot = ggml_vec_dot_q1_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, + [GGML_TYPE_Q1_0_G128] = { + .from_float = quantize_row_q1_0_g128, + .vec_dot = ggml_vec_dot_q1_0_g128_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, }, diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 9b8de3eea..10a9dbde7 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -679,6 +679,8 @@ void ggml_compute_forward_add( case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: case GGML_TYPE_TQ3_0: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q1_0_G128: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -1130,6 +1132,8 @@ void ggml_compute_forward_add1( case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: case GGML_TYPE_TQ3_0: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q1_0_G128: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -1260,6 +1264,8 @@ void ggml_compute_forward_acc( case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: case GGML_TYPE_TQ3_0: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q1_0_G128: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -4349,6 +4355,8 @@ void ggml_compute_forward_out_prod( case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: case GGML_TYPE_TQ3_0: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q1_0_G128: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -4626,6 +4634,8 @@ void ggml_compute_forward_set( case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: case GGML_TYPE_TQ3_0: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q1_0_G128: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -4850,6 +4860,8 @@ void ggml_compute_forward_get_rows( case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: case GGML_TYPE_TQ3_0: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q1_0_G128: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -5576,6 +5588,8 @@ void ggml_compute_forward_clamp( case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: case GGML_TYPE_TQ3_0: + case GGML_TYPE_Q1_0: + case GGML_TYPE_Q1_0_G128: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index ccb3c6cbd..06134f032 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -114,6 +114,83 @@ void quantize_row_tq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, quantize_row_tq3_0_ref(x, y, k); } +void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK1_0 == 0); + block_q1_0 * GGML_RESTRICT y = vy; + quantize_row_q1_0_ref(x, y, k); +} + +void quantize_row_q1_0_g128(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK1_0_G128 == 0); + block_q1_0_g128 * GGML_RESTRICT y = vy; + quantize_row_q1_0_g128_ref(x, y, k); +} + +//===================================== Q1_0 vec_dot ================================= + +void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); + + const block_q1_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + float sumf = 0.0f; + + for (int i = 0; i < nb; i++) { + const float d0 = GGML_FP16_TO_FP32(x[i].d); + const float d1 = GGML_FP16_TO_FP32(y[i].d); + + int sumi = 0; + for (int j = 0; j < QK1_0; j++) { + const int xi = ((x[i].qs[j / 8] >> (j % 8)) & 1) ? 1 : -1; + sumi += xi * (int)y[i].qs[j]; + } + + sumf += d0 * d1 * (float)sumi; + } + + *s = sumf; +} + +void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK1_0_G128; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); + + const block_q1_0_g128 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + float sumf = 0.0f; + + for (int i = 0; i < nb; i++) { + const float d0 = GGML_FP16_TO_FP32(x[i].d); + + // Each Q1_0_g128 block spans 4 Q8_0 blocks (4 × 32 = 128) + for (int k = 0; k < 4; k++) { + const float d1 = GGML_FP16_TO_FP32(y[i * 4 + k].d); + int sumi = 0; + + for (int j = 0; j < QK8_0; j++) { + const int bit_index = k * QK8_0 + j; + const int xi = ((x[i].qs[bit_index / 8] >> (bit_index % 8)) & 1) ? 1 : -1; + sumi += xi * (int)y[i * 4 + k].qs[j]; + } + + sumf += d0 * d1 * (float)sumi; + } + } + + *s = sumf; +} + //===================================== Q8_K ============================================== void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index f7060ebdd..1f76f40a5 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -33,6 +33,9 @@ void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, i void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_tq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q1_0_g128(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -55,6 +58,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 892306037..89af6037a 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -2496,6 +2496,92 @@ size_t quantize_tq3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, return nrow * row_size; } +// ====================== PrismML Q1_0 1-bit ternary quantization ====================== + +void quantize_row_q1_0_ref(const float * GGML_RESTRICT x, block_q1_0 * GGML_RESTRICT y, int64_t k) { + assert(k % QK1_0 == 0); + const int64_t nb = k / QK1_0; + + for (int64_t i = 0; i < nb; i++) { + float amax = 0.0f; + for (int j = 0; j < QK1_0; j++) { + amax += fabsf(x[i * QK1_0 + j]); + } + const float d = amax / QK1_0; + y[i].d = GGML_FP32_TO_FP16(d); + + memset(y[i].qs, 0, sizeof(y[i].qs)); + for (int j = 0; j < QK1_0; j++) { + if (x[i * QK1_0 + j] >= 0.0f) { + y[i].qs[j / 8] |= (1 << (j % 8)); + } + } + } +} + +void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK1_0 == 0); + const int64_t nb = k / QK1_0; + + for (int64_t i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < QK1_0; j++) { + const int bit = (x[i].qs[j / 8] >> (j % 8)) & 1; + y[i * QK1_0 + j] = bit ? d : -d; + } + } +} + +size_t quantize_q1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + (void)quant_weights; + const size_t row_size = ggml_row_size(GGML_TYPE_Q1_0, n_per_row); + quantize_row_q1_0_ref(src, dst, (int64_t)nrow * n_per_row); + return nrow * row_size; +} + +// ====================== PrismML Q1_0_G128 1-bit ternary (128-element blocks) ====================== + +void quantize_row_q1_0_g128_ref(const float * GGML_RESTRICT x, block_q1_0_g128 * GGML_RESTRICT y, int64_t k) { + assert(k % QK1_0_G128 == 0); + const int64_t nb = k / QK1_0_G128; + + for (int64_t i = 0; i < nb; i++) { + float amax = 0.0f; + for (int j = 0; j < QK1_0_G128; j++) { + amax += fabsf(x[i * QK1_0_G128 + j]); + } + const float d = amax / QK1_0_G128; + y[i].d = GGML_FP32_TO_FP16(d); + + memset(y[i].qs, 0, sizeof(y[i].qs)); + for (int j = 0; j < QK1_0_G128; j++) { + if (x[i * QK1_0_G128 + j] >= 0.0f) { + y[i].qs[j / 8] |= (1 << (j % 8)); + } + } + } +} + +void dequantize_row_q1_0_g128(const block_q1_0_g128 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK1_0_G128 == 0); + const int64_t nb = k / QK1_0_G128; + + for (int64_t i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < QK1_0_G128; j++) { + const int bit = (x[i].qs[j / 8] >> (j % 8)) & 1; + y[i * QK1_0_G128 + j] = bit ? d : -d; + } + } +} + +size_t quantize_q1_0_g128(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + (void)quant_weights; + const size_t row_size = ggml_row_size(GGML_TYPE_Q1_0_G128, n_per_row); + quantize_row_q1_0_g128_ref(src, dst, (int64_t)nrow * n_per_row); + return nrow * row_size; +} + // ====================== "True" 2-bit (de)-quantization void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 59acb4132..ae0ef3bce 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -35,6 +35,9 @@ GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_tq3_0_ref(const float * GGML_RESTRICT x, block_tq3_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q1_0_ref (const float * GGML_RESTRICT x, block_q1_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q1_0_g128_ref(const float * GGML_RESTRICT x, block_q1_0_g128 * GGML_RESTRICT y, int64_t k); + GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); @@ -63,6 +66,9 @@ GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_tq3_0(const block_tq3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q1_0 (const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q1_0_g128(const block_q1_0_g128 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -88,6 +94,9 @@ GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_REST GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_tq3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q1_0 (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q1_0_g128(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index cf45a17ff..3a9dca5a2 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -912,6 +912,22 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_tq3_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq3_0_ref, }, + [GGML_TYPE_Q1_0] = { + .type_name = "q1_0", + .blck_size = QK1_0, + .type_size = sizeof(block_q1_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q1_0, + .from_float_ref = (ggml_from_float_t) quantize_row_q1_0_ref, + }, + [GGML_TYPE_Q1_0_G128] = { + .type_name = "q1_0_g128", + .blck_size = QK1_0_G128, + .type_size = sizeof(block_q1_0_g128), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q1_0_g128, + .from_float_ref = (ggml_from_float_t) quantize_row_q1_0_g128_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -1397,6 +1413,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; case GGML_FTYPE_MOSTLY_MXFP4: wtype = GGML_TYPE_MXFP4; break; case GGML_FTYPE_MOSTLY_NVFP4: wtype = GGML_TYPE_NVFP4; break; + case GGML_FTYPE_MOSTLY_Q1_0: wtype = GGML_TYPE_Q1_0; break; + case GGML_FTYPE_MOSTLY_Q1_0_G128: wtype = GGML_TYPE_Q1_0_G128; break; case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break; case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break; case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break; @@ -7673,6 +7691,8 @@ size_t ggml_quantize_chunk( case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_TQ3_0: result = quantize_tq3_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q1_0: result = quantize_q1_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q1_0_G128: result = quantize_q1_0_g128(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index ab3cc9748..cd9a50909 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -568,6 +568,24 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para } // read the tensor info + // PrismML Q1_0 compatibility: detect if this GGUF uses PrismML type IDs + // PrismML encodes Q1_0 as type 40 and Q1_0_g128 as type 41, which clash + // with NVFP4 (40) and TQ3_0 (41). Detect via general.file_type KV: + // ftype 40 = MOSTLY_Q1_0, ftype 41 = MOSTLY_Q1_0_g128 (PrismML) + // ftype 26 = MOSTLY_NVFP4 (this fork) + bool remap_prismml = false; + { + const int64_t ft_idx = gguf_find_key(ctx, "general.file_type"); + if (ft_idx >= 0) { + const uint32_t ftype = gguf_get_val_u32(ctx, ft_idx); + // PrismML uses ftype values 40 or 41 for Q1_0 models + if (ftype == 40 || ftype == 41) { + GGML_LOG_INFO("%s: detected PrismML Q1_0 file (file_type=%u), remapping type IDs\n", __func__, ftype); + remap_prismml = true; + } + } + } + for (int64_t i = 0; ok && i < n_tensors; ++i) { struct gguf_tensor_info info; @@ -648,6 +666,15 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para { ok = ok && gr.read(info.t.type); + // PrismML Q1_0 compatibility remap + if (remap_prismml) { + if (info.t.type == (enum ggml_type)40) { + info.t.type = GGML_TYPE_Q1_0; + } else if (info.t.type == (enum ggml_type)41) { + info.t.type = GGML_TYPE_Q1_0_G128; + } + } + // check that tensor type is within defined range if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) { GGML_LOG_ERROR("%s: tensor '%s' has invalid ggml type %d. should be in [0, %d)\n", diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 9383644ab..0c7b115ce 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3820,6 +3820,9 @@ class GGMLQuantizationType(IntEnum): TQ2_0 = 35 MXFP4 = 39 NVFP4 = 40 + TQ3_0 = 41 + Q1_0 = 42 + Q1_0_G128 = 43 class ExpertGatingFuncType(IntEnum): @@ -3873,6 +3876,8 @@ class LlamaFileType(IntEnum): MOSTLY_TQ2_0 = 37 # except 1d tensors MOSTLY_MXFP4_MOE = 38 # except 1d tensors MOSTLY_NVFP4 = 39 # except 1d tensors + MOSTLY_Q1_0 = 40 # except 1d tensors (PrismML 1-bit) + MOSTLY_Q1_0_G128 = 41 # except 1d tensors (PrismML 1-bit g128) GUESSED = 1024 # not specified in the model file @@ -3981,6 +3986,9 @@ class VisionProjectorType: GGMLQuantizationType.TQ2_0: (256, 2 + 64), GGMLQuantizationType.MXFP4: (32, 1 + 16), GGMLQuantizationType.NVFP4: (64, 4 + 32), + GGMLQuantizationType.TQ3_0: (32, 8 + 4 + 2), + GGMLQuantizationType.Q1_0: (32, 2 + 4), + GGMLQuantizationType.Q1_0_G128: (128, 2 + 16), }