From 542476682b1124971719e2f4b1cb10972b387e99 Mon Sep 17 00:00:00 2001
From: sentseven <sentsven@gmail.com>
Date: Wed, 1 Apr 2026 11:34:21 -0500
Subject: [PATCH] feat: add PrismML Q1_0/Q1_0_G128 1-bit ternary quantization
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Port PrismML's custom Q1_0 (32-element blocks) and Q1_0_G128 (128-element
blocks) 1-bit ternary quantization types into the turboquant fork. This
enables loading and running Bonsai 1-bit models (1.7B, 4B, 8B) which use
these types.

Changes:
- ggml.h: Add GGML_TYPE_Q1_0=42, GGML_TYPE_Q1_0_G128=43, bump COUNT to 44
  Add GGML_FTYPE_MOSTLY_Q1_0=27, GGML_FTYPE_MOSTLY_Q1_0_G128=28
- ggml-common.h: Add block_q1_0 (6 bytes) and block_q1_0_g128 (18 bytes)
  struct definitions with static_asserts
- ggml-quants.h/c: Add quantize_row, dequantize_row, and quantize
  functions for both Q1_0 and Q1_0_G128
- ggml.c: Add type_traits entries, ftype switch cases, quantize dispatch
- ggml-cpu/ggml-cpu.c: Register CPU type traits with vec_dot support
- ggml-cpu/quants.h/c: Add vec_dot implementations (Q1_0 x Q8_0 and
  Q1_0_G128 x Q8_0) and quantize_row wrappers
- ggml-cpu/ops.cpp: Add Q1_0/Q1_0_G128 to all quantized-type switch
  statements (get_rows, set_rows, cpy, etc.)
- gguf.cpp: Add PrismML compatibility remap — when general.file_type
  indicates a PrismML model (ftype 40 or 41), remap tensor type IDs
  40->42 (Q1_0) and 41->43 (Q1_0_G128) to avoid collision with
  NVFP4 (40) and TQ3_0 (41)
- gguf-py/constants.py: Add Python enum entries and block size mappings

Type ID conflict resolution:
  PrismML uses type 40=Q1_0, 41=Q1_0_g128
  Turboquant uses type 40=NVFP4, 41=TQ3_0
  This port assigns Q1_0=42, Q1_0_G128=43 and remaps at GGUF load time

Algorithm: Ternary quantization where each value is encoded as a single
sign bit (1=+scale, 0=-scale), with scale = mean(abs(block_values)).

Tested: Bonsai-4B and Bonsai-8B GGUFs load successfully and produce
coherent output on CPU inference (0.7 t/s for 4B, 0.3 t/s for 8B).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ggml/include/ggml.h          |  6 ++-
 ggml/src/ggml-common.h       | 18 ++++++++
 ggml/src/ggml-cpu/ggml-cpu.c | 12 +++++
 ggml/src/ggml-cpu/ops.cpp    | 14 ++++++
 ggml/src/ggml-cpu/quants.c   | 77 ++++++++++++++++++++++++++++++++
 ggml/src/ggml-cpu/quants.h   |  6 +++
 ggml/src/ggml-quants.c       | 86 ++++++++++++++++++++++++++++++++++++
 ggml/src/ggml-quants.h       |  9 ++++
 ggml/src/ggml.c              | 20 +++++++++
 ggml/src/gguf.cpp            | 27 +++++++++++
 gguf-py/gguf/constants.py    |  8 ++++
 11 files changed, 282 insertions(+), 1 deletion(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 509ac1eb8..f0475320e 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -429,7 +429,9 @@ extern "C" {
         GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
         GGML_TYPE_NVFP4   = 40, // NVFP4 (4 blocks, E4M3 scale)
         GGML_TYPE_TQ3_0   = 41, // TurboQuant 3-bit polar + QJL (no per-block scale)
-        GGML_TYPE_COUNT   = 42,
+        GGML_TYPE_Q1_0      = 42, // PrismML 1-bit ternary (32-element blocks)
+        GGML_TYPE_Q1_0_G128 = 43, // PrismML 1-bit ternary (128-element blocks)
+        GGML_TYPE_COUNT   = 44,
     };
 
     // precision
@@ -466,6 +468,8 @@ extern "C" {
         GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
         GGML_FTYPE_MOSTLY_MXFP4   = 25, // except 1d tensors
         GGML_FTYPE_MOSTLY_NVFP4   = 26, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q1_0      = 27, // except 1d tensors (PrismML 1-bit)
+        GGML_FTYPE_MOSTLY_Q1_0_G128 = 28, // except 1d tensors (PrismML 1-bit g128)
     };
 
     // available tensor operations:
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index f8433333e..5ce668558 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -281,6 +281,24 @@ typedef struct {
 } block_tq3_0;
 static_assert(sizeof(block_tq3_0) == QK_TQ3_0/4 + QK_TQ3_0/8 + sizeof(ggml_half), "wrong tq3_0 block size/padding");
 
+// PrismML Q1_0: 1-bit ternary quantization (32-element blocks)
+// Each value quantized as sign bit: bit=1 → +scale, bit=0 → −scale
+// scale = mean(abs(values)) per block
+#define QK1_0 32
+typedef struct {
+    ggml_half d;         // scale (mean absolute value)
+    uint8_t   qs[QK1_0 / 8]; // sign bits: 32 × 1 bit = 4 bytes
+} block_q1_0;
+static_assert(sizeof(block_q1_0) == sizeof(ggml_half) + QK1_0/8, "wrong q1_0 block size/padding");
+
+// PrismML Q1_0_G128: 1-bit ternary quantization (128-element blocks)
+#define QK1_0_G128 128
+typedef struct {
+    ggml_half d;                   // scale
+    uint8_t   qs[QK1_0_G128 / 8]; // sign bits: 128 × 1 bit = 16 bytes
+} block_q1_0_g128;
+static_assert(sizeof(block_q1_0_g128) == sizeof(ggml_half) + QK1_0_G128/8, "wrong q1_0_g128 block size/padding");
+
 //
 // Super-block quantization structures
 //
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index e039c43a2..2dc7cc8a0 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -394,6 +394,18 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .from_float               = quantize_row_tq3_0,
         .nrows                    = 1,
     },
+    [GGML_TYPE_Q1_0] = {
+        .from_float               = quantize_row_q1_0,
+        .vec_dot                  = ggml_vec_dot_q1_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q1_0_G128] = {
+        .from_float               = quantize_row_q1_0_g128,
+        .vec_dot                  = ggml_vec_dot_q1_0_g128_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
     [GGML_TYPE_I32] = {
         .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
     },
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 9b8de3eea..10a9dbde7 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -679,6 +679,8 @@ void ggml_compute_forward_add(
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
         case GGML_TYPE_TQ3_0:
+        case GGML_TYPE_Q1_0:
+        case GGML_TYPE_Q1_0_G128:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -1130,6 +1132,8 @@ void ggml_compute_forward_add1(
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
         case GGML_TYPE_TQ3_0:
+        case GGML_TYPE_Q1_0:
+        case GGML_TYPE_Q1_0_G128:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -1260,6 +1264,8 @@ void ggml_compute_forward_acc(
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
         case GGML_TYPE_TQ3_0:
+        case GGML_TYPE_Q1_0:
+        case GGML_TYPE_Q1_0_G128:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -4349,6 +4355,8 @@ void ggml_compute_forward_out_prod(
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
         case GGML_TYPE_TQ3_0:
+        case GGML_TYPE_Q1_0:
+        case GGML_TYPE_Q1_0_G128:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -4626,6 +4634,8 @@ void ggml_compute_forward_set(
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
         case GGML_TYPE_TQ3_0:
+        case GGML_TYPE_Q1_0:
+        case GGML_TYPE_Q1_0_G128:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -4850,6 +4860,8 @@ void ggml_compute_forward_get_rows(
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
         case GGML_TYPE_TQ3_0:
+        case GGML_TYPE_Q1_0:
+        case GGML_TYPE_Q1_0_G128:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -5576,6 +5588,8 @@ void ggml_compute_forward_clamp(
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
         case GGML_TYPE_TQ3_0:
+        case GGML_TYPE_Q1_0:
+        case GGML_TYPE_Q1_0_G128:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
index ccb3c6cbd..06134f032 100644
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -114,6 +114,83 @@ void quantize_row_tq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy,
     quantize_row_tq3_0_ref(x, y, k);
 }
 
+void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK1_0 == 0);
+    block_q1_0 * GGML_RESTRICT y = vy;
+    quantize_row_q1_0_ref(x, y, k);
+}
+
+void quantize_row_q1_0_g128(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK1_0_G128 == 0);
+    block_q1_0_g128 * GGML_RESTRICT y = vy;
+    quantize_row_q1_0_g128_ref(x, y, k);
+}
+
+//===================================== Q1_0 vec_dot =================================
+
+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+        const float d1 = GGML_FP16_TO_FP32(y[i].d);
+
+        int sumi = 0;
+        for (int j = 0; j < QK1_0; j++) {
+            const int xi = ((x[i].qs[j / 8] >> (j % 8)) & 1) ? 1 : -1;
+            sumi += xi * (int)y[i].qs[j];
+        }
+
+        sumf += d0 * d1 * (float)sumi;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK1_0_G128;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
+
+    const block_q1_0_g128 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+
+        // Each Q1_0_g128 block spans 4 Q8_0 blocks (4 × 32 = 128)
+        for (int k = 0; k < 4; k++) {
+            const float d1 = GGML_FP16_TO_FP32(y[i * 4 + k].d);
+            int sumi = 0;
+
+            for (int j = 0; j < QK8_0; j++) {
+                const int bit_index = k * QK8_0 + j;
+                const int xi = ((x[i].qs[bit_index / 8] >> (bit_index % 8)) & 1) ? 1 : -1;
+                sumi += xi * (int)y[i * 4 + k].qs[j];
+            }
+
+            sumf += d0 * d1 * (float)sumi;
+        }
+    }
+
+    *s = sumf;
+}
+
 //===================================== Q8_K ==============================================
 
 void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h
index f7060ebdd..1f76f40a5 100644
--- a/ggml/src/ggml-cpu/quants.h
+++ b/ggml/src/ggml-cpu/quants.h
@@ -33,6 +33,9 @@ void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, i
 void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_tq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
+void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q1_0_g128(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
 void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
@@ -55,6 +58,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
 void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 892306037..89af6037a 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -2496,6 +2496,92 @@ size_t quantize_tq3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
     return nrow * row_size;
 }
 
+// ====================== PrismML Q1_0 1-bit ternary quantization ======================
+
+void quantize_row_q1_0_ref(const float * GGML_RESTRICT x, block_q1_0 * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK1_0 == 0);
+    const int64_t nb = k / QK1_0;
+
+    for (int64_t i = 0; i < nb; i++) {
+        float amax = 0.0f;
+        for (int j = 0; j < QK1_0; j++) {
+            amax += fabsf(x[i * QK1_0 + j]);
+        }
+        const float d = amax / QK1_0;
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        memset(y[i].qs, 0, sizeof(y[i].qs));
+        for (int j = 0; j < QK1_0; j++) {
+            if (x[i * QK1_0 + j] >= 0.0f) {
+                y[i].qs[j / 8] |= (1 << (j % 8));
+            }
+        }
+    }
+}
+
+void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK1_0 == 0);
+    const int64_t nb = k / QK1_0;
+
+    for (int64_t i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        for (int j = 0; j < QK1_0; j++) {
+            const int bit = (x[i].qs[j / 8] >> (j % 8)) & 1;
+            y[i * QK1_0 + j] = bit ? d : -d;
+        }
+    }
+}
+
+size_t quantize_q1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    (void)quant_weights;
+    const size_t row_size = ggml_row_size(GGML_TYPE_Q1_0, n_per_row);
+    quantize_row_q1_0_ref(src, dst, (int64_t)nrow * n_per_row);
+    return nrow * row_size;
+}
+
+// ====================== PrismML Q1_0_G128 1-bit ternary (128-element blocks) ======================
+
+void quantize_row_q1_0_g128_ref(const float * GGML_RESTRICT x, block_q1_0_g128 * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK1_0_G128 == 0);
+    const int64_t nb = k / QK1_0_G128;
+
+    for (int64_t i = 0; i < nb; i++) {
+        float amax = 0.0f;
+        for (int j = 0; j < QK1_0_G128; j++) {
+            amax += fabsf(x[i * QK1_0_G128 + j]);
+        }
+        const float d = amax / QK1_0_G128;
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        memset(y[i].qs, 0, sizeof(y[i].qs));
+        for (int j = 0; j < QK1_0_G128; j++) {
+            if (x[i * QK1_0_G128 + j] >= 0.0f) {
+                y[i].qs[j / 8] |= (1 << (j % 8));
+            }
+        }
+    }
+}
+
+void dequantize_row_q1_0_g128(const block_q1_0_g128 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK1_0_G128 == 0);
+    const int64_t nb = k / QK1_0_G128;
+
+    for (int64_t i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        for (int j = 0; j < QK1_0_G128; j++) {
+            const int bit = (x[i].qs[j / 8] >> (j % 8)) & 1;
+            y[i * QK1_0_G128 + j] = bit ? d : -d;
+        }
+    }
+}
+
+size_t quantize_q1_0_g128(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    (void)quant_weights;
+    const size_t row_size = ggml_row_size(GGML_TYPE_Q1_0_G128, n_per_row);
+    quantize_row_q1_0_g128_ref(src, dst, (int64_t)nrow * n_per_row);
+    return nrow * row_size;
+}
+
 // ====================== "True" 2-bit (de)-quantization
 
 void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 59acb4132..ae0ef3bce 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -35,6 +35,9 @@ GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0
 GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_tq3_0_ref(const float * GGML_RESTRICT x, block_tq3_0 * GGML_RESTRICT y, int64_t k);
 
+GGML_API void quantize_row_q1_0_ref    (const float * GGML_RESTRICT x, block_q1_0     * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q1_0_g128_ref(const float * GGML_RESTRICT x, block_q1_0_g128 * GGML_RESTRICT y, int64_t k);
+
 GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
@@ -63,6 +66,9 @@ GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float *
 GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_tq3_0(const block_tq3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 
+GGML_API void dequantize_row_q1_0    (const block_q1_0     * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q1_0_g128(const block_q1_0_g128 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
 GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -88,6 +94,9 @@ GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_REST
 GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_tq3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
+GGML_API size_t quantize_q1_0    (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q1_0_g128(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
 GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index cf45a17ff..3a9dca5a2 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -912,6 +912,22 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_tq3_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_tq3_0_ref,
     },
+    [GGML_TYPE_Q1_0] = {
+        .type_name                = "q1_0",
+        .blck_size                = QK1_0,
+        .type_size                = sizeof(block_q1_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q1_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q1_0_ref,
+    },
+    [GGML_TYPE_Q1_0_G128] = {
+        .type_name                = "q1_0_g128",
+        .blck_size                = QK1_0_G128,
+        .type_size                = sizeof(block_q1_0_g128),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q1_0_g128,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q1_0_g128_ref,
+    },
 };
 
 const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
@@ -1397,6 +1413,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
         case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
         case GGML_FTYPE_MOSTLY_MXFP4:         wtype = GGML_TYPE_MXFP4; break;
         case GGML_FTYPE_MOSTLY_NVFP4:         wtype = GGML_TYPE_NVFP4; break;
+        case GGML_FTYPE_MOSTLY_Q1_0:          wtype = GGML_TYPE_Q1_0; break;
+        case GGML_FTYPE_MOSTLY_Q1_0_G128:     wtype = GGML_TYPE_Q1_0_G128; break;
         case GGML_FTYPE_MOSTLY_Q2_K:          wtype = GGML_TYPE_Q2_K;  break;
         case GGML_FTYPE_MOSTLY_Q3_K:          wtype = GGML_TYPE_Q3_K;  break;
         case GGML_FTYPE_MOSTLY_Q4_K:          wtype = GGML_TYPE_Q4_K;  break;
@@ -7673,6 +7691,8 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_TQ1_0:   result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_TQ2_0:   result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_TQ3_0:   result = quantize_tq3_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q1_0:      result = quantize_q1_0    (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q1_0_G128: result = quantize_q1_0_g128(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index ab3cc9748..cd9a50909 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -568,6 +568,24 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
     }
 
     // read the tensor info
+    // PrismML Q1_0 compatibility: detect if this GGUF uses PrismML type IDs
+    // PrismML encodes Q1_0 as type 40 and Q1_0_g128 as type 41, which clash
+    // with NVFP4 (40) and TQ3_0 (41). Detect via general.file_type KV:
+    //   ftype 40 = MOSTLY_Q1_0, ftype 41 = MOSTLY_Q1_0_g128 (PrismML)
+    //   ftype 26 = MOSTLY_NVFP4 (this fork)
+    bool remap_prismml = false;
+    {
+        const int64_t ft_idx = gguf_find_key(ctx, "general.file_type");
+        if (ft_idx >= 0) {
+            const uint32_t ftype = gguf_get_val_u32(ctx, ft_idx);
+            // PrismML uses ftype values 40 or 41 for Q1_0 models
+            if (ftype == 40 || ftype == 41) {
+                GGML_LOG_INFO("%s: detected PrismML Q1_0 file (file_type=%u), remapping type IDs\n", __func__, ftype);
+                remap_prismml = true;
+            }
+        }
+    }
+
     for (int64_t i = 0; ok && i < n_tensors; ++i) {
         struct gguf_tensor_info info;
 
@@ -648,6 +666,15 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
         {
             ok = ok && gr.read(info.t.type);
 
+            // PrismML Q1_0 compatibility remap
+            if (remap_prismml) {
+                if (info.t.type == (enum ggml_type)40) {
+                    info.t.type = GGML_TYPE_Q1_0;
+                } else if (info.t.type == (enum ggml_type)41) {
+                    info.t.type = GGML_TYPE_Q1_0_G128;
+                }
+            }
+
             // check that tensor type is within defined range
             if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) {
                 GGML_LOG_ERROR("%s: tensor '%s' has invalid ggml type %d. should be in [0, %d)\n",
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 9383644ab..0c7b115ce 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -3820,6 +3820,9 @@ class GGMLQuantizationType(IntEnum):
     TQ2_0   = 35
     MXFP4   = 39
     NVFP4   = 40
+    TQ3_0   = 41
+    Q1_0      = 42
+    Q1_0_G128 = 43
 
 
 class ExpertGatingFuncType(IntEnum):
@@ -3873,6 +3876,8 @@ class LlamaFileType(IntEnum):
     MOSTLY_TQ2_0         = 37  # except 1d tensors
     MOSTLY_MXFP4_MOE     = 38  # except 1d tensors
     MOSTLY_NVFP4         = 39  # except 1d tensors
+    MOSTLY_Q1_0          = 40  # except 1d tensors (PrismML 1-bit)
+    MOSTLY_Q1_0_G128     = 41  # except 1d tensors (PrismML 1-bit g128)
 
     GUESSED              = 1024  # not specified in the model file
 
@@ -3981,6 +3986,9 @@ class VisionProjectorType:
     GGMLQuantizationType.TQ2_0:   (256, 2 + 64),
     GGMLQuantizationType.MXFP4:   (32, 1 + 16),
     GGMLQuantizationType.NVFP4:   (64, 4 + 32),
+    GGMLQuantizationType.TQ3_0:   (32, 8 + 4 + 2),
+    GGMLQuantizationType.Q1_0:      (32, 2 + 4),
+    GGMLQuantizationType.Q1_0_G128: (128, 2 + 16),
 }