Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
ac49ac6
refactor GPU compact tower witness flow
hero78119 Apr 25, 2026
84a2631
Fix compact tower memory accounting
hero78119 Apr 26, 2026
12453f6
Optimize compact logup ones allocation
hero78119 Apr 26, 2026
7d60f01
update dep
hero78119 Apr 26, 2026
925de92
Merge branch 'master' into feat/prover_mle_zero_padding
hero78119 Apr 26, 2026
e9fbe9c
fix main mem estimation
hero78119 Apr 26, 2026
46e87bb
Merge branch 'master' of github.com:scroll-tech/ceno into feat/prover…
hero78119 Apr 26, 2026
b888fbb
Merge branch 'feat/prover_mle_zero_padding' of github.com:scroll-tech…
hero78119 Apr 26, 2026
5ecce04
fix mem estimator
hero78119 Apr 26, 2026
be14006
snapshot compact tower estimator state
hero78119 Apr 26, 2026
df88dec
rollback Cargo.toml, Cargo.lock change
hero78119 Apr 27, 2026
b57b692
fix memory estimation
hero78119 Apr 27, 2026
c50b793
verifier log
hero78119 Apr 27, 2026
89b8698
Pass tower input by value for GPU proving
hero78119 Apr 27, 2026
f210e1f
split tower layer by view
hero78119 Apr 27, 2026
99b7a94
Use dense tower build for compact GPU input
hero78119 Apr 27, 2026
f0d81b6
Pass logup shape to tower prove estimator
hero78119 Apr 27, 2026
917810c
Deduplicate borrowed tower input booking
hero78119 Apr 27, 2026
4fc8dae
fix logging
hero78119 Apr 27, 2026
ef9fa30
Check scheduler memory estimate in mem tracking
hero78119 Apr 27, 2026
011a898
Refine replay tower proof memory estimate
hero78119 Apr 27, 2026
f3ca1cf
clippy fix
hero78119 Apr 27, 2026
147f567
add missing syncronization, avoid race condition
hero78119 Apr 28, 2026
94fc7bf
Account ShardRam tower prove allocator overhead
hero78119 Apr 28, 2026
c9401d1
misc: clippy fix
hero78119 Apr 28, 2026
d14e66a
Fix GPU proof memory estimation
hero78119 Apr 28, 2026
ceced51
Fix GPU proof estimate row basis
hero78119 Apr 28, 2026
d1ab71a
Tune ShardRam tower proof estimate
hero78119 Apr 28, 2026
7c6e97c
Batch main constraints into single sumcheck
hero78119 Apr 29, 2026
505e258
Restore replay backing before batched main
hero78119 Apr 29, 2026
b2fba0f
Replay witness backing incrementally during PCS opening
hero78119 Apr 29, 2026
25d7f42
wip more log
hero78119 Apr 29, 2026
2128bf9
Improve GPU proof failure diagnostics
hero78119 Apr 29, 2026
d5513de
Compact ShardRAM main witness extraction
hero78119 Apr 29, 2026
2df2590
Log batched main MLE histograms
hero78119 Apr 29, 2026
b67c6b7
Fix batched main GPU verification
hero78119 Apr 30, 2026
a4d066f
Use legacy layout for batched main GPU sumcheck
hero78119 Apr 30, 2026
29ae6df
update gkr dependency
hero78119 Apr 30, 2026
7d1a9de
Merge branch 'master' of github.com:scroll-tech/ceno into feat/prover…
hero78119 Apr 30, 2026
7ebc0cf
Merge branch 'feat/prover_mle_zero_padding' into feat/batch_main_sumc…
hero78119 Apr 30, 2026
fb96061
perf(gpu): trim batched main sumcheck work
hero78119 May 1, 2026
be5a6f5
perf(gpu): use direct layout for batched main
hero78119 May 1, 2026
27ed865
Experiment staggered batched main sumcheck prover
hero78119 May 3, 2026
268025b
Use dedicated batched main sumcheck prover
hero78119 May 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 11 additions & 11 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,16 @@ version = "0.1.0"
ceno_crypto_primitives = { git = "https://github.com/scroll-tech/ceno-patch.git", package = "ceno_crypto_primitives", branch = "main" }
ceno_syscall = { git = "https://github.com/scroll-tech/ceno-patch.git", package = "ceno_syscall", branch = "main" }

ff_ext = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "ff_ext", tag = "v1.0.0-alpha.24" }
mpcs = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "mpcs", tag = "v1.0.0-alpha.24" }
multilinear_extensions = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "multilinear_extensions", tag = "v1.0.0-alpha.24" }
p3 = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "p3", tag = "v1.0.0-alpha.24" }
poseidon = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "poseidon", tag = "v1.0.0-alpha.24" }
sp1-curves = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sp1-curves", tag = "v1.0.0-alpha.24" }
sumcheck = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sumcheck", tag = "v1.0.0-alpha.24" }
transcript = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "transcript", tag = "v1.0.0-alpha.24" }
whir = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "whir", tag = "v1.0.0-alpha.24" }
witness = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "witness", tag = "v1.0.0-alpha.24" }
ff_ext = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "ff_ext", tag = "v1.0.0-alpha.25" }
mpcs = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "mpcs", tag = "v1.0.0-alpha.25" }
multilinear_extensions = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "multilinear_extensions", tag = "v1.0.0-alpha.25" }
p3 = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "p3", tag = "v1.0.0-alpha.25" }
poseidon = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "poseidon", tag = "v1.0.0-alpha.25" }
sp1-curves = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sp1-curves", tag = "v1.0.0-alpha.25" }
sumcheck = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sumcheck", tag = "v1.0.0-alpha.25" }
transcript = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "transcript", tag = "v1.0.0-alpha.25" }
whir = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "whir", tag = "v1.0.0-alpha.25" }
witness = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "witness", tag = "v1.0.0-alpha.25" }

anyhow = { version = "1.0", default-features = false }
bincode = "1"
Expand Down Expand Up @@ -129,7 +129,7 @@ lto = "thin"

#[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"]
#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] }

#
#[patch."https://github.com/scroll-tech/gkr-backend"]
#ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
#mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" }
Expand Down
16 changes: 4 additions & 12 deletions ceno_zkvm/src/bin/e2e.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ use ceno_zkvm::print_allocated_bytes;
use ceno_zkvm::{
e2e::{
Checkpoint, FieldType, MultiProver, PcsKind, Preset, public_io_words_to_digest_words,
run_e2e_full_trace_verify, run_e2e_single_shard_debug_verify, run_e2e_with_checkpoint,
setup_platform, setup_platform_debug,
run_e2e_with_checkpoint, setup_platform, setup_platform_debug,
},
scheme::{
ZKVMProof, constants::MAX_NUM_VARIABLES, create_backend, create_prover, hal::ProverDevice,
Expand Down Expand Up @@ -352,17 +351,10 @@ fn run_inner<
fs::write(&vk_file, vk_bytes).unwrap();

if checkpoint > Checkpoint::PrepVerify {
// `run_e2e_with_checkpoint` already performs the real verification for the
// complete flow. Re-running it here without the emulation exit code causes
// a false "Unfinished execution" error to be logged.
let verifier = ZKVMVerifier::new(vk);
if target_shard_id.is_some() {
run_e2e_single_shard_debug_verify(
&verifier,
zkvm_proofs.first().cloned().expect("missing shard proof"),
None,
max_steps,
);
} else {
run_e2e_full_trace_verify(&verifier, zkvm_proofs.clone(), None, max_steps);
}
soundness_test(zkvm_proofs.first().cloned().unwrap(), &verifier);
}
}
Expand Down
31 changes: 25 additions & 6 deletions ceno_zkvm/src/e2e.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ use serde::Serialize;
use std::collections::{HashMap, HashSet};
use std::{
collections::{BTreeMap, BTreeSet},
io::Write,
marker::PhantomData,
ops::Range,
sync::Arc,
Expand Down Expand Up @@ -2193,9 +2194,18 @@ fn create_proofs_streaming<

let transcript = Transcript::new(b"riscv");
let start = std::time::Instant::now();
let zkvm_proof = prover
.create_proof(&shard_ctx, zkvm_witness, pi, transcript)
.expect("create_proof failed");
let zkvm_proof =
match prover.create_proof(&shard_ctx, zkvm_witness, pi, transcript) {
Ok(proof) => proof,
Err(err) => {
eprintln!(
"create_proof failed for shard {}: {err:?}",
shard_ctx.shard_id
);
let _ = std::io::stderr().flush();
std::process::exit(1);
}
};
tracing::debug!(
"{}th shard proof created in {:?}",
shard_ctx.shard_id,
Expand Down Expand Up @@ -2254,9 +2264,18 @@ fn create_proofs_streaming<

let transcript = Transcript::new(b"riscv");
let start = std::time::Instant::now();
let zkvm_proof = prover
.create_proof(&shard_ctx, zkvm_witness, pi, transcript)
.expect("create_proof failed");
let zkvm_proof =
match prover.create_proof(&shard_ctx, zkvm_witness, pi, transcript) {
Ok(proof) => proof,
Err(err) => {
eprintln!(
"create_proof failed for shard {}: {err:?}",
shard_ctx.shard_id
);
let _ = std::io::stderr().flush();
std::process::exit(1);
}
};
tracing::debug!(
"{}th shard proof created in {:?}",
shard_ctx.shard_id,
Expand Down
24 changes: 12 additions & 12 deletions ceno_zkvm/src/instructions/gpu/chips/keccak.rs
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,7 @@ fn replay_keccak_witness_only_from_packed<E: ExtensionField>(
) -> Result<RowMajorMatrix<E::BaseField>, ZKVMError> {
use crate::precompiles::KECCAK_ROUNDS_CEIL_LOG2;

let num_padded_instances = num_instances.next_power_of_two().max(2);
let num_padded_rows = num_padded_instances * 32;
let num_rows = num_instances * 32;
let rotation = KECCAK_ROUNDS_CEIL_LOG2;

let col_map = info_span!("col_map").in_scope(|| extract_keccak_column_map(config, num_witin));
Expand All @@ -358,7 +357,7 @@ fn replay_keccak_witness_only_from_packed<E: ExtensionField>(
.witgen_keccak(
&col_map,
packed_instances,
num_padded_rows,
num_rows,
shard_offset,
fetch_base_pc,
fetch_num_slots,
Expand All @@ -372,17 +371,18 @@ fn replay_keccak_witness_only_from_packed<E: ExtensionField>(
let raw_witin = if crate::instructions::gpu::config::is_debug_compare_enabled()
|| !should_materialize_witness_on_gpu()
{
info_span!("transpose_d2h", rows = num_padded_rows, cols = num_witin).in_scope(|| {
let produced_rows = gpu_result.witness.num_rows;
info_span!("transpose_d2h", rows = produced_rows, cols = num_witin).in_scope(|| {
let mut rmm_buffer = hal
.alloc_elems_on_device(num_padded_rows * num_witin, false, None)
.alloc_elems_on_device(produced_rows * num_witin, false, None)
.map_err(|e| {
ZKVMError::InvalidWitness(format!("GPU alloc for transpose failed: {e}").into())
})?;
matrix_transpose::<CudaHalBB31, ff_ext::BabyBearExt4, _>(
&hal.inner,
&mut rmm_buffer,
&gpu_result.witness.device_buffer,
num_padded_rows,
produced_rows,
num_witin,
)
.map_err(|e| ZKVMError::InvalidWitness(format!("GPU transpose failed: {e}").into()))?;
Expand Down Expand Up @@ -445,8 +445,7 @@ fn gpu_assign_keccak_inner<E: ExtensionField>(
use crate::precompiles::KECCAK_ROUNDS_CEIL_LOG2;

let num_instances = step_indices.len();
let num_padded_instances = num_instances.next_power_of_two().max(2);
let num_padded_rows = num_padded_instances * 32; // 2^5 = 32 rows per instance
let num_rows = num_instances * 32; // 2^5 = 32 rows per instance
let rotation = KECCAK_ROUNDS_CEIL_LOG2; // = 5
let materialize_initial_witness = crate::instructions::gpu::config::is_debug_compare_enabled()
|| should_materialize_witness_on_initial_assign();
Expand Down Expand Up @@ -479,7 +478,7 @@ fn gpu_assign_keccak_inner<E: ExtensionField>(
.witgen_keccak(
&col_map,
&packed_instances,
num_padded_rows,
num_rows,
shard_ctx.current_shard_offset_cycle(),
fetch_base_pc,
fetch_num_slots,
Expand Down Expand Up @@ -565,17 +564,18 @@ fn gpu_assign_keccak_inner<E: ExtensionField>(
} else if crate::instructions::gpu::config::is_debug_compare_enabled()
|| !should_materialize_witness_on_gpu()
{
info_span!("transpose_d2h", rows = num_padded_rows, cols = num_witin).in_scope(|| {
let produced_rows = gpu_result.witness.num_rows;
info_span!("transpose_d2h", rows = produced_rows, cols = num_witin).in_scope(|| {
let mut rmm_buffer = hal
.alloc_elems_on_device(num_padded_rows * num_witin, false, None)
.alloc_elems_on_device(produced_rows * num_witin, false, None)
.map_err(|e| {
ZKVMError::InvalidWitness(format!("GPU alloc for transpose failed: {e}").into())
})?;
matrix_transpose::<CudaHalBB31, ff_ext::BabyBearExt4, _>(
&hal.inner,
&mut rmm_buffer,
&gpu_result.witness.device_buffer,
num_padded_rows,
produced_rows,
num_witin,
)
.map_err(|e| ZKVMError::InvalidWitness(format!("GPU transpose failed: {e}").into()))?;
Expand Down
8 changes: 4 additions & 4 deletions ceno_zkvm/src/instructions/gpu/chips/shard_ram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -439,11 +439,11 @@ pub(crate) fn try_gpu_assign_shard_ram<E: ExtensionField>(
{
let struct_data = tracing::info_span!(
"gpu_shard_ram_structural_transpose_d2h",
num_rows_padded,
rows = gpu_structural.num_rows,
num_structural_witin,
)
.in_scope(|| -> Result<_, ZKVMError> {
let wit_num_rows = num_rows_padded;
let wit_num_rows = gpu_structural.num_rows;
let struct_num_cols = num_structural_witin;
let mut struct_rmm_buf = hal
.witgen
Expand Down Expand Up @@ -684,11 +684,11 @@ pub(crate) fn try_gpu_assign_shard_ram_from_device<E: ExtensionField>(
{
let struct_data = tracing::info_span!(
"gpu_shard_ram_structural_transpose_d2h_from_device",
num_rows_padded,
rows = gpu_structural.num_rows,
num_structural_witin,
)
.in_scope(|| -> Result<_, ZKVMError> {
let wit_num_rows = num_rows_padded;
let wit_num_rows = gpu_structural.num_rows;
let struct_num_cols = num_structural_witin;
let mut struct_rmm_buf = hal
.witgen
Expand Down
5 changes: 3 additions & 2 deletions ceno_zkvm/src/instructions/gpu/dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ pub(crate) fn try_gpu_assign_instances<E: ExtensionField, I: Instruction<E>>(
let total_instances = step_indices.len();
if total_instances == 0 {
// Empty: just return empty matrices
let num_witin = num_witin.max(1);
let num_structural_witin = num_structural_witin.max(1);
let raw_witin = RowMajorMatrix::<E::BaseField>::new(0, num_witin, I::padding_strategy());
let raw_structural =
Expand Down Expand Up @@ -481,7 +482,7 @@ fn gpu_assign_instances_inner<E: ExtensionField, I: Instruction<E>>(
total_instances,
num_witin,
I::padding_strategy(),
)
)?
};
if materialize_initial_witness {
raw_witin.padding_by_strategy();
Expand Down Expand Up @@ -1484,7 +1485,7 @@ fn replay_gpu_witness_from_resident_raw<E: ExtensionField, I: Instruction<E>>(
total_instances,
replay.num_witin,
I::padding_strategy(),
);
)?;

// Keep replayed witness immutable after attaching the col-major device backing.
// Mutating/padding a RowMajorMatrix clears device metadata, but replay consumers
Expand Down
4 changes: 2 additions & 2 deletions ceno_zkvm/src/instructions/gpu/utils/d2h.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,9 @@ pub(crate) fn gpu_witness_to_rmm<E: ExtensionField>(
num_rows: usize,
num_cols: usize,
padding: InstancePaddingStrategy,
) -> RowMajorMatrix<E::BaseField> {
) -> Result<RowMajorMatrix<E::BaseField>, ZKVMError> {
let mut rmm = RowMajorMatrix::<E::BaseField>::new(num_rows, num_cols, padding);
// Keep the original col-major witness buffer as the source of truth for GPU commit.
rmm.set_device_backing(gpu_result.device_buffer, DeviceMatrixLayout::ColMajor);
rmm
Ok(rmm)
}
Loading
Loading