diff --git a/bin/Index/IRGen.cpp b/bin/Index/IRGen.cpp index 0b85ce873..62a5f4393 100644 --- a/bin/Index/IRGen.cpp +++ b/bin/Index/IRGen.cpp @@ -299,7 +299,12 @@ std::optional IRGenerator::Generate( alloca_inst.object_index = obj_idx; alloca_inst.type_entity_id = TypeEntityIdOf(param.Type()); uint32_t alloca_idx = EmitTopLevel(std::move(alloca_inst)); - object_to_alloca_[obj_idx] = alloca_idx; + auto [it, inserted] = object_to_alloca_.try_emplace(obj_idx, alloca_idx); + CHECK(inserted) + << "param ALLOCA re-registered for obj_idx=" << obj_idx + << " (param entity " << EntityIdOf(param) + << "); first registration at inst " << it->second + << ", second at inst " << alloca_idx; } // Emit local variable ALLOCAs in the frame block. @@ -415,6 +420,7 @@ std::optional IRGenerator::Generate( // Verify block structure. VerifyBlocks(); + VerifyIR(); // Compute stack frame layout: assign offsets to non-dynamic objects. ComputeFrameLayout(); @@ -548,6 +554,7 @@ std::optional IRGenerator::GenerateGlobalInit( ComputeDominators(); ComputeRPO(); VerifyBlocks(); + VerifyIR(); ComputeFrameLayout(); DLOG(INFO) << "Generated global init IR for var entity " @@ -775,6 +782,21 @@ uint32_t IRGenerator::EmitLoadFromLValue(const pasta::Expr &e) { uint32_t addr_idx = EmitLValue(e); + // Hard trap at the codegen origin: if EmitLValue handed back a + // terminator's slot, this LOAD would emit `MEMORY/LOAD_* [%term]`. + // The substrate evaluates a terminator-as-value as 0, producing a + // load from address 0x0. The choke-point CHECK in EmitInstruction + // catches this too, but firing here gives a tighter stack focused + // on the lvalue site responsible. + if (addr_idx < func_.instructions.size()) { + CHECK(!mx::ir::IsTerminator(func_.instructions[addr_idx].opcode)) + << "EmitLoadFromLValue: EmitLValue returned terminator slot " + << addr_idx << " (opcode=" + << static_cast(func_.instructions[addr_idx].opcode) + << ") for expr " << eid << " (kind=" + << std::string(e.KindName()) << ")"; + } + // Check if this is a large object that can't be scalar-loaded. // For types > 8 bytes (structs, arrays, etc.), return the address directly — // the "value" of a large object is its pointer. Callers that need to copy @@ -945,6 +967,28 @@ uint32_t IRGenerator::GetOrMakeObject(const pasta::Decl &decl) { // --------------------------------------------------------------------------- uint32_t IRGenerator::EmitInstruction(InstructionIR inst) { + // Choke-point trap: a value-producing instruction must not consume a + // terminator (IMPLICIT_GOTO, RET, BREAK, CONTINUE, GOTO, UNREACHABLE, + // COND_BRANCH, SWITCH, ...) as a value operand. The + // DeclRefExpr→EnumConstantDecl miscompile produced exactly this + // shape: a LOAD whose address operand was the entry IMPLICIT_GOTO. + // Aborting here gives a live stack frame showing the AST node that + // lost the decl reference. + if (!mx::ir::IsTerminator(inst.opcode)) { + for (uint32_t op_idx : inst.operand_indices) { + if (op_idx >= func_.instructions.size()) continue; + auto operand_op = func_.instructions[op_idx].opcode; + CHECK(!mx::ir::IsTerminator(operand_op)) + << "EmitInstruction: opcode " + << static_cast(inst.opcode) + << " (source_entity_id=" << inst.source_entity_id + << ") would consume terminator " << op_idx + << " (opcode=" << static_cast(operand_op) + << ") as a value operand; a lvalue/rvalue lowering lost the " + "decl reference"; + } + } + inst.parent_block_index = current_block_index_; uint32_t idx = static_cast(func_.instructions.size()); func_.instructions.push_back(std::move(inst)); @@ -2274,6 +2318,27 @@ uint32_t IRGenerator::EmitRValue(const pasta::Expr &e) { return EmitRValue(ewc->SubExpression()); } + // DeclRefExpr to an EnumConstantDecl is *not* an lvalue — it's an + // integer constant. Lower it to CONST directly so it never hits the + // load-from-lvalue fallback (which would produce MEMORY/LOAD_* whose + // address operand is a phantom "alloca" that was never registered, + // misroutes through `object_to_alloca_[obj_idx]` -> 0, and reads from + // `%0` — the entry block's IMPLICIT_GOTO). + if (auto dre = pasta::DeclRefExpr::From(e)) { + if (auto ec = pasta::EnumConstantDecl::From(dre->Declaration())) { + InstructionIR inst; + inst.opcode = mx::ir::OpCode::CONST; + inst.source_entity_id = eid; + llvm::APSInt v = ec->InitializerValue(); + inst.int_value = v.getSExtValue(); + inst.uint_value = v.getZExtValue(); + inst.width = static_cast(v.getBitWidth()); + bool is_signed = v.isSigned(); + inst.const_op = static_cast(IntConstOp(inst.width, is_signed)); + return emit_typed(std::move(inst)); + } + } + // Lvalue expressions -- load from their address. if (pasta::DeclRefExpr::From(e) || pasta::MemberExpr::From(e) || @@ -4404,9 +4469,24 @@ uint32_t IRGenerator::EmitLValue(const pasta::Expr &e) { } } - // Local/parameter → reference the ALLOCA directly. + // Local/parameter → reference the ALLOCA directly. Refuse anything + // that doesn't look like a local/parameter VarDecl: an unrecognised + // decl kind (EnumConstantDecl, LabelDecl, BindingDecl, ...) silently + // misroutes through `object_to_alloca_[obj_idx]` and returns slot 0, + // which is the entry IMPLICIT_GOTO. A LOAD whose address operand is + // a terminator is not just invalid — it has no diagnostic at the + // call site, so callers see address `0x0` at runtime instead. + CHECK(pasta::VarDecl::From(decl).has_value()) + << "EmitLValue: DeclRefExpr to unsupported decl kind " + << std::string(decl.KindName()) + << " (entity " << EntityIdOf(decl) << ", expr " << eid << ")"; + uint32_t obj_idx = GetOrMakeObject(decl); - return object_to_alloca_[obj_idx]; + auto it = object_to_alloca_.find(obj_idx); + CHECK(it != object_to_alloca_.end()) + << "EmitLValue: VarDecl " << EntityIdOf(decl) + << " has no registered ALLOCA (obj_idx=" << obj_idx << ")"; + return it->second; } // MemberExpr. @@ -4974,6 +5054,129 @@ void IRGenerator::VerifyBlocks() { } } +void IRGenerator::VerifyIR() { + using OpCode = mx::ir::OpCode; + using MemOp = mx::ir::MemOp; + auto &instructions = func_.instructions; + auto &blocks = func_.blocks; + if (instructions.empty()) return; + + // Slot 0 is the entry block's first instruction (always IMPLICIT_GOTO, + // by construction). A value-producing instruction whose operand + // references slot 0 means a decl reference was lost during lowering. + auto is_scope_op = [](OpCode op) { + return op == OpCode::ENTER_SCOPE || op == OpCode::EXIT_SCOPE; + }; + auto fmt_inst = [&](uint32_t idx) -> std::string { + if (idx >= instructions.size()) return "?"; + return std::to_string(idx) + " (opcode=" + + std::to_string(static_cast(instructions[idx].opcode)) + ")"; + }; + + // Check #6: every block has a parent_structure_index. Orphaned blocks + // are the bug class fixed in #587 (compensation/unreachable/label + // forward-ref blocks); verifier guards against regression. + for (uint32_t bi = 0; bi < blocks.size(); ++bi) { + if (blocks[bi].parent_structure_index == UINT32_MAX) { + LOG(ERROR) << "VerifyIR: function entity " + << func_.func_decl_entity_id + << " block " << bi << " has no parent_structure_index"; + } + } + + for (uint32_t i = 0; i < instructions.size(); ++i) { + const auto &inst = instructions[i]; + + // Check #1 + #2 + #3: a value-producing instruction must not consume + // slot 0 (the entry IMPLICIT_GOTO), nor any other terminator/scope + // op, as a value operand. Skip terminators themselves — their + // "operands" can legitimately be the things they branch on, but the + // operand list of e.g. COND_BRANCH is the condition value, which + // isn't a terminator in well-formed IR. SWITCH consumes the + // selector via operand_indices[0], also a value. + for (uint32_t opi = 0; opi < inst.operand_indices.size(); ++opi) { + uint32_t op_idx = inst.operand_indices[opi]; + if (op_idx >= instructions.size()) { + LOG(ERROR) << "VerifyIR: function entity " + << func_.func_decl_entity_id + << " inst " << i << " operand " << opi + << " index " << op_idx << " out of range"; + continue; + } + OpCode op_op = instructions[op_idx].opcode; + + // A value operand referencing a terminator is the bug shape: + // the DeclRefExpr→EnumConstantDecl miscompile produced + // `MEMORY/LOAD_* [%term]` because EmitLValue handed back the + // entry IMPLICIT_GOTO's slot. Slot 0 specifically isn't the + // signal — it's whatever instruction was emitted first; only + // the terminator-as-value shape is unambiguously malformed. + if (mx::ir::IsTerminator(op_op)) { + LOG(ERROR) << "VerifyIR: function entity " + << func_.func_decl_entity_id + << " inst " << fmt_inst(i) + << " operand " << opi + << " is terminator " << fmt_inst(op_idx) + << "; terminators do not produce values"; + continue; + } + + if (is_scope_op(op_op)) { + LOG(ERROR) << "VerifyIR: function entity " + << func_.func_decl_entity_id + << " inst " << fmt_inst(i) + << " operand " << opi + << " is scope marker " << fmt_inst(op_idx) + << "; scope ops do not produce values"; + } + } + + // Check #4: MEMORY operand-count sanity vs. MemOp sub-opcode. + if (inst.opcode == OpCode::MEMORY) { + auto memop = static_cast(inst.mem_op); + auto n = inst.operand_indices.size(); + if (mx::ir::IsAnyLoad(memop)) { + // Plain load: address only. (BIT_READ_* also one operand.) + if (n != 1u) { + LOG(ERROR) << "VerifyIR: function entity " + << func_.func_decl_entity_id + << " inst " << fmt_inst(i) + << " (load mem_op=" << static_cast(memop) + << ") has " << n << " operands, expected 1"; + } + } else if (mx::ir::IsAnyStore(memop)) { + // Store: address + value (two operands). Bit-field stores are + // store_le/store_be with bit_offset/bit_width set; same shape. + if (n != 2u) { + LOG(ERROR) << "VerifyIR: function entity " + << func_.func_decl_entity_id + << " inst " << fmt_inst(i) + << " (store mem_op=" << static_cast(memop) + << ") has " << n << " operands, expected 2"; + } + } + // Bulk/string MemOps have variable shapes; left to per-op + // checks if a concrete miscompile motivates it later. + } + + // Check #5: SWITCH default sanity. With explicit-default codegen + // (#587), every SWITCH whose `default` is reachable sets + // is_default=true on its case entry and points to a real block. + // A switch whose only "default" path is implicit must not have a + // case entry pointing at slot 0. + if (inst.opcode == OpCode::SWITCH) { + for (const auto &c : inst.switch_cases) { + if (c.is_default && c.block_index == 0u) { + LOG(ERROR) << "VerifyIR: function entity " + << func_.func_decl_entity_id + << " inst " << fmt_inst(i) + << " switch default branch points to slot 0"; + } + } + } + } +} + void IRGenerator::ComputeFrameLayout() { uint32_t offset = 0; func_.has_dynamic_allocas = false; diff --git a/bin/Index/IRGen.h b/bin/Index/IRGen.h index f9c5386c3..f9bfaf191 100644 --- a/bin/Index/IRGen.h +++ b/bin/Index/IRGen.h @@ -360,6 +360,13 @@ class IRGenerator { void ComputeDominators(); void ComputeRPO(); void VerifyBlocks(); + // Semantic-level checks: catches malformed instructions that would + // pass VerifyBlocks (well-formed CFG, valid operand indices) but are + // semantically nonsense — e.g. a LOAD whose address operand is the + // entry IMPLICIT_GOTO, or a value-producing instruction whose operand + // points at a terminator/scope marker. Emits LOG(ERROR) per finding; + // does not throw — the partial IR is kept for downstream investigation. + void VerifyIR(); void ComputeFrameLayout(); }; diff --git a/bindings/Python/symex/dispatch.py b/bindings/Python/symex/dispatch.py index 13d70e38b..126e03827 100644 --- a/bindings/Python/symex/dispatch.py +++ b/bindings/Python/symex/dispatch.py @@ -305,13 +305,18 @@ def _shadow_write(shadow, addr, val, size): """ z3 = _z3_module() if z3 is not None and _is_z3(val): - # Simplify before per-byte decomposition: when ``val`` was - # constructed by re-concatenating bytes earlier read from the - # shadow (the common round-trip), z3 collapses the - # ``Extract(i+7, i, Concat(b_n, …, b_0))`` patterns back to the - # original byte expressions. Without this, every load-modify-store - # cycle accumulates Concat/Extract layers around the same bytes. - val = z3.simplify(val) + # Structural Concat-fold before per-byte decomposition: when + # ``val`` was built by re-concatenating bytes earlier read from + # the shadow (the load-modify-store round-trip), the fold + # collapses ``Concat(Extract(31,24,X),Extract(23,16,X),...)`` + # back to ``X`` (or a single Extract) so the per-byte + # decomposition below stores raw ``Extract(8i+7, 8i, X)`` + # entries. Using the structural fold here instead of + # ``z3.simplify`` avoids the asymmetric-byte-arith collapse + # that would otherwise produce ``212 + 255*b`` for the low + # byte of ``1492 - zext_32(b)`` and lose the parent identity + # for future loads. + val = _z3_concat_fold(z3, val) i = 0 while i < size: shadow[addr + i] = _z3_byte_at(val, i) @@ -369,7 +374,13 @@ def _shadow_read(shadow, addr, size, data, buf): while j >= 0: result = z3.Concat(result, buf[j]) j -= 1 - return z3.simplify(result) + # Structural fold only — collapses consecutive Extracts of the + # same parent back to the parent (or a single Extract). Avoids + # ``z3.simplify`` here on purpose: simplify folds the low byte + # through byte arith asymmetrically and prevents the parent from + # being recovered. A consumer op (compare, branch, cast) is the + # right place to run a full simplify. + return _z3_concat_fold(z3, result) def _make_default_mem_read(is_float, shadow=None, buf=None, byte_order="little"): @@ -569,6 +580,82 @@ def _z3_byte_at(val, i): return z3.Extract(8 * (i + 1) - 1, 8 * i, val) +def _z3_concat_fold(z3, expr): + """Structural fold for Concat-of-consecutive-Extracts-of-same-parent. + + Pure structure rewrite — never folds Extract through arithmetic. + `z3.simplify` would do that asymmetrically (the low byte of a + subtraction has no incoming borrow, so it folds into byte arith, + while high bytes don't), producing shapes like + `Concat(Extract(31, 8, X), 212 + 255*b)` where the low byte's + parent identity has been lost and the round-trip back to X is + impossible to recover. + + Steps: + + 1. Flatten nested Concats — z3 stores n-ary Concat as a + left-associated chain of binary applications, so + `Concat(a, b, c, d)` is `Concat(Concat(Concat(a, b), c), d)` + under the hood. The fold has to see it as a flat MSB→LSB + sequence to merge consecutive Extracts. + 2. Merge consecutive Extracts of the same parent at adjacent + bit ranges into a single Extract. + 3. If a single Extract spans the parent's full width, return + the parent itself. + + Rules 2 + 3 together recover the original parent expression on a + full-width round trip: store a 32-bit value as four byte shadows, + load all four, and the result is the original 32-bit term — no + z3.simplify required, and (more importantly) no asymmetric + arithmetic folding to the low byte. + """ + if not z3.is_app(expr) or expr.decl().kind() != z3.Z3_OP_CONCAT: + return expr + + # Step 1: flatten nested Concats into a single MSB→LSB list. + parts = [] + stack = [expr] + while stack: + node = stack.pop() + if z3.is_app(node) and node.decl().kind() == z3.Z3_OP_CONCAT: + # Children are MSB→LSB; push in reverse so the pop order + # walks them MSB-first. + kids = node.children() + for k in reversed(kids): + stack.append(k) + else: + parts.append(node) + + if len(parts) == 1: + return parts[0] + + # Step 2: left-to-right merge of adjacent same-parent Extracts. + out = [] + for p in parts: + if (out and + z3.is_app(p) and p.decl().kind() == z3.Z3_OP_EXTRACT and + z3.is_app(out[-1]) and + out[-1].decl().kind() == z3.Z3_OP_EXTRACT and + p.arg(0).eq(out[-1].arg(0))): + top_hi, top_lo = out[-1].params() + bot_hi, bot_lo = p.params() + if top_lo == bot_hi + 1: + out[-1] = z3.Extract(top_hi, bot_lo, p.arg(0)) + continue + out.append(p) + + if len(out) == 1: + only = out[0] + # Step 3: full-width Extract collapses to its parent. + if z3.is_app(only) and only.decl().kind() == z3.Z3_OP_EXTRACT: + hi, lo = only.params() + parent = only.arg(0) + if lo == 0 and hi == parent.size() - 1: + return parent + return only + return z3.Concat(*out) + + # Per-arity dispatch tables: each entry is `(opcode_range, builder)`. # Builders for ops that need a z3 module function take (z3, a[, b]) so # the cached `_z3_module()` reference flows through; the rest are pure diff --git a/tests/symex/c/symex_integration.c b/tests/symex/c/symex_integration.c index fa064cab6..a302ebec5 100644 --- a/tests/symex/c/symex_integration.c +++ b/tests/symex/c/symex_integration.c @@ -168,3 +168,51 @@ int32_t si_switch_constrained(int32_t sel) { default: return -1; } } + +// ----------------------------------------------------------------------- +// EnumConstantDecl reference regression corpus. +// +// A `DeclRefExpr` to an `EnumConstantDecl` is an rvalue that yields the +// enumerator's integer value — it must lower to a CONST, never to a +// MEMORY/LOAD. The functions and global below pin every shape of +// EnumConstantDecl reference we hit in C (aggregate initializer, +// scalar return, comparison, switch selector, function-call argument). +// ----------------------------------------------------------------------- + +enum si_color { si_red = 1, si_green = 2, si_blue = 3 }; + +struct si_color_entry { + const char *name; + enum si_color k; +}; + +// Aggregate initializer: each entry's `k` is a DeclRefExpr to an +// EnumConstantDecl. Without the fix these lower to MEMORY/LOAD_*. +static struct si_color_entry si_color_table[] = { + { "red", si_red }, + { "green", si_green }, + { "blue", si_blue }, +}; + +// Direct rvalue use: `return si_red;` is the bare DeclRefExpr. +int32_t si_enum_return(void) { + return (int32_t)si_red; +} + +// Comparison: scalar use of EnumConstantDecl on the rhs of ==. +int32_t si_enum_cmp(int32_t k) { + if ((enum si_color)k == si_blue) return 100; + return 0; +} + +// Pass an EnumConstantDecl as a function call argument. +int32_t si_enum_take(int32_t k) { return k + 1; } +int32_t si_enum_arg(void) { + return si_enum_take(si_green); +} + +// Read the table back so the global initializer is reachable. +const char *si_enum_table_name(int32_t i) { + if (i < 0 || i >= 3) return (const char *)0; + return si_color_table[i].name; +} diff --git a/tests/symex/test_concat_fold.py b/tests/symex/test_concat_fold.py new file mode 100644 index 000000000..ea15124b1 --- /dev/null +++ b/tests/symex/test_concat_fold.py @@ -0,0 +1,122 @@ +# Copyright (c) 2026-present, Trail of Bits, Inc. +# +# This source code is licensed in accordance with the terms specified in +# the LICENSE file found in the root directory of this source tree. + +"""Structural Concat-fold and shadow round-trip tests. + +The shadow store/load pipeline must recover the original parent +expression on a full-width round trip. `z3.simplify` can't do this — +it folds the low byte through byte arithmetic asymmetrically (the low +byte of `1492 - zext_32(b)` becomes `212 + 255*b`), which destroys +the parent identity for the high bytes. The structural Concat-fold +in `dispatch._z3_concat_fold` keeps the parent reachable. +""" + +import pytest + +z3 = pytest.importorskip("z3") + +from multiplier.symex.dispatch import ( + _z3_concat_fold, _shadow_write, _shadow_read, _z3_byte_at, +) + + +def _bv(val, w): + return z3.BitVecVal(val, w) + + +# --------------------------------------------------------------------------- +# Direct fold tests: structural rewrites only. +# --------------------------------------------------------------------------- + +def test_concat_fold_full_width_round_trip(): + """Concat of all four byte-Extracts of a 32-bit term collapses to + the term itself (rule: full-coverage Extract → identity).""" + b = z3.BitVec("b", 8) + x = z3.BitVecVal(1492, 32) - z3.ZeroExt(24, b) + parts = [z3.Extract(8 * (i + 1) - 1, 8 * i, x) for i in range(4)] + # MSB-first concat (matching _shadow_read's layout). + expr = z3.Concat(*reversed(parts)) + out = _z3_concat_fold(z3, expr) + assert out.eq(x), f"expected parent recovery, got {out}" + + +def test_concat_fold_partial_width_to_extract(): + """Concat of two consecutive Extracts of the same parent collapses + to one Extract spanning both.""" + x = z3.BitVec("x", 32) + expr = z3.Concat(z3.Extract(15, 8, x), z3.Extract(7, 0, x)) + out = _z3_concat_fold(z3, expr) + assert out.eq(z3.Extract(15, 0, x)), f"got {out}" + + +def test_concat_fold_mixed_parents_stays_split(): + """Different parents → fold leaves the Concat alone.""" + x = z3.BitVec("x", 32) + y = z3.BitVec("y", 32) + expr = z3.Concat(z3.Extract(31, 8, y), z3.Extract(7, 0, x)) + out = _z3_concat_fold(z3, expr) + # Must be a Concat with two operands (not collapsed). + assert z3.is_app(out) and out.decl().kind() == z3.Z3_OP_CONCAT + assert out.num_args() == 2 + + +def test_concat_fold_simplify_does_NOT_recover_parent(): + """Demonstrates the bug `_z3_concat_fold` exists to avoid: with + `z3.simplify`, the low byte is folded through arithmetic and the + full-Concat-collapse rule no longer matches.""" + b = z3.BitVec("b", 8) + x = z3.BitVecVal(1492, 32) - z3.ZeroExt(24, b) + parts = [z3.Extract(8 * (i + 1) - 1, 8 * i, x) for i in range(4)] + expr = z3.Concat(*reversed(parts)) + # Naive z3.simplify: bytes 1-3 collapse, byte 0 turns into byte arith. + naive = z3.simplify(expr) + assert not naive.eq(x), \ + "z3.simplify unexpectedly recovered the parent — the asymmetric" \ + " byte-fold isn't firing in this z3 build, so this test no" \ + " longer guards what it was written for." + # Our structural fold recovers the parent regardless. + structural = _z3_concat_fold(z3, expr) + assert structural.eq(x), f"structural fold lost the parent: {structural}" + + +# --------------------------------------------------------------------------- +# Shadow round-trip: STORE + LOAD via the actual helpers. +# --------------------------------------------------------------------------- + +def test_shadow_round_trip_recovers_parent_32bit(): + """Storing a 32-bit symbolic term then reading it back yields the + same term, not a byte-split residue.""" + b = z3.BitVec("b", 8) + x = z3.BitVecVal(1492, 32) - z3.ZeroExt(24, b) + + shadow = {} + _shadow_write(shadow, addr=0x100, val=x, size=4) + # Concrete fallback bytes — _shadow_read prefers shadow entries + # over these, so the actual values don't matter for this test. + data = bytes(4) + out = _shadow_read(shadow, addr=0x100, size=4, data=data, buf=[]) + assert out is not None + assert out.eq(x), f"round trip lost parent identity: {out}" + + +def test_shadow_partial_overwrite_keeps_minimal_concat(): + """Overwriting byte 0 of a 4-byte symbolic term with a new + 8-bit value yields `Concat(Extract(31, 8, X), new_byte)` — the + high bytes still reference X, the low byte is the new value, and + no further fold collapses across the parent boundary.""" + b = z3.BitVec("b", 8) + new_byte = z3.BitVec("new_byte", 8) + x = z3.BitVecVal(1492, 32) - z3.ZeroExt(24, b) + + shadow = {} + _shadow_write(shadow, addr=0x100, val=x, size=4) + # Now overwrite byte 0 with a fresh symbolic byte. + _shadow_write(shadow, addr=0x100, val=new_byte, size=1) + + data = bytes(4) + out = _shadow_read(shadow, addr=0x100, size=4, data=data, buf=[]) + assert out is not None + expected = z3.Concat(z3.Extract(31, 8, x), new_byte) + assert out.eq(expected), f"got {out}, expected {expected}" \ No newline at end of file diff --git a/tests/symex/test_irgen_decl_ref.py b/tests/symex/test_irgen_decl_ref.py new file mode 100644 index 000000000..141fcf84c --- /dev/null +++ b/tests/symex/test_irgen_decl_ref.py @@ -0,0 +1,182 @@ +# Copyright (c) 2026-present, Trail of Bits, Inc. +# +# This source code is licensed in accordance with the terms specified in +# the LICENSE file found in the root directory of this source tree. + +"""Regression tests for IRGen DeclRefExpr lowering. + +A `DeclRefExpr` to an `EnumConstantDecl` is an rvalue — it must lower +to a `CONST` instruction holding the enumerator's integer value. The +old code routed every `DeclRefExpr` through the lvalue fallback, which +turned an enum constant reference into `MEMORY/LOAD_LE_* [%0]` (a load +from instruction 0, the entry block's IMPLICIT_GOTO). + +These tests pin the corpus added at the bottom of +`tests/symex/c/symex_integration.c`. Without the IRGen fix every +function below produces at least one MEMORY instruction whose +source_statement is a DeclRefExpr to an EnumConstantDecl; with the fix +those references all produce CONST. +""" + +import pytest + +import multiplier as mx + +from conftest import find_ir_function + + +def _walk_instructions(ir_func): + """Yield every IRInstruction in `ir_func`, including nested sub- + instruction operands. `IRBlock.all_instructions` only enumerates + the block's root instructions; CONST nodes that materialise + EnumConstantDecl values live as operands of stores/calls/comparisons, + so the walk has to descend through `nth_operand`. + """ + if ir_func is None: + return + seen = set() + + def walk(inst): + if inst is None: + return + key = inst.id + if key in seen: + return + seen.add(key) + yield inst + for i in range(inst.num_operands): + child = inst.nth_operand(i) + if child is None: + continue + yield from walk(child) + + for block in ir_func.blocks: + for root in block.all_instructions: + yield from walk(root) + + +def _is_decl_ref_to_enum_constant(stmt): + if not isinstance(stmt, mx.ast.DeclRefExpr): + return False + return isinstance(stmt.declaration, mx.ast.EnumConstantDecl) + + +def _enum_const_refs(ir_func): + """All instructions whose source_statement is a DeclRefExpr to an + EnumConstantDecl, paired with the referenced enumerator name.""" + for inst in _walk_instructions(ir_func): + stmt = inst.source_statement + if _is_decl_ref_to_enum_constant(stmt): + yield inst, str(stmt.declaration.name) + + +def _find_var_decl(index, name): + for vd in mx.ast.VarDecl.IN(index): + if str(vd.name) == name: + return vd + return None + + +# --------------------------------------------------------------------------- +# Functional uses of EnumConstantDecl +# --------------------------------------------------------------------------- + +def _is_load(inst): + """True if `inst` is a MEMORY load. The bug shape is a LOAD whose + source_statement is a DeclRefExpr→EnumConstantDecl; wrapping STOREs + (e.g. into an argument alloca) legitimately inherit the same source + statement and are not part of the bug.""" + if inst.opcode != mx.ir.OpCode.MEMORY: + return False + s = inst.to_string + return "LOAD_" in s + + +@pytest.mark.parametrize("func_name,expected_names", [ + ("si_enum_return", {"si_red"}), + ("si_enum_cmp", {"si_blue"}), + ("si_enum_arg", {"si_green"}), +]) +def test_enum_constant_ref_emits_const(symex_index, func_name, expected_names): + """Every DeclRefExpr→EnumConstantDecl in `func_name` materialises as a + CONST, never as a MEMORY/LOAD.""" + ir = find_ir_function(symex_index, func_name) + assert ir is not None, f"{func_name} missing from index" + + const_names = set() + for inst, enum_name in _enum_const_refs(ir): + # The bug shape: a LOAD whose source statement is the + # DeclRefExpr to the EnumConstantDecl (no storage exists, so any + # LOAD is malformed by definition). + assert not _is_load(inst), ( + f"{func_name}: DeclRefExpr to EnumConstantDecl {enum_name!r} " + f"lowered to a load: {inst.to_string!r}" + ) + if inst.opcode == mx.ir.OpCode.CONST: + const_names.add(enum_name) + + # Every expected enumerator must materialise as a CONST somewhere + # in the function. (Wrapping STOREs that propagate the source + # statement are fine; the value sub-instruction must be a CONST.) + assert expected_names <= const_names, ( + f"{func_name}: expected CONST emissions for {expected_names}, " + f"saw {const_names}" + ) + + +# --------------------------------------------------------------------------- +# Aggregate initializer at file scope. +# --------------------------------------------------------------------------- + +def test_enum_constant_ref_in_global_aggregate(symex_index): + """`si_color_table`'s initializer references si_red/si_green/si_blue — + each must be a CONST, not a MEMORY load.""" + vd = _find_var_decl(symex_index, "si_color_table") + assert vd is not None, "si_color_table missing from index" + + ir = mx.ir.IRFunction.FROM(vd) + assert ir is not None, ( + "no global initializer IR for si_color_table — IRGen failed to emit " + "GenerateGlobalInit() output for the aggregate" + ) + + const_names = set() + for inst, enum_name in _enum_const_refs(ir): + assert not _is_load(inst), ( + f"si_color_table init: DeclRefExpr to {enum_name!r} lowered " + f"to a load: {inst.to_string!r}" + ) + if inst.opcode == mx.ir.OpCode.CONST: + const_names.add(enum_name) + + assert {"si_red", "si_green", "si_blue"} <= const_names, ( + f"expected CONSTs for si_red/si_green/si_blue in initializer, " + f"saw {const_names}" + ) + + +# --------------------------------------------------------------------------- +# Negative shape check: no instruction in the corpus has %0 as a +# value-producing operand. (`%0` is always the entry IMPLICIT_GOTO; a +# load/store/etc. that consumes it is the malformed shape we're guarding +# against.) +# --------------------------------------------------------------------------- + +@pytest.mark.parametrize("func_name", [ + "si_enum_return", "si_enum_cmp", "si_enum_arg", "si_enum_table_name", +]) +def test_no_value_consumer_of_entry_goto(symex_index, func_name): + ir = find_ir_function(symex_index, func_name) + assert ir is not None, f"{func_name} missing from index" + + for inst in _walk_instructions(ir): + if inst.is_terminator: + continue + for op_idx in range(inst.num_operands): + operand = inst.nth_operand(op_idx) + if operand is None: + continue + assert not operand.is_terminator, ( + f"{func_name}: instruction {inst.to_string!r} consumes a " + f"terminator ({operand.to_string!r}) as operand {op_idx}" + )