diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index bb798eb93..1f83d5c42 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -12,10 +12,21 @@ jobs: - uses: actions/checkout@v3 with: path: cle + - name: Resolve binaries branch + id: binaries-ref + shell: bash + run: | + BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" + if [ -n "$BRANCH" ] && git ls-remote --exit-code --heads https://github.com/angr/binaries.git "$BRANCH" >/dev/null 2>&1; then + echo "ref=$BRANCH" >> "$GITHUB_OUTPUT" + else + echo "ref=master" >> "$GITHUB_OUTPUT" + fi - uses: actions/checkout@v3 with: repository: angr/binaries path: binaries + ref: ${{ steps.binaries-ref.outputs.ref }} - uses: actions/setup-python@v4 with: python-version: "3.10" diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index f31803a06..d4448eda6 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -12,10 +12,21 @@ jobs: - uses: actions/checkout@v3 with: path: cle + - name: Resolve binaries branch + id: binaries-ref + shell: bash + run: | + BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" + if [ -n "$BRANCH" ] && git ls-remote --exit-code --heads https://github.com/angr/binaries.git "$BRANCH" >/dev/null 2>&1; then + echo "ref=$BRANCH" >> "$GITHUB_OUTPUT" + else + echo "ref=master" >> "$GITHUB_OUTPUT" + fi - uses: actions/checkout@v3 with: repository: angr/binaries path: binaries + ref: ${{ steps.binaries-ref.outputs.ref }} - uses: actions/setup-python@v4 with: python-version: "3.10" diff --git a/cle/backends/macho/macho.py b/cle/backends/macho/macho.py index 97be4a1fd..de3920537 100644 --- a/cle/backends/macho/macho.py +++ b/cle/backends/macho/macho.py @@ -42,6 +42,28 @@ __all__ = ("MachO", "MachOSection", "MachOSegment", "SymbolList") +class _ChainStride(typing.NamedTuple): + """Stride layout for a dyld chained-pointer format: the byte multiplier for the `next` + field and whether to read the packed pointer through the Arm64e or Generic64 view.""" + + bytes: int + use_arm64e: bool + + +# Per dyld's fixup-chains.h: each pointer format defines a stride (the byte multiplier for the +# `next` field) and which packed-pointer layout to read (Arm64e vs Generic64). The two layouts put +# `next` at different bit positions, so picking the wrong one yields garbage walks. +_CHAIN_STRIDE: dict[DyldChainedPtrFormats, _ChainStride] = { + DyldChainedPtrFormats.DYLD_CHAINED_PTR_ARM64E: _ChainStride(bytes=8, use_arm64e=True), + DyldChainedPtrFormats.DYLD_CHAINED_PTR_64: _ChainStride(bytes=4, use_arm64e=False), + DyldChainedPtrFormats.DYLD_CHAINED_PTR_64_OFFSET: _ChainStride(bytes=4, use_arm64e=False), + DyldChainedPtrFormats.DYLD_CHAINED_PTR_ARM64E_KERNEL: _ChainStride(bytes=4, use_arm64e=True), + DyldChainedPtrFormats.DYLD_CHAINED_PTR_ARM64E_USERLAND: _ChainStride(bytes=8, use_arm64e=True), + DyldChainedPtrFormats.DYLD_CHAINED_PTR_ARM64E_FIRMWARE: _ChainStride(bytes=4, use_arm64e=True), + DyldChainedPtrFormats.DYLD_CHAINED_PTR_ARM64E_USERLAND24: _ChainStride(bytes=8, use_arm64e=True), +} + + # pylint: disable=abstract-method class SymbolList(SortedKeyList): """ @@ -157,8 +179,12 @@ def __init__(self, *args, **kwargs): "7I", binary_file, 0, 28 ) - # Libraries are always implicitly PIC - self.pic = bool(self.flags & MH_flags.MH_PIE) or bool(self.filetype & MachoFiletype.MH_DYLIB) + # Libraries, bundles, and kexts are always implicitly PIC + self.pic = bool(self.flags & MH_flags.MH_PIE) or self.filetype in ( + MachoFiletype.MH_DYLIB, + MachoFiletype.MH_BUNDLE, + MachoFiletype.MH_KEXT_BUNDLE, + ) if not bool(self.flags & MH_flags.MH_TWOLEVEL): # ensure MH_TWOLEVEL log.error( @@ -207,6 +233,9 @@ def __init__(self, *args, **kwargs): # A Library is loaded as a dependency, this is fine, the loader will map it to somewhere above the main # binary, so we don't need to do anything pass + elif self.filetype in (MachoFiletype.MH_BUNDLE, MachoFiletype.MH_KEXT_BUNDLE): + if self.is_main_bin: + self._custom_base_addr = 0 else: # This case is not explicitly supported yet. # There are various other MachoFiletypes, which might have different quirks in their loading @@ -1154,60 +1183,88 @@ def _parse_dyld_chained_fixups(self): starts = self._get_struct(dyld_chained_starts_in_segment, starts_addr) seg = self.find_segment_containing(starts.segment_offset) - # There are weird binaries where the offsets inside the file - # and inside the virtual addr space don't match anymore. - # This isn't properly supported yet, and the only known case is the __PII section inside the __ETC segment - # of rare binaries, which isn't that important for most purposes - shift = seg.vaddr - (seg.offset) - if shift != 0: - assert isinstance(seg, MachOSegment) - assert seg.segname == "__ETC", ( - "Only __ETC segments are known to have this shift, please open an" - " issue for this binary so it can be investigated" - ) - log.error("Segment shift detected in, not handling fixups here for now") - continue + # In some binaries (kexts, __ETC segments) the segment's file offset and virtual + # address differ. Chain entries are read at *file* offsets but relocation addresses + # must be virtual (relative to the linked base). Compute the delta once here and + # add it when creating relocations below. + file_to_vaddr_shift = seg.vaddr - seg.offset if seg is not None else 0 page_starts_data = self._read(self._binary_stream, starts_addr + 22, starts.page_count * 2) page_starts = struct.unpack("<" + ("H" * starts.page_count), page_starts_data) pointer_format: DyldChainedPtrFormats = starts.pointer_format log.info("Page has pointer_format: %s", pointer_format) + # Each pointer format has its own (next, stride) layout. Generic64 packs `next` as a + # 12-bit field at bit 52; Arm64e packs it as 11 bits at bit 51. Mixing them up reads + # garbage out of the chain header — see the kext path with DYLD_CHAINED_PTR_ARM64E_KERNEL. + stride = _CHAIN_STRIDE.get(pointer_format) + if stride is None: + raise NotImplementedError(f"Chain stride for pointer format {pointer_format} not known") + is_arm64e = stride.use_arm64e for j, start in enumerate(page_starts): if start == DYLD_CHAINED_PTR_START_NONE: continue - chain_entry_addr = starts.segment_offset + (j * starts.page_size) + start - current_chain_addr = chain_entry_addr + page_base = starts.segment_offset + (j * starts.page_size) + page_end = page_base + starts.page_size + current_chain_addr = page_base + start log.info("Reading chain at %x", current_chain_addr) while True: - chained_rebase_ptr: ChainedFixupPointerOnDisk = self._get_struct( - ChainedFixupPointerOnDisk, current_chain_addr - ) + try: + chained_rebase_ptr: ChainedFixupPointerOnDisk = self._get_struct( + ChainedFixupPointerOnDisk, current_chain_addr + ) + except ValueError: + log.warning("Chain entry at %#x extends past end of file; stopping", current_chain_addr) + break bind = chained_rebase_ptr.isBind(pointer_format) rebase = chained_rebase_ptr.isRebase(pointer_format, self.mapped_base) if bind is not None: libOrdinal, _addend = bind + if libOrdinal >= len(self._dyld_imports): + log.error( + "Chained fixup bind ordinal %d out of range (have %d imports) at %#x; " + "stopping chain walk", + libOrdinal, + len(self._dyld_imports), + current_chain_addr, + ) + break import_symbol = self._dyld_imports[libOrdinal] - reloc = MachOSymbolRelocation(self, import_symbol, current_chain_addr, None) + reloc_addr = current_chain_addr + file_to_vaddr_shift + reloc = MachOSymbolRelocation(self, import_symbol, reloc_addr, None) self.relocs.append(reloc) - # Legacy Code uses bind_xrefs, explicitly add this to make this compatible for now import_symbol.bind_xrefs.append(reloc.dest_addr + self.linked_base) - log.debug("Binding for %s found at %x", import_symbol, current_chain_addr) + log.debug("Binding for %s found at %x", import_symbol, reloc_addr) elif rebase is not None: + reloc_addr = current_chain_addr + file_to_vaddr_shift target = self.linked_base + rebase - location: MemoryPointer = self.linked_base + current_chain_addr - anon_reloc = MachOPointerRelocation(owner=self, relative_addr=current_chain_addr, data=rebase) + location: MemoryPointer = self.linked_base + reloc_addr + anon_reloc = MachOPointerRelocation(owner=self, relative_addr=reloc_addr, data=rebase) self.relocs.append(anon_reloc) log.debug("Rebase to %x found at %x", target, location) else: raise CLEInvalidBinaryError("FixupPointer was neither bind nor rebase, that shouldn't happen") - skip = chained_rebase_ptr.generic64.rebase.next * 4 - current_chain_addr += skip + if is_arm64e: + next_count = chained_rebase_ptr.arm64e.rebase.next + else: + next_count = chained_rebase_ptr.generic64.rebase.next + skip = next_count * stride.bytes if skip == 0: break + current_chain_addr += skip + if current_chain_addr >= page_end: + # Chains are per-page; if a malformed chain would walk into the next page, + # stop rather than reinterpreting unrelated data as fixup entries. + log.warning( + "Chain walked past page end at %#x (page %#x..%#x); stopping", + current_chain_addr, + page_base, + page_end, + ) + break def get_symbol_by_address_fuzzy(self, address): """ diff --git a/cle/backends/macho/structs.py b/cle/backends/macho/structs.py index b636931e8..351f31f48 100644 --- a/cle/backends/macho/structs.py +++ b/cle/backends/macho/structs.py @@ -130,6 +130,15 @@ class dyld_chained_ptr_arm64e_bind24(HelperStruct): https://github.com/apple-opensource/dyld/blob/852.2/include/mach-o/fixup-chains.h#L164-L173 """ + _fields_ = [ + ("ordinal", c_uint64, 24), + ("zero", c_uint64, 8), + ("addend", c_uint64, 19), + ("next", c_uint64, 11), + ("bind", c_uint64, 1), + ("auth", c_uint64, 1), + ] + # noinspection PyPep8Naming class dyld_chained_ptr_arm64e_auth_bind24(HelperStruct): diff --git a/cle/backends/universal2.py b/cle/backends/universal2.py index bf86fc36a..8042b8934 100644 --- a/cle/backends/universal2.py +++ b/cle/backends/universal2.py @@ -117,21 +117,32 @@ def __init__(self, *args, arch=None, **kwargs): slices.append((cputype, cpusubtype, offset, size, align)) self._fat_arches = list(slices) - # Filter to requested architecture if specified + # Pick which slice(s) to actually load. Loading every slice into memory at once produces + # conflicting placement requirements (multiple MH_EXECUTE slices all want 0x400000) and + # multiple is_main_bin objects, which break downstream consumers. The rules are: + # - If arch= was passed explicitly, honor it. + # - Otherwise, if we are the main binary, pick the first slice and warn. + # - Otherwise (loaded as a dependency), pick the slice matching the main binary's arch. if arch is not None: if not isinstance(arch, archinfo.Arch): raise TypeError(f"arch must be an archinfo.Arch instance, got {type(arch).__name__}") - filtered = [] - for cputype, cpusubtype, offset, size, align in slices: - slice_arch = _cputype_to_arch(cputype) - if slice_arch is not None and isinstance(arch, type(slice_arch)): - filtered.append((cputype, cpusubtype, offset, size, align)) - if not filtered: + slices = self._filter_slices_by_arch(slices, arch) + elif self._is_main_universal: + if len(slices) > 1: available = [CPU_TYPE_NAMES.get(s[0], f"unknown(0x{s[0]:X})") for s in slices] - raise KeyError( - f"Architecture {arch!r} not found in universal binary. Available architectures: {available}" + log.warning( + "Universal binary contains multiple architectures %s; " + "loading only the first (%s). Pass arch= to select a specific slice.", + available, + available[0], ) - slices = filtered + slices = slices[:1] + else: + main_arch = self.loader._main_object.arch if self.loader._main_object is not None else None + if main_arch is None: + slices = slices[:1] + else: + slices = self._filter_slices_by_arch(slices, main_arch) # Load each slice using _load_object_isolated. # Unlike StaticArchive (where children are .o files), universal binary slices @@ -172,6 +183,18 @@ def __init__(self, *args, arch=None, **kwargs): if self.loader._main_object is self: self.loader._main_object = None + @staticmethod + def _filter_slices_by_arch(slices, arch): + filtered = [] + for entry in slices: + slice_arch = _cputype_to_arch(entry[0]) + if slice_arch is not None and isinstance(arch, type(slice_arch)): + filtered.append(entry) + if not filtered: + available = [CPU_TYPE_NAMES.get(s[0], f"unknown(0x{s[0]:X})") for s in slices] + raise KeyError(f"Architecture {arch!r} not found in universal binary. Available architectures: {available}") + return filtered + @property def available_arches(self): """Return the list of architecture names present in the universal binary's fat header.""" diff --git a/tests/test_macho_kext.py b/tests/test_macho_kext.py new file mode 100644 index 000000000..d8238679e --- /dev/null +++ b/tests/test_macho_kext.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +import os + +import cle +from cle import MachO +from cle.backends.macho.macho_enums import MachoFiletype +from cle.backends.macho.segment import MachOSegment + +TEST_BASE = os.path.join(os.path.dirname(os.path.realpath(__file__)), os.path.join("..", "..", "binaries")) +KEXT = os.path.join(TEST_BASE, "tests", "aarch64", "IPwnKit.macho.kext") + + +def test_kext_loads(): + ld = cle.Loader(KEXT, auto_load_libs=False) + assert isinstance(ld.main_object, MachO) + assert ld.main_object.filetype == MachoFiletype.MH_KEXT_BUNDLE + + +def test_kext_arch(): + ld = cle.Loader(KEXT, auto_load_libs=False) + assert ld.main_object.arch.name == "AARCH64" + + +def test_kext_pic(): + ld = cle.Loader(KEXT, auto_load_libs=False) + assert ld.main_object.pic is True + + +def test_kext_base_addr(): + ld = cle.Loader(KEXT, auto_load_libs=False) + assert ld.main_object.mapped_base == 0 + + +def test_kext_segments(): + ld = cle.Loader(KEXT, auto_load_libs=False) + mo = ld.main_object + assert isinstance(mo, MachO) + segnames = [s.segname for s in mo.segments] + assert "__TEXT" in segnames + assert "__TEXT_EXEC" in segnames + assert "__DATA" in segnames + assert "__DATA_CONST" in segnames + assert "__LINKEDIT" in segnames + + +def test_kext_sections(): + ld = cle.Loader(KEXT, auto_load_libs=False) + mo = ld.main_object + assert isinstance(mo, MachO) + section_names = set() + for seg in mo.segments: + assert isinstance(seg, MachOSegment) + for sec in seg.sections: + section_names.add((seg.segname, sec.sectname)) + assert ("__TEXT_EXEC", "__text") in section_names + assert ("__TEXT_EXEC", "__auth_stubs") in section_names + assert ("__DATA_CONST", "__auth_got") in section_names + assert ("__DATA_CONST", "__got") in section_names + + +def test_kext_symbols(): + ld = cle.Loader(KEXT, auto_load_libs=False) + mo = ld.main_object + assert len(mo.symbols) > 100 + sym_names = {s.name for s in mo.symbols} + assert "_kmod_info" in sym_names + assert "__realmain" in sym_names + assert "_IPwnKit_start" in sym_names + assert "_IPwnKit_stop" in sym_names + + +def test_kext_iokit_class_symbols(): + ld = cle.Loader(KEXT, auto_load_libs=False) + sym_names = {s.name for s in ld.main_object.symbols} + assert "__ZN21io_oooverflow_IPwnKit5startEP9IOService" in sym_names + assert "__ZN31io_oooverflow_IPwnKitUserClient10gMetaClassE" in sym_names + + +def test_kext_relocations(): + ld = cle.Loader(KEXT, auto_load_libs=False) + assert len(ld.main_object.relocs) > 0 + + +def test_kext_code_readable(): + ld = cle.Loader(KEXT, auto_load_libs=False) + mo = ld.main_object + start_sym = [s for s in mo.symbols if s.name == "_IPwnKit_start" and s.relative_addr != 0] + assert len(start_sym) > 0 + addr = start_sym[0].relative_addr + data = mo.memory.load(addr, 4) + assert len(data) == 4 + assert data != b"\x00\x00\x00\x00" diff --git a/tests/test_universal2.py b/tests/test_universal2.py index 6e3bed03d..583a281d9 100644 --- a/tests/test_universal2.py +++ b/tests/test_universal2.py @@ -29,8 +29,8 @@ def test_universal2_autodetect(): assert type(ld.main_object) is Universal2 -def test_universal2_load_all_slices(): - """Test loading all architecture slices from a universal binary.""" +def test_universal2_default_first_slice(): + """Test that loading without arch= picks only the first slice.""" ld = cle.Loader(FATBIN, auto_load_libs=False) main = ld.main_object @@ -38,19 +38,14 @@ def test_universal2_load_all_slices(): assert main.is_outer is True assert main.has_memory is False - # Should have two child objects (x86_64 + aarch64) - assert len(main.child_objects) == 2 - assert len(main.slices) == 2 - - # All children should be MachO objects parented to the Universal2 - for child in main.child_objects: - assert isinstance(child, MachO) - assert child.parent_object is main + # Should load only the first slice when no arch is specified + assert len(main.child_objects) == 1 + assert len(main.slices) == 1 - # Check that both expected architectures are present - arch_names = {child.arch.name for child in main.child_objects} - assert "AMD64" in arch_names - assert "AARCH64" in arch_names + # The child should be a MachO object parented to the Universal2 + child = main.child_objects[0] + assert isinstance(child, MachO) + assert child.parent_object is main def test_universal2_load_single_arch(): @@ -101,9 +96,27 @@ def test_universal2_available_arches(): def test_universal2_child_names(): """Test that child objects have descriptive names including architecture.""" - ld = cle.Loader(FATBIN, auto_load_libs=False) + ld = cle.Loader(FATBIN, auto_load_libs=False, main_opts={"arch": archinfo.ArchAMD64()}) main = ld.main_object names = {child.binary_basename for child in main.child_objects} assert any("[x64]" in n for n in names) - assert any("[aarch64]" in n for n in names) + + +def test_universal2_filter_slices_by_arch(): + """The slice-filter helper used by the dependency-loading path picks the matching arch.""" + # (cputype, cpusubtype, offset, size, align) tuples — only the cputype field matters here. + slices = [ + (0x1000007, 0, 0, 0, 0), # x86_64 + (0x100000C, 0, 0, 0, 0), # aarch64 + ] + aarch64 = Universal2._filter_slices_by_arch(slices, archinfo.ArchAArch64()) + assert len(aarch64) == 1 + assert aarch64[0][0] == 0x100000C + + amd64 = Universal2._filter_slices_by_arch(slices, archinfo.ArchAMD64()) + assert len(amd64) == 1 + assert amd64[0][0] == 0x1000007 + + with pytest.raises(KeyError, match="not found in universal binary"): + Universal2._filter_slices_by_arch(slices, archinfo.ArchMIPS32())