saltytrain2 · edmcman · Oct 10, 2023 · Oct 10, 2023 · Oct 10, 2023 · Oct 26, 2023
diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
@@ -0,0 +1,52 @@
+# .github/actions/shared-action/action.yml
+name: Setup
+description: Shared logic for workflows
+runs:
+  using: "composite"
+  steps:
+    - uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+    - uses: actions/setup-java@v4
+      with:
+        distribution: 'zulu' # See 'Supported distributions' for available options
+        java-version: '21'
+    - uses: antoniovazquezblanco/setup-ghidra@v2.0.4
+
+    - name: Upgrade pip
+      shell: bash
+      run: python -m pip install --upgrade pip setuptools wheel
+
+    - name: Install apt dependencies
+      shell: bash
+      run: sudo apt-get install -y pkg-config libsentencepiece-dev libprotobuf-dev
+
+    - name: Install dependencies
+      shell: bash
+      run: pip install -r requirements.txt
+
+    - name: Download Ghidrathon
+      uses: robinraju/release-downloader@v1
+      with:
+        # The source repository path.
+        # Expected format {owner}/{repo}
+        # Default: ${{ github.repository }}
+          repository: mandiant/Ghidrathon
+          tag: v4.0.0
+          fileName: '*.zip'
+
+    - name: Install Ghidrathon
+      shell: bash
+      run: |
+        mkdir ghidrathon-tmp
+        unzip Ghidrathon*.zip -d ghidrathon-tmp
+        pip install -r ghidrathon-tmp/requirements.txt
+        python ghidrathon-tmp/ghidrathon_configure.py $GHIDRA_INSTALL_DIR
+        unzip ghidrathon-tmp/Ghidrathon*.zip -d $GHIDRA_INSTALL_DIR/Ghidra/Extensions
+        #$GHIDRA_INSTALL_DIR/support/analyzeHeadless projects TmpProject -import /bin/ls
+
+    - name: Make projects directory
+      shell: bash
+      run: mkdir -p projects
+
diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,48 +1,82 @@
-name: Test
+name: Test DIRTY Ghidra
 
 on:
   push:
-    branches: [ main ]
   pull_request:
-    branches: [ main ]
-
   workflow_dispatch:
 
 jobs:
-  install:
-    # The type of runner that the job will run on
+  test-train:
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04]
-        python-version: [3.6, 3.7, 3.8]
+        os: [ubuntu-22.04, ubuntu-24.04]
+        python-version: ["3.10", "3.11"]
 
     steps:
-      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Upgrade pip
-        run: python -m pip install --upgrade pip setuptools wheel
-      - name: Install dependencies
-        run: pip install -r requirements.txt
-      - name: Download data and model
-        working-directory: ./dirty
+      - uses: actions/checkout@v4
+      - name: Setup
+        uses: ./.github/actions/setup
+
+      - name: Generate dataset
+        run: |
+          set -ex
+          mkdir -p $DATASET_DIR
+          cd $DATASET_DIR
+          # Create some dummy programs
+          for n in $(seq 20); do echo -e "#include <stdio.h>\nint main(int argc, const char** argv) { printf(\"%d %d\\\\n\", $n, argc); }" > s$n.c; gcc -g s$n.c -o s$n; rm s$n.c; done
+          cd $GITHUB_WORKSPACE/dataset-gen-ghidra
+          python generate.py --verbose --ghidra $GHIDRA_INSTALL_DIR/support/analyzeHeadless -t 1 -b $DATASET_DIR -o $DATA_DIR/unprocessed
+          cd $DATA_DIR/unprocessed && python $GITHUB_WORKSPACE/dataset-gen-ghidra/gen_names.py $DATA_DIR/unprocessed
+        env:
+          DATASET_DIR: ${{ runner.temp }}/dataset
+          DATA_DIR: ${{ runner.temp }}/data
+
+      - name: Preprocess dataset
         run: |
-          wget -q cmu-itl.s3.amazonaws.com/dirty/dirt.tar.gz -O dirt.tar.gz
-          tar -xzf dirt.tar.gz
-          mkdir exp_runs/
-          wget -q cmu-itl.s3.amazonaws.com/dirty/dirty_mt.ckpt -O exp_runs/dirty_mt.ckpt
+          set -ex
+          cd $GITHUB_WORKSPACE/dirty
+          python -m utils.preprocess $DATA_DIR/unprocessed $DATA_DIR/unprocessed/files.txt $DATA_DIR/processed
+          ln -s $DATA_DIR/processed $(pwd)/data1
+          python -m utils.vocab --size=164 --use-bpe "$DATA_DIR/processed/"'train-*.tar' "$DATA_DIR/processed/typelib.json" data1/vocab.bpe10000
+        env:
+          DATA_DIR: ${{ runner.temp }}/data
+      - name: Train on dataset
+        run: |
+          set -ex
+          cd $GITHUB_WORKSPACE/dirty
           wandb offline
-      - name: Infer and evaluate
-        working-directory: ./dirty
+          python exp.py train multitask_test_ci.xfmr.jsonnet
+          find . -name '*.ckpt'
+  test-inference:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04, ubuntu-24.04]
+        python-version: ["3.10", "3.11"]
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup
+        uses: ./.github/actions/setup
+
+      - name: Install huggingface-cli
+        run: pip install huggingface_hub[cli]
+
+      - name: Cache model files
+        uses: actions/cache@v4
+        with:
+          path: ${{ runner.temp }}/model-dl
+          key: hf-model-dl
+
+      - name: Download model files
+        run: huggingface-cli download --repo-type model ejschwartz/dirty-ghidra --local-dir $MODEL_DL_DIR && cp -R $MODEL_DL_DIR/data1 $GITHUB_WORKSPACE/dirty/data1
+        env:
+          MODEL_DL_DIR: ${{ runner.temp }}/model-dl
+
+      - name: Run DIRTY inference
         run: |
-          python exp.py train --expname=eval_dirty_mt multitask_test_ci.xfmr.jsonnet --eval-ckpt exp_runs/dirty_mt.ckpt
-          cat test_result.json
-          cat test_result.json | jq ".test_retype_acc"
-          cat test_result.json | jq ".test_rename_acc"
-          cat test_result.json | jq ".test_retype_acc" | awk '{if ($1 < 0.6) exit 1}'
-          cat test_result.json | jq ".test_rename_acc" | awk '{if ($1 < 0.5) exit 1}'
+          $GHIDRA_INSTALL_DIR/support/analyzeHeadless projects MyProject -import /bin/ls -postScript $GITHUB_WORKSPACE/scripts/DIRTY_infer.py $(pwd)/infer_success.txt
+          test -f infer_success.txt
diff --git a/.gitignore b/.gitignore
@@ -141,3 +141,11 @@ cython_debug/
 *.c
 
 exp_runs/
+
+# wandb
+dirty/wandb/
+dirty/multitask-greedy.xfmr.jsonnet
+dirty/data1
+dirty/struct_files.txt
+dirty/eval.pkl
+dirty/forward.pkl
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,30 @@
+# docker build -t dirty-ghidra .
+# docker run -d --name dirty-ghidra --gpus '"device=3,4"' -it -v /path/to/data:/data dirty-ghidra
+
+FROM blacktop/ghidra:latest
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+  --mount=type=cache,target=/var/lib/apt,sharing=locked \
+  apt-get -y update && apt-get -y install -y python3-pip python-is-python3 \
+  git pkg-config libsentencepiece-dev libprotobuf-dev nano sudo unzip
+
+# Install Ghidrathon
+
+WORKDIR /tmp/
+
+RUN wget https://github.com/mandiant/Ghidrathon/releases/download/v4.0.0/Ghidrathon-v4.0.0.zip
+RUN unzip Ghidrathon-v4.0.0.zip -d ghidrathon
+RUN --mount=type=cache,target=/root/.cache pip install --break-system-packages -r ghidrathon/requirements.txt
+RUN python ghidrathon/ghidrathon_configure.py /ghidra
+RUN unzip ghidrathon/Ghidrathon-v4.0.0.zip -d /ghidra/Ghidra/Extensions
+
+# Install DIRTY Ghidra
+
+WORKDIR /
+
+COPY . /DIRTY
+
+RUN --mount=type=cache,target=/root/.cache pip install --break-system-packages --upgrade -r /DIRTY/requirements.txt
+
+ENTRYPOINT ["/bin/sh", "-c"]
+CMD ["tail", "-f", "/dev/null"]
diff --git a/README.md b/README.md
@@ -8,17 +8,43 @@ While most of the model code remains identical, we add support for generating a
 The original README provides clear instructions on how to download and run their pre-trained DIRTY model, but the README's instructions are slightly unclear when describing how to train your own model.
 This README explicitly covers all the steps necessary to train a DIRTY model from scratch.
 
+This is @edmcman's fork of the original DIRTY-Ghidra repository.  It features a number of improvements and bug fixes, and also includes the ability to perform inference on new examples.
+
+## Getting Started with DIRTY-Ghidra Inference
+
+[![Test DIRTY Ghidra's inference ability](https://github.com/edmcman/DIRTY-Ghidra/actions/workflows/test.yml/badge.svg)](https://github.com/edmcman/DIRTY-Ghidra/actions/workflows/test.yml)
+
+Most people probably just want to use DIRTY-Ghidra to predict variable names and
+types for their own binaries.  If that is you, follow these instructions:
+
+1. Clone this repository to `DIRTY_DIR`
+2. Optional but highly recommended: Create a virtual environment (venv) with `python -m venv /path/to/venv; source /path/to/venv/bin/activate`. This will prevent DIRTY from interfering with your system python packages.
+3. Install the requirements via `pip install -r requirements.txt`
+4. [Install Ghidra](https://ghidra-sre.org/InstallationGuide.html)
+5. [Install Ghidrathon](https://github.com/mandiant/Ghidrathon/?tab=readme-ov-file#installing-ghidrathon).  Make sure you configure Ghidrathon (`python
+   ghidrathon_configure.py`) using the venv from step 2.
+6. Download the latest model from HF (`huggingface_hub[cli] && huggingface-cli download --repo-type model ejschwartz/dirty-ghidra --local-dir $DIRTY_DIR/dirty`)
+7. Run `mkdir ~/ghidra_scripts && ln -s DIRTY_DIR/scripts/DIRTY_infer.py ~/ghidra_scripts/DIRTY_infer.py` if on Linux.
+8. Open a function in Ghidra.  Run the script `DIRTY_infer.py` in the script manager.
+9. Optionally assign the script to a keyboard shortcut.
+
 ## Requirements
 
-- Linux with Python 3.6/3.7/3.8
+- Linux with Python 3.10+
 - [PyTorch ≥ 1.5.1](https://pytorch.org/)
-- [Ghidrathon 1.0.0](https://github.com/mandiant/Ghidrathon)
+- [Ghidrathon >= 4.0.0](https://github.com/mandiant/Ghidrathon)
 - `pip install -r requirements.txt`
 
+### Libraries
+
+A few libraries are required by the python packages.  On ubuntu, you can install
+these with:
+- `apt install pkg-config libsentencepiece-dev libprotobuf-dev`
+
 ## Training a DIRTY model
 
 ### Dataset Generation
-The first step to train DIRTY is to obtain a unprocessed DIRT dataset. Instructions can be found in the `dataset-gen-ghidra` folder.
+The first step to train DIRTY is to obtain a unprocessed DIRT dataset. Instructions can be found in the [dataset-gen-ghidra](dataset-gen-ghidra) folder.
 
 ### Dataset Preprocessing
 
@@ -40,7 +66,7 @@ We also need to build a vocabulary of tokens that the model will understand
 
 ```bash
 # inside the `dirty` directory
-python3 -m utils.vocab [-h] [options] TRAIN_FILES_TAR PATH_TO_TYPELIB_JSON TARGET_DIRECTORY/vocab.bpe10000
+python3 -m utils.vocab [-h] --use-bpe [options] TRAIN_FILES_TAR PATH_TO_TYPELIB_JSON TARGET_DIRECTORY/vocab.bpe10000
 ```
 
 This script generates vocabulary files located in `TARGET_DIRECTORY`. It is recommended to prefix the vocab files with `vocab.bpe10000` to match the expected vocabulary filenames in the model config files.

diff --git a/binary/dire_types.py b/binary/dire_types.py
@@ -973,6 +973,29 @@ def __hash__(self) -> int:
     def __str__(self) -> str:
         return "void"
 
+class TypeDef(TypeInfo):
+
+    def __init__(self, name, size, other_type_name) -> None:
+        self.name = name
+        self.size = size
+        self.other_type_name = other_type_name
+
+    @classmethod
+    def _from_json(cls, d: t.Dict[str, t.Any]) -> "TypeDef":
+        return cls(name=d["name"], size=d["size"], other_type_name=d["other_type_name"])
+
+    def _to_json(self) -> t.Dict[str, int]:
+        return {"T": 11, "name": self.name, "size": self.size, "other_type_name": self.other_type_name}
+
+    def __eq__(self, other: t.Any) -> bool:
+        return isinstance(other, TypeDef) and self.name == other.name and self.size == other.size and self.other_type_name == other.other_type_name
+
+    def __hash__(self) -> int:
+        return hash((self.name, self.size, self.other_type_name))
+
+    def __str__(self) -> str:
+        return self.name
+
 class Disappear(TypeInfo):
     """Target type for variables that don't appear in the ground truth function"""
     size = 0
@@ -1061,6 +1084,7 @@ def read_metadata(d: t.Dict[str, t.Any]) -> "TypeLibCodec.CodecTypes":
             8: Void,
             9: FunctionPointer,
             10: Disappear,
+            11: TypeDef
         }
         return classes[d["T"]]._from_json(d)
 

diff --git a/binary/ghidra_function.py b/binary/ghidra_function.py
@@ -162,21 +162,21 @@ class CollectedFunction:
     """
 
     def __init__(self, *, ea: int, debug: Function, decompiler: Function):
-        self.name: str = debug.name
+        self.name: str = debug.name if hasattr(debug, "name") else "unknown"
         self.ea = ea
         self.debug = debug
         self.decompiler = decompiler
 
     def to_json(self):
         return {
             "e": self.ea,
-            "b": self.debug.to_json(),
+            "b": self.debug.to_json() if hasattr(self.debug, "to_json") else None,
             "c": self.decompiler.to_json(),
         }
 
     @classmethod
     def from_json(cls, d):
-        debug = Function.from_json(d["b"])
+        debug = Function.from_json(d["b"]) if d["b"] is not None else None
         decompiler = Function.from_json(d["c"])
         return cls(ea=d["e"], debug=debug, decompiler=decompiler)