Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
194 commits
Select commit Hold shift + click to select a range
12ba632
Use currentProgram() for new Ghidrathon
sei-eschwartz Oct 10, 2023
2ce6e67
Bump Ghidrathon version
sei-eschwartz Oct 10, 2023
0fbd014
Also adjust currentProgram in dump_trees script
sei-eschwartz Oct 10, 2023
11816e7
Fix typo
sei-eschwartz Oct 26, 2023
5cfb2fa
Add tqdm logging
sei-eschwartz Oct 26, 2023
94e2efd
Unrestrict wandb version
sei-eschwartz Oct 26, 2023
36ee12c
Add gen-pred script
sei-eschwartz Nov 8, 2023
249eebe
Fix ghidra variable encoding bug
sei-eschwartz Nov 14, 2023
4d7dce5
Improve tqdm output in preprocess script
sei-eschwartz Nov 14, 2023
0ad07a7
update README
sei-eschwartz Dec 8, 2023
c5e77b6
add jsonlines
sei-eschwartz Dec 8, 2023
d9de554
gen-pred changes
sei-eschwartz Dec 8, 2023
3436e9a
Add prepare_vis script.
sei-eschwartz Dec 13, 2023
8385dce
Use strip insteaad of strip --strip-unneeded
sei-eschwartz Dec 13, 2023
452da89
Ignore thunks
sei-eschwartz Dec 13, 2023
b6ecf6f
Update gitignore
sei-eschwartz Dec 13, 2023
040d46d
prepare_vis changes
sei-eschwartz Dec 13, 2023
dfd1d27
No, we're not using the name.
sei-eschwartz Dec 19, 2023
d70d3f1
Fix bug in Ghidra type serialization
sei-eschwartz Feb 28, 2024
1f65afd
Add missing subtypes
sei-eschwartz Mar 4, 2024
559bdd3
A few changes to make testing easier
edmcman Apr 29, 2024
9dced3f
Fix a few typenames for subtypes
edmcman Apr 29, 2024
23be121
Fix encoding bug.
edmcman Apr 29, 2024
7994e27
Modify preprocessing to include subtypes.
edmcman May 3, 2024
549404c
Modify name of Structs.
edmcman May 3, 2024
d663a03
Don't print symbols.
edmcman May 3, 2024
8b6caee
minor readme note
edmcman May 3, 2024
125d6a4
TypeDef support in Ghidra
edmcman May 10, 2024
a02e03f
Add WIP DIRTY import script to repository
edmcman May 10, 2024
50480f7
Handle FunctionDefs
edmcman May 10, 2024
c73892a
randomize order of type construction
edmcman May 10, 2024
04d5570
Output type name instead of structure
edmcman Jun 13, 2024
1816600
ghidra 11.1 update
edmcman Jun 13, 2024
33043ab
Create a separate pruned typelib file.
edmcman Jun 21, 2024
c52f8e2
Slightly rename files.
edmcman Jul 1, 2024
3e9bde3
Add typedef
edmcman Jul 11, 2024
681abe5
Small changes to DIRTY import test script
edmcman Jul 11, 2024
0a18f35
Update types
edmcman Jul 30, 2024
3170e77
Add sample/eval script
edmcman Jul 30, 2024
f175b44
Update scripts
edmcman Jul 30, 2024
527f67f
Can't align on symbol version yet.
edmcman Jul 30, 2024
08e1315
hjson, add built-in types
edmcman Jul 30, 2024
18fdd7e
Continue without symbols
edmcman Sep 3, 2024
26b926d
update sample_test
edmcman Sep 3, 2024
3a10213
Don't require debug
edmcman Sep 3, 2024
0cc0359
unknown name
edmcman Sep 3, 2024
dcbfb95
Handle missing debug
edmcman Sep 3, 2024
ebdb4e1
These seem quite dangerous
edmcman Sep 9, 2024
fb46484
None vs empty
edmcman Sep 9, 2024
b7ff2ae
Fix some things
edmcman Sep 10, 2024
8ff588d
Merge remote-tracking branch 'origin/main'
edmcman Sep 10, 2024
6ed558d
target type mask
edmcman Sep 10, 2024
0c1a20e
comments
edmcman Sep 10, 2024
ac417d7
handle missing debug
edmcman Sep 12, 2024
e60f8f5
Add beginning of new infer command
edmcman Sep 12, 2024
569d878
more None handling.
edmcman Sep 12, 2024
7abfb5e
more infer.py
edmcman Sep 12, 2024
0fb1780
Actually do the prediction!
edmcman Sep 13, 2024
34a8705
Actually load weights!
edmcman Sep 17, 2024
13dfbc0
spelling
edmcman Sep 18, 2024
04be43a
refactor
edmcman Sep 18, 2024
5579e5a
add ghidra inference script (unfinished)
edmcman Sep 18, 2024
ab8cb92
It works!
edmcman Sep 18, 2024
ccc9a63
minor
edmcman Sep 18, 2024
6d167ee
Handle <unk>
edmcman Sep 18, 2024
efbbbdb
Fix a few bugs
edmcman Sep 18, 2024
423f136
packages in README
edmcman Sep 18, 2024
df67ed7
Compute paths
edmcman Sep 18, 2024
99d792a
remove comment
edmcman Sep 18, 2024
24a4bc4
Add rough directions.
edmcman Sep 18, 2024
2a16037
README
edmcman Sep 18, 2024
7a1a8b4
Don't print the vocab name
edmcman Sep 19, 2024
5ffae16
Update requirements.txt
edmcman Sep 19, 2024
f62b909
Update install.yml
edmcman Sep 19, 2024
ac85677
Update install.yml
edmcman Sep 19, 2024
f7481c0
Update install.yml
edmcman Sep 19, 2024
1632427
Update install.yml
edmcman Sep 19, 2024
cb4f2b8
Update install.yml
edmcman Sep 19, 2024
a6f6261
Update install.yml
edmcman Sep 19, 2024
b79787c
Update README
edmcman Sep 19, 2024
527b786
actions
edmcman Sep 19, 2024
edb2ec3
actions
edmcman Sep 19, 2024
6e80ac6
actions
edmcman Sep 19, 2024
2773284
Add CI testing DIRTY Ghidra inference ability. (#1)
edmcman Sep 19, 2024
13b8280
minor change to workflow
edmcman Sep 20, 2024
9421b42
Update README
edmcman Sep 20, 2024
68441fd
Adjust requirements.txt
edmcman Sep 20, 2024
3faf4e6
Pygments is needed for lexing
edmcman Sep 20, 2024
ab6d23a
Newer webdataset causes problems with python 3.11
edmcman Sep 20, 2024
0069da2
Add Assertion message
edmcman Sep 20, 2024
829c8eb
Update instructions
edmcman Sep 20, 2024
d0c1b80
Add check for python 3
edmcman Sep 20, 2024
70ef86a
A bunch of changes for the HF space
edmcman Sep 20, 2024
4b609fc
Minor changes to DIRTY_infer for HF space.
edmcman Sep 21, 2024
ce444a7
asm
edmcman Sep 21, 2024
a16120f
Move some code out of DIRTY_infer Ghidra script
edmcman Sep 22, 2024
e1d85f9
Adjust imports
edmcman Sep 22, 2024
9b392e7
Fix DIRTY_infer script.
edmcman Sep 23, 2024
f852d13
Update gen_names.py for slightly different naming
edmcman Sep 23, 2024
bba3d6b
Remove very bad comma
edmcman Sep 23, 2024
92c0bf7
Fix merging bug
edmcman Sep 23, 2024
f1bcc2f
Output some filtered variables
edmcman Sep 24, 2024
1a21fc5
Small change to filtered metadata format
edmcman Sep 24, 2024
f6f38e6
Don't include an analysis timeout
edmcman Sep 25, 2024
f50aa7b
Use a longer generation timeout
edmcman Sep 25, 2024
8f5697b
Add a time-out message
edmcman Sep 25, 2024
4911bba
Update README.md
edmcman Sep 26, 2024
8568c50
remove debug message
edmcman Sep 26, 2024
0eb24ab
Merge remote-tracking branch 'origin/main'
edmcman Sep 26, 2024
5bfa98d
Improve vocab for DIRTY-Ghidra
edmcman Sep 26, 2024
8212c33
Add libprotobuf-dev to list of packages
edmcman Oct 1, 2024
6b648a4
Fix some type mask stuff.
edmcman Oct 1, 2024
19277de
Misc.
edmcman Oct 1, 2024
3947821
Improve/fix a few things in evaluation script.
edmcman Oct 2, 2024
acd0fa1
filtered variables
edmcman Oct 2, 2024
6aee0da
Use 'disappear' instead of ''
edmcman Oct 2, 2024
0a9b70d
One more disappear change.
edmcman Oct 2, 2024
b582ce8
Optionally return multiple predictions (#2)
edmcman Oct 3, 2024
3e6c773
Remove a few unused imports
edmcman Oct 4, 2024
e97954c
Switch CI to new model
edmcman Oct 4, 2024
e46013f
Remove a few unused imports
edmcman Oct 4, 2024
b2093c9
Remove unused imports
edmcman Oct 4, 2024
d67c817
Properly set if a variable has a user defined name
edmcman Oct 4, 2024
3f862ba
Experiment: Try disabling disappear and remove non-user training vars
edmcman Oct 4, 2024
8c8e0fd
Fix use_disappear option
edmcman Oct 4, 2024
aed38e2
Try one more time to fix
edmcman Oct 4, 2024
7f4a336
Try yet again to fix disappear
edmcman Oct 4, 2024
224672b
Set use_disappear=False explicitly in preprocessing.
edmcman Oct 4, 2024
70bb241
As long as we're making progress, keep going...
edmcman Oct 12, 2024
f0dbf26
Minor output changes
edmcman Oct 14, 2024
382367d
Capture unknown storage locations
edmcman Oct 14, 2024
46ff8c6
Fix incorrect type name.
edmcman Oct 14, 2024
bf01586
Try to fix hash of variables
edmcman Oct 14, 2024
adb83d0
Rename variable.
edmcman Oct 14, 2024
f647551
Example refactor (#3)
edmcman Oct 16, 2024
f1363f1
Minor changes to scripts
edmcman Oct 23, 2024
1f5ecfd
Automatically find training batch size (#4)
edmcman Oct 24, 2024
1fe5724
Create Dockerfile
edmcman Oct 28, 2024
7fbb5e5
Set sync_dist=True
edmcman Oct 28, 2024
6c60896
Merge pull request #5 from edmcman/dockerfile
edmcman Oct 28, 2024
0aea262
Merge pull request #7 from edmcman/sync_dist
edmcman Oct 28, 2024
3171670
Define dataset __len__ to improve progress bar during training (#6)
edmcman Oct 28, 2024
db3fbdc
Don't print vocab file name
edmcman Oct 28, 2024
a0a8dfe
Update README.md instructions.
edmcman Oct 28, 2024
f60609a
Add Dockerfile reminder commands
edmcman Oct 29, 2024
07c6343
Allow pointers without subtypes
edmcman Oct 29, 2024
311e7ee
Add note
edmcman Oct 29, 2024
bc9af40
Disable tuner with DDP
edmcman Oct 29, 2024
b3b687c
Fix tuner
edmcman Oct 29, 2024
f799b24
value error
edmcman Oct 29, 2024
d35dbbf
Estimate dataset length (#13)
edmcman Oct 29, 2024
320d111
debug
edmcman Oct 29, 2024
cdfadbd
Handle dummy dataset
edmcman Oct 30, 2024
954c0ed
Merge branch 'feature/estimate-dataset-len' into development
edmcman Oct 30, 2024
eaee541
Wandb improvements (#14)
edmcman Oct 30, 2024
bea75b4
Upgrade webdataset version (#15)
edmcman Oct 30, 2024
86de2ec
Set shardshuffle to silence warning.
edmcman Oct 30, 2024
e111018
Tensor precision
edmcman Oct 30, 2024
70d80ec
Merge pull request #16 from edmcman/feature/tensoracc
edmcman Oct 30, 2024
d5d0faf
Small logging adjustments
edmcman Oct 30, 2024
59c94e4
Handle pointer without subtype
edmcman Nov 3, 2024
7ef7f85
Update preprocess.py
edmcman Nov 4, 2024
42c6fd3
Merge pull request #18 from edmcman/remove-no-filter
edmcman Nov 4, 2024
928659b
Only use 1 cpu per run
edmcman Nov 12, 2024
d0b01e2
Don't print so much during generation
edmcman Nov 12, 2024
477caf6
Don't print scary exception
edmcman Nov 12, 2024
4e48084
Fix stupid torchmetrics accuracy problem.
edmcman Nov 13, 2024
6383308
Merge branch 'release/0.1'
edmcman Nov 13, 2024
8cb66bd
Merge tag '0.1' into development
edmcman Nov 13, 2024
fb85a61
Log summed _loss and _acc metrics
edmcman Nov 18, 2024
a6ff2b1
Merge pull request #20 from edmcman/feature/log-changes
edmcman Nov 18, 2024
edf11f2
Learning rate monitor
edmcman Nov 18, 2024
ebcb705
Use overall accuracy
edmcman Nov 18, 2024
e0f259e
Plateau
edmcman Nov 18, 2024
3646ed9
Change precision to 32 bit
edmcman Nov 18, 2024
2e35077
Merge pull request #21 from edmcman/feature/train-misc
edmcman Nov 18, 2024
a28c3db
Revert precision
edmcman Nov 19, 2024
528b8b0
Slightly change dm
edmcman Nov 19, 2024
ad400ee
Merge commit '2e35077' into development
edmcman Nov 19, 2024
8c54b11
I think we resolved this
edmcman Nov 19, 2024
5b44650
Use BatchSizeFinder for testing too
edmcman Nov 19, 2024
4ef70fa
Merge branch 'development' into exp
edmcman Nov 19, 2024
e89e239
Merge pull request #22 from edmcman/exp
edmcman Nov 19, 2024
706d3fd
Do not include the full path of the submodel
edmcman Nov 19, 2024
498a189
format
edmcman Nov 20, 2024
0664544
Attempt to fix vocab loading
edmcman Nov 20, 2024
34a355c
Backward compatibility fix
edmcman Nov 20, 2024
3b5fe2e
Expand path before finding dir
edmcman Nov 20, 2024
c25acab
Attempt to fix vocab loading
edmcman Nov 20, 2024
acf04de
Merge pull request #25 from edmcman/feature/vocab-path
edmcman Nov 20, 2024
20515bf
log all
edmcman Nov 21, 2024
d7470f7
Add safety margin to batch scaling
edmcman Nov 25, 2024
7dfbb9b
Fix safety margin
edmcman Nov 25, 2024
5a226c9
Merge branch 'release/0.2'
edmcman Nov 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions .github/actions/setup/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# .github/actions/shared-action/action.yml
name: Setup
description: Shared logic for workflows
runs:
using: "composite"
steps:
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
- uses: actions/setup-java@v4
with:
distribution: 'zulu' # See 'Supported distributions' for available options
java-version: '21'
- uses: antoniovazquezblanco/setup-ghidra@v2.0.4

- name: Upgrade pip
shell: bash
run: python -m pip install --upgrade pip setuptools wheel

- name: Install apt dependencies
shell: bash
run: sudo apt-get install -y pkg-config libsentencepiece-dev libprotobuf-dev

- name: Install dependencies
shell: bash
run: pip install -r requirements.txt

- name: Download Ghidrathon
uses: robinraju/release-downloader@v1
with:
# The source repository path.
# Expected format {owner}/{repo}
# Default: ${{ github.repository }}
repository: mandiant/Ghidrathon
tag: v4.0.0
fileName: '*.zip'

- name: Install Ghidrathon
shell: bash
run: |
mkdir ghidrathon-tmp
unzip Ghidrathon*.zip -d ghidrathon-tmp
pip install -r ghidrathon-tmp/requirements.txt
python ghidrathon-tmp/ghidrathon_configure.py $GHIDRA_INSTALL_DIR
unzip ghidrathon-tmp/Ghidrathon*.zip -d $GHIDRA_INSTALL_DIR/Ghidra/Extensions
#$GHIDRA_INSTALL_DIR/support/analyzeHeadless projects TmpProject -import /bin/ls

- name: Make projects directory
shell: bash
run: mkdir -p projects

31 changes: 0 additions & 31 deletions .github/workflows/install.yml

This file was deleted.

98 changes: 66 additions & 32 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -1,48 +1,82 @@
name: Test
name: Test DIRTY Ghidra

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

workflow_dispatch:

jobs:
install:
# The type of runner that the job will run on
test-train:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-20.04]
python-version: [3.6, 3.7, 3.8]
os: [ubuntu-22.04, ubuntu-24.04]
python-version: ["3.10", "3.11"]

steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}

- name: Upgrade pip
run: python -m pip install --upgrade pip setuptools wheel
- name: Install dependencies
run: pip install -r requirements.txt
- name: Download data and model
working-directory: ./dirty
- uses: actions/checkout@v4
- name: Setup
uses: ./.github/actions/setup

- name: Generate dataset
run: |
set -ex
mkdir -p $DATASET_DIR
cd $DATASET_DIR
# Create some dummy programs
for n in $(seq 20); do echo -e "#include <stdio.h>\nint main(int argc, const char** argv) { printf(\"%d %d\\\\n\", $n, argc); }" > s$n.c; gcc -g s$n.c -o s$n; rm s$n.c; done
cd $GITHUB_WORKSPACE/dataset-gen-ghidra
python generate.py --verbose --ghidra $GHIDRA_INSTALL_DIR/support/analyzeHeadless -t 1 -b $DATASET_DIR -o $DATA_DIR/unprocessed
cd $DATA_DIR/unprocessed && python $GITHUB_WORKSPACE/dataset-gen-ghidra/gen_names.py $DATA_DIR/unprocessed
env:
DATASET_DIR: ${{ runner.temp }}/dataset
DATA_DIR: ${{ runner.temp }}/data

- name: Preprocess dataset
run: |
wget -q cmu-itl.s3.amazonaws.com/dirty/dirt.tar.gz -O dirt.tar.gz
tar -xzf dirt.tar.gz
mkdir exp_runs/
wget -q cmu-itl.s3.amazonaws.com/dirty/dirty_mt.ckpt -O exp_runs/dirty_mt.ckpt
set -ex
cd $GITHUB_WORKSPACE/dirty
python -m utils.preprocess $DATA_DIR/unprocessed $DATA_DIR/unprocessed/files.txt $DATA_DIR/processed
ln -s $DATA_DIR/processed $(pwd)/data1
python -m utils.vocab --size=164 --use-bpe "$DATA_DIR/processed/"'train-*.tar' "$DATA_DIR/processed/typelib.json" data1/vocab.bpe10000
env:
DATA_DIR: ${{ runner.temp }}/data
- name: Train on dataset
run: |
set -ex
cd $GITHUB_WORKSPACE/dirty
wandb offline
- name: Infer and evaluate
working-directory: ./dirty
python exp.py train multitask_test_ci.xfmr.jsonnet
find . -name '*.ckpt'
test-inference:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04, ubuntu-24.04]
python-version: ["3.10", "3.11"]

steps:
- uses: actions/checkout@v4
- name: Setup
uses: ./.github/actions/setup

- name: Install huggingface-cli
run: pip install huggingface_hub[cli]

- name: Cache model files
uses: actions/cache@v4
with:
path: ${{ runner.temp }}/model-dl
key: hf-model-dl

- name: Download model files
run: huggingface-cli download --repo-type model ejschwartz/dirty-ghidra --local-dir $MODEL_DL_DIR && cp -R $MODEL_DL_DIR/data1 $GITHUB_WORKSPACE/dirty/data1
env:
MODEL_DL_DIR: ${{ runner.temp }}/model-dl

- name: Run DIRTY inference
run: |
python exp.py train --expname=eval_dirty_mt multitask_test_ci.xfmr.jsonnet --eval-ckpt exp_runs/dirty_mt.ckpt
cat test_result.json
cat test_result.json | jq ".test_retype_acc"
cat test_result.json | jq ".test_rename_acc"
cat test_result.json | jq ".test_retype_acc" | awk '{if ($1 < 0.6) exit 1}'
cat test_result.json | jq ".test_rename_acc" | awk '{if ($1 < 0.5) exit 1}'
$GHIDRA_INSTALL_DIR/support/analyzeHeadless projects MyProject -import /bin/ls -postScript $GITHUB_WORKSPACE/scripts/DIRTY_infer.py $(pwd)/infer_success.txt
test -f infer_success.txt
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,11 @@ cython_debug/
*.c

exp_runs/

# wandb
dirty/wandb/
dirty/multitask-greedy.xfmr.jsonnet
dirty/data1
dirty/struct_files.txt
dirty/eval.pkl
dirty/forward.pkl
30 changes: 30 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# docker build -t dirty-ghidra .
# docker run -d --name dirty-ghidra --gpus '"device=3,4"' -it -v /path/to/data:/data dirty-ghidra

FROM blacktop/ghidra:latest

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get -y update && apt-get -y install -y python3-pip python-is-python3 \
git pkg-config libsentencepiece-dev libprotobuf-dev nano sudo unzip

# Install Ghidrathon

WORKDIR /tmp/

RUN wget https://github.com/mandiant/Ghidrathon/releases/download/v4.0.0/Ghidrathon-v4.0.0.zip
RUN unzip Ghidrathon-v4.0.0.zip -d ghidrathon
RUN --mount=type=cache,target=/root/.cache pip install --break-system-packages -r ghidrathon/requirements.txt
RUN python ghidrathon/ghidrathon_configure.py /ghidra
RUN unzip ghidrathon/Ghidrathon-v4.0.0.zip -d /ghidra/Ghidra/Extensions

# Install DIRTY Ghidra

WORKDIR /

COPY . /DIRTY

RUN --mount=type=cache,target=/root/.cache pip install --break-system-packages --upgrade -r /DIRTY/requirements.txt

ENTRYPOINT ["/bin/sh", "-c"]
CMD ["tail", "-f", "/dev/null"]
34 changes: 30 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,43 @@ While most of the model code remains identical, we add support for generating a
The original README provides clear instructions on how to download and run their pre-trained DIRTY model, but the README's instructions are slightly unclear when describing how to train your own model.
This README explicitly covers all the steps necessary to train a DIRTY model from scratch.

This is @edmcman's fork of the original DIRTY-Ghidra repository. It features a number of improvements and bug fixes, and also includes the ability to perform inference on new examples.

## Getting Started with DIRTY-Ghidra Inference

[![Test DIRTY Ghidra's inference ability](https://github.com/edmcman/DIRTY-Ghidra/actions/workflows/test.yml/badge.svg)](https://github.com/edmcman/DIRTY-Ghidra/actions/workflows/test.yml)

Most people probably just want to use DIRTY-Ghidra to predict variable names and
types for their own binaries. If that is you, follow these instructions:

1. Clone this repository to `DIRTY_DIR`
2. Optional but highly recommended: Create a virtual environment (venv) with `python -m venv /path/to/venv; source /path/to/venv/bin/activate`. This will prevent DIRTY from interfering with your system python packages.
3. Install the requirements via `pip install -r requirements.txt`
4. [Install Ghidra](https://ghidra-sre.org/InstallationGuide.html)
5. [Install Ghidrathon](https://github.com/mandiant/Ghidrathon/?tab=readme-ov-file#installing-ghidrathon). Make sure you configure Ghidrathon (`python
ghidrathon_configure.py`) using the venv from step 2.
6. Download the latest model from HF (`huggingface_hub[cli] && huggingface-cli download --repo-type model ejschwartz/dirty-ghidra --local-dir $DIRTY_DIR/dirty`)
7. Run `mkdir ~/ghidra_scripts && ln -s DIRTY_DIR/scripts/DIRTY_infer.py ~/ghidra_scripts/DIRTY_infer.py` if on Linux.
8. Open a function in Ghidra. Run the script `DIRTY_infer.py` in the script manager.
9. Optionally assign the script to a keyboard shortcut.

## Requirements

- Linux with Python 3.6/3.7/3.8
- Linux with Python 3.10+
- [PyTorch ≥ 1.5.1](https://pytorch.org/)
- [Ghidrathon 1.0.0](https://github.com/mandiant/Ghidrathon)
- [Ghidrathon >= 4.0.0](https://github.com/mandiant/Ghidrathon)
- `pip install -r requirements.txt`

### Libraries

A few libraries are required by the python packages. On ubuntu, you can install
these with:
- `apt install pkg-config libsentencepiece-dev libprotobuf-dev`

## Training a DIRTY model

### Dataset Generation
The first step to train DIRTY is to obtain a unprocessed DIRT dataset. Instructions can be found in the `dataset-gen-ghidra` folder.
The first step to train DIRTY is to obtain a unprocessed DIRT dataset. Instructions can be found in the [dataset-gen-ghidra](dataset-gen-ghidra) folder.

### Dataset Preprocessing

Expand All @@ -40,7 +66,7 @@ We also need to build a vocabulary of tokens that the model will understand

```bash
# inside the `dirty` directory
python3 -m utils.vocab [-h] [options] TRAIN_FILES_TAR PATH_TO_TYPELIB_JSON TARGET_DIRECTORY/vocab.bpe10000
python3 -m utils.vocab [-h] --use-bpe [options] TRAIN_FILES_TAR PATH_TO_TYPELIB_JSON TARGET_DIRECTORY/vocab.bpe10000
```

This script generates vocabulary files located in `TARGET_DIRECTORY`. It is recommended to prefix the vocab files with `vocab.bpe10000` to match the expected vocabulary filenames in the model config files.
Expand Down
24 changes: 24 additions & 0 deletions binary/dire_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,29 @@ def __hash__(self) -> int:
def __str__(self) -> str:
return "void"

class TypeDef(TypeInfo):

def __init__(self, name, size, other_type_name) -> None:
self.name = name
self.size = size
self.other_type_name = other_type_name

@classmethod
def _from_json(cls, d: t.Dict[str, t.Any]) -> "TypeDef":
return cls(name=d["name"], size=d["size"], other_type_name=d["other_type_name"])

def _to_json(self) -> t.Dict[str, int]:
return {"T": 11, "name": self.name, "size": self.size, "other_type_name": self.other_type_name}

def __eq__(self, other: t.Any) -> bool:
return isinstance(other, TypeDef) and self.name == other.name and self.size == other.size and self.other_type_name == other.other_type_name

def __hash__(self) -> int:
return hash((self.name, self.size, self.other_type_name))

def __str__(self) -> str:
return self.name

class Disappear(TypeInfo):
"""Target type for variables that don't appear in the ground truth function"""
size = 0
Expand Down Expand Up @@ -1061,6 +1084,7 @@ def read_metadata(d: t.Dict[str, t.Any]) -> "TypeLibCodec.CodecTypes":
8: Void,
9: FunctionPointer,
10: Disappear,
11: TypeDef
}
return classes[d["T"]]._from_json(d)

Expand Down
6 changes: 3 additions & 3 deletions binary/ghidra_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,21 +162,21 @@ class CollectedFunction:
"""

def __init__(self, *, ea: int, debug: Function, decompiler: Function):
self.name: str = debug.name
self.name: str = debug.name if hasattr(debug, "name") else "unknown"
self.ea = ea
self.debug = debug
self.decompiler = decompiler

def to_json(self):
return {
"e": self.ea,
"b": self.debug.to_json(),
"b": self.debug.to_json() if hasattr(self.debug, "to_json") else None,
"c": self.decompiler.to_json(),
}

@classmethod
def from_json(cls, d):
debug = Function.from_json(d["b"])
debug = Function.from_json(d["b"]) if d["b"] is not None else None
decompiler = Function.from_json(d["c"])
return cls(ea=d["e"], debug=debug, decompiler=decompiler)

Expand Down
Loading