Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
11710a7
Modify quickrun to allow resuming
IAlibay Feb 16, 2026
322bc23
fix the gather tests
IAlibay Feb 16, 2026
a61598e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 16, 2026
6579c04
Merge branch 'main' of github.com:OpenFreeEnergy/openfe into quickrun…
atravitz Mar 11, 2026
0f43f6f
add check for protocol_dag.json
atravitz Mar 11, 2026
5e1a21c
add basic test
atravitz Mar 11, 2026
182562f
clearer language, hopefully
atravitz Mar 11, 2026
3180b8c
store protocol dag using transformation key
atravitz Mar 11, 2026
1c0fdf7
another tmpdir -> tmp_path fix
atravitz Mar 12, 2026
156320b
add error handling check
atravitz Mar 12, 2026
360882a
fix naming in test
atravitz Mar 13, 2026
04d47bb
add news item
atravitz Mar 13, 2026
1055c1b
use assert_click_success
atravitz Mar 13, 2026
2b1000e
Merge branch 'main' into quickrun_resume
atravitz Mar 17, 2026
c8a03d8
add test for interrupted job
atravitz Mar 18, 2026
d735c7f
remove checkpoint when a job has completed successfully
atravitz Mar 18, 2026
6518499
add handling for checkpoint error handling without --resume
atravitz Mar 18, 2026
5cd437e
clean up logic
atravitz Mar 18, 2026
61a97b6
check for warning
atravitz Mar 18, 2026
48ab9c8
add docs
atravitz Mar 18, 2026
31a6589
make a cache dir
atravitz Mar 18, 2026
eaebaac
Merge branch 'main' of github.com:OpenFreeEnergy/openfe into quickrun…
atravitz Mar 20, 2026
fe746c8
use clickexception
atravitz Mar 20, 2026
fd9253b
update error message
atravitz Mar 20, 2026
4ed9b5f
update kwarg
atravitz Mar 20, 2026
494eb06
keep everything in the quickrun cache
atravitz Mar 20, 2026
d9a2d5e
clearer message
atravitz Mar 20, 2026
58b5642
it's ProtocolDAG not protocolDAG
atravitz Mar 20, 2026
e7de6b8
bump CI
atravitz Mar 20, 2026
ec44cce
bump CI
atravitz Mar 20, 2026
e1f742b
Merge branch 'main' into quickrun_resume
atravitz Mar 24, 2026
bd8efe6
Merge branch 'main' of github.com:OpenFreeEnergy/openfe into quickrun…
atravitz Mar 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions news/quickrun_resume.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
**Added:**

* Added ``--resume`` flag to ``openfe quickrun``.
Quickrun now temporarily caches ``protocolDAG`` information and when used with the ``--resume`` flag, quickrun will attempt resume execution of an incomplete transformation.

**Changed:**

* <news item>

**Deprecated:**

* <news item>

**Removed:**

* <news item>

**Fixed:**

* <news item>

**Security:**

* <news item>
45 changes: 42 additions & 3 deletions src/openfecli/commands/quickrun.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import json
import pathlib
import warnings

import click

Expand Down Expand Up @@ -30,8 +31,14 @@ def _format_exception(exception) -> str:
type=click.Path(dir_okay=False, file_okay=False, path_type=pathlib.Path),
help="Filepath at which to create and write the JSON-formatted results.",
) # fmt: skip
@click.option(
"--resume",
is_flag=True,
default=False,
help=("Attempt to resume this transformation's execution using the cache."),
)
@print_duration
def quickrun(transformation, work_dir, output):
def quickrun(transformation, work_dir, output, resume):
"""Run the transformation (edge) in the given JSON file.

Simulation JSON files can be created with the
Expand All @@ -51,7 +58,9 @@ def quickrun(transformation, work_dir, output):
import logging
import os
import sys
from json import JSONDecodeError

from gufe import ProtocolDAG
from gufe.protocols.protocoldag import execute_DAG
from gufe.tokenization import JSON_HANDLER
from gufe.transformations.transformation import Transformation
Expand Down Expand Up @@ -94,13 +103,40 @@ def quickrun(transformation, work_dir, output):
else:
output.parent.mkdir(exist_ok=True, parents=True)

write("Planning simulations for this edge...")
dag = trans.create()
# Attempt to either deserialize or freshly create DAG
cache_basedir = work_dir / "quickrun_cache"
trans_DAG_json = cache_basedir / f"{trans.key}-ProtocolDAG.json"
Copy link
Contributor

@atravitz atravitz Mar 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right now the directory structure (with https://github.com/OpenFreeEnergy/gufe/pull/764/files) looks like:

quickrun_cache/
├── ProtocolDAG-8090c89c950e976b33829a15e662ed89-results_cache/
└── Transformation-dbea03c534737749bb413e01a382f3af-protocolDAG.json

is there a better way to indicate that Transformation-dbea03c534737749bb413e01a382f3af-ProtocolDAG.json is the ProtocolDAG corresponding to Transformation-dbea03c534737749bb413e01a382f3af?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could move the dag creation up a little bit, and just use the dag key as part of the name?

I.e. of you define trans_DAG_json after you define dag, it could be f"ProtocolDAG-{dag.key}.json" instead.

Copy link
Contributor

@atravitz atravitz Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in offline conversation, @IAlibay and I agree that moving dag creation up will not work, because each time .create() is called, a new unique ProtocolDAG is created.

We also realize that, as of right now, the cache is only unique based on the transformation key and the -d argument. This means that if a user called the following two commands in succession:

openfe quickrun transformation.json -o result1.json -d results/
openfe quickrun transformation.json -o result2.json -d results/

openfe will treat these as a re-execution and raise an error telling the user that they should pass in --resume or delete the file and restart.

There are two possible solutions to this:

  1. Enforce user behavior to have separate -d values for each repeat (essentially what is implemented now)
  2. Build the hash based on the uniqueness of all 3 1. transformation key 2. -o, and 3. -d. Since -o can be any arbitrary filepath we may want to hash -o and store it as:
    [-d arg]/quickrun_cache/dagcache-[hash(transformation.key, -o arg)].

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

option 2 has been implemented in #1890


if trans_DAG_json.is_file():
if resume:
write(f"Attempting to resume execution using existing edges from '{trans_DAG_json}'")
try:
dag = ProtocolDAG.from_json(trans_DAG_json)
except JSONDecodeError:
errmsg = f"Recovery failed, please remove {trans_DAG_json} and any results from your working directory before continuing to create a new protocol."
raise click.ClickException(errmsg)
else:
errmsg = f"Transformation has been started but is incomplete. Please remove {trans_DAG_json} and rerun, or resume execution using the ``--resume`` flag."
raise click.ClickException(errmsg)

else:
if resume:
click.echo(
f"openfe quickrun was run with --resume, but no checkpoint found at {trans_DAG_json}. Starting new execution."
)

# Create the DAG instead and then serialize for later resuming
write("Planning simulations for this edge...")
dag = trans.create()
cache_basedir.mkdir(exist_ok=True)
dag.to_json(trans_DAG_json)

write("Starting the simulations for this edge...")
dagresult = execute_DAG(
dag,
shared_basedir=work_dir,
scratch_basedir=work_dir,
cache_basedir=cache_basedir,
keep_shared=True,
raise_error=False,
n_retries=2,
Expand All @@ -126,6 +162,9 @@ def quickrun(transformation, work_dir, output):
with open(output, mode="w") as outf:
json.dump(out_dict, outf, cls=JSON_HANDLER.encoder)

# remove the checkpoint since the job has completed
os.remove(trans_DAG_json)

write(f"Here is the result:\n\tdG = {estimate} ± {uncertainty}\n")
write("")

Expand Down
94 changes: 84 additions & 10 deletions src/openfecli/tests/commands/test_quickrun.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
import os
import pathlib
from importlib import resources
from unittest import mock

import pytest
from click.testing import CliRunner
Expand All @@ -9,6 +11,8 @@

from openfecli.commands.quickrun import quickrun

from ..utils import assert_click_success


@pytest.fixture
def json_file():
Expand All @@ -18,13 +22,7 @@ def json_file():
return json_file


@pytest.mark.parametrize(
"extra_args",
[
{},
{"-d": "foo_dir", "-o": "foo.json"},
],
)
@pytest.mark.parametrize("extra_args", [{}, {"-d": "foo_dir", "-o": "foo.json"}])
def test_quickrun(extra_args, json_file):
extras = sum([list(kv) for kv in extra_args.items()], [])

Expand All @@ -37,8 +35,14 @@ def test_quickrun(extra_args, json_file):
assert not pathlib.Path(outfile).exists()

result = runner.invoke(quickrun, [json_file] + extras)
assert result.exit_code == 0

assert_click_success(result)
assert "Here is the result" in result.output
trans = Transformation.from_json(json_file)
# checkpoint should be deleted when job is complete
assert not pathlib.Path(
extra_args.get("-d", ""), "quickrun_cache", f"{trans.key}-ProtocolDAG.json"
).exists()

# output json should exist and have results after execution
assert pathlib.Path(outfile).exists()
Expand All @@ -55,6 +59,23 @@ def test_quickrun(extra_args, json_file):
# assert len(list(dirpath.iterdir())) > 0


@pytest.mark.parametrize("extra_args", [{}, {"-d": "foo_dir", "-o": "foo.json"}])
def test_quickrun_interrupted(extra_args, json_file):
"""If a quickrun is unable to complete, the ProtocolDAG.json checkpoint should exist."""
extras = sum([list(kv) for kv in extra_args.items()], [])

runner = CliRunner()
with runner.isolated_filesystem():
with mock.patch("gufe.protocols.protocoldag.execute_DAG", side_effect=RuntimeError):
result = runner.invoke(quickrun, [json_file] + extras)

assert "Here is the result" not in result.output
trans = Transformation.from_json(json_file)
assert pathlib.Path(
extra_args.get("-d", ""), "quickrun_cache", f"{trans.key}-ProtocolDAG.json"
).exists()


def test_quickrun_output_file_exists(json_file):
"""Fail if the output file already exists."""
runner = CliRunner()
Expand All @@ -71,7 +92,7 @@ def test_quickrun_output_file_in_nonexistent_directory(json_file):
with runner.isolated_filesystem():
outfile = pathlib.Path("not_dir/foo.json")
result = runner.invoke(quickrun, [json_file, "-o", outfile])
assert result.exit_code == 0
assert_click_success(result)
assert outfile.parent.is_dir()


Expand All @@ -82,7 +103,7 @@ def test_quickrun_dir_created_at_runtime(json_file):
outdir = "not_dir"
outfile = outdir + "foo.json"
result = runner.invoke(quickrun, [json_file, "-d", outdir, "-o", outfile])
assert result.exit_code == 0
assert_click_success(result)


def test_quickrun_unit_error():
Expand All @@ -98,3 +119,56 @@ def test_quickrun_unit_error():
# to be stored in JSON
# not sure whether that means we should always be storing all
# protocol dag results maybe?


def test_quickrun_existing_checkpoint(json_file):
"""In the default case where resume=False, if the checkpoint exists, quickrun should error out and not attempt to execute."""
trans = Transformation.from_json(json_file)
dag = trans.create()

runner = CliRunner()
with runner.isolated_filesystem():
pathlib.Path("quickrun_cache").mkdir()
dag.to_json(pathlib.Path("quickrun_cache", f"{trans.key}-ProtocolDAG.json"))
result = runner.invoke(quickrun, [json_file])
assert result.exit_code == 1
assert "Attempting to resume" not in result.output
assert "Transformation has been started but is incomplete." in result.stderr


def test_quickrun_resume_from_checkpoint(json_file):
trans = Transformation.from_json(json_file)
dag = trans.create()

runner = CliRunner()
with runner.isolated_filesystem():
pathlib.Path("quickrun_cache").mkdir()
dag.to_json(pathlib.Path("quickrun_cache", f"{trans.key}-ProtocolDAG.json"))
result = runner.invoke(quickrun, [json_file, "--resume"])

assert_click_success(result)
assert "Attempting to resume" in result.output


def test_quickrun_resume_invalid_checkpoint(json_file):
"""Fail if the output file doesn't load properly."""
trans = Transformation.from_json(json_file)

runner = CliRunner()
with runner.isolated_filesystem():
pathlib.Path("quickrun_cache").mkdir()
pathlib.Path("quickrun_cache", f"{trans.key}-ProtocolDAG.json").touch()
result = runner.invoke(quickrun, [json_file, "--resume"])

assert result.exit_code == 1
assert "Attempting to resume" in result.output
assert "Recovery failed" in result.stderr


def test_quickrun_resume_missing_checkpoint(json_file):
"""If --resume is passed but there's not checkpoint, just echo a message and keep going."""
runner = CliRunner()
with runner.isolated_filesystem():
result = runner.invoke(quickrun, [json_file, "--resume"])
assert_click_success(result)
assert "openfe quickrun was run with --resume, but no checkpoint found at" in result.output
Loading