From 5f41248696193a069b33e384c1b4275202b07f41 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Wed, 25 Mar 2026 16:05:18 -0700 Subject: [PATCH] 7.1.1 replacing 7.1.0 --- README.md | 6 +++--- ScaFFold/utils/create_restart_script.py | 4 ++-- scripts/install-rccl.sh | 2 +- scripts/install-tuolumne-torchpypi.sh | 2 +- scripts/install-tuolumne.sh | 2 +- scripts/scaffold-tuolumne-torchpypi.job | 2 +- scripts/scaffold-tuolumne.job | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 63a4649..04bef50 100644 --- a/README.md +++ b/README.md @@ -40,9 +40,9 @@ The model is trained from a random initialization until convergence, which is de 1. `ml cuda/12.9.1 gcc/13.3.1 mvapich2/2.3.7` 1. `export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH` - ROCm (elcap): - 1. `ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi` + 1. `ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.1 rccl/fast-env-slows-mpi` - If using WCI wheel: - 1. `export LD_PRELOAD=/opt/rocm-7.1.0/llvm/lib/libomp.so` # for libomp.so + 1. `export LD_PRELOAD=/opt/rocm-7.1.1/llvm/lib/libomp.so` # for libomp.so 1. Install the benchmark in the python venv: - CUDA: `pip install --no-binary=mpi4py .[cuda] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/cu129 2>&1 | tee install.log` @@ -226,7 +226,7 @@ make && make install git clone https://github.com/LLNL/Caliper.git cd Caliper mkdir pybuild && cd pybuild -ml rocm/7.1.0 +ml rocm/7.1.1 ml cuda/12.9.1 cmake -DWITH_PYTHON_BINDINGS=ON \ -DWITH_ROCPROFILER=ON \ diff --git a/ScaFFold/utils/create_restart_script.py b/ScaFFold/utils/create_restart_script.py index 27a892a..cc8bbbc 100644 --- a/ScaFFold/utils/create_restart_script.py +++ b/ScaFFold/utils/create_restart_script.py @@ -98,7 +98,7 @@ def _get_env_setup() -> str: # --- Begin Environment Setup --- # Load Modules if command -v module &> /dev/null; then - ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi + ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.1 rccl/fast-env-slows-mpi fi # Activate Virtual Environment @@ -111,7 +111,7 @@ def _get_env_setup() -> str: # Environment variables export SPINDLE_FLUXOPT=off -export LD_PRELOAD=/opt/rocm-7.1.0/llvm/lib/libomp.so +export LD_PRELOAD=/opt/rocm-7.1.1/llvm/lib/libomp.so export PROFILE_TORCH=ON # --- End Environment Setup --- diff --git a/scripts/install-rccl.sh b/scripts/install-rccl.sh index 306486a..a84add3 100644 --- a/scripts/install-rccl.sh +++ b/scripts/install-rccl.sh @@ -6,7 +6,7 @@ if [ -d "aws-ofi-nccl.git" ]; then return 1 2>/dev/null || exit 1 fi -rocm_version=7.1.0 +rocm_version=7.1.1 module swap PrgEnv-cray PrgEnv-gnu module load rocm/$rocm_version diff --git a/scripts/install-tuolumne-torchpypi.sh b/scripts/install-tuolumne-torchpypi.sh index 87c8473..26e7a22 100644 --- a/scripts/install-tuolumne-torchpypi.sh +++ b/scripts/install-tuolumne-torchpypi.sh @@ -1,4 +1,4 @@ . install-rccl.sh ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-tuo-pypi && source .venvs/scaffoldvenv-tuo-pypi/bin/activate && pip install --upgrade pip -ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi +ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.1 rccl/fast-env-slows-mpi pip install -e .[rocm] --prefix=.venvs/scaffoldvenv-tuo-pypi --extra-index-url https://download.pytorch.org/whl/rocm7.1 2>&1 | tee install.log diff --git a/scripts/install-tuolumne.sh b/scripts/install-tuolumne.sh index 62760ca..339fd8f 100644 --- a/scripts/install-tuolumne.sh +++ b/scripts/install-tuolumne.sh @@ -1,5 +1,5 @@ ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-tuo && source .venvs/scaffoldvenv-tuo/bin/activate && pip install --upgrade pip -ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi +ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.1 rccl/fast-env-slows-mpi pip install -e .[rocmwci] --prefix=.venvs/scaffoldvenv-tuo 2>&1 | tee install.log # Needed until new wheel exists for torch using mpich 9.1.0 TORCH_LIB_DIR=".venvs/scaffoldvenv-tuo/lib/python3.11/site-packages/torch/lib" diff --git a/scripts/scaffold-tuolumne-torchpypi.job b/scripts/scaffold-tuolumne-torchpypi.job index cc9b10e..2629c60 100644 --- a/scripts/scaffold-tuolumne-torchpypi.job +++ b/scripts/scaffold-tuolumne-torchpypi.job @@ -7,7 +7,7 @@ # flux: -qpdebug # flux: -B fractale -ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi +ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.1 rccl/fast-env-slows-mpi . .venvs/scaffoldvenv-tuo-pypi/bin/activate diff --git a/scripts/scaffold-tuolumne.job b/scripts/scaffold-tuolumne.job index ce50b46..bd0d33a 100644 --- a/scripts/scaffold-tuolumne.job +++ b/scripts/scaffold-tuolumne.job @@ -7,13 +7,13 @@ # flux: -qpdebug # flux: -B fractale -ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi +ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.1 rccl/fast-env-slows-mpi . .venvs/scaffoldvenv-tuo/bin/activate # (1) Avoid libmagma error # (2) Removing libmpi may cause segfault on mpi4py import -export LD_PRELOAD="/opt/rocm-7.1.0/llvm/lib/libomp.so /opt/cray/pe/mpich/9.1.0/ofi/gnu/11.2/lib/libmpi_gnu.so.12" +export LD_PRELOAD="/opt/rocm-7.1.1/llvm/lib/libomp.so /opt/cray/pe/mpich/9.1.0/ofi/gnu/11.2/lib/libmpi_gnu.so.12" torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml