From faa76d8ddddb655eaf950cd5aeb612ae281bb5ae Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Mon, 9 Mar 2026 12:54:49 +0100
Subject: [PATCH 01/80] ci: added mi300 tests on beverin

---
 .ci/cscs_beverin_pipeline.yml                 | 25 +++++++++++++++++++
 .../lemonio/package.py                        |  0
 .../tmlqcd/beverin-mi300/compilers.yaml       |  2 ++
 .../tmlqcd/beverin-mi300/config.yaml          | 11 ++++++++
 .../tmlqcd/beverin-mi300/environments.yaml    | 25 +++++++++++++++++++
 .../tmlqcd/beverin-mi300/modules.yaml         | 23 +++++++++++++++++
 .../tmlqcd/beverin-mi300/repo/packages        |  1 +
 .../tmlqcd/beverin-mi300/repo/repo.yaml       |  2 ++
 .../tmlqcd/daint-gh200/repo/packages          |  1 +
 9 files changed, 90 insertions(+)
 create mode 100644 .ci/cscs_beverin_pipeline.yml
 rename .ci/{uenv-recipes/tmlqcd/daint-gh200/repo/packages => spack_packages}/lemonio/package.py (100%)
 create mode 100644 .ci/uenv-recipes/tmlqcd/beverin-mi300/compilers.yaml
 create mode 100644 .ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml
 create mode 100644 .ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml
 create mode 100644 .ci/uenv-recipes/tmlqcd/beverin-mi300/modules.yaml
 create mode 120000 .ci/uenv-recipes/tmlqcd/beverin-mi300/repo/packages
 create mode 100644 .ci/uenv-recipes/tmlqcd/beverin-mi300/repo/repo.yaml
 create mode 120000 .ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
new file mode 100644
index 000000000..cbbe2b7c6
--- /dev/null
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -0,0 +1,25 @@
+include:
+  - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
+  - local: '/.ci/include/cscs/00-variables.yml'
+  - local: '/.ci/include/cscs/01-test-templates.yml'
+
+stages:
+  - build
+  - test
+
+build-quda/uenv/beverin-mi300:
+  stage: build
+  extends: .uenv-builder-beverin-mi300
+  variables:
+    UENV_RECIPE: .ci/uenv-recipes/tmlqcd/beverin-mi300
+
+test/beverin-mi300:
+  extends: .test/hmc
+  variables:
+    INPUT_FILE: "doc/sample-input/sample-hmc-quda-cscs.input"
+    REFPATH: "doc/sample-output/hmc-quda-cscs"
+    QUDA_ENABLE_TUNING: 0 # disable tuning
+    QUDA_ENABLE_GDR: 1 # enable GPU-Direct RDMA
+    SLURM_JOB_NUM_NODES: 2
+    SLURM_NTASKS: 8
+    SLURM_TIMELIMIT: "00:30:00"
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py b/.ci/spack_packages/lemonio/package.py
similarity index 100%
rename from .ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
rename to .ci/spack_packages/lemonio/package.py
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/compilers.yaml b/.ci/uenv-recipes/tmlqcd/beverin-mi300/compilers.yaml
new file mode 100644
index 000000000..840d9974d
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/compilers.yaml
@@ -0,0 +1,2 @@
+gcc:
+  version: "14.2"
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml b/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml
new file mode 100644
index 000000000..3ec694351
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml
@@ -0,0 +1,11 @@
+name: tmlqcd
+store: /user-environment
+spack:
+  repo: https://github.com/spack/spack.git
+  commit: releases/v1.0
+  packages:
+    repo: https://github.com/spack/spack-packages.git
+    commit: releases/v2025.11
+modules: true
+description: "tmLQCD is a freely available software suite providing a set of tools to be used in lattice QCD simulations."
+version: 2
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml b/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml
new file mode 100644
index 000000000..07307771a
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml
@@ -0,0 +1,25 @@
+gcc-env:
+  compiler: [gcc]
+  network:
+      mpi: cray-mpich@8.1.32 +rocm
+      specs: [ 'libfabric@2.3 +rocm' ]
+  unify: true
+  specs:
+  - python@3.12
+  - numdiff
+  - quda@develop +qdp +multigrid +twisted_clover +twisted_mass
+  - lemonio
+  - c-lime
+  - openblas
+  - hip@6.3.3 ^mesa@23.3.6
+  variants:
+  - +mpi
+  - +rocm
+  - amdgpu_target=gfx942
+  views:
+    default:
+      link: roots
+      uenv:
+        add_compilers: true
+        prefix_paths:
+          LD_LIBRARY_PATH: [lib, lib64]
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/modules.yaml b/.ci/uenv-recipes/tmlqcd/beverin-mi300/modules.yaml
new file mode 100644
index 000000000..623307b09
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/modules.yaml
@@ -0,0 +1,23 @@
+modules:
+  # Paths to check when creating modules for all module sets
+  prefix_inspections:
+    bin:
+      - PATH
+    lib:
+      - LD_LIBRARY_PATH
+    lib64:
+      - LD_LIBRARY_PATH
+
+  default:
+    arch_folder: false
+    # Where to install modules
+    roots:
+      tcl: /user-environment/modules
+    tcl:
+      all:
+        autoload: none
+      hash_length: 0
+      exclude_implicits: true
+      exclude: ['%gcc@7.5.0', 'gcc %gcc@7.5.0']
+      projections:
+        all: '{name}/{version}'
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/packages b/.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/packages
new file mode 120000
index 000000000..1229fc196
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/packages
@@ -0,0 +1 @@
+../../../../spack_packages
\ No newline at end of file
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/repo.yaml b/.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/repo.yaml
new file mode 100644
index 000000000..f08fa46a4
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/repo.yaml
@@ -0,0 +1,2 @@
+repo:
+  namespace: apps
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages
new file mode 120000
index 000000000..39e779607
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages
@@ -0,0 +1 @@
+../../../../spack_packages/
\ No newline at end of file

From be4b0b12232b43824bfe45c578b290f2d43a6928 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Wed, 11 Mar 2026 03:03:37 +0100
Subject: [PATCH 02/80] Beverin uses a different authentification mechanism

---
 .ci/cscs_beverin_pipeline.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index cbbe2b7c6..35d0224ca 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -20,6 +20,8 @@ test/beverin-mi300:
     REFPATH: "doc/sample-output/hmc-quda-cscs"
     QUDA_ENABLE_TUNING: 0 # disable tuning
     QUDA_ENABLE_GDR: 1 # enable GPU-Direct RDMA
+    F7T_CLIENT_ID: $F7T_TDS_CONSUMER_KEY
+    F7T_CLIENT_SECRET: $F7T_TDS_CONSUMER_SECRET
     SLURM_JOB_NUM_NODES: 2
     SLURM_NTASKS: 8
     SLURM_TIMELIMIT: "00:30:00"

From 02c82ce39d47373446d7b3c046114feef5f2a207 Mon Sep 17 00:00:00 2001
From: chaoos <chaoos@users.noreply.github.com>
Date: Wed, 11 Mar 2026 11:30:52 +0100
Subject: [PATCH 03/80] Add client ID and secret to beverin pipeline

Add F7T_CLIENT_ID and F7T_CLIENT_SECRET variables for build stage.
---
 .ci/cscs_beverin_pipeline.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 35d0224ca..90d468270 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -11,6 +11,8 @@ build-quda/uenv/beverin-mi300:
   stage: build
   extends: .uenv-builder-beverin-mi300
   variables:
+    F7T_CLIENT_ID: $F7T_TDS_CONSUMER_KEY
+    F7T_CLIENT_SECRET: $F7T_TDS_CONSUMER_SECRET
     UENV_RECIPE: .ci/uenv-recipes/tmlqcd/beverin-mi300
 
 test/beverin-mi300:

From f6ba11da22cc10a36f3be7052fe59dbaf851321a Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Wed, 11 Mar 2026 12:04:45 +0100
Subject: [PATCH 04/80] pass right compiler flags for gh200 or mi300

---
 .ci/cscs_beverin_pipeline.yml          |  8 ++------
 .ci/cscs_default_pipeline.yml          |  2 +-
 .ci/include/cscs/00-variables.yml      | 20 ++++++++++++++++++++
 .ci/include/cscs/01-test-templates.yml |  4 ----
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 90d468270..203ce78dd 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -9,21 +9,17 @@ stages:
 
 build-quda/uenv/beverin-mi300:
   stage: build
-  extends: .uenv-builder-beverin-mi300
+  extends: [.uenv-builder-beverin-mi300, .beverin-mi300-secrets]
   variables:
-    F7T_CLIENT_ID: $F7T_TDS_CONSUMER_KEY
-    F7T_CLIENT_SECRET: $F7T_TDS_CONSUMER_SECRET
     UENV_RECIPE: .ci/uenv-recipes/tmlqcd/beverin-mi300
 
 test/beverin-mi300:
-  extends: .test/hmc
+  extends: [.uenv-runner-beverin-mi300, .test/hmc, .beverin-mi300-vars, .beverin-mi300-secrets]
   variables:
     INPUT_FILE: "doc/sample-input/sample-hmc-quda-cscs.input"
     REFPATH: "doc/sample-output/hmc-quda-cscs"
     QUDA_ENABLE_TUNING: 0 # disable tuning
     QUDA_ENABLE_GDR: 1 # enable GPU-Direct RDMA
-    F7T_CLIENT_ID: $F7T_TDS_CONSUMER_KEY
-    F7T_CLIENT_SECRET: $F7T_TDS_CONSUMER_SECRET
     SLURM_JOB_NUM_NODES: 2
     SLURM_NTASKS: 8
     SLURM_TIMELIMIT: "00:30:00"
diff --git a/.ci/cscs_default_pipeline.yml b/.ci/cscs_default_pipeline.yml
index 0a059aa40..4bb78aca0 100644
--- a/.ci/cscs_default_pipeline.yml
+++ b/.ci/cscs_default_pipeline.yml
@@ -14,7 +14,7 @@ build-quda/uenv/daint-gh200:
     UENV_RECIPE: .ci/uenv-recipes/tmlqcd/daint-gh200
 
 test/daint-gh200:
-  extends: .test/hmc
+  extends: [.uenv-runner-daint-gh200, .test/hmc, .daint-gh200-vars]
   variables:
     INPUT_FILE: "doc/sample-input/sample-hmc-quda-cscs.input"
     REFPATH: "doc/sample-output/hmc-quda-cscs"
diff --git a/.ci/include/cscs/00-variables.yml b/.ci/include/cscs/00-variables.yml
index 27bb44047..d92b47c3c 100644
--- a/.ci/include/cscs/00-variables.yml
+++ b/.ci/include/cscs/00-variables.yml
@@ -11,3 +11,23 @@ variables:
   UENV_VERSION: experimental
   UENV_TAG: v0.0.6
 
+
+# These are the firecrest id and secret for the beverin pipeline
+.beverin-mi300-secrets:
+  variables:
+    F7T_CLIENT_ID: $F7T_TDS_CLIENT_ID
+    F7T_CLIENT_SECRET: $F7T_TDS_CLIENT_SECRET
+
+# Compiler flags for the GH200 nodes
+.daint-gh200-vars:
+  variables:
+    CFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
+    CXXFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
+    LDFLAGS: "-fopenmp"
+
+# Compiler flags for the Mi300A nodes
+.beverin-mi300-vars:
+  variables:
+    CFLAGS: "-O3 -fopenmp -mtune=znver4 -mcpu=znver4"
+    CXXFLAGS: "-O3 -fopenmp -mtune=znver4 -mcpu=znver4"
+    LDFLAGS: "-fopenmp"
diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index 9a4a8da45..45e49b0fa 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -4,13 +4,9 @@ include:
 
 .test/base:
   stage: test
-  extends: .uenv-runner-daint-gh200
   image: ${UENV_NAME}/${UENV_VERSION}:${UENV_TAG}
   variables:
     WITH_UENV_VIEW: "default"
-    CFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
-    CXXFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
-    LDFLAGS: "-fopenmp"
   before_script:
     - |
       if test "${SLURM_PROCID}" -eq "0"; then

From 4737a9df875fb0194e5a31fed68f0fdefd535da9 Mon Sep 17 00:00:00 2001
From: Taillefumier Mathieu <29380261+mtaillefumier@users.noreply.github.com>
Date: Wed, 11 Mar 2026 12:36:06 +0100
Subject: [PATCH 05/80] Update 00-variables.yml

fix typo
---
 .ci/include/cscs/00-variables.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/include/cscs/00-variables.yml b/.ci/include/cscs/00-variables.yml
index d92b47c3c..b8debbd2a 100644
--- a/.ci/include/cscs/00-variables.yml
+++ b/.ci/include/cscs/00-variables.yml
@@ -15,8 +15,8 @@ variables:
 # These are the firecrest id and secret for the beverin pipeline
 .beverin-mi300-secrets:
   variables:
-    F7T_CLIENT_ID: $F7T_TDS_CLIENT_ID
-    F7T_CLIENT_SECRET: $F7T_TDS_CLIENT_SECRET
+    F7T_CLIENT_ID: $F7T_TDS_COMSUMER_KEY
+    F7T_CLIENT_SECRET: $F7T_TDS_COMSUMER_SECRET
 
 # Compiler flags for the GH200 nodes
 .daint-gh200-vars:

From 0a5445f41c5201ecc032e27e64c5eee79b2cc1dc Mon Sep 17 00:00:00 2001
From: Taillefumier Mathieu <29380261+mtaillefumier@users.noreply.github.com>
Date: Wed, 11 Mar 2026 13:21:10 +0100
Subject: [PATCH 06/80] Fix variable names for beverin pipeline secrets

---
 .ci/include/cscs/00-variables.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/include/cscs/00-variables.yml b/.ci/include/cscs/00-variables.yml
index b8debbd2a..bddb06752 100644
--- a/.ci/include/cscs/00-variables.yml
+++ b/.ci/include/cscs/00-variables.yml
@@ -15,8 +15,8 @@ variables:
 # These are the firecrest id and secret for the beverin pipeline
 .beverin-mi300-secrets:
   variables:
-    F7T_CLIENT_ID: $F7T_TDS_COMSUMER_KEY
-    F7T_CLIENT_SECRET: $F7T_TDS_COMSUMER_SECRET
+    F7T_CLIENT_ID: $F7T_TDS_CONSUMER_KEY
+    F7T_CLIENT_SECRET: $F7T_TDS_CONSUMER_SECRET
 
 # Compiler flags for the GH200 nodes
 .daint-gh200-vars:

From 6a2e926723c3a955eea7f901f7693d22a5dfee72 Mon Sep 17 00:00:00 2001
From: Taillefumier Mathieu <29380261+mtaillefumier@users.noreply.github.com>
Date: Wed, 11 Mar 2026 19:48:56 +0100
Subject: [PATCH 07/80] Add SLURM_TIMELIMIT variable to build stage

Increases the time limit to 8 hours
---
 .ci/cscs_beverin_pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 203ce78dd..e06b39867 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -12,7 +12,7 @@ build-quda/uenv/beverin-mi300:
   extends: [.uenv-builder-beverin-mi300, .beverin-mi300-secrets]
   variables:
     UENV_RECIPE: .ci/uenv-recipes/tmlqcd/beverin-mi300
-
+    SLURM_TIMELIMIT: "08:00:00"
 test/beverin-mi300:
   extends: [.uenv-runner-beverin-mi300, .test/hmc, .beverin-mi300-vars, .beverin-mi300-secrets]
   variables:

From d2ff7b89af4a73950c7a3e9253280d9efe4938c8 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 14 Oct 2025 15:32:53 +0200
Subject: [PATCH 08/80] CMake support

---
 .gitignore                                    |    1 -
 CMakeLists.txt                                |  390 +++
 cmake/FindCLime.cmake                         |   27 +
 cmake/FindLemon.cmake                         |   25 +
 cmake/git_hash.h.in                           |    6 +
 .../tmlqcd_config_internal.h.in               |  130 +-
 cmake_includes.txt                            |  425 ++++
 io/Makefile.in                                |  135 -
 src/bin/CMakeLists.txt                        |   19 +
 LapH_ev.c => src/bin/LapH_ev.c                |    0
 benchmark.c => src/bin/benchmark.c            |    0
 .../bin/check_locallity.c                     |    4 +-
 deriv_mg_tune.c => src/bin/deriv_mg_tune.c    |    0
 gen_sources.c => src/bin/gen_sources.c        |    0
 hmc_tm.c => src/bin/hmc_tm.c                  |    0
 hopping_test.c => src/bin/hopping_test.c      |    0
 invert.c => src/bin/invert.c                  |    0
 {util => src/bin}/main_ildg2uk.c              |    0
 .../bin/offline_measurement.c                 |    0
 .../bin/qphix_test_Dslash.c                   |    0
 {test => src/bin}/scalar_prod_r_test.c        |    0
 {test => src/bin}/test_eigenvalues.c          |    0
 test_lemon.c => src/bin/test_lemon.c          |    0
 src/lib/CMakeLists.txt                        |  457 ++++
 .../lib/DDalphaAMG_interface.c                |    0
 .../lib/DDalphaAMG_interface.h                |    0
 Ptilde_nd.c => src/lib/Ptilde_nd.c            |    0
 Ptilde_nd.h => src/lib/Ptilde_nd.h            |    0
 aligned_malloc.c => src/lib/aligned_malloc.c  |    0
 aligned_malloc.h => src/lib/aligned_malloc.h  |    0
 block.c => src/lib/block.c                    |    0
 block.h => src/lib/block.h                    |    0
 boundary.c => src/lib/boundary.c              |    0
 boundary.h => src/lib/boundary.h              |    0
 {buffers => src/lib/buffers}/Makefile.in      |    0
 {buffers => src/lib/buffers}/gauge.c          |    0
 {buffers => src/lib/buffers}/gauge.h          |    0
 {buffers => src/lib/buffers}/gauge.ih         |    0
 .../buffers}/gauge_allocate_gauge_buffers.c   |    0
 .../buffers}/gauge_finalize_gauge_buffers.c   |    0
 .../gauge_free_unused_gauge_buffers.c         |    0
 .../lib/buffers}/gauge_get_gauge_field.c      |    0
 .../buffers}/gauge_get_gauge_field_array.c    |    0
 .../buffers}/gauge_initialize_gauge_buffers.c |    0
 .../lib/buffers}/gauge_return_gauge_field.c   |    0
 .../buffers}/gauge_return_gauge_field_array.c |    0
 {buffers => src/lib/buffers}/utils.h          |    0
 {buffers => src/lib/buffers}/utils.ih         |    0
 .../utils_generic_exchange.blocking.inc       |    0
 .../lib/buffers}/utils_generic_exchange.c     |    0
 .../utils_generic_exchange.nonblocking.inc    |    0
 .../lib/chebyshev_polynomial.c                |    4 +-
 .../lib/chebyshev_polynomial.h                |    0
 .../lib/chebyshev_polynomial_nd.c             |    0
 .../lib/chebyshev_polynomial_nd.h             |    0
 clenshaw_coef.c => src/lib/clenshaw_coef.c    |    0
 clenshaw_coef.h => src/lib/clenshaw_coef.h    |    0
 .../lib/compare_derivative.c                  |    0
 .../lib/compare_derivative.h                  |    0
 {cu => src/lib/cu}/COPYING                    |    0
 {cu => src/lib/cu}/COPYING.LESSER             |    0
 {cu => src/lib/cu}/Makefile.in                |    0
 {cu => src/lib/cu}/check-regressions          |    0
 {cu => src/lib/cu}/cu.c                       |    0
 {cu => src/lib/cu}/cu.h                       |    0
 .../lib/default_input_values.h                |    0
 deriv_Sb.c => src/lib/deriv_Sb.c              |    0
 deriv_Sb.h => src/lib/deriv_Sb.h              |    0
 deriv_Sb_D_psi.c => src/lib/deriv_Sb_D_psi.c  |    0
 deriv_Sb_D_psi.h => src/lib/deriv_Sb_D_psi.h  |    0
 expo.c => src/lib/expo.c                      |    0
 expo.h => src/lib/expo.h                      |    0
 fatal_error.c => src/lib/fatal_error.c        |    0
 fatal_error.h => src/lib/fatal_error.h        |    0
 gamma.c => src/lib/gamma.c                    |    0
 gamma.h => src/lib/gamma.h                    |    0
 geometry_eo.c => src/lib/geometry_eo.c        |    0
 geometry_eo.h => src/lib/geometry_eo.h        |    0
 .../lib/get_rectangle_staples.c               |    0
 .../lib/get_rectangle_staples.h               |    0
 get_staples.c => src/lib/get_staples.c        |    0
 get_staples.h => src/lib/get_staples.h        |    0
 getopt.c => src/lib/getopt.c                  |    0
 getopt.h => src/lib/getopt.h                  |    0
 gettime.c => src/lib/gettime.c                |    0
 gettime.h => src/lib/gettime.h                |    0
 global.h => src/lib/global.h                  |    0
 .../lib/hamiltonian_field.h                   |    0
 {include => src/lib/include}/tmLQCD.h         |    0
 {include => src/lib/include}/tmlqcd_config.h  |    0
 {init => src/lib/init}/Makefile.in            |    0
 {init => src/lib/init}/init.h                 |    0
 {init => src/lib/init}/init_bispinor_field.c  |    0
 {init => src/lib/init}/init_bispinor_field.h  |    0
 .../lib/init}/init_chi_spinor_field.c         |    0
 .../lib/init}/init_chi_spinor_field.h         |    0
 .../lib/init}/init_critical_globals.c         |    0
 .../lib/init}/init_critical_globals.h         |    0
 .../lib/init}/init_dirac_halfspinor.c         |    0
 .../lib/init}/init_dirac_halfspinor.h         |    0
 {init => src/lib/init}/init_gauge_fg.c        |    0
 {init => src/lib/init}/init_gauge_fg.h        |    0
 {init => src/lib/init}/init_gauge_field.c     |    0
 {init => src/lib/init}/init_gauge_field.h     |    0
 {init => src/lib/init}/init_gauge_tmp.c       |    0
 {init => src/lib/init}/init_gauge_tmp.h       |    0
 .../lib/init}/init_geometry_indices.c         |    0
 .../lib/init}/init_geometry_indices.h         |    0
 {init => src/lib/init}/init_global_states.c   |    0
 {init => src/lib/init}/init_global_states.h   |    0
 {init => src/lib/init}/init_moment_field.c    |    0
 {init => src/lib/init}/init_moment_field.h    |    0
 .../lib/init}/init_omp_accumulators.c         |    0
 .../lib/init}/init_omp_accumulators.h         |    0
 {init => src/lib/init}/init_openmp.c          |    0
 {init => src/lib/init}/init_openmp.h          |    0
 {init => src/lib/init}/init_parallel.c        |    0
 {init => src/lib/init}/init_parallel.h        |    0
 {init => src/lib/init}/init_spinor_field.c    |    0
 {init => src/lib/init}/init_spinor_field.h    |    0
 .../lib/init}/init_stout_smear_vars.c         |    0
 .../lib/init}/init_stout_smear_vars.h         |    0
 integrator.c => src/lib/integrator.c          |    0
 integrator.h => src/lib/integrator.h          |    0
 .../lib/invert_clover_eo.c                    |    0
 .../lib/invert_clover_eo.h                    |    0
 .../lib/invert_doublet_eo.c                   |    0
 .../lib/invert_doublet_eo.h                   |    0
 invert_eo.c => src/lib/invert_eo.c            |    1 +
 invert_eo.h => src/lib/invert_eo.h            |    0
 invert_overlap.c => src/lib/invert_overlap.c  |    0
 invert_overlap.h => src/lib/invert_overlap.h  |    0
 {io => src/lib/io}/DML_crc32.c                |    0
 {io => src/lib/io}/deri_write_stdout.c        |    0
 {io => src/lib/io}/deri_write_stdout.h        |    0
 {io => src/lib/io}/dml.c                      |    0
 {io => src/lib/io}/dml.h                      |    0
 {io => src/lib/io}/eospinor.h                 |    0
 {io => src/lib/io}/eospinor.ih                |    0
 {io => src/lib/io}/eospinor_read.c            |    0
 {io => src/lib/io}/eospinor_write.c           |    0
 {io => src/lib/io}/gauge.h                    |    0
 {io => src/lib/io}/gauge.ih                   |    0
 {io => src/lib/io}/gauge_read.c               |    0
 {io => src/lib/io}/gauge_read_binary.c        |    0
 {io => src/lib/io}/gauge_write.c              |    0
 {io => src/lib/io}/gauge_write_binary.c       |    0
 .../lib/io}/gauge_write_luscher_binary.c      |    0
 .../lib/io}/gauge_write_luscher_binary.h      |    0
 {io => src/lib/io}/io_cm.c                    |    0
 {io => src/lib/io}/io_cm.h                    |    0
 {io => src/lib/io}/params.h                   |    0
 {io => src/lib/io}/params.ih                  |    0
 .../lib/io}/params_construct_InverterInfo.c   |    0
 .../lib/io}/params_construct_ildgFormat.c     |    0
 .../io}/params_construct_propagatorFormat.c   |    0
 .../lib/io}/params_construct_sourceFormat.c   |    0
 {io => src/lib/io}/params_construct_xlfInfo.c |    0
 {io => src/lib/io}/selector.h                 |    0
 {io => src/lib/io}/spinor.h                   |    0
 {io => src/lib/io}/spinor.ih                  |    0
 {io => src/lib/io}/spinor_read.c              |    0
 {io => src/lib/io}/spinor_read_binary.c       |    0
 {io => src/lib/io}/spinor_write.c             |    0
 {io => src/lib/io}/spinor_write_binary.c      |    0
 {io => src/lib/io}/spinor_write_info.c        |    0
 .../lib/io}/spinor_write_propagator_format.c  |    0
 .../lib/io}/spinor_write_propagator_type.c    |    0
 .../lib/io}/spinor_write_source_format.c      |    0
 {io => src/lib/io}/spinor_write_stdout.c      |    0
 {io => src/lib/io}/spinor_write_stdout.h      |    0
 {io => src/lib/io}/sw_write_stdout.c          |    0
 {io => src/lib/io}/sw_write_stdout.h          |    0
 {io => src/lib/io}/utils.c                    |    0
 {io => src/lib/io}/utils.h                    |    2 -
 {io => src/lib/io}/utils.ih                   |    4 +-
 .../lib/io}/utils_close_reader_record.c       |    0
 .../lib/io}/utils_close_writer_record.c       |    0
 {io => src/lib/io}/utils_construct_reader.c   |    2 +
 {io => src/lib/io}/utils_construct_writer.c   |    0
 {io => src/lib/io}/utils_destruct_reader.c    |    0
 {io => src/lib/io}/utils_destruct_writer.c    |    0
 {io => src/lib/io}/utils_engineering.c        |    0
 {io => src/lib/io}/utils_kill_with_error.c    |    0
 {io => src/lib/io}/utils_parse_checksum_xml.c |    0
 .../lib/io}/utils_parse_ildgformat_xml.c      |    0
 .../lib/io}/utils_parse_propagator_type.c     |    0
 {io => src/lib/io}/utils_read_message.c       |    0
 {io => src/lib/io}/utils_write_checksum.c     |    0
 .../lib/io}/utils_write_first_message.c       |    0
 {io => src/lib/io}/utils_write_header.c       |    0
 {io => src/lib/io}/utils_write_ildg_format.c  |    0
 .../lib/io}/utils_write_inverter_info.c       |    0
 {io => src/lib/io}/utils_write_message.c      |    0
 {io => src/lib/io}/utils_write_xlf.c          |    0
 {io => src/lib/io}/utils_write_xlf_xml.c      |    0
 .../lib/kahan_summation.h                     |    0
 {linalg => src/lib/linalg}/Makefile.in        |    0
 {linalg => src/lib/linalg}/add.c              |    0
 {linalg => src/lib/linalg}/add.h              |    0
 {linalg => src/lib/linalg}/addto_32.c         |    0
 {linalg => src/lib/linalg}/addto_32.h         |    0
 {linalg => src/lib/linalg}/assign.c           |    0
 {linalg => src/lib/linalg}/assign.h           |    0
 {linalg => src/lib/linalg}/assign_add_mul.c   |    0
 {linalg => src/lib/linalg}/assign_add_mul.h   |    0
 .../lib/linalg}/assign_add_mul_add_mul.c      |    0
 .../lib/linalg}/assign_add_mul_add_mul.h      |    0
 .../lib/linalg}/assign_add_mul_add_mul_r.c    |    0
 .../lib/linalg}/assign_add_mul_add_mul_r.h    |    0
 .../lib/linalg}/assign_add_mul_body.c         |    0
 {linalg => src/lib/linalg}/assign_add_mul_r.c |    0
 {linalg => src/lib/linalg}/assign_add_mul_r.h |    0
 .../lib/linalg}/assign_add_mul_r_32.c         |    5 +-
 .../lib/linalg}/assign_add_mul_r_32.h         |    0
 .../lib/linalg}/assign_add_mul_r_add_mul.c    |    0
 .../lib/linalg}/assign_add_mul_r_add_mul.h    |    0
 {linalg => src/lib/linalg}/assign_diff_mul.c  |    0
 {linalg => src/lib/linalg}/assign_diff_mul.h  |    0
 {linalg => src/lib/linalg}/assign_mul_add.c   |    0
 {linalg => src/lib/linalg}/assign_mul_add.h   |    0
 .../lib/linalg}/assign_mul_add_mul.c          |    0
 .../lib/linalg}/assign_mul_add_mul.h          |    0
 .../assign_mul_add_mul_add_mul_add_mul_r.c    |    0
 .../assign_mul_add_mul_add_mul_add_mul_r.h    |    0
 .../linalg}/assign_mul_add_mul_add_mul_r.c    |    0
 .../linalg}/assign_mul_add_mul_add_mul_r.h    |    0
 .../lib/linalg}/assign_mul_add_mul_r.c        |    0
 .../lib/linalg}/assign_mul_add_mul_r.h        |    0
 .../lib/linalg}/assign_mul_add_mul_r_32.c     |    0
 .../lib/linalg}/assign_mul_add_mul_r_32.h     |    0
 {linalg => src/lib/linalg}/assign_mul_add_r.c |    0
 {linalg => src/lib/linalg}/assign_mul_add_r.h |    0
 .../lib/linalg}/assign_mul_add_r_32.c         |    0
 .../lib/linalg}/assign_mul_add_r_32.h         |    0
 .../lib/linalg}/assign_mul_add_r_and_square.c |    0
 .../lib/linalg}/assign_mul_add_r_and_square.h |    0
 .../linalg}/assign_mul_bra_add_mul_ket_add.c  |    0
 .../linalg}/assign_mul_bra_add_mul_ket_add.h  |    0
 .../assign_mul_bra_add_mul_ket_add_r.c        |    0
 .../assign_mul_bra_add_mul_ket_add_r.h        |    0
 .../lib/linalg}/assign_mul_bra_add_mul_r.c    |    0
 .../lib/linalg}/assign_mul_bra_add_mul_r.h    |    0
 {linalg => src/lib/linalg}/assign_to_32.c     |    0
 {linalg => src/lib/linalg}/assign_to_32.h     |    0
 {linalg => src/lib/linalg}/blas.h             |    0
 {linalg => src/lib/linalg}/comp_decomp.c      |    0
 {linalg => src/lib/linalg}/comp_decomp.h      |    0
 .../lib/linalg}/convert_eo_to_lexic.c         |    0
 .../lib/linalg}/convert_eo_to_lexic.h         |    0
 .../lib/linalg}/convert_even_to_lexic.c       |    0
 .../lib/linalg}/convert_even_to_lexic.h       |    0
 .../lib/linalg}/convert_odd_to_lexic.c        |    0
 .../lib/linalg}/convert_odd_to_lexic.h        |    0
 {linalg => src/lib/linalg}/diff.c             |    0
 {linalg => src/lib/linalg}/diff.h             |    0
 {linalg => src/lib/linalg}/diff_32.c          |    0
 {linalg => src/lib/linalg}/diff_32.h          |    0
 .../lib/linalg}/diff_and_square_norm.c        |    0
 .../lib/linalg}/diff_and_square_norm.h        |    0
 {linalg => src/lib/linalg}/fortran.h          |    0
 {linalg => src/lib/linalg}/lapack.h           |    0
 {linalg => src/lib/linalg}/map_to_blas.h      |    0
 {linalg => src/lib/linalg}/mattimesvec.c      |    0
 {linalg => src/lib/linalg}/mattimesvec.h      |    0
 {linalg => src/lib/linalg}/mul.c              |    0
 {linalg => src/lib/linalg}/mul.h              |    0
 {linalg => src/lib/linalg}/mul_add_mul.c      |    0
 {linalg => src/lib/linalg}/mul_add_mul.h      |    0
 {linalg => src/lib/linalg}/mul_add_mul_r.c    |    0
 {linalg => src/lib/linalg}/mul_add_mul_r.h    |    0
 {linalg => src/lib/linalg}/mul_diff_mul.c     |    0
 {linalg => src/lib/linalg}/mul_diff_mul.h     |    0
 {linalg => src/lib/linalg}/mul_diff_mul_r.c   |    0
 {linalg => src/lib/linalg}/mul_diff_mul_r.h   |    0
 {linalg => src/lib/linalg}/mul_diff_r.c       |    0
 {linalg => src/lib/linalg}/mul_diff_r.h       |    0
 {linalg => src/lib/linalg}/mul_gamma5.c       |    0
 {linalg => src/lib/linalg}/mul_gamma5.h       |    0
 {linalg => src/lib/linalg}/mul_r.c            |    0
 {linalg => src/lib/linalg}/mul_r.h            |    0
 {linalg => src/lib/linalg}/mul_r_32.c         |    0
 {linalg => src/lib/linalg}/mul_r_32.h         |    0
 {linalg => src/lib/linalg}/mul_r_gamma5.c     |    0
 {linalg => src/lib/linalg}/mul_r_gamma5.h     |    0
 {linalg => src/lib/linalg}/print_spinor.c     |    0
 {linalg => src/lib/linalg}/print_spinor.h     |    0
 .../linalg}/print_spinor_similar_components.c |    0
 .../linalg}/print_spinor_similar_components.h |    0
 {linalg => src/lib/linalg}/ratio.c            |    0
 {linalg => src/lib/linalg}/ratio.h            |    0
 {linalg => src/lib/linalg}/scalar_prod.c      |    0
 {linalg => src/lib/linalg}/scalar_prod.h      |    0
 {linalg => src/lib/linalg}/scalar_prod_body.c |    0
 {linalg => src/lib/linalg}/scalar_prod_i.c    |    0
 {linalg => src/lib/linalg}/scalar_prod_i.h    |    0
 {linalg => src/lib/linalg}/scalar_prod_r.c    |    0
 {linalg => src/lib/linalg}/scalar_prod_r.h    |    0
 {linalg => src/lib/linalg}/scalar_prod_r_32.c |    0
 {linalg => src/lib/linalg}/scalar_prod_r_32.h |    0
 {linalg => src/lib/linalg}/set_even_to_zero.c |    0
 {linalg => src/lib/linalg}/set_even_to_zero.h |    0
 .../lib/linalg}/square_and_minmax.c           |    0
 .../lib/linalg}/square_and_minmax.h           |    0
 .../lib/linalg}/square_and_prod_r.c           |    0
 .../lib/linalg}/square_and_prod_r.h           |    0
 {linalg => src/lib/linalg}/square_norm.c      |    0
 {linalg => src/lib/linalg}/square_norm.h      |    0
 {linalg => src/lib/linalg}/square_norm_32.c   |    0
 {linalg => src/lib/linalg}/square_norm_32.h   |    0
 linalg_eo.h => src/lib/linalg_eo.h            |    0
 little_D.c => src/lib/little_D.c              |    0
 little_D.h => src/lib/little_D.h              |    0
 little_D_body.c => src/lib/little_D_body.c    |    0
 matrix_utils.c => src/lib/matrix_utils.c      |    0
 matrix_utils.h => src/lib/matrix_utils.h      |    0
 {meas => src/lib/meas}/Makefile.in            |    0
 {meas => src/lib/meas}/correlators.c          |    0
 {meas => src/lib/meas}/correlators.h          |    0
 {meas => src/lib/meas}/field_strength_types.h |    0
 {meas => src/lib/meas}/gradient_flow.c        |    0
 {meas => src/lib/meas}/gradient_flow.h        |    0
 ...easure_clover_field_strength_observables.c |    0
 ...easure_clover_field_strength_observables.h |    0
 {meas => src/lib/meas}/measurements.c         |    0
 {meas => src/lib/meas}/measurements.h         |    0
 {meas => src/lib/meas}/oriented_plaquettes.c  |    0
 {meas => src/lib/meas}/oriented_plaquettes.h  |    0
 {meas => src/lib/meas}/pion_norm.c            |    0
 {meas => src/lib/meas}/pion_norm.h            |    0
 {meas => src/lib/meas}/polyakov_loop.c        |    0
 {meas => src/lib/meas}/polyakov_loop.h        |    0
 .../lib/measure_gauge_action.c                |    0
 .../lib/measure_gauge_action.h                |    0
 .../lib/measure_rectangles.c                  |    0
 .../lib/measure_rectangles.h                  |    0
 misc_types.h => src/lib/misc_types.h          |    2 +-
 {monomial => src/lib/monomial}/Makefile.in    |    0
 .../lib/monomial}/clover_trlog_monomial.c     |    0
 .../lib/monomial}/clover_trlog_monomial.h     |    0
 .../lib/monomial}/cloverdet_monomial.c        |    0
 .../lib/monomial}/cloverdet_monomial.h        |    0
 .../lib/monomial}/cloverdetratio_monomial.c   |    0
 .../lib/monomial}/cloverdetratio_monomial.h   |    0
 .../lib/monomial}/cloverdetratio_rwmonomial.c |    0
 .../lib/monomial}/cloverdetratio_rwmonomial.h |    0
 .../lib/monomial}/clovernd_trlog_monomial.c   |    0
 .../lib/monomial}/clovernd_trlog_monomial.h   |    0
 .../lib/monomial}/cloverndpoly_monomial.c     |    0
 .../lib/monomial}/cloverndpoly_monomial.h     |    0
 {monomial => src/lib/monomial}/det_monomial.c |    0
 {monomial => src/lib/monomial}/det_monomial.h |    0
 .../lib/monomial}/detratio_monomial.c         |    0
 .../lib/monomial}/detratio_monomial.h         |    0
 .../lib/monomial}/gauge_monomial.c            |    0
 .../lib/monomial}/gauge_monomial.h            |    0
 .../lib/monomial}/moment_energy.c             |    0
 .../lib/monomial}/moment_energy.h             |    0
 .../lib/monomial}/monitor_forces.c            |    0
 .../lib/monomial}/monitor_forces.h            |    0
 {monomial => src/lib/monomial}/monomial.c     |    0
 {monomial => src/lib/monomial}/monomial.h     |    0
 .../lib/monomial}/nddetratio_monomial.c       |    0
 .../lib/monomial}/nddetratio_monomial.h       |    0
 .../lib/monomial}/ndpoly_monomial.c           |    0
 .../lib/monomial}/ndpoly_monomial.h           |    0
 .../lib/monomial}/ndrat_monomial.c            |    0
 .../lib/monomial}/ndrat_monomial.h            |    0
 .../lib/monomial}/ndratcor_monomial.c         |    0
 .../lib/monomial}/ndratcor_monomial.h         |    0
 .../lib/monomial}/poly_monomial.c             |    0
 .../lib/monomial}/poly_monomial.h             |    0
 {monomial => src/lib/monomial}/rat_monomial.c |    0
 {monomial => src/lib/monomial}/rat_monomial.h |    0
 .../lib/monomial}/ratcor_monomial.c           |    0
 .../lib/monomial}/ratcor_monomial.h           |    0
 mpi_init.c => src/lib/mpi_init.c              |    0
 mpi_init.h => src/lib/mpi_init.h              |    0
 .../lib/omp_accumulator.h                     |    0
 operator.c => src/lib/operator.c              |    0
 operator.h => src/lib/operator.h              |    0
 .../lib/operator}/Block_D_psi_body.c          |    0
 {operator => src/lib/operator}/D_psi.c        |    0
 {operator => src/lib/operator}/D_psi.h        |    0
 {operator => src/lib/operator}/D_psi_body.c   |    0
 {operator => src/lib/operator}/Dov_proj.c     |    0
 {operator => src/lib/operator}/Dov_proj.h     |    0
 {operator => src/lib/operator}/Dov_psi.c      |    0
 {operator => src/lib/operator}/Dov_psi.h      |    0
 .../lib/operator}/Hopping_Matrix.c            |    0
 .../lib/operator}/Hopping_Matrix.h            |    0
 .../lib/operator}/Hopping_Matrix_32.c         |    0
 .../lib/operator}/Hopping_Matrix_32.h         |    0
 .../lib/operator}/Hopping_Matrix_32_nocom.c   |    0
 .../lib/operator}/Hopping_Matrix_nocom.c      |    0
 .../lib/operator}/Hopping_Matrix_nocom.h      |    0
 {operator => src/lib/operator}/Makefile.in    |    0
 .../assign_mul_one_sw_pm_imu_inv_block_body.c |    0
 ...assign_mul_one_sw_pm_imu_site_lexic_body.c |    0
 .../lib/operator}/clover_accumulate_deriv.c   |    0
 {operator => src/lib/operator}/clover_deriv.c |    0
 {operator => src/lib/operator}/clover_det.c   |    0
 .../lib/operator}/clover_inline.h             |    0
 .../lib/operator}/clover_invert.c             |    0
 {operator => src/lib/operator}/clover_leaf.c  |    0
 {operator => src/lib/operator}/clover_leaf.h  |    0
 {operator => src/lib/operator}/clover_term.c  |    0
 .../lib/operator}/clovertm_operators.c        |    0
 .../lib/operator}/clovertm_operators.h        |    0
 .../lib/operator}/clovertm_operators_32.c     |    0
 .../lib/operator}/clovertm_operators_32.h     |    0
 .../lib/operator}/halfspinor_body.c           |    0
 .../lib/operator}/halfspinor_body_32.c        |    0
 .../lib/operator}/halfspinor_hopping.h        |    0
 .../lib/operator}/halfspinor_hopping_32.h     |    0
 {operator => src/lib/operator}/hopping.h      |    0
 .../lib/operator}/hopping_body_dbl.c          |    0
 {operator => src/lib/operator}/hopping_sgl.c  |    0
 .../lib/operator}/mul_one_pm_imu_inv_body.c   |    0
 .../operator}/mul_one_pm_imu_sub_mul_body.c   |    0
 {operator => src/lib/operator}/tm_operators.c |    0
 {operator => src/lib/operator}/tm_operators.h |    0
 .../lib/operator}/tm_operators_32.c           |    0
 .../lib/operator}/tm_operators_32.h           |    0
 .../lib/operator}/tm_operators_nd.c           |    0
 .../lib/operator}/tm_operators_nd.h           |    0
 .../lib/operator}/tm_operators_nd_32.c        |    0
 .../lib/operator}/tm_operators_nd_32.h        |    0
 .../lib/operator}/tm_sub_Hopping_Matrix.c     |    0
 .../lib/operator}/tm_sub_Hopping_Matrix.h     |    0
 .../lib/operator}/tm_times_Hopping_Matrix.c   |    0
 .../lib/operator}/tm_times_Hopping_Matrix.h   |    0
 operator_types.h => src/lib/operator_types.h  |    0
 overrelaxation.c => src/lib/overrelaxation.c  |    2 +-
 overrelaxation.h => src/lib/overrelaxation.h  |    0
 parallel_io.h => src/lib/parallel_io.h        |    0
 phmc.c => src/lib/phmc.c                      |    0
 phmc.h => src/lib/phmc.h                      |    0
 prepare_source.c => src/lib/prepare_source.c  |    0
 prepare_source.h => src/lib/prepare_source.h  |    0
 .../lib/profiling}/hmc/Readme.md              |    0
 .../lib/profiling}/hmc/example_profile.pdf    |  Bin
 .../lib/profiling}/hmc/profile.Rmd            |    0
 {profiling => src/lib/profiling}/hmc/timing.R |    0
 .../lib/profiling}/hmc_mk2/.gitignore         |    0
 .../lib/profiling}/hmc_mk2/README.md          |    0
 .../profiling}/hmc_mk2/logs/example_log.out   |    0
 .../lib/profiling}/hmc_mk2/make_profile.R     |    0
 .../lib/profiling}/hmc_mk2/profile.Rmd        |    0
 src/lib/qphix/qphix_base_classes.hpp          |  771 ++++++
 src/lib/qphix/qphix_interface.cpp             | 2192 +++++++++++++++++
 src/lib/qphix/qphix_interface.hpp             |   51 +
 src/lib/qphix/qphix_interface_utils.hpp       |   33 +
 .../lib/qphix_interface.h                     |    0
 qphix_types.h => src/lib/qphix_types.h        |    0
 qphix_veclen.h => src/lib/qphix_veclen.h      |    0
 .../lib/quda_dummy_types.h                    |    0
 src/lib/quda_gauge_paths.inc                  |  158 ++
 quda_interface.c => src/lib/quda_interface.c  |    0
 quda_interface.h => src/lib/quda_interface.h  |    0
 quda_types.h => src/lib/quda_types.h          |    0
 ranlxd.c => src/lib/ranlxd.c                  |    0
 ranlxd.h => src/lib/ranlxd.h                  |    0
 ranlxs.c => src/lib/ranlxs.c                  |    0
 ranlxs.h => src/lib/ranlxs.h                  |    0
 {rational => src/lib/rational}/Makefile.in    |    0
 {rational => src/lib/rational}/elliptic.c     |    0
 {rational => src/lib/rational}/elliptic.h     |    0
 {rational => src/lib/rational}/rational.c     |    0
 {rational => src/lib/rational}/rational.h     |    0
 {rational => src/lib/rational}/zolotarev.c    |    0
 {rational => src/lib/rational}/zolotarev.h    |    0
 read_input.h => src/lib/read_input.h          |    0
 read_input.l => src/lib/read_input.l          |    0
 .../lib/reweighting_factor.c                  |    0
 .../lib/reweighting_factor.h                  |    0
 .../lib/reweighting_factor_nd.c               |    0
 .../lib/reweighting_factor_nd.h               |    0
 .../lib/rnd_gauge_trafo.c                     |    0
 .../lib/rnd_gauge_trafo.h                     |    0
 sighandler.c => src/lib/sighandler.c          |    0
 sighandler.h => src/lib/sighandler.h          |    0
 {smearing => src/lib/smearing}/Makefile.in    |    0
 {smearing => src/lib/smearing}/ape.h          |    0
 {smearing => src/lib/smearing}/ape.ih         |    0
 .../lib/smearing}/ape_ape_smear.c             |    0
 {smearing => src/lib/smearing}/hex.h          |    0
 {smearing => src/lib/smearing}/hex.ih         |    0
 .../lib/smearing}/hex_hex_smear.c             |    0
 .../lib/smearing}/hex_stout_exclude_none.c    |    0
 .../lib/smearing}/hex_stout_exclude_one.c     |    0
 .../lib/smearing}/hex_stout_exclude_two.c     |    0
 {smearing => src/lib/smearing}/hyp.h          |    0
 {smearing => src/lib/smearing}/hyp.ih         |    0
 .../smearing}/hyp_APE_project_exclude_none.c  |    0
 .../smearing}/hyp_APE_project_exclude_one.c   |    0
 .../smearing}/hyp_APE_project_exclude_two.c   |    0
 .../lib/smearing}/hyp_hyp_smear.c             |    0
 .../smearing}/hyp_hyp_staples_exclude_none.c  |    0
 .../smearing}/hyp_hyp_staples_exclude_one.c   |    0
 .../smearing}/hyp_hyp_staples_exclude_two.c   |    0
 {smearing => src/lib/smearing}/stout.h        |    0
 {smearing => src/lib/smearing}/stout.ih       |    0
 .../lib/smearing}/stout_stout_smear.c         |    0
 .../smearing}/uils_print_config_to_screen.c   |    0
 {smearing => src/lib/smearing}/utils.h        |    0
 {smearing => src/lib/smearing}/utils.ih       |    0
 .../lib/smearing}/utils_generic_staples.c     |    0
 .../smearing}/utils_print_config_to_screen.c  |    0
 .../lib/smearing}/utils_print_su3.c           |    0
 .../lib/smearing}/utils_project_antiherm.c    |    0
 .../lib/smearing}/utils_project_herm.c        |    0
 .../lib/smearing}/utils_reunitarize.c         |    0
 .../lib/smearing}/utils_reunitarize_MILC.c    |    8 +-
 .../lib/solver}/M_plus_block_psi_body.c       |    0
 {solver => src/lib/solver}/Makefile.in        |    0
 {solver => src/lib/solver}/Msap.c             |    0
 {solver => src/lib/solver}/Msap.h             |    0
 {solver => src/lib/solver}/bicg_complex.c     |    0
 {solver => src/lib/solver}/bicg_complex.h     |    0
 {solver => src/lib/solver}/bicgstab2.c        |    0
 {solver => src/lib/solver}/bicgstab2.h        |    0
 {solver => src/lib/solver}/bicgstab_complex.c |    0
 {solver => src/lib/solver}/bicgstab_complex.h |    0
 .../lib/solver}/bicgstab_complex_bi.c         |    0
 .../lib/solver}/bicgstab_complex_bi.h         |    0
 {solver => src/lib/solver}/bicgstabell.c      |    0
 {solver => src/lib/solver}/bicgstabell.h      |    0
 {solver => src/lib/solver}/cg_her.c           |    0
 {solver => src/lib/solver}/cg_her.h           |    0
 {solver => src/lib/solver}/cg_her_bi.c        |    0
 {solver => src/lib/solver}/cg_her_bi.h        |    0
 {solver => src/lib/solver}/cg_her_nd.c        |    0
 {solver => src/lib/solver}/cg_her_nd.h        |    0
 {solver => src/lib/solver}/cg_mms_tm.c        |    0
 {solver => src/lib/solver}/cg_mms_tm.h        |    0
 {solver => src/lib/solver}/cg_mms_tm_nd.c     |    0
 {solver => src/lib/solver}/cg_mms_tm_nd.h     |    0
 {solver => src/lib/solver}/cgne4complex.c     |    0
 {solver => src/lib/solver}/cgne4complex.h     |    0
 {solver => src/lib/solver}/cgs_real.c         |    0
 {solver => src/lib/solver}/cgs_real.h         |    0
 {solver => src/lib/solver}/chrono_guess.c     |    0
 {solver => src/lib/solver}/chrono_guess.h     |    0
 {solver => src/lib/solver}/cr.c               |    0
 {solver => src/lib/solver}/cr.h               |    0
 {solver => src/lib/solver}/dfl_projector.c    |    0
 {solver => src/lib/solver}/dfl_projector.h    |    0
 .../lib/solver}/diagonalise_general_matrix.c  |    0
 .../lib/solver}/diagonalise_general_matrix.h  |    0
 .../lib/solver}/dirac_operator_eigenvectors.c |    0
 .../lib/solver}/dirac_operator_eigenvectors.h |    0
 {solver => src/lib/solver}/eigcg.c            |    0
 {solver => src/lib/solver}/eigcg.h            |    0
 {solver => src/lib/solver}/eigenvalues.c      |    0
 {solver => src/lib/solver}/eigenvalues.h      |    0
 {solver => src/lib/solver}/eigenvalues_bi.c   |    0
 {solver => src/lib/solver}/eigenvalues_bi.h   |    0
 {solver => src/lib/solver}/fgmres.c           |    0
 {solver => src/lib/solver}/fgmres.h           |    0
 {solver => src/lib/solver}/fgmres4complex.c   |    0
 {solver => src/lib/solver}/fgmres4complex.h   |    0
 .../lib/solver}/fgmres4complex_body.c         |    0
 {solver => src/lib/solver}/gcr.c              |    0
 {solver => src/lib/solver}/gcr.h              |    0
 {solver => src/lib/solver}/gcr4complex.c      |    0
 {solver => src/lib/solver}/gcr4complex.h      |    0
 {solver => src/lib/solver}/gcr4complex_body.c |    0
 {solver => src/lib/solver}/gcr4complex_body.h |    0
 .../lib/solver}/generate_dfl_subspace.c       |    0
 .../lib/solver}/generate_dfl_subspace.h       |    0
 {solver => src/lib/solver}/gmres.c            |    0
 {solver => src/lib/solver}/gmres.h            |    0
 {solver => src/lib/solver}/gmres_dr.c         |    0
 {solver => src/lib/solver}/gmres_dr.h         |    0
 {solver => src/lib/solver}/gmres_precon.c     |    0
 {solver => src/lib/solver}/gmres_precon.h     |    0
 {solver => src/lib/solver}/gram-schmidt.c     |    0
 {solver => src/lib/solver}/gram-schmidt.h     |    0
 {solver => src/lib/solver}/incr_eigcg.c       |    0
 {solver => src/lib/solver}/incr_eigcg.h       |    0
 {solver => src/lib/solver}/index_jd.c         |    0
 {solver => src/lib/solver}/index_jd.h         |    0
 {solver => src/lib/solver}/init_guess.c       |    0
 {solver => src/lib/solver}/init_guess.h       |    0
 {solver => src/lib/solver}/jdher.c            |    0
 {solver => src/lib/solver}/jdher.h            |    0
 {solver => src/lib/solver}/jdher_bi.c         |    0
 {solver => src/lib/solver}/jdher_bi.h         |    0
 .../lib/solver}/little_mg_precon_body.c       |    0
 .../lib/solver}/little_project_eo_body.c      |    0
 {solver => src/lib/solver}/lu_solve.c         |    0
 {solver => src/lib/solver}/lu_solve.h         |    0
 .../lib/solver}/matrix_mult_typedef.h         |    0
 .../lib/solver}/matrix_mult_typedef_bi.h      |    0
 .../lib/solver}/matrix_mult_typedef_nd.h      |    0
 {solver => src/lib/solver}/mcr.c              |    0
 {solver => src/lib/solver}/mcr.h              |    0
 {solver => src/lib/solver}/mcr4complex.c      |    0
 {solver => src/lib/solver}/mcr4complex.h      |    0
 {solver => src/lib/solver}/mixed_cg_her.c     |    0
 {solver => src/lib/solver}/mixed_cg_her.h     |    0
 .../lib/solver}/mixed_cg_mms_tm_nd.c          |    0
 .../lib/solver}/mixed_cg_mms_tm_nd.h          |    0
 {solver => src/lib/solver}/monomial_solve.c   |    0
 {solver => src/lib/solver}/monomial_solve.h   |    0
 {solver => src/lib/solver}/mr.c               |    0
 {solver => src/lib/solver}/mr.h               |    0
 {solver => src/lib/solver}/mr4complex.c       |    0
 {solver => src/lib/solver}/mr4complex.h       |    0
 {solver => src/lib/solver}/mrblk_body.c       |    0
 {solver => src/lib/solver}/ortho.c            |    0
 {solver => src/lib/solver}/ortho.h            |    0
 {solver => src/lib/solver}/pcg_her.c          |    0
 {solver => src/lib/solver}/pcg_her.h          |    0
 {solver => src/lib/solver}/poly_precon.c      |    0
 {solver => src/lib/solver}/poly_precon.h      |    0
 {solver => src/lib/solver}/quicksort.c        |    0
 {solver => src/lib/solver}/quicksort.h        |    0
 {solver => src/lib/solver}/restart_X.c        |    0
 {solver => src/lib/solver}/restart_X.h        |    0
 {solver => src/lib/solver}/rg_mixed_cg_her.c  |    0
 {solver => src/lib/solver}/rg_mixed_cg_her.h  |    0
 .../lib/solver}/rg_mixed_cg_her_nd.c          |    0
 .../lib/solver}/rg_mixed_cg_her_nd.h          |    0
 .../lib/solver}/rg_mixed_cg_typedef.h         |    0
 {solver => src/lib/solver}/solver.h           |    0
 {solver => src/lib/solver}/solver_field.c     |    0
 {solver => src/lib/solver}/solver_field.h     |    0
 {solver => src/lib/solver}/solver_params.h    |    0
 {solver => src/lib/solver}/solver_types.c     |    0
 {solver => src/lib/solver}/solver_types.h     |    0
 {solver => src/lib/solver}/sub_low_ev.c       |    0
 {solver => src/lib/solver}/sub_low_ev.h       |    0
 {solver => src/lib/solver}/sumr.c             |    0
 {solver => src/lib/solver}/sumr.h             |    0
 .../lib/source_generation.c                   |    0
 .../lib/source_generation.h                   |    0
 spinor_fft.c => src/lib/spinor_fft.c          |    0
 spinor_fft.h => src/lib/spinor_fft.h          |    0
 start.c => src/lib/start.c                    |    0
 start.h => src/lib/start.h                    |    0
 .../lib/struct_accessors.h                    |    0
 su3.h => src/lib/su3.h                        |    0
 su3adj.h => src/lib/su3adj.h                  |    0
 su3spinor.h => src/lib/su3spinor.h            |    0
 tensors.h => src/lib/tensors.h                |    0
 {test => src/lib/test}/Makefile               |    0
 {test => src/lib/test}/check_geometry.c       |    0
 {test => src/lib/test}/check_geometry.h       |    0
 {test => src/lib/test}/check_nan.c            |    0
 {test => src/lib/test}/check_nan.h            |    0
 {test => src/lib/test}/check_overlap.c        |    0
 {test => src/lib/test}/check_xchange.c        |    0
 {test => src/lib/test}/hopping_test.README    |    0
 .../lib/test}/hopping_test.input.compare      |    0
 {test => src/lib/test}/hopping_test.input.new |    0
 .../lib/test}/hopping_test.input.start        |    0
 .../lib/test}/hopping_test_generate_script    |    0
 {test => src/lib/test}/hopping_test_qscript   |    0
 .../lib/test}/measure_rectangles.debug.c      |    0
 {test => src/lib/test}/overlaptests.c         |    0
 {test => src/lib/test}/overlaptests.h         |    0
 {test => src/lib/test}/qdran64.h              |    0
 .../lib/tm_debug_printf.c                     |    0
 .../lib/tm_debug_printf.h                     |    0
 .../lib/update_backward_gauge.c               |    0
 .../lib/update_backward_gauge.h               |    0
 update_gauge.c => src/lib/update_gauge.c      |    0
 update_gauge.h => src/lib/update_gauge.h      |    0
 update_momenta.c => src/lib/update_momenta.c  |    0
 update_momenta.h => src/lib/update_momenta.h  |    0
 .../lib/update_momenta_fg.c                   |    0
 .../lib/update_momenta_fg.h                   |    0
 update_tm.c => src/lib/update_tm.c            |    0
 update_tm.h => src/lib/update_tm.h            |    0
 {util => src/lib/util}/io.c                   |    0
 {util => src/lib/util}/io.h                   |    0
 {util => src/lib/util}/laguer/Makefile        |    0
 {util => src/lib/util}/laguer/chebyRoot.C     |    0
 {util => src/lib/util}/laguer/chebyRoot.H     |    0
 {util => src/lib/util}/laguer/laguer.c        |    0
 {util => src/lib/util}/laguer/quadroptRoot.C  |    0
 {util => src/lib/util}/oox/Makefile           |    0
 {util => src/lib/util}/oox/oox.c              |    0
 {util => src/lib/util}/oox/oox_gawrapper.cxx  |    0
 {util => src/lib/util}/oox/oox_gawrapper.h    |    0
 {util => src/lib/util}/swapendian.c           |    0
 {util => src/lib/util}/tmlqcd-indent          |    0
 {wrapper => src/lib/wrapper}/Makefile.in      |    0
 {wrapper => src/lib/wrapper}/lib_wrapper.c    |    0
 {xchange => src/lib/xchange}/Makefile.in      |    0
 .../lib/xchange}/little_field_gather.c        |    0
 .../lib/xchange}/little_field_gather.h        |    0
 .../lib/xchange}/little_field_gather_body.c   |    0
 {xchange => src/lib/xchange}/xchange.h        |    0
 .../lib/xchange}/xchange_2fields.c            |    0
 .../lib/xchange}/xchange_2fields.h            |    0
 {xchange => src/lib/xchange}/xchange_deri.c   |    0
 {xchange => src/lib/xchange}/xchange_deri.h   |    0
 {xchange => src/lib/xchange}/xchange_field.c  |    0
 {xchange => src/lib/xchange}/xchange_field.h  |    0
 {xchange => src/lib/xchange}/xchange_gauge.c  |    0
 {xchange => src/lib/xchange}/xchange_gauge.h  |    0
 .../lib/xchange}/xchange_halffield.c          |    0
 .../lib/xchange}/xchange_halffield.h          |    0
 .../lib/xchange}/xchange_lexicfield.c         |    0
 .../lib/xchange}/xchange_lexicfield.h         |    0
 708 files changed, 4614 insertions(+), 240 deletions(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 cmake/FindCLime.cmake
 create mode 100644 cmake/FindLemon.cmake
 create mode 100644 cmake/git_hash.h.in
 rename {include => cmake}/tmlqcd_config_internal.h.in (56%)
 create mode 100644 cmake_includes.txt
 delete mode 100644 io/Makefile.in
 create mode 100644 src/bin/CMakeLists.txt
 rename LapH_ev.c => src/bin/LapH_ev.c (100%)
 rename benchmark.c => src/bin/benchmark.c (100%)
 rename check_locallity.c => src/bin/check_locallity.c (99%)
 rename deriv_mg_tune.c => src/bin/deriv_mg_tune.c (100%)
 rename gen_sources.c => src/bin/gen_sources.c (100%)
 rename hmc_tm.c => src/bin/hmc_tm.c (100%)
 rename hopping_test.c => src/bin/hopping_test.c (100%)
 rename invert.c => src/bin/invert.c (100%)
 rename {util => src/bin}/main_ildg2uk.c (100%)
 rename offline_measurement.c => src/bin/offline_measurement.c (100%)
 rename qphix_test_Dslash.c => src/bin/qphix_test_Dslash.c (100%)
 rename {test => src/bin}/scalar_prod_r_test.c (100%)
 rename {test => src/bin}/test_eigenvalues.c (100%)
 rename test_lemon.c => src/bin/test_lemon.c (100%)
 create mode 100644 src/lib/CMakeLists.txt
 rename DDalphaAMG_interface.c => src/lib/DDalphaAMG_interface.c (100%)
 rename DDalphaAMG_interface.h => src/lib/DDalphaAMG_interface.h (100%)
 rename Ptilde_nd.c => src/lib/Ptilde_nd.c (100%)
 rename Ptilde_nd.h => src/lib/Ptilde_nd.h (100%)
 rename aligned_malloc.c => src/lib/aligned_malloc.c (100%)
 rename aligned_malloc.h => src/lib/aligned_malloc.h (100%)
 rename block.c => src/lib/block.c (100%)
 rename block.h => src/lib/block.h (100%)
 rename boundary.c => src/lib/boundary.c (100%)
 rename boundary.h => src/lib/boundary.h (100%)
 rename {buffers => src/lib/buffers}/Makefile.in (100%)
 rename {buffers => src/lib/buffers}/gauge.c (100%)
 rename {buffers => src/lib/buffers}/gauge.h (100%)
 rename {buffers => src/lib/buffers}/gauge.ih (100%)
 rename {buffers => src/lib/buffers}/gauge_allocate_gauge_buffers.c (100%)
 rename {buffers => src/lib/buffers}/gauge_finalize_gauge_buffers.c (100%)
 rename {buffers => src/lib/buffers}/gauge_free_unused_gauge_buffers.c (100%)
 rename {buffers => src/lib/buffers}/gauge_get_gauge_field.c (100%)
 rename {buffers => src/lib/buffers}/gauge_get_gauge_field_array.c (100%)
 rename {buffers => src/lib/buffers}/gauge_initialize_gauge_buffers.c (100%)
 rename {buffers => src/lib/buffers}/gauge_return_gauge_field.c (100%)
 rename {buffers => src/lib/buffers}/gauge_return_gauge_field_array.c (100%)
 rename {buffers => src/lib/buffers}/utils.h (100%)
 rename {buffers => src/lib/buffers}/utils.ih (100%)
 rename {buffers => src/lib/buffers}/utils_generic_exchange.blocking.inc (100%)
 rename {buffers => src/lib/buffers}/utils_generic_exchange.c (100%)
 rename {buffers => src/lib/buffers}/utils_generic_exchange.nonblocking.inc (100%)
 rename chebyshev_polynomial.c => src/lib/chebyshev_polynomial.c (98%)
 rename chebyshev_polynomial.h => src/lib/chebyshev_polynomial.h (100%)
 rename chebyshev_polynomial_nd.c => src/lib/chebyshev_polynomial_nd.c (100%)
 rename chebyshev_polynomial_nd.h => src/lib/chebyshev_polynomial_nd.h (100%)
 rename clenshaw_coef.c => src/lib/clenshaw_coef.c (100%)
 rename clenshaw_coef.h => src/lib/clenshaw_coef.h (100%)
 rename compare_derivative.c => src/lib/compare_derivative.c (100%)
 rename compare_derivative.h => src/lib/compare_derivative.h (100%)
 rename {cu => src/lib/cu}/COPYING (100%)
 rename {cu => src/lib/cu}/COPYING.LESSER (100%)
 rename {cu => src/lib/cu}/Makefile.in (100%)
 rename {cu => src/lib/cu}/check-regressions (100%)
 rename {cu => src/lib/cu}/cu.c (100%)
 rename {cu => src/lib/cu}/cu.h (100%)
 rename default_input_values.h => src/lib/default_input_values.h (100%)
 rename deriv_Sb.c => src/lib/deriv_Sb.c (100%)
 rename deriv_Sb.h => src/lib/deriv_Sb.h (100%)
 rename deriv_Sb_D_psi.c => src/lib/deriv_Sb_D_psi.c (100%)
 rename deriv_Sb_D_psi.h => src/lib/deriv_Sb_D_psi.h (100%)
 rename expo.c => src/lib/expo.c (100%)
 rename expo.h => src/lib/expo.h (100%)
 rename fatal_error.c => src/lib/fatal_error.c (100%)
 rename fatal_error.h => src/lib/fatal_error.h (100%)
 rename gamma.c => src/lib/gamma.c (100%)
 rename gamma.h => src/lib/gamma.h (100%)
 rename geometry_eo.c => src/lib/geometry_eo.c (100%)
 rename geometry_eo.h => src/lib/geometry_eo.h (100%)
 rename get_rectangle_staples.c => src/lib/get_rectangle_staples.c (100%)
 rename get_rectangle_staples.h => src/lib/get_rectangle_staples.h (100%)
 rename get_staples.c => src/lib/get_staples.c (100%)
 rename get_staples.h => src/lib/get_staples.h (100%)
 rename getopt.c => src/lib/getopt.c (100%)
 rename getopt.h => src/lib/getopt.h (100%)
 rename gettime.c => src/lib/gettime.c (100%)
 rename gettime.h => src/lib/gettime.h (100%)
 rename global.h => src/lib/global.h (100%)
 rename hamiltonian_field.h => src/lib/hamiltonian_field.h (100%)
 rename {include => src/lib/include}/tmLQCD.h (100%)
 rename {include => src/lib/include}/tmlqcd_config.h (100%)
 rename {init => src/lib/init}/Makefile.in (100%)
 rename {init => src/lib/init}/init.h (100%)
 rename {init => src/lib/init}/init_bispinor_field.c (100%)
 rename {init => src/lib/init}/init_bispinor_field.h (100%)
 rename {init => src/lib/init}/init_chi_spinor_field.c (100%)
 rename {init => src/lib/init}/init_chi_spinor_field.h (100%)
 rename {init => src/lib/init}/init_critical_globals.c (100%)
 rename {init => src/lib/init}/init_critical_globals.h (100%)
 rename {init => src/lib/init}/init_dirac_halfspinor.c (100%)
 rename {init => src/lib/init}/init_dirac_halfspinor.h (100%)
 rename {init => src/lib/init}/init_gauge_fg.c (100%)
 rename {init => src/lib/init}/init_gauge_fg.h (100%)
 rename {init => src/lib/init}/init_gauge_field.c (100%)
 rename {init => src/lib/init}/init_gauge_field.h (100%)
 rename {init => src/lib/init}/init_gauge_tmp.c (100%)
 rename {init => src/lib/init}/init_gauge_tmp.h (100%)
 rename {init => src/lib/init}/init_geometry_indices.c (100%)
 rename {init => src/lib/init}/init_geometry_indices.h (100%)
 rename {init => src/lib/init}/init_global_states.c (100%)
 rename {init => src/lib/init}/init_global_states.h (100%)
 rename {init => src/lib/init}/init_moment_field.c (100%)
 rename {init => src/lib/init}/init_moment_field.h (100%)
 rename {init => src/lib/init}/init_omp_accumulators.c (100%)
 rename {init => src/lib/init}/init_omp_accumulators.h (100%)
 rename {init => src/lib/init}/init_openmp.c (100%)
 rename {init => src/lib/init}/init_openmp.h (100%)
 rename {init => src/lib/init}/init_parallel.c (100%)
 rename {init => src/lib/init}/init_parallel.h (100%)
 rename {init => src/lib/init}/init_spinor_field.c (100%)
 rename {init => src/lib/init}/init_spinor_field.h (100%)
 rename {init => src/lib/init}/init_stout_smear_vars.c (100%)
 rename {init => src/lib/init}/init_stout_smear_vars.h (100%)
 rename integrator.c => src/lib/integrator.c (100%)
 rename integrator.h => src/lib/integrator.h (100%)
 rename invert_clover_eo.c => src/lib/invert_clover_eo.c (100%)
 rename invert_clover_eo.h => src/lib/invert_clover_eo.h (100%)
 rename invert_doublet_eo.c => src/lib/invert_doublet_eo.c (100%)
 rename invert_doublet_eo.h => src/lib/invert_doublet_eo.h (100%)
 rename invert_eo.c => src/lib/invert_eo.c (99%)
 rename invert_eo.h => src/lib/invert_eo.h (100%)
 rename invert_overlap.c => src/lib/invert_overlap.c (100%)
 rename invert_overlap.h => src/lib/invert_overlap.h (100%)
 rename {io => src/lib/io}/DML_crc32.c (100%)
 rename {io => src/lib/io}/deri_write_stdout.c (100%)
 rename {io => src/lib/io}/deri_write_stdout.h (100%)
 rename {io => src/lib/io}/dml.c (100%)
 rename {io => src/lib/io}/dml.h (100%)
 rename {io => src/lib/io}/eospinor.h (100%)
 rename {io => src/lib/io}/eospinor.ih (100%)
 rename {io => src/lib/io}/eospinor_read.c (100%)
 rename {io => src/lib/io}/eospinor_write.c (100%)
 rename {io => src/lib/io}/gauge.h (100%)
 rename {io => src/lib/io}/gauge.ih (100%)
 rename {io => src/lib/io}/gauge_read.c (100%)
 rename {io => src/lib/io}/gauge_read_binary.c (100%)
 rename {io => src/lib/io}/gauge_write.c (100%)
 rename {io => src/lib/io}/gauge_write_binary.c (100%)
 rename {io => src/lib/io}/gauge_write_luscher_binary.c (100%)
 rename {io => src/lib/io}/gauge_write_luscher_binary.h (100%)
 rename {io => src/lib/io}/io_cm.c (100%)
 rename {io => src/lib/io}/io_cm.h (100%)
 rename {io => src/lib/io}/params.h (100%)
 rename {io => src/lib/io}/params.ih (100%)
 rename {io => src/lib/io}/params_construct_InverterInfo.c (100%)
 rename {io => src/lib/io}/params_construct_ildgFormat.c (100%)
 rename {io => src/lib/io}/params_construct_propagatorFormat.c (100%)
 rename {io => src/lib/io}/params_construct_sourceFormat.c (100%)
 rename {io => src/lib/io}/params_construct_xlfInfo.c (100%)
 rename {io => src/lib/io}/selector.h (100%)
 rename {io => src/lib/io}/spinor.h (100%)
 rename {io => src/lib/io}/spinor.ih (100%)
 rename {io => src/lib/io}/spinor_read.c (100%)
 rename {io => src/lib/io}/spinor_read_binary.c (100%)
 rename {io => src/lib/io}/spinor_write.c (100%)
 rename {io => src/lib/io}/spinor_write_binary.c (100%)
 rename {io => src/lib/io}/spinor_write_info.c (100%)
 rename {io => src/lib/io}/spinor_write_propagator_format.c (100%)
 rename {io => src/lib/io}/spinor_write_propagator_type.c (100%)
 rename {io => src/lib/io}/spinor_write_source_format.c (100%)
 rename {io => src/lib/io}/spinor_write_stdout.c (100%)
 rename {io => src/lib/io}/spinor_write_stdout.h (100%)
 rename {io => src/lib/io}/sw_write_stdout.c (100%)
 rename {io => src/lib/io}/sw_write_stdout.h (100%)
 rename {io => src/lib/io}/utils.c (100%)
 rename {io => src/lib/io}/utils.h (99%)
 rename {io => src/lib/io}/utils.ih (96%)
 rename {io => src/lib/io}/utils_close_reader_record.c (100%)
 rename {io => src/lib/io}/utils_close_writer_record.c (100%)
 rename {io => src/lib/io}/utils_construct_reader.c (97%)
 rename {io => src/lib/io}/utils_construct_writer.c (100%)
 rename {io => src/lib/io}/utils_destruct_reader.c (100%)
 rename {io => src/lib/io}/utils_destruct_writer.c (100%)
 rename {io => src/lib/io}/utils_engineering.c (100%)
 rename {io => src/lib/io}/utils_kill_with_error.c (100%)
 rename {io => src/lib/io}/utils_parse_checksum_xml.c (100%)
 rename {io => src/lib/io}/utils_parse_ildgformat_xml.c (100%)
 rename {io => src/lib/io}/utils_parse_propagator_type.c (100%)
 rename {io => src/lib/io}/utils_read_message.c (100%)
 rename {io => src/lib/io}/utils_write_checksum.c (100%)
 rename {io => src/lib/io}/utils_write_first_message.c (100%)
 rename {io => src/lib/io}/utils_write_header.c (100%)
 rename {io => src/lib/io}/utils_write_ildg_format.c (100%)
 rename {io => src/lib/io}/utils_write_inverter_info.c (100%)
 rename {io => src/lib/io}/utils_write_message.c (100%)
 rename {io => src/lib/io}/utils_write_xlf.c (100%)
 rename {io => src/lib/io}/utils_write_xlf_xml.c (100%)
 rename kahan_summation.h => src/lib/kahan_summation.h (100%)
 rename {linalg => src/lib/linalg}/Makefile.in (100%)
 rename {linalg => src/lib/linalg}/add.c (100%)
 rename {linalg => src/lib/linalg}/add.h (100%)
 rename {linalg => src/lib/linalg}/addto_32.c (100%)
 rename {linalg => src/lib/linalg}/addto_32.h (100%)
 rename {linalg => src/lib/linalg}/assign.c (100%)
 rename {linalg => src/lib/linalg}/assign.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_add_mul.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_add_mul.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_body.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r_32.c (93%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r_32.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r_add_mul.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r_add_mul.h (100%)
 rename {linalg => src/lib/linalg}/assign_diff_mul.c (100%)
 rename {linalg => src/lib/linalg}/assign_diff_mul.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_add_mul_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_add_mul_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_r_32.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_r_32.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r_32.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r_32.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r_and_square.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r_and_square.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_ket_add.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_ket_add.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_ket_add_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_ket_add_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_to_32.c (100%)
 rename {linalg => src/lib/linalg}/assign_to_32.h (100%)
 rename {linalg => src/lib/linalg}/blas.h (100%)
 rename {linalg => src/lib/linalg}/comp_decomp.c (100%)
 rename {linalg => src/lib/linalg}/comp_decomp.h (100%)
 rename {linalg => src/lib/linalg}/convert_eo_to_lexic.c (100%)
 rename {linalg => src/lib/linalg}/convert_eo_to_lexic.h (100%)
 rename {linalg => src/lib/linalg}/convert_even_to_lexic.c (100%)
 rename {linalg => src/lib/linalg}/convert_even_to_lexic.h (100%)
 rename {linalg => src/lib/linalg}/convert_odd_to_lexic.c (100%)
 rename {linalg => src/lib/linalg}/convert_odd_to_lexic.h (100%)
 rename {linalg => src/lib/linalg}/diff.c (100%)
 rename {linalg => src/lib/linalg}/diff.h (100%)
 rename {linalg => src/lib/linalg}/diff_32.c (100%)
 rename {linalg => src/lib/linalg}/diff_32.h (100%)
 rename {linalg => src/lib/linalg}/diff_and_square_norm.c (100%)
 rename {linalg => src/lib/linalg}/diff_and_square_norm.h (100%)
 rename {linalg => src/lib/linalg}/fortran.h (100%)
 rename {linalg => src/lib/linalg}/lapack.h (100%)
 rename {linalg => src/lib/linalg}/map_to_blas.h (100%)
 rename {linalg => src/lib/linalg}/mattimesvec.c (100%)
 rename {linalg => src/lib/linalg}/mattimesvec.h (100%)
 rename {linalg => src/lib/linalg}/mul.c (100%)
 rename {linalg => src/lib/linalg}/mul.h (100%)
 rename {linalg => src/lib/linalg}/mul_add_mul.c (100%)
 rename {linalg => src/lib/linalg}/mul_add_mul.h (100%)
 rename {linalg => src/lib/linalg}/mul_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/mul_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/mul_diff_mul.c (100%)
 rename {linalg => src/lib/linalg}/mul_diff_mul.h (100%)
 rename {linalg => src/lib/linalg}/mul_diff_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/mul_diff_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/mul_diff_r.c (100%)
 rename {linalg => src/lib/linalg}/mul_diff_r.h (100%)
 rename {linalg => src/lib/linalg}/mul_gamma5.c (100%)
 rename {linalg => src/lib/linalg}/mul_gamma5.h (100%)
 rename {linalg => src/lib/linalg}/mul_r.c (100%)
 rename {linalg => src/lib/linalg}/mul_r.h (100%)
 rename {linalg => src/lib/linalg}/mul_r_32.c (100%)
 rename {linalg => src/lib/linalg}/mul_r_32.h (100%)
 rename {linalg => src/lib/linalg}/mul_r_gamma5.c (100%)
 rename {linalg => src/lib/linalg}/mul_r_gamma5.h (100%)
 rename {linalg => src/lib/linalg}/print_spinor.c (100%)
 rename {linalg => src/lib/linalg}/print_spinor.h (100%)
 rename {linalg => src/lib/linalg}/print_spinor_similar_components.c (100%)
 rename {linalg => src/lib/linalg}/print_spinor_similar_components.h (100%)
 rename {linalg => src/lib/linalg}/ratio.c (100%)
 rename {linalg => src/lib/linalg}/ratio.h (100%)
 rename {linalg => src/lib/linalg}/scalar_prod.c (100%)
 rename {linalg => src/lib/linalg}/scalar_prod.h (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_body.c (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_i.c (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_i.h (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_r.c (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_r.h (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_r_32.c (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_r_32.h (100%)
 rename {linalg => src/lib/linalg}/set_even_to_zero.c (100%)
 rename {linalg => src/lib/linalg}/set_even_to_zero.h (100%)
 rename {linalg => src/lib/linalg}/square_and_minmax.c (100%)
 rename {linalg => src/lib/linalg}/square_and_minmax.h (100%)
 rename {linalg => src/lib/linalg}/square_and_prod_r.c (100%)
 rename {linalg => src/lib/linalg}/square_and_prod_r.h (100%)
 rename {linalg => src/lib/linalg}/square_norm.c (100%)
 rename {linalg => src/lib/linalg}/square_norm.h (100%)
 rename {linalg => src/lib/linalg}/square_norm_32.c (100%)
 rename {linalg => src/lib/linalg}/square_norm_32.h (100%)
 rename linalg_eo.h => src/lib/linalg_eo.h (100%)
 rename little_D.c => src/lib/little_D.c (100%)
 rename little_D.h => src/lib/little_D.h (100%)
 rename little_D_body.c => src/lib/little_D_body.c (100%)
 rename matrix_utils.c => src/lib/matrix_utils.c (100%)
 rename matrix_utils.h => src/lib/matrix_utils.h (100%)
 rename {meas => src/lib/meas}/Makefile.in (100%)
 rename {meas => src/lib/meas}/correlators.c (100%)
 rename {meas => src/lib/meas}/correlators.h (100%)
 rename {meas => src/lib/meas}/field_strength_types.h (100%)
 rename {meas => src/lib/meas}/gradient_flow.c (100%)
 rename {meas => src/lib/meas}/gradient_flow.h (100%)
 rename {meas => src/lib/meas}/measure_clover_field_strength_observables.c (100%)
 rename {meas => src/lib/meas}/measure_clover_field_strength_observables.h (100%)
 rename {meas => src/lib/meas}/measurements.c (100%)
 rename {meas => src/lib/meas}/measurements.h (100%)
 rename {meas => src/lib/meas}/oriented_plaquettes.c (100%)
 rename {meas => src/lib/meas}/oriented_plaquettes.h (100%)
 rename {meas => src/lib/meas}/pion_norm.c (100%)
 rename {meas => src/lib/meas}/pion_norm.h (100%)
 rename {meas => src/lib/meas}/polyakov_loop.c (100%)
 rename {meas => src/lib/meas}/polyakov_loop.h (100%)
 rename measure_gauge_action.c => src/lib/measure_gauge_action.c (100%)
 rename measure_gauge_action.h => src/lib/measure_gauge_action.h (100%)
 rename measure_rectangles.c => src/lib/measure_rectangles.c (100%)
 rename measure_rectangles.h => src/lib/measure_rectangles.h (100%)
 rename misc_types.h => src/lib/misc_types.h (99%)
 rename {monomial => src/lib/monomial}/Makefile.in (100%)
 rename {monomial => src/lib/monomial}/clover_trlog_monomial.c (100%)
 rename {monomial => src/lib/monomial}/clover_trlog_monomial.h (100%)
 rename {monomial => src/lib/monomial}/cloverdet_monomial.c (100%)
 rename {monomial => src/lib/monomial}/cloverdet_monomial.h (100%)
 rename {monomial => src/lib/monomial}/cloverdetratio_monomial.c (100%)
 rename {monomial => src/lib/monomial}/cloverdetratio_monomial.h (100%)
 rename {monomial => src/lib/monomial}/cloverdetratio_rwmonomial.c (100%)
 rename {monomial => src/lib/monomial}/cloverdetratio_rwmonomial.h (100%)
 rename {monomial => src/lib/monomial}/clovernd_trlog_monomial.c (100%)
 rename {monomial => src/lib/monomial}/clovernd_trlog_monomial.h (100%)
 rename {monomial => src/lib/monomial}/cloverndpoly_monomial.c (100%)
 rename {monomial => src/lib/monomial}/cloverndpoly_monomial.h (100%)
 rename {monomial => src/lib/monomial}/det_monomial.c (100%)
 rename {monomial => src/lib/monomial}/det_monomial.h (100%)
 rename {monomial => src/lib/monomial}/detratio_monomial.c (100%)
 rename {monomial => src/lib/monomial}/detratio_monomial.h (100%)
 rename {monomial => src/lib/monomial}/gauge_monomial.c (100%)
 rename {monomial => src/lib/monomial}/gauge_monomial.h (100%)
 rename {monomial => src/lib/monomial}/moment_energy.c (100%)
 rename {monomial => src/lib/monomial}/moment_energy.h (100%)
 rename {monomial => src/lib/monomial}/monitor_forces.c (100%)
 rename {monomial => src/lib/monomial}/monitor_forces.h (100%)
 rename {monomial => src/lib/monomial}/monomial.c (100%)
 rename {monomial => src/lib/monomial}/monomial.h (100%)
 rename {monomial => src/lib/monomial}/nddetratio_monomial.c (100%)
 rename {monomial => src/lib/monomial}/nddetratio_monomial.h (100%)
 rename {monomial => src/lib/monomial}/ndpoly_monomial.c (100%)
 rename {monomial => src/lib/monomial}/ndpoly_monomial.h (100%)
 rename {monomial => src/lib/monomial}/ndrat_monomial.c (100%)
 rename {monomial => src/lib/monomial}/ndrat_monomial.h (100%)
 rename {monomial => src/lib/monomial}/ndratcor_monomial.c (100%)
 rename {monomial => src/lib/monomial}/ndratcor_monomial.h (100%)
 rename {monomial => src/lib/monomial}/poly_monomial.c (100%)
 rename {monomial => src/lib/monomial}/poly_monomial.h (100%)
 rename {monomial => src/lib/monomial}/rat_monomial.c (100%)
 rename {monomial => src/lib/monomial}/rat_monomial.h (100%)
 rename {monomial => src/lib/monomial}/ratcor_monomial.c (100%)
 rename {monomial => src/lib/monomial}/ratcor_monomial.h (100%)
 rename mpi_init.c => src/lib/mpi_init.c (100%)
 rename mpi_init.h => src/lib/mpi_init.h (100%)
 rename omp_accumulator.h => src/lib/omp_accumulator.h (100%)
 rename operator.c => src/lib/operator.c (100%)
 rename operator.h => src/lib/operator.h (100%)
 rename {operator => src/lib/operator}/Block_D_psi_body.c (100%)
 rename {operator => src/lib/operator}/D_psi.c (100%)
 rename {operator => src/lib/operator}/D_psi.h (100%)
 rename {operator => src/lib/operator}/D_psi_body.c (100%)
 rename {operator => src/lib/operator}/Dov_proj.c (100%)
 rename {operator => src/lib/operator}/Dov_proj.h (100%)
 rename {operator => src/lib/operator}/Dov_psi.c (100%)
 rename {operator => src/lib/operator}/Dov_psi.h (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix.c (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix.h (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix_32.c (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix_32.h (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix_32_nocom.c (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix_nocom.c (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix_nocom.h (100%)
 rename {operator => src/lib/operator}/Makefile.in (100%)
 rename {operator => src/lib/operator}/assign_mul_one_sw_pm_imu_inv_block_body.c (100%)
 rename {operator => src/lib/operator}/assign_mul_one_sw_pm_imu_site_lexic_body.c (100%)
 rename {operator => src/lib/operator}/clover_accumulate_deriv.c (100%)
 rename {operator => src/lib/operator}/clover_deriv.c (100%)
 rename {operator => src/lib/operator}/clover_det.c (100%)
 rename {operator => src/lib/operator}/clover_inline.h (100%)
 rename {operator => src/lib/operator}/clover_invert.c (100%)
 rename {operator => src/lib/operator}/clover_leaf.c (100%)
 rename {operator => src/lib/operator}/clover_leaf.h (100%)
 rename {operator => src/lib/operator}/clover_term.c (100%)
 rename {operator => src/lib/operator}/clovertm_operators.c (100%)
 rename {operator => src/lib/operator}/clovertm_operators.h (100%)
 rename {operator => src/lib/operator}/clovertm_operators_32.c (100%)
 rename {operator => src/lib/operator}/clovertm_operators_32.h (100%)
 rename {operator => src/lib/operator}/halfspinor_body.c (100%)
 rename {operator => src/lib/operator}/halfspinor_body_32.c (100%)
 rename {operator => src/lib/operator}/halfspinor_hopping.h (100%)
 rename {operator => src/lib/operator}/halfspinor_hopping_32.h (100%)
 rename {operator => src/lib/operator}/hopping.h (100%)
 rename {operator => src/lib/operator}/hopping_body_dbl.c (100%)
 rename {operator => src/lib/operator}/hopping_sgl.c (100%)
 rename {operator => src/lib/operator}/mul_one_pm_imu_inv_body.c (100%)
 rename {operator => src/lib/operator}/mul_one_pm_imu_sub_mul_body.c (100%)
 rename {operator => src/lib/operator}/tm_operators.c (100%)
 rename {operator => src/lib/operator}/tm_operators.h (100%)
 rename {operator => src/lib/operator}/tm_operators_32.c (100%)
 rename {operator => src/lib/operator}/tm_operators_32.h (100%)
 rename {operator => src/lib/operator}/tm_operators_nd.c (100%)
 rename {operator => src/lib/operator}/tm_operators_nd.h (100%)
 rename {operator => src/lib/operator}/tm_operators_nd_32.c (100%)
 rename {operator => src/lib/operator}/tm_operators_nd_32.h (100%)
 rename {operator => src/lib/operator}/tm_sub_Hopping_Matrix.c (100%)
 rename {operator => src/lib/operator}/tm_sub_Hopping_Matrix.h (100%)
 rename {operator => src/lib/operator}/tm_times_Hopping_Matrix.c (100%)
 rename {operator => src/lib/operator}/tm_times_Hopping_Matrix.h (100%)
 rename operator_types.h => src/lib/operator_types.h (100%)
 rename overrelaxation.c => src/lib/overrelaxation.c (99%)
 rename overrelaxation.h => src/lib/overrelaxation.h (100%)
 rename parallel_io.h => src/lib/parallel_io.h (100%)
 rename phmc.c => src/lib/phmc.c (100%)
 rename phmc.h => src/lib/phmc.h (100%)
 rename prepare_source.c => src/lib/prepare_source.c (100%)
 rename prepare_source.h => src/lib/prepare_source.h (100%)
 rename {profiling => src/lib/profiling}/hmc/Readme.md (100%)
 rename {profiling => src/lib/profiling}/hmc/example_profile.pdf (100%)
 rename {profiling => src/lib/profiling}/hmc/profile.Rmd (100%)
 rename {profiling => src/lib/profiling}/hmc/timing.R (100%)
 rename {profiling => src/lib/profiling}/hmc_mk2/.gitignore (100%)
 rename {profiling => src/lib/profiling}/hmc_mk2/README.md (100%)
 rename {profiling => src/lib/profiling}/hmc_mk2/logs/example_log.out (100%)
 rename {profiling => src/lib/profiling}/hmc_mk2/make_profile.R (100%)
 rename {profiling => src/lib/profiling}/hmc_mk2/profile.Rmd (100%)
 create mode 100644 src/lib/qphix/qphix_base_classes.hpp
 create mode 100644 src/lib/qphix/qphix_interface.cpp
 create mode 100644 src/lib/qphix/qphix_interface.hpp
 create mode 100644 src/lib/qphix/qphix_interface_utils.hpp
 rename qphix_interface.h => src/lib/qphix_interface.h (100%)
 rename qphix_types.h => src/lib/qphix_types.h (100%)
 rename qphix_veclen.h => src/lib/qphix_veclen.h (100%)
 rename quda_dummy_types.h => src/lib/quda_dummy_types.h (100%)
 create mode 100644 src/lib/quda_gauge_paths.inc
 rename quda_interface.c => src/lib/quda_interface.c (100%)
 rename quda_interface.h => src/lib/quda_interface.h (100%)
 rename quda_types.h => src/lib/quda_types.h (100%)
 rename ranlxd.c => src/lib/ranlxd.c (100%)
 rename ranlxd.h => src/lib/ranlxd.h (100%)
 rename ranlxs.c => src/lib/ranlxs.c (100%)
 rename ranlxs.h => src/lib/ranlxs.h (100%)
 rename {rational => src/lib/rational}/Makefile.in (100%)
 rename {rational => src/lib/rational}/elliptic.c (100%)
 rename {rational => src/lib/rational}/elliptic.h (100%)
 rename {rational => src/lib/rational}/rational.c (100%)
 rename {rational => src/lib/rational}/rational.h (100%)
 rename {rational => src/lib/rational}/zolotarev.c (100%)
 rename {rational => src/lib/rational}/zolotarev.h (100%)
 rename read_input.h => src/lib/read_input.h (100%)
 rename read_input.l => src/lib/read_input.l (100%)
 rename reweighting_factor.c => src/lib/reweighting_factor.c (100%)
 rename reweighting_factor.h => src/lib/reweighting_factor.h (100%)
 rename reweighting_factor_nd.c => src/lib/reweighting_factor_nd.c (100%)
 rename reweighting_factor_nd.h => src/lib/reweighting_factor_nd.h (100%)
 rename rnd_gauge_trafo.c => src/lib/rnd_gauge_trafo.c (100%)
 rename rnd_gauge_trafo.h => src/lib/rnd_gauge_trafo.h (100%)
 rename sighandler.c => src/lib/sighandler.c (100%)
 rename sighandler.h => src/lib/sighandler.h (100%)
 rename {smearing => src/lib/smearing}/Makefile.in (100%)
 rename {smearing => src/lib/smearing}/ape.h (100%)
 rename {smearing => src/lib/smearing}/ape.ih (100%)
 rename {smearing => src/lib/smearing}/ape_ape_smear.c (100%)
 rename {smearing => src/lib/smearing}/hex.h (100%)
 rename {smearing => src/lib/smearing}/hex.ih (100%)
 rename {smearing => src/lib/smearing}/hex_hex_smear.c (100%)
 rename {smearing => src/lib/smearing}/hex_stout_exclude_none.c (100%)
 rename {smearing => src/lib/smearing}/hex_stout_exclude_one.c (100%)
 rename {smearing => src/lib/smearing}/hex_stout_exclude_two.c (100%)
 rename {smearing => src/lib/smearing}/hyp.h (100%)
 rename {smearing => src/lib/smearing}/hyp.ih (100%)
 rename {smearing => src/lib/smearing}/hyp_APE_project_exclude_none.c (100%)
 rename {smearing => src/lib/smearing}/hyp_APE_project_exclude_one.c (100%)
 rename {smearing => src/lib/smearing}/hyp_APE_project_exclude_two.c (100%)
 rename {smearing => src/lib/smearing}/hyp_hyp_smear.c (100%)
 rename {smearing => src/lib/smearing}/hyp_hyp_staples_exclude_none.c (100%)
 rename {smearing => src/lib/smearing}/hyp_hyp_staples_exclude_one.c (100%)
 rename {smearing => src/lib/smearing}/hyp_hyp_staples_exclude_two.c (100%)
 rename {smearing => src/lib/smearing}/stout.h (100%)
 rename {smearing => src/lib/smearing}/stout.ih (100%)
 rename {smearing => src/lib/smearing}/stout_stout_smear.c (100%)
 rename {smearing => src/lib/smearing}/uils_print_config_to_screen.c (100%)
 rename {smearing => src/lib/smearing}/utils.h (100%)
 rename {smearing => src/lib/smearing}/utils.ih (100%)
 rename {smearing => src/lib/smearing}/utils_generic_staples.c (100%)
 rename {smearing => src/lib/smearing}/utils_print_config_to_screen.c (100%)
 rename {smearing => src/lib/smearing}/utils_print_su3.c (100%)
 rename {smearing => src/lib/smearing}/utils_project_antiherm.c (100%)
 rename {smearing => src/lib/smearing}/utils_project_herm.c (100%)
 rename {smearing => src/lib/smearing}/utils_reunitarize.c (100%)
 rename {smearing => src/lib/smearing}/utils_reunitarize_MILC.c (88%)
 rename {solver => src/lib/solver}/M_plus_block_psi_body.c (100%)
 rename {solver => src/lib/solver}/Makefile.in (100%)
 rename {solver => src/lib/solver}/Msap.c (100%)
 rename {solver => src/lib/solver}/Msap.h (100%)
 rename {solver => src/lib/solver}/bicg_complex.c (100%)
 rename {solver => src/lib/solver}/bicg_complex.h (100%)
 rename {solver => src/lib/solver}/bicgstab2.c (100%)
 rename {solver => src/lib/solver}/bicgstab2.h (100%)
 rename {solver => src/lib/solver}/bicgstab_complex.c (100%)
 rename {solver => src/lib/solver}/bicgstab_complex.h (100%)
 rename {solver => src/lib/solver}/bicgstab_complex_bi.c (100%)
 rename {solver => src/lib/solver}/bicgstab_complex_bi.h (100%)
 rename {solver => src/lib/solver}/bicgstabell.c (100%)
 rename {solver => src/lib/solver}/bicgstabell.h (100%)
 rename {solver => src/lib/solver}/cg_her.c (100%)
 rename {solver => src/lib/solver}/cg_her.h (100%)
 rename {solver => src/lib/solver}/cg_her_bi.c (100%)
 rename {solver => src/lib/solver}/cg_her_bi.h (100%)
 rename {solver => src/lib/solver}/cg_her_nd.c (100%)
 rename {solver => src/lib/solver}/cg_her_nd.h (100%)
 rename {solver => src/lib/solver}/cg_mms_tm.c (100%)
 rename {solver => src/lib/solver}/cg_mms_tm.h (100%)
 rename {solver => src/lib/solver}/cg_mms_tm_nd.c (100%)
 rename {solver => src/lib/solver}/cg_mms_tm_nd.h (100%)
 rename {solver => src/lib/solver}/cgne4complex.c (100%)
 rename {solver => src/lib/solver}/cgne4complex.h (100%)
 rename {solver => src/lib/solver}/cgs_real.c (100%)
 rename {solver => src/lib/solver}/cgs_real.h (100%)
 rename {solver => src/lib/solver}/chrono_guess.c (100%)
 rename {solver => src/lib/solver}/chrono_guess.h (100%)
 rename {solver => src/lib/solver}/cr.c (100%)
 rename {solver => src/lib/solver}/cr.h (100%)
 rename {solver => src/lib/solver}/dfl_projector.c (100%)
 rename {solver => src/lib/solver}/dfl_projector.h (100%)
 rename {solver => src/lib/solver}/diagonalise_general_matrix.c (100%)
 rename {solver => src/lib/solver}/diagonalise_general_matrix.h (100%)
 rename {solver => src/lib/solver}/dirac_operator_eigenvectors.c (100%)
 rename {solver => src/lib/solver}/dirac_operator_eigenvectors.h (100%)
 rename {solver => src/lib/solver}/eigcg.c (100%)
 rename {solver => src/lib/solver}/eigcg.h (100%)
 rename {solver => src/lib/solver}/eigenvalues.c (100%)
 rename {solver => src/lib/solver}/eigenvalues.h (100%)
 rename {solver => src/lib/solver}/eigenvalues_bi.c (100%)
 rename {solver => src/lib/solver}/eigenvalues_bi.h (100%)
 rename {solver => src/lib/solver}/fgmres.c (100%)
 rename {solver => src/lib/solver}/fgmres.h (100%)
 rename {solver => src/lib/solver}/fgmres4complex.c (100%)
 rename {solver => src/lib/solver}/fgmres4complex.h (100%)
 rename {solver => src/lib/solver}/fgmres4complex_body.c (100%)
 rename {solver => src/lib/solver}/gcr.c (100%)
 rename {solver => src/lib/solver}/gcr.h (100%)
 rename {solver => src/lib/solver}/gcr4complex.c (100%)
 rename {solver => src/lib/solver}/gcr4complex.h (100%)
 rename {solver => src/lib/solver}/gcr4complex_body.c (100%)
 rename {solver => src/lib/solver}/gcr4complex_body.h (100%)
 rename {solver => src/lib/solver}/generate_dfl_subspace.c (100%)
 rename {solver => src/lib/solver}/generate_dfl_subspace.h (100%)
 rename {solver => src/lib/solver}/gmres.c (100%)
 rename {solver => src/lib/solver}/gmres.h (100%)
 rename {solver => src/lib/solver}/gmres_dr.c (100%)
 rename {solver => src/lib/solver}/gmres_dr.h (100%)
 rename {solver => src/lib/solver}/gmres_precon.c (100%)
 rename {solver => src/lib/solver}/gmres_precon.h (100%)
 rename {solver => src/lib/solver}/gram-schmidt.c (100%)
 rename {solver => src/lib/solver}/gram-schmidt.h (100%)
 rename {solver => src/lib/solver}/incr_eigcg.c (100%)
 rename {solver => src/lib/solver}/incr_eigcg.h (100%)
 rename {solver => src/lib/solver}/index_jd.c (100%)
 rename {solver => src/lib/solver}/index_jd.h (100%)
 rename {solver => src/lib/solver}/init_guess.c (100%)
 rename {solver => src/lib/solver}/init_guess.h (100%)
 rename {solver => src/lib/solver}/jdher.c (100%)
 rename {solver => src/lib/solver}/jdher.h (100%)
 rename {solver => src/lib/solver}/jdher_bi.c (100%)
 rename {solver => src/lib/solver}/jdher_bi.h (100%)
 rename {solver => src/lib/solver}/little_mg_precon_body.c (100%)
 rename {solver => src/lib/solver}/little_project_eo_body.c (100%)
 rename {solver => src/lib/solver}/lu_solve.c (100%)
 rename {solver => src/lib/solver}/lu_solve.h (100%)
 rename {solver => src/lib/solver}/matrix_mult_typedef.h (100%)
 rename {solver => src/lib/solver}/matrix_mult_typedef_bi.h (100%)
 rename {solver => src/lib/solver}/matrix_mult_typedef_nd.h (100%)
 rename {solver => src/lib/solver}/mcr.c (100%)
 rename {solver => src/lib/solver}/mcr.h (100%)
 rename {solver => src/lib/solver}/mcr4complex.c (100%)
 rename {solver => src/lib/solver}/mcr4complex.h (100%)
 rename {solver => src/lib/solver}/mixed_cg_her.c (100%)
 rename {solver => src/lib/solver}/mixed_cg_her.h (100%)
 rename {solver => src/lib/solver}/mixed_cg_mms_tm_nd.c (100%)
 rename {solver => src/lib/solver}/mixed_cg_mms_tm_nd.h (100%)
 rename {solver => src/lib/solver}/monomial_solve.c (100%)
 rename {solver => src/lib/solver}/monomial_solve.h (100%)
 rename {solver => src/lib/solver}/mr.c (100%)
 rename {solver => src/lib/solver}/mr.h (100%)
 rename {solver => src/lib/solver}/mr4complex.c (100%)
 rename {solver => src/lib/solver}/mr4complex.h (100%)
 rename {solver => src/lib/solver}/mrblk_body.c (100%)
 rename {solver => src/lib/solver}/ortho.c (100%)
 rename {solver => src/lib/solver}/ortho.h (100%)
 rename {solver => src/lib/solver}/pcg_her.c (100%)
 rename {solver => src/lib/solver}/pcg_her.h (100%)
 rename {solver => src/lib/solver}/poly_precon.c (100%)
 rename {solver => src/lib/solver}/poly_precon.h (100%)
 rename {solver => src/lib/solver}/quicksort.c (100%)
 rename {solver => src/lib/solver}/quicksort.h (100%)
 rename {solver => src/lib/solver}/restart_X.c (100%)
 rename {solver => src/lib/solver}/restart_X.h (100%)
 rename {solver => src/lib/solver}/rg_mixed_cg_her.c (100%)
 rename {solver => src/lib/solver}/rg_mixed_cg_her.h (100%)
 rename {solver => src/lib/solver}/rg_mixed_cg_her_nd.c (100%)
 rename {solver => src/lib/solver}/rg_mixed_cg_her_nd.h (100%)
 rename {solver => src/lib/solver}/rg_mixed_cg_typedef.h (100%)
 rename {solver => src/lib/solver}/solver.h (100%)
 rename {solver => src/lib/solver}/solver_field.c (100%)
 rename {solver => src/lib/solver}/solver_field.h (100%)
 rename {solver => src/lib/solver}/solver_params.h (100%)
 rename {solver => src/lib/solver}/solver_types.c (100%)
 rename {solver => src/lib/solver}/solver_types.h (100%)
 rename {solver => src/lib/solver}/sub_low_ev.c (100%)
 rename {solver => src/lib/solver}/sub_low_ev.h (100%)
 rename {solver => src/lib/solver}/sumr.c (100%)
 rename {solver => src/lib/solver}/sumr.h (100%)
 rename source_generation.c => src/lib/source_generation.c (100%)
 rename source_generation.h => src/lib/source_generation.h (100%)
 rename spinor_fft.c => src/lib/spinor_fft.c (100%)
 rename spinor_fft.h => src/lib/spinor_fft.h (100%)
 rename start.c => src/lib/start.c (100%)
 rename start.h => src/lib/start.h (100%)
 rename struct_accessors.h => src/lib/struct_accessors.h (100%)
 rename su3.h => src/lib/su3.h (100%)
 rename su3adj.h => src/lib/su3adj.h (100%)
 rename su3spinor.h => src/lib/su3spinor.h (100%)
 rename tensors.h => src/lib/tensors.h (100%)
 rename {test => src/lib/test}/Makefile (100%)
 rename {test => src/lib/test}/check_geometry.c (100%)
 rename {test => src/lib/test}/check_geometry.h (100%)
 rename {test => src/lib/test}/check_nan.c (100%)
 rename {test => src/lib/test}/check_nan.h (100%)
 rename {test => src/lib/test}/check_overlap.c (100%)
 rename {test => src/lib/test}/check_xchange.c (100%)
 rename {test => src/lib/test}/hopping_test.README (100%)
 rename {test => src/lib/test}/hopping_test.input.compare (100%)
 rename {test => src/lib/test}/hopping_test.input.new (100%)
 rename {test => src/lib/test}/hopping_test.input.start (100%)
 rename {test => src/lib/test}/hopping_test_generate_script (100%)
 rename {test => src/lib/test}/hopping_test_qscript (100%)
 rename {test => src/lib/test}/measure_rectangles.debug.c (100%)
 rename {test => src/lib/test}/overlaptests.c (100%)
 rename {test => src/lib/test}/overlaptests.h (100%)
 rename {test => src/lib/test}/qdran64.h (100%)
 rename tm_debug_printf.c => src/lib/tm_debug_printf.c (100%)
 rename tm_debug_printf.h => src/lib/tm_debug_printf.h (100%)
 rename update_backward_gauge.c => src/lib/update_backward_gauge.c (100%)
 rename update_backward_gauge.h => src/lib/update_backward_gauge.h (100%)
 rename update_gauge.c => src/lib/update_gauge.c (100%)
 rename update_gauge.h => src/lib/update_gauge.h (100%)
 rename update_momenta.c => src/lib/update_momenta.c (100%)
 rename update_momenta.h => src/lib/update_momenta.h (100%)
 rename update_momenta_fg.c => src/lib/update_momenta_fg.c (100%)
 rename update_momenta_fg.h => src/lib/update_momenta_fg.h (100%)
 rename update_tm.c => src/lib/update_tm.c (100%)
 rename update_tm.h => src/lib/update_tm.h (100%)
 rename {util => src/lib/util}/io.c (100%)
 rename {util => src/lib/util}/io.h (100%)
 rename {util => src/lib/util}/laguer/Makefile (100%)
 rename {util => src/lib/util}/laguer/chebyRoot.C (100%)
 rename {util => src/lib/util}/laguer/chebyRoot.H (100%)
 rename {util => src/lib/util}/laguer/laguer.c (100%)
 rename {util => src/lib/util}/laguer/quadroptRoot.C (100%)
 rename {util => src/lib/util}/oox/Makefile (100%)
 rename {util => src/lib/util}/oox/oox.c (100%)
 rename {util => src/lib/util}/oox/oox_gawrapper.cxx (100%)
 rename {util => src/lib/util}/oox/oox_gawrapper.h (100%)
 rename {util => src/lib/util}/swapendian.c (100%)
 rename {util => src/lib/util}/tmlqcd-indent (100%)
 rename {wrapper => src/lib/wrapper}/Makefile.in (100%)
 rename {wrapper => src/lib/wrapper}/lib_wrapper.c (100%)
 rename {xchange => src/lib/xchange}/Makefile.in (100%)
 rename {xchange => src/lib/xchange}/little_field_gather.c (100%)
 rename {xchange => src/lib/xchange}/little_field_gather.h (100%)
 rename {xchange => src/lib/xchange}/little_field_gather_body.c (100%)
 rename {xchange => src/lib/xchange}/xchange.h (100%)
 rename {xchange => src/lib/xchange}/xchange_2fields.c (100%)
 rename {xchange => src/lib/xchange}/xchange_2fields.h (100%)
 rename {xchange => src/lib/xchange}/xchange_deri.c (100%)
 rename {xchange => src/lib/xchange}/xchange_deri.h (100%)
 rename {xchange => src/lib/xchange}/xchange_field.c (100%)
 rename {xchange => src/lib/xchange}/xchange_field.h (100%)
 rename {xchange => src/lib/xchange}/xchange_gauge.c (100%)
 rename {xchange => src/lib/xchange}/xchange_gauge.h (100%)
 rename {xchange => src/lib/xchange}/xchange_halffield.c (100%)
 rename {xchange => src/lib/xchange}/xchange_halffield.h (100%)
 rename {xchange => src/lib/xchange}/xchange_lexicfield.c (100%)
 rename {xchange => src/lib/xchange}/xchange_lexicfield.h (100%)

diff --git a/.gitignore b/.gitignore
index 79e2bc1b2..0a2e35fba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,7 +18,6 @@ tags*
 hmc_tm
 invert
 offline_measurement
-lib/
 benchmark
 *.data
 *.para
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..9dc9f71f2
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,390 @@
+cmake_minimum_required(VERSION 3.24)
+
+project(
+  tmlqcd
+  DESCRIPTION "tmlQCD"
+  HOMEPAGE_URL "http://www.itkp.uni-bonn.de/~urbach/software.html"
+  VERSION "6.0.0"
+  LANGUAGES C CXX)
+
+# include our cmake snippets
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+# =================================================================================================
+# REQUIRE OUT-OF-SOURCE BUILDS
+file(TO_CMAKE_PATH "${PROJECT_BINARY_DIR}/CMakeLists.txt" LOC_PATH)
+if(EXISTS "${LOC_PATH}")
+  message(
+    FATAL_ERROR
+      "You cannot build in a source directory (or any directory with a CMakeLists.txt file). Please make a build subdirectory."
+  )
+endif()
+
+# =================================================================================================
+# PROJECT AND VERSION
+include(CMakeDependentOption)
+include(CheckSymbolExists)
+include(CheckLibraryExists)
+include(CheckFunctionExists)
+include(GNUInstallDirs)
+
+cmake_policy(SET CMP0048 NEW)
+
+if(POLICY CMP0144)
+  cmake_policy(SET CMP0144 NEW)
+endif()
+
+if(NOT DEFINED CMAKE_CUDA_STANDARD)
+  set(CMAKE_CUDA_STANDARD 14)
+  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+endif()
+
+if(NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()
+
+if(NOT DEFINED CMAKE_C_STANDARD)
+  set(CMAKE_C_STANDARD 11)
+  set(CMAKE_C_STANDARD_REQUIRED ON)
+endif()
+
+if(NOT DEFINED CMAKE_HIP_STANDARD)
+  set(CMAKE_HIP_STANDARD 14)
+  set(CMAKE_HIP_STANDARD_REQUIRED ON)
+endif()
+
+find_package(PkgConfig)
+
+# ##############################################################################
+# Define the paths for static libraries and executables
+# ##############################################################################
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY
+    ${cp2k_BINARY_DIR}/lib
+    CACHE PATH "Single output directory for building all libraries.")
+
+# Search for rocm in common locations
+foreach(__var ROCM_ROOT CRAY_ROCM_ROOT ORNL_ROCM_ROOT CRAY_ROCM_PREFIX
+              ROCM_PREFIX CRAY_ROCM_DIR)
+  if($ENV{${__var}})
+    list(APPEND CMAKE_PREFIX_PATH $ENV{__var})
+    set(ROCM_PATH
+        $ENV{__var}
+        CACHE PATH "Path to ROCm installation")
+  endif()
+endforeach()
+
+option(CMAKE_POSITION_INDEPENDENT_CODE "Enable position independent code" ON)
+option(BUILD_SHARED_LIBS "Enable shared library" ON)
+option(TM_USE_FFTW "Enable fftw support" OFF)
+option(TM_USE_MPI "Enable MPI support" OFF)
+option(TM_USE_CUDA "Enable QUDA support" OFF)
+option(TM_USE_HIP "Enable HIP support" OFF)
+option(TM_USE_DDALPHAAMG "Enable DDalphaAMG support" OFF)
+option(TM_USE_OPENMP "Enable openMP" ON)
+option(TM_FIXED_VOLUME "fix volume at compile time" OFF)
+set(
+  TM_ENABLE_ALIGNMENT
+  "auto"
+  CACHE STRING   "Automatically or expliclty align arrays to byte number. auto, none, 16, 32, 64")
+
+set_property(
+  CACHE TM_ENABLE_ALIGNMENT
+  PROPERTY STRINGS
+  "auto"
+  "none"
+  "16"
+  "32"
+  "64")
+
+option(TM_BGL_DRAM "use BGL dram window (BGL only!)" ON)
+option(TM_USE_OPTIMIZATION "enable optimisation" ON)
+option(TM_USE_GAUGE_COPY "Enable use of a copy of the gauge field" ON)
+option(TM_USE_HALFSPINOR "Use a Dirac Op. with halfspinor exchange" ON)
+option(TM_USE_TSPLITPAR "Enable timeslice-splitted communications" ON)
+option(TM_USE_QPHIX "enable QPhiX" OFF)
+option(TM_USE_SHMEM "Use shmem API" OFF)
+option(TM_USE_QUDA "Enable QUDA support" OFF)
+option(TM_USE_GPROF "Enable gprof profiler" OFF)
+option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
+
+# MPI dependent options
+cmake_dependent_option(
+  TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]"
+  OFF "TM_USE_MPI" OFF)
+cmake_dependent_option(
+  TM_NONBLOCKING_MPI "Use non-blocking MPI calls for spinor and gaug" ON
+  "TM_USE_MPI" OFF)
+
+# need to do it properly. Just a place holder
+cmake_dependent_option(
+  TM_MPI_DIMENSION "Use n dimensional parallelisation [default=4]" 4
+  "TM_USE_MPI" OFF)
+
+# HIP dependent options
+cmake_dependent_option(TM_USE_CUDA_HIP "Enable CUDA support in HIP" OFF
+                       "TM_USE_HIP" OFF)
+
+# clime and lemon depend on MPI
+cmake_dependent_option(TM_USE_LEMON "Use the lemon io library" OFF
+                       "TM_USE_MPI" ON)
+
+# GPU dependent options
+cmake_dependent_option(TM_USE_QUDA_EXPERIMENTAL "Enable QUDA support" ON
+                       "TM_USE_QUDA" OFF)
+cmake_dependent_option(
+  TM_QUDA_FERMIONIC_FORCES "Enable support for fermionic forces using QUDA"
+  ON "TM_USE_QUDA" OFF)
+
+cmake_dependent_option(TM_USE_NVHPC "Enable Nvidia HPC toolkit" OFF
+                       "TM_USE_CUDA" OFF)
+
+# search for blas and lapack
+find_package(BLAS REQUIRED)
+#
+find_package(LAPACK REQUIRED)
+set(HAVE_LAPACK ON)
+find_package(FLEX REQUIRED)
+# do we need bison ?
+find_package(BISON REQUIRED)
+
+set(PACKAGE_NAME ${PROJECT_DESCRIPTION})
+set(PACKAGE_VERSION ${PROJECT_VERSION})
+set(PACKAGE_TARNAME "tmlqcd")
+set(PACKAGE_BUGREPORT "curbach@gmx.de")
+set(PACKAGE_STRING "${PROJECT_DESCRIPTION} ${PROJECT_VERSION}")
+
+unset(TM_USE_MPI)
+unset(TM_USE_OMP)
+unset(HAVE_LIBLEMON)
+unset(HAVE_LIBLIME)
+unset(FIXEDVOLUME)
+unset(_PERSISTENT)
+unset(_NON_BLOCKING)
+unset(HAVE_LIBQUDA)
+unset(TM_USE_QUDA)
+unset(TM_QUDA_EXPERIMENTAL)
+unset(TM_QUDA_FERMIONIC_FORCES)
+unset(DDalphaAMG)
+unset(TM_USE_QPHIX)
+unset(QPHIX_SOALEN)
+unset(_NEW_GEOMETRY)
+unset(_NON_BLOCKING)
+unset(_USE_SHMEM)
+unset(_USE_HALFSPINOR)
+set(ALIGN " ")
+set(ALIGN_BASE "0")
+set(ALIGN_BASE32 "0")
+set(ALIGN32 " ")
+
+message("${TM_ENABLE_ALIGNMENT}")
+if (${TM_ENABLE_ALIGNMENT} STREQUAL "auto")
+  set(ALIGN_BASE "0x00")
+  set(ALIGN " ")
+  set(ALIGN_BASE32 "0x00")
+  set(ALIGN32 " ")
+elseif (TM_ENABLE_ALIGNMENT EQUAL 16)
+  set(ALIGN_BASE "0x0F")
+  set(ALIGN "__attribute__ ((aligned (16)))")
+  set(ALIGN_BASE32 "0x0F")
+  set(ALIGN32 "__attribute__ ((aligned (16)))")
+elseif (TM_ENABLE_ALIGNMENT EQUAL 32)
+  set(ALIGN_BASE "0x2F")
+  set(ALIGN "__attribute__ ((aligned (32)))")
+  set(ALIGN_BASE32 "0x2F")
+  set(ALIGN32 "__attribute__ ((aligned (32)))")
+elseif (TM_ENABLE_ALIGNMENT EQUAL 64)
+  set(ALIGN_BASE "0x3F")
+  set(ALIGN "__attribute__ ((aligned (64)))")
+  set(ALIGN_BASE32 "0x3F")
+  set(ALIGN32 "__attribute__ ((aligned (64)))")
+else()
+  message(FATAL_ERROR "Unusable value for array alignment. Allowed values are: auto, none, 16, 32, 64")
+endif()
+
+if(TM_USE_HALFSPINOR)
+  set(_USE_HALFSPINOR ON)
+endif()
+
+if(TM_FIXED_VOLUME)
+  set(FIXEDVOLUME ON)
+endif()
+
+if(TM_PERSISTENT_MPI)
+  set(_PERSISTENT ON)
+endif()
+
+if(TM_USE_MPI)
+  find_package(MPI REQUIRED)
+  set(TM_USE_MPI ON)
+  if(TM_NONBLOCKING_MPI)
+    set(_NON_BLOCKING ON)
+  endif()
+endif()
+
+if(TM_USE_OPENMP)
+  find_package(OpenMP REQUIRED COMPONENTS C CXX)
+  set(TM_USE_OMP ON)
+endif()
+
+if(TM_USE_HDF5)
+  find_package(HDF5 REQUIRED COMPONENTS C)
+endif()
+
+if(TM_USE_LEMON)
+  find_package(Clemon REQUIRED)
+  set(HAVE_LIBLEMON ON)
+endif()
+
+find_package(CLime REQUIRED)
+set(HAVE_LIBLIME ON)
+
+if(TM_USE_QUDA)
+  find_package(QUDA REQUIRED config)
+  set(HAVE_LIBQUDA ON)
+  if(TM_USE_QUDA_EXPERIMENTAL)
+    set(TM_QUDA_EXPERIMENTAL ON)
+  endif()
+  if(TM_QUDA_FERMIONIC_FORCES)
+    set(TM_QUDA_FERMIONIC_FORCES ON)
+  endif()
+  if(TM_USE_CUDA OR TM_USE_HIP)
+    set(TM_USE_QUDA ON)
+  endif()
+endif()
+
+if(TM_USE_CUDA AND TM_USE_HIP)
+  message(
+    ERROR
+    "HIP and CUDA are mutually exclusive. Please choose one GPU support only")
+endif()
+
+if(TM_USE_CUDA OR QUDA_TARGET_CUDA)
+  enable_language(CUDA)
+  if(TM_USE_NVHPC)
+    find_package(NVHPC REQUIRED COMPONENTS CUDA MATH HOSTUTILS NCCL)
+  else()
+    find_package(CUDAToolkit REQUIRED)
+  endif()
+endif()
+
+message("QUDA_TARGET: ${QUDA_TARGET_CUDA}")
+if(TM_USE_HIP OR QUDA_TARGET_HIP)
+  enable_language(hip)
+
+  # we may want to use hip-cuda for development or debugging purposes especially
+  # if AMD GPU access is not possible. So allow it
+  if(TM_USE_CUDA_HIP)
+    find_package(CUDA)
+  endif()
+
+  if(CMAKE_HIP_PLATFORM MATCHES "amd")
+    set(TM_GPU_PLATFORM_DFLAGS "__HIP_PLATFORM_AMD__")
+  else()
+    set(TM_GPU_PLATFORM_DFLAGS "__HIP_PLATFORM_NVIDIA__")
+  endif()
+endif()
+
+if(TM_USE_SHMEM)
+  set(_USE_SHMEM ON)
+endif()
+
+if(TM_USE_QPIHX)
+  find_package(QPhiX REQUIRED)
+  if(NOT TARGET tmlqcd::qphix)
+    add_library(tmlqcd::qphix INTERFACE IMPORTED)
+    set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_LINK_LIBRARIES
+      "${QPHIX_LIBRARIES}")
+    set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+      "${QPHIX_INCLUDE_DIRS}")
+  endif()
+  set(TM_USE_QPHIX ON)
+endif()
+
+# check for fftw3 (rely on pkgconfig).
+if(TM_USE_FFTW)
+  pkg_search_module(tmlqcd_fftw3 IMPORTED_TARGET GLOBAL fftw3)
+  if(tmlqcd_fftw3_FOUND)
+    add_library(tmlqcd::fftw3 ALIAS PkgConfig::tmlqcd_fftw3)
+  endif()
+endif()
+
+# gprofiler
+
+if (TM_USE_GPROF)
+  set(PROFILE_FLAGS "-pg;-g")
+  if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "powerpc|powerpc64")
+    list(APPEND PROFILE_FLAGS "-qfullpath")
+  endif()
+  add_compile_options($<BOOL:$<COMPILE_LANGUAGE:C>:$PROFILE_FLAGS>)
+endif()
+
+if (TM_ENABLE_WARNINGS)
+  add_compile_options(
+    $<$<COMPILE_LANG_AND_ID:C,GNU>:-Wall>
+    $<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wall>)
+endif()
+
+# check for the presence of clock_gettime in libc or librt
+check_symbol_exists(clock_gettime "time.h" HAVE_CLOCK_GETTIME)
+check_library_exists(rt clock_gettime "" HAVE_CLOCK_GETTIME_IN_RT)
+check_function_exists(fseeko HAVE_FSEEKO)
+
+# set the parallelization
+
+if(TM_USE_MPI)
+  if(TM_MPI_DIMENSION EQUAL "1")
+    # T parallelisation
+    set(PARALLELT ON)
+  elseif(TM_MPI_DIMENSION EQUAL "2")
+    # XT parallelisation
+    set(PARALLELXT ON)
+  elseif(TM_MPI_DIMENSION EQUAL "3")
+    set(PARALLELXYT ON)
+    # XYZ parallelisation
+  elseif(TM_MPI_DIMENSION EQUAL "4")
+    # timeslice-splitted communications
+    set(PARALLELXYZT ON)
+  elseif(TM_MPI_DIMENSION EQUAL "X")
+    set(PARALLELX ON)
+  elseif(TM_MPI_DIMENSION EQUAL "XY")
+    set(PARALLELXY ON)
+  elseif(TM_MPI_DIMENSION EQUAL "XYZ")
+    set(PARALLELXYZ ON)
+  else()
+    set(PARALLELXYZT ON)
+  endif()
+endif()
+
+# keep the autotool config.h header.
+configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlqcd_config_internal.h.in"
+               "${PROJECT_BINARY_DIR}/tmlqcd_config_internal.h" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/fixed_volume.h.in"
+               "${PROJECT_BINARY_DIR}/fixed_volume.h" @ONLY)
+# check if git command exists
+find_program(GIT_EXE NAMES git)
+
+# generate version header
+string(TIMESTAMP TM_TIMESTAMP "%Y-%m-%d %H:%M:%S")
+if(DEFINED GIT_EXE AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
+  execute_process(
+    COMMAND git rev-parse HEAD
+    OUTPUT_VARIABLE TM_SHA
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  execute_process(
+    COMMAND git describe --all
+    OUTPUT_VARIABLE TM_GIT_BRANCH
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  message(STATUS "git hash ${TM_SHA}")
+else()
+  # set(TM_GIT_BRANCH "release v${SIRIUS_VERSION}")
+  set(TM_SHA
+      "https://github.com/etmc/tmLQCD/releases/tag/rel-${TMLQCD_VERSION_MAJOR}-${TMLQCD_VERSION_MINOR}"
+  )
+endif()
+
+configure_file(cmake/git_hash.h.in git_hash.h @ONLY)
+add_subdirectory(src/lib)
+add_subdirectory(src/bin)
diff --git a/cmake/FindCLime.cmake b/cmake/FindCLime.cmake
new file mode 100644
index 000000000..0c3eabe48
--- /dev/null
+++ b/cmake/FindCLime.cmake
@@ -0,0 +1,27 @@
+include(FindPackageHandleStandardArgs)
+
+find_library(
+  TMLQCD_CLIME_LIBRARIES
+  NAMES lime
+  PATH_SUFFIXES "lib" "lib64")
+
+find_path(
+  TMLQCD_CLIME_INCLUDE_DIRS
+  NAMES lime.h
+  PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
+
+message("${TMLQCD_CLIME_INCLUDE_DIRS}")
+find_package_handle_standard_args(CLime DEFAULT_MSG TMLQCD_CLIME_LIBRARIES
+                                  TMLQCD_CLIME_INCLUDE_DIRS)
+
+if(NOT TARGET tmlqcd::clime)
+  add_library(tmlqcd::clime INTERFACE IMPORTED)
+  set_target_properties(tmlqcd::clime PROPERTIES INTERFACE_LINK_LIBRARIES
+                                                 "${TMLQCD_CLIME_LIBRARIES}")
+  set_target_properties(tmlqcd::clime PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+                                                 "${TMLQCD_CLIME_INCLUDE_DIRS}")
+endif()
+
+set(TMLQCD_CLIME_FOUND ON)
+mark_as_advanced(TMLQCD_CLIME_FOUND TMLQCD_CLIME_LIBRARIES
+                 TMLQCD_CLIME_INCLUDE_DIRS)
diff --git a/cmake/FindLemon.cmake b/cmake/FindLemon.cmake
new file mode 100644
index 000000000..cdeca5e42
--- /dev/null
+++ b/cmake/FindLemon.cmake
@@ -0,0 +1,25 @@
+include(FindPackageHandleStandardArgs)
+
+find_library(
+  TMLQCD_LEMON_LIBRARIES
+  NAMES lemon
+  PATH_SUFFIXES "lib" "lib64")
+
+find_path(
+  TMLQCD_LEMON_INCLUDE_DIRS
+  NAMES lemon.h
+  PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
+
+find_package_handle_standard_args(Lemon DEFAULT_MSG TMLQCD_LEMON_LIBRARIES
+                                  TMLQCD_LEMON_INCLUDE_DIRS)
+
+if(NOT TARGET tmlqcd::lemon)
+  add_library(tmlqcd::lemon INTERFACE IMPORTED)
+  set_target_properties(tmlqcd::lemon PROPERTIES INTERFACE_LINK_LIBRARIES
+                                                 "${TMLQCD_LEMON_LIBRARIES}")
+  set_target_properties(tmlqcd::lemon PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+                                                 "${TMLQCD_LEMON_INCLUDE_DIRS}")
+endif()
+
+set(TMLQCD_LEMON_FOUND ON)
+mark_as_advanced(TMLQCD_LEMON_LIBRARIES TMLQCD_LEMON_INCLUDE_DIRS)
diff --git a/cmake/git_hash.h.in b/cmake/git_hash.h.in
new file mode 100644
index 000000000..23f624742
--- /dev/null
+++ b/cmake/git_hash.h.in
@@ -0,0 +1,6 @@
+#ifndef _GIT_HASH_H
+#define _GIT_HASH_H
+
+const char git_hash[] = "@TMLQCD_SHA@";
+
+#endif /* _GIT_HASH_H */
diff --git a/include/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
similarity index 56%
rename from include/tmlqcd_config_internal.h.in
rename to cmake/tmlqcd_config_internal.h.in
index 037ad84a5..5dd9c7096 100644
--- a/include/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -4,177 +4,139 @@
  * systems, for example. */
 
 /* We are on a CRAY */
-#undef CRAY
+#cmakedefine CRAY
 
 /* lapack available */
-#undef HAVE_LAPACK
+#cmakedefine HAVE_LAPACK 
 
 /* Define to 1 if you have the `lime' library (-llime). */
-#undef HAVE_LIBLIME
+#cmakedefine HAVE_LIBLIME 
 
 /* Define to 1 if you have the `lemon' library (-llemon). */
-#undef HAVE_LIBLEMON
+#cmakedefine HAVE_LIBLEMON 
 
 /* 1 if clock_gettime is available for use in benchmark */
-#undef HAVE_CLOCK_GETTIME
+#cmakedefine HAVE_CLOCK_GETTIME 
 
 /* Compile with MPI support */
-#undef TM_USE_MPI
+#cmakedefine TM_USE_MPI
 
 /* Compile with OpenMP support */
-#undef TM_USE_OMP
+#cmakedefine TM_USE_OMP
 
 /* Compile with FFTW support */
-#undef HAVE_FFTW
+#cmakedefine HAVE_FFTW 
 
 /* Fortran has not extra _ */
-#undef NOF77_
+#cmakedefine NOF77_
 
 /* Define to the address where bug reports for this package should be sent. */
-#undef PACKAGE_BUGREPORT
+#define PACKAGE_BUGREPORT "@PACKAGE_BUGREPORT@"
 
 /* Define to the full name of this package. */
-#undef PACKAGE_NAME
-
+#define PACKAGE_NAME "@PROJECT_DESCRIPTION@"
 /* Define to the full name and version of this package. */
-#undef PACKAGE_STRING
+#define PACKAGE_STRING "@PROJECT_VERSION@"
 
 /* Define to the one symbol short name of this package. */
-#undef PACKAGE_TARNAME
+#define PACKAGE_TARNAME "@PACKAGE_TARNAME@"
 
 /* Define to the version of this package. */
-#undef PACKAGE_VERSION
+#define PACKAGE_VERSION "@PROJECT_DESCRIPTION@ @PROJECT_VERSION@"
 
 /* X parallelisation */
-#undef PARALLELX
+#cmakedefine PARALLELX 
 
 /* XY parallelisation */
-#undef PARALLELXY
+#cmakedefine PARALLELXY 
 
 /* XYZ parallelisation */
-#undef PARALLELXYZ
+#cmakedefine PARALLELXYZ
 
 /* One dimensional parallelisation */
-#undef PARALLELT
+#cmakedefine PARALLELT
 
 /* Two dimensional parallelisation */
-#undef PARALLELXT
+#cmakedefine PARALLELXT
 
 /* Three dimensional parallelisation */
-#undef PARALLELXYT
+#cmakedefine PARALLELXYT
 
 /* Four dimensional parallelisation */
-#undef PARALLELXYZT
+#cmakedefine PARALLELXYZT
 
 /* Fixed volume at compiletime */
-#undef FIXEDVOLUME
+#cmakedefine FIXEDVOLUME
 
 /* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
-#undef HAVE_FSEEKO
+#cmakedefine HAVE_FSEEKO
 
 /* Alignment for arrays -- necessary for SSE and automated vectorization */
-#undef ALIGN_BASE
+#define ALIGN_BASE @ALIGN_BASE@
 
 /* Alignment compiler hint macro */
-#undef ALIGN
+#cmakedefine ALIGN @ALIGN@
 
 /* Alignment for 32bit arrays -- necessary for SSE and automated vectorization */
-#undef ALIGN_BASE32
+#define ALIGN_BASE32 @ALIGN_BASE32@
 
 /* Alignment of 32bit fields, compiler hint macro */
-#undef ALIGN32
+#define ALIGN32 @ALIGN32@
 
 /* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
    `char[]'. */
-#undef YYTEXT_POINTER
+#cmakedefine YYTEXT_POINTER
 
 /* Number of bits in a file offset, on hosts where this is settable. */
-#undef _FILE_OFFSET_BITS
+#cmakedefine _FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
 
 /* Construct an extra copy of the gauge fields */
-#undef _GAUGE_COPY
+#cmakedefine _GAUGE_COPY
 
 /* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
-#undef _LARGEFILE_SOURCE
+#cmakedefine _LARGEFILE_SOURCE
 
 /* Define for large files, on AIX-style hosts. */
-#undef _LARGE_FILES
+#cmakedefine _LARGE_FILES 
 
 /* Use even/odd geometry in the gauge fields */
-#undef _NEW_GEOMETRY
+#cmakedefine _NEW_GEOMETRY
 
 /* x86 64 Bit architecture */
-#undef _x86_64
-
-/* Define to empty if `const' does not conform to ANSI C. */
-#undef const
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-#undef inline
-#endif
-
-/* Define to `long' if <sys/types.h> does not define. */
-#undef off_t
-
-/* Define to `unsigned' if <sys/types.h> does not define. */
-#undef size_t
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#undef HAVE_STDINT_H
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#undef HAVE_SYS_TYPES_H
-
-/* Define to 1 if the system has the type `uint16_t'. */
-#undef HAVE_UINT16_T
-
-/* Define to 1 if the system has the type `uint32_t'. */
-#undef HAVE_UINT32_T
-
-/* Define to 1 if the system has the type `uint64_t'. */
-#undef HAVE_UINT64_T
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#undef HAVE_UNISTD_H
+#cmakedefine _x86_64
 
 /* Define to 1 if Dirac operator with halfspinor should be used */
-#undef _USE_HALFSPINOR
+#cmakedefine _USE_HALFSPINOR 
 
 /* Define to 1 if shmem API should be used */
-#undef _USE_SHMEM
+#cmakedefine _USE_SHMEM
 
 /* Define to 1 if KOJAK instrumentalisation should be done*/
-#undef _KOJAK_INST
-
-/* Define to equivalent of C99 restrict keyword, or to nothing if this is not
-   supported. Do not define if restrict is supported directly. */
-#undef restrict
+#cmakedefine _KOJAK_INST
 
 /* Define to 1 if persistent MPI calls for halfspinor should be used */
-#undef _PERSISTENT
+#cmakedefine _PERSISTENT
 
 /* Define to 1 if non-blocking MPI calls for spinor and gauge should be used */
-#undef _NON_BLOCKING
+#cmakedefine _NON_BLOCKING
 
 /* Define to 1 if you have the `quda' library (-lquda). */
-#undef HAVE_LIBQUDA
+#cmakedefine HAVE_LIBQUDA
 
 /* Using QUDA GPU */
-#undef TM_USE_QUDA
+#cmakedefine TM_USE_QUDA 
 
 /* Using experimental QUDA version */
-#undef TM_QUDA_EXPERIMENTAL
+#cmakedefine TM_QUDA_EXPERIMENTAL
 
 /* Using QUDA fermionic forces */
-#undef TM_QUDA_FERMIONIC_FORCES
+#cmakedefine TM_QUDA_FERMIONIC_FORCES
 
 /* Using DDalphaAMG */
-#undef DDalphaAMG
+#cmakedefine DDalphaAMG
 
 /* Using QPHIX */
-#undef TM_USE_QPHIX
+#cmakedefine TM_USE_QPHIX 
 
 /* Structure of Array length to use with QPhiX */
-#undef QPHIX_SOALEN
+#cmakedefine QPHIX_SOALEN @TMLQCD_QPHIX_SOALEN@
diff --git a/cmake_includes.txt b/cmake_includes.txt
new file mode 100644
index 000000000..b8e105cc0
--- /dev/null
+++ b/cmake_includes.txt
@@ -0,0 +1,425 @@
+LIST(APPEND IO_SRC_C io_srcio/utils_write_inverter_info.c
+io/gauge_read.c
+io/utils_write_xlf.c
+io/utils_construct_reader.c
+io/params_construct_xlfInfo.c
+io/utils_kill_with_error.c
+io/DML_crc32.c
+io/spinor_write_source_format.c
+io/deri_write_stdout.c
+io/spinor_write_propagator_format.c
+io/utils_engineering.c
+io/utils_parse_propagator_type.c
+io/io_cm.c
+io/utils_parse_ildgformat_xml.c
+io/utils_read_message.c
+io/utils_write_ildg_format.c
+io/utils_destruct_writer.c
+io/gauge_write.c
+io/utils_write_message.c
+io/params_construct_ildgFormat.c
+io/spinor_read.c
+io/utils_close_reader_record.c
+io/spinor_read_binary.c
+io/utils.c
+io/spinor_write_stdout.c
+io/spinor_write_info.c
+io/utils_write_checksum.c
+io/utils_write_header.c
+io/eospinor_read.c
+io/utils_write_first_message.c
+io/params_construct_InverterInfo.c
+io/utils_parse_checksum_xml.c
+io/utils_construct_writer.c
+io/sw_write_stdout.c
+io/spinor_write_propagator_type.c
+io/gauge_write_binary.c
+io/spinor_write.c
+io/utils_write_xlf_xml.c
+io/params_construct_propagatorFormat.c
+io/gauge_read_binary.c
+io/dml.c
+io/spinor_write_binary.c
+io/utils_destruct_reader.c
+io/utils_close_writer_record.c
+io/eospinor_write.c
+io/gauge_write_luscher_binary.c
+io/params_construct_sourceFormat.c)
+
+list(APPEND INIT_SRC_C init/init_dirac_halfspinor.c
+     init/init_geometry_indices.c
+     init/init_openmp.c
+     init/init_gauge_field.c
+     init/init_parallel.c
+     init/init_chi_spinor_field.c
+     init/init_gauge_fg.c
+     init/init_spinor_field.c
+     init/init_global_states.c
+     init/init_bispinor_field.c
+     init/init_gauge_tmp.c
+     init/init_critical_globals.c
+     init/init_omp_accumulators.c
+     init/init_jacobi_field.c
+     init/init_stout_smear_vars.c
+     init/init_moment_field.c)
+
+list(APPEND SOLVER_SRC_C
+solver/bicg_complex.c
+solver/dfl_projector.c
+solver/eigenvalues_Jacobi.c
+solver/gcr.c
+solver/gmres_precon.c
+solver/chrono_guess.c
+solver/gcr4complex.c
+solver/jdher.c
+solver/gcr4complex_body.c
+solver/gmres_dr.c
+solver/fgmres4complex_body.c
+solver/cg_her_bi.c
+solver/solver_field.c
+solver/quicksort.c
+solver/bicgstab2.c
+solver/cgs_real.c
+solver/M_plus_block_psi_body.c
+solver/little_mg_precon_body.c
+solver/cg_her_su3vect.c
+solver/little_project_eo_body.c
+solver/monomial_solve.c
+solver/cr.c
+solver/gram-schmidt.c
+solver/solver_types.c
+solver/mode_number.c
+solver/cg_her.c
+solver/jdher_bi.c
+solver/mrblk_body.c
+solver/eigcg.c
+solver/jdher_su3vect.c
+solver/poly_precon.c
+solver/Msap.c
+solver/fgmres.c
+solver/dirac_operator_eigenvectors.c
+solver/incr_eigcg.c
+solver/index_jd.c
+solver/sumr.c
+solver/cgne4complex.c
+solver/eigenvalues_bi.c
+solver/gmres.c
+solver/lu_solve.c
+solver/diagonalise_general_matrix.c
+solver/mcr.c
+solver/bicgstabell.c
+solver/rg_mixed_cg_her.c
+solver/mixed_cg_her.c
+solver/mixed_cg_mms_tm_nd.c
+solver/rg_mixed_cg_her_nd.c
+solver/spectral_proj.c
+solver/restart_X.c
+solver/generate_dfl_subspace.c
+solver/eigenvalues.c
+solver/mcr4complex.c
+solver/mr4complex.c
+solver/bicgstab_complex.c
+solver/cg_mms_tm_nd.c
+solver/mr.c
+solver/cg_her_nd.c
+solver/bicgstab_complex_bi.c
+solver/sub_low_ev.c
+solver/ortho.c
+solver/pcg_her.c
+solver/fgmres4complex.c
+solver/cg_mms_tm.c
+solver/init_guess.c)
+
+list(APPEND LINALG_SRC_C linalg/assign_mul_bra_add_mul_r.c
+     linalg/mul_r_gamma5.c
+     linalg/convert_eo_to_lexic.c
+     linalg/print_spinor.c
+     linalg/assign_add_mul_body.c
+     linalg/mul_diff_mul_r.c
+     linalg/square_norm_32.c
+     linalg/mul.c
+     linalg/mul_r.c
+     linalg/mul_gamma5.c
+     linalg/ratio.c
+     linalg/square_norm.c
+     linalg/mul_diff_mul.c
+     linalg/square_and_minmax.c
+     linalg/add.c
+     linalg/assign_add_mul_add_mul_r.c
+     linalg/comp_decomp.c
+     linalg/mul_add_mul.c
+     linalg/diff_32.c
+     linalg/assign_add_mul.c
+     linalg/addto_32.c
+     linalg/assign_mul_add_mul_add_mul_add_mul_r.c
+     linalg/assign_add_mul_r.c
+     linalg/diff.c
+     linalg/assign_mul_add_mul_r.c
+     linalg/scalar_prod_r.c
+     linalg/assign_to_32.c
+     linalg/assign_add_mul_add_mul.c
+     linalg/mul_diff_r.c
+     linalg/assign_mul_add_r_and_square.c
+     linalg/assign_mul_add_mul_r_32.c
+     linalg/assign_mul_add_mul.c
+     linalg/assign_mul_add_mul_add_mul_r.c
+     linalg/scalar_prod_r_32.c
+     linalg/assign_mul_add_r.c
+     linalg/assign_mul_add_r_32.c
+     linalg/scalar_prod_su3spinor.c
+     linalg/convert_even_to_lexic.c
+     linalg/mul_r_32.c
+     linalg/assign_add_mul_r_add_mul.c
+     linalg/convert_odd_to_lexic.c
+     linalg/diff_and_square_norm.c
+     linalg/scalar_prod_i.c
+     linalg/mul_add_mul_r.c
+     linalg/assign_diff_mul.c
+     linalg/assign_mul_bra_add_mul_ket_add_r.c
+     linalg/set_even_to_zero.c
+     linalg/assign_mul_add.c
+     linalg/square_and_prod_r.c
+     linalg/scalar_prod_body.c
+     linalg/assign_mul_bra_add_mul_ket_add.c
+     linalg/assign_add_mul_r_32.c
+     linalg/scalar_prod.c
+     linalg/mattimesvec.c
+     linalg/assign.c
+     linalg/print_spinor_similar_components.c)
+
+list(APPEND RATIONAL_SRC_C rational/zolotarev.c
+     rational/rational.c
+     rational/elliptic.c)
+
+list(APPEND OPERATOR_SRC_C operator/clover_invert.c
+     operator/hopping_body_dbl.c
+     operator/tm_operators_nd_32.c
+     operator/hopping_sse_dbl.c
+     operator/halfspinor_body.c
+     operator/Block_D_psi_body.c
+     operator/mul_one_pm_imu_sub_mul_body.c
+     operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
+     operator/assign_mul_one_sw_pm_imu_inv_block_body.c
+     operator/clover_accumulate_deriv.c
+     operator/Hopping_Matrix.c
+     operator/hopping_bg_dbl.c
+     operator/tm_operators.c
+     operator/tm_times_Hopping_Matrix.c
+     operator/clovertm_operators_32.c
+     operator/hopping_sgl.c
+     operator/Dov_proj.c
+     operator/clover_deriv.c
+     operator/halfspinor_bg_dbl.c
+     operator/clover_det.c
+     operator/clover_leaf.c
+     operator/D_psi_body.c
+     operator/clovertm_operators.c
+     operator/hopping_sse_sgl.c
+     operator/halfspinor_sse_dbl.c
+     operator/Dov_psi.c
+     operator/tm_operators_nd.c
+     operator/tm_sub_Hopping_Matrix.c
+     operator/Hopping_Matrix_nocom.c
+     operator/clover_term.c
+     operator/halfspinor_bgq_dbl.c
+     operator/Hopping_Matrix_32_nocom.c
+     operator/D_psi.c
+     operator/tm_operators_32.c
+     operator/Hopping_Matrix_32.c
+     operator/halfspinor_body_32.c
+     operator/mul_one_pm_imu_inv_body.c)
+
+list(APPEND SMEARING_SRC_C smearing/hex_stout_exclude_two.c
+     smearing/hex_hex_smear.c
+     smearing/utils_print_su3.c
+     smearing/hyp_APE_project_exclude_none.c
+     smearing/hyp_hyp_staples_exclude_one.c
+     smearing/hyp_APE_project_exclude_one.c
+     smearing/hex_stout_exclude_one.c
+     smearing/hyp_hyp_staples_exclude_two.c
+     smearing/hex_stout_exclude_none.c
+     smearing/stout_stout_smear.c
+     smearing/hyp_hyp_smear.c
+     smearing/hyp_APE_project_exclude_two.c
+     smearing/utils_project_herm.c
+     smearing/utils_reunitarize.c
+     smearing/utils_generic_staples.c
+     smearing/hyp_hyp_staples_exclude_none.c
+     smearing/ape_ape_smear.c
+     smearing/uils_print_config_to_screen.c
+     smearing/utils_project_antiherm.c
+     smearing/utils_print_config_to_screen.c
+     smearing/utils_reunitarize_MILC.c)
+
+list(APPEND BUFFER_SRC_C
+     buffers/gauge_return_gauge_field.c
+     buffers/gauge_get_gauge_field.c
+     buffers/gauge_finalize_gauge_buffers.c
+     buffers/gauge_initialize_gauge_buffers.c
+     buffers/gauge.c
+     buffers/gauge_free_unused_gauge_buffers.c
+     buffers/gauge_get_gauge_field_array.c
+     buffers/utils_generic_exchange.c
+     buffers/gauge_allocate_gauge_buffers.c
+     buffers/gauge_return_gauge_field_array.c)
+
+list(APPEND MONOMIAL_SRC_C
+     monomial/detratio_monomial.c
+     monomial/sf_gauge_monomial.c
+     monomial/poly_monomial.c
+     monomial/cloverdetratio_monomial.c
+     monomial/ndrat_monomial.c
+     monomial/cloverdet_monomial.c
+     monomial/clover_trlog_monomial.c
+     monomial/cloverndpoly_monomial.c
+     monomial/monitor_forces.c
+     monomial/ndpoly_monomial.c
+     monomial/det_monomial.c
+     monomial/monomial.c
+     monomial/cloverdetratio_rwmonomial.c
+     monomial/gauge_monomial.c
+     monomial/clovernd_trlog_monomial.c
+     monomial/ratcor_monomial.c
+     monomial/nddetratio_monomial.c
+     monomial/rat_monomial.c
+     monomial/ndratcor_monomial.c
+     monomial/moment_energy.c)
+
+list(APPEND EXCHANGE_SRC_C xchange/xchange_lexicfield.c
+xchange/xchange_2fields.c
+xchange/xchange_gauge.c
+xchange/xchange_halffield.c
+xchange/xchange_jacobi.c
+xchange/little_field_gather_body.c
+xchange/little_field_gather.c
+xchange/xchange_deri.c
+xchange/xchange_field.c
+xchange/xchange_field_tslice.c)
+
+list(APPEND MEAS_SRC_C
+meas/pion_norm.c
+meas/correlators.c
+meas/polyakov_loop.c
+meas/measurements.c
+meas/oriented_plaquettes.c
+meas/gradient_flow.c
+meas/measure_clover_field_strength_observables.c)
+
+list(APPEND SF_SRC_C sf/sf_calc_action.c
+     sf/sf_get_rectangle_staples.c
+     sf/sf_get_staples.c
+     sf/sf_observables.c
+     sf/sf_utils.c
+     )
+
+list(APPEND MAIN_SRC_C
+measure_gauge_action.c
+start.c
+deriv_Sb.c
+reweighting_factor_nd.c
+ranlxs.c
+source_generation.c
+read_input.c
+invert_doublet_eo.c
+geometry_eo.c
+getopt.c
+offline_measurement.c
+tm_debug_printf.c
+chebyshev_polynomial_nd.c
+invert_eo.c
+little_D.c
+get_rectangle_staples.c
+gen_sources.c
+rnd_gauge_trafo.c
+test_lemon.c
+LapH_ev.c
+benchmark.c
+measure_rectangles.c
+check_locallity.c
+invert.c
+deriv_Sb_D_psi.c
+deriv_mg_tune.c
+mpi_init.c
+update_momenta_fg.c
+gamma.c
+matrix_utils.c
+reweighting_factor.c
+update_tm.c
+jacobi.c
+invert_overlap.c
+phmc.c
+get_staples.c
+clenshaw_coef.c
+block.c
+spinor_fft.c
+boundary.c
+little_D_body.c
+X_psi.c
+prepare_source.c
+DDalphaAMG_interface.c
+update_backward_gauge.c
+invert_clover_eo.c
+gettime.c
+hmc_tm.c
+update_momenta.c
+sighandler.c
+compare_derivative.c
+ranlxd.c
+DirectPut.c
+aligned_malloc.c
+fatal_error.c
+operator.c
+cu/cu.c
+chebyshev_polynomial.c
+qphix_test_Dslash.c
+expo.c
+overrelaxation.c
+Ptilde_nd.c
+update_gauge.c
+hopping_test.c
+integrator.c
+P_M_eta.c)
+
+if (TMLQCD_USE_QPHIX)
+list(APPEND MAIN_SRC_C qphix_interface.cpp)
+endif()
+
+if (TMLQCD_USE_QUDA)
+list(APPEND MAIN_SRC_C quda_interface.c)
+endif()
+
+list(APPEND ALL_SRC ${MAIN_SRC_C} ${SF_SRC_C} ${XCHANGE_SRC_C} ${MONOMIAL_SRC_C} ${BUFFER_SRC_C} ${SMEARING_SRC_C} ${OPERATOR_SRC_C} ${RATIONAL_SRC_C} ${LINALG_SRC_C} ${IO_SRC_C} ${INIT_SRC_C} ${SOLVER_SRC_C})
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+# cmake 4.0 uses a different syntax for the option
+flex_target(tmlqcd_input_read input_read.l input_read.c
+            $<$<VERSION_LESS:${CMAKE_MAJOR_VERSION},4>:COMPILE_FLAGS "-Ca -Ptmlqcd">
+            $<$<VERSION_GREATER_EQUAL:${CMAKE_MAJOR_VERSION},4>:OPTIONS "-Ca;-Ptmlqcd">)
+
+# create a target library with namespacing because cmake does not know name space at all
+add_library(tmlqcd::hmc ALL_SRC ${FLEX_tmlqcd_input_read_OUTPUTS})
+set_target_properties(tmlqcd::hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
+
+# define a library and add the dependencies
+target_link_libraries(tmlqcd::hmc
+                      $<$<BOOL:${HAVE_CLOCK_GETTIME_IN_RT}>:rt>
+                      $<$<BOOL:${TMLQCD_USE_LIME}>:tmlqcd::lime>
+                      $<$<BOOL:${TMLQCD_USE_LEMON}>:tmlqcd::lemon>
+                      $<$<BOOL:${TMLQCD_USE_QPHIX}>:tmlqcd::qphix>
+                      $<$<BOOL:${TMLQCD_USE_FFTW}>:tmlqcd::fftw3>
+                      $<$<BOOL:${TMLQCD_USE_MPI}>:MPI::MPI_C MPI::MPI_CXX>
+                      $<$<BOOL:${TMLQCD_USE_QUDA}>:quda::quda>
+                      $<$<BOOL:${TMLQCD_USE_CUDA}>:CUDA::cufft CUDA::cufftw CUDA::cublas CUDA::cudart CUDA::cuda_driver>
+                      $<$<BOOL:${TMLQCD_USE_HIP}>:hip::hipfft roc::hipblas hip::host>
+                      ${LAPACK_LIBRARIES}
+                      ${BLAS_LIBRARIES}
+                      $<$<BOOL:${TMLQCD_USE_OPENMP}>:OpenMP::OpenMP_C OpenMP::OpenMP_CXX>
+                      m)
+
+target_compile_definitions(tmlqcd::hmc
+                           $<$<BOOL:${TMLQCD_USE_HIP}>:${TMLQCD_GPU_PLATFORM_DFLAGS}>
+                           )
+
+target_include_directories(tmlqcd::hmc PUBLIC $<INSTALL_INTERFACE:include>
+                           PRIVATE "init io linalg meas monomial operator profiling rational sf smearing solver util xchange wrapper")
diff --git a/io/Makefile.in b/io/Makefile.in
deleted file mode 100644
index 41b5b78ce..000000000
--- a/io/Makefile.in
+++ /dev/null
@@ -1,135 +0,0 @@
-
-srcdir = @srcdir@
-top_builddir =  @top_builddir@
-abs_top_builddir = @abs_top_builddir@
-top_srcdir = @top_srcdir@
-abs_top_srcdir = @abs_top_srcdir@
-subdir = io
-builddir = @builddir@
-
-CFLAGS = @CFLAGS@
-DEPFLAGS = @DEPFLAGS@
-LDFLAGS = @LDFLAGS@
-DEFS = @DEFS@
-OPTARGS = @OPTARGS@
-
-AR = @AR@
-RANLIB = @RANLIB@
-CC = @CC@
-CCDEP = @CCDEP@
-CCLD = $(CC)
-LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@
-LEX = @LEX@
-AUTOCONF = @AUTOCONF@
-DEFS = @DEFS@
-
-LEMON_AVAILABLE = @LEMON_AVAILABLE@
-
-INCLUDES = @INCLUDES@
-LDADD =
-COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS}
-
-LIBRARIES = libio
-
-libio_TARGETS = utils_engineering \
-		utils_parse_checksum_xml \
-		utils_write_message \
-		utils_read_message \
-		gauge_write_binary \
-		gauge_read_binary \
-		gauge_read \
-		gauge_write \
-		utils_write_xlf \
-		utils_write_xlf_xml \
-		utils_write_ildg_format \
-		utils_write_header \
-		utils_write_checksum \
-		utils_write_inverter_info \
-		utils_kill_with_error \
-		utils_construct_reader \
-		utils_destruct_reader \
-		utils_construct_writer \
-		utils_destruct_writer \
-		utils_close_writer_record \
-		utils_close_reader_record \
-		utils_write_first_message \
-		utils_parse_propagator_type \
-		utils_parse_ildgformat_xml \
-		params_construct_ildgFormat \
-		params_construct_propagatorFormat \
-		params_construct_sourceFormat \
-		params_construct_xlfInfo \
-		params_construct_InverterInfo \
-		spinor_write \
-		spinor_read \
-		spinor_write_binary \
-		spinor_read_binary \
-		spinor_write_info \
-		spinor_write_source_format \
-		spinor_write_propagator_format \
-		spinor_write_propagator_type \
-		utils DML_crc32 dml \
-		eospinor_write \
-		eospinor_read \
-		io_cm \
-		deri_write_stdout spinor_write_stdout sw_write_stdout \
-		gauge_write_luscher_binary
-
-libio_OBJECTS = $(addsuffix .o, ${libio_TARGETS})
-
-# default rule
-
-all: Makefile dep libio.a
-
-# rules for debugging
-debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@
-debug all-debug: all
-
-# rules for profiling information
-profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@
-profile all-profile: all
-
-
-#include dep rules
-
-
--include $(addsuffix .d,${libio_TARGETS})
-
-include ${top_srcdir}/Makefile.global
-
-# rule to compile objects
-
-%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config_internal.h
-	$(COMPILE) -c $<
-
-
-# rule to make libio
-libio.a: ${libio_OBJECTS} Makefile
-	@rm -f libio.a
-	@${AR} cru libio.a $(libio_OBJECTS)
-	@$(RANLIB) libio.a
-	@cp libio.a ${top_builddir}/lib/libio.a
-
-# rule to generate .d files
-
-$(addsuffix .d,$(libio_TARGETS)): %.d: ${srcdir}/%.c Makefile
-	@$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@
-
-# rule to make dependencies
-
-dep: ${addsuffix .d, ${libio_TARGETS}}
-
-# rules to clean
-
-compile-clean: Makefile
-	rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d
-
-clean: compile-clean
-	rm -f $(addsuffix .a, ${LIBRARIES})
-	rm -f ../lib/libio.a
-
-distclean: clean
-	rm -f Makefile
-
-
-.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile
diff --git a/src/bin/CMakeLists.txt b/src/bin/CMakeLists.txt
new file mode 100644
index 000000000..29c9c1d8a
--- /dev/null
+++ b/src/bin/CMakeLists.txt
@@ -0,0 +1,19 @@
+list(APPEND tmlqcd_prog "benchmark;deriv_mg_tune;hmc_tm;offline_measurement")
+
+include_directories(
+  $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+  $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/src/lib/include>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<$<BOOL:${TM_USE_LEMON}>:${TM_LEMON_INCLUDE_DIRS}>
+  ${TM_CLIME_INCLUDE_DIRS})
+
+foreach(_prog ${tmlqcd_prog})
+  add_executable(${_prog} "${_prog}.c")
+
+  target_link_libraries(${_prog} PUBLIC hmc)
+  set_target_properties(
+    ${_prog}
+    PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+               POSITION_INDEPENDENT_CODE ON
+               LINKER_LANGUAGE "CXX")
+endforeach()
diff --git a/LapH_ev.c b/src/bin/LapH_ev.c
similarity index 100%
rename from LapH_ev.c
rename to src/bin/LapH_ev.c
diff --git a/benchmark.c b/src/bin/benchmark.c
similarity index 100%
rename from benchmark.c
rename to src/bin/benchmark.c
diff --git a/check_locallity.c b/src/bin/check_locallity.c
similarity index 99%
rename from check_locallity.c
rename to src/bin/check_locallity.c
index 9ed46daee..52ea21209 100644
--- a/check_locallity.c
+++ b/src/bin/check_locallity.c
@@ -17,10 +17,8 @@
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
 
-#include "lime.h"
-#ifdef HAVE_CONFIG_H
+#include <lime.h>
 #include <tmlqcd_config.h>
-#endif
 #include <math.h>
 #include <signal.h>
 #include <stdio.h>
diff --git a/deriv_mg_tune.c b/src/bin/deriv_mg_tune.c
similarity index 100%
rename from deriv_mg_tune.c
rename to src/bin/deriv_mg_tune.c
diff --git a/gen_sources.c b/src/bin/gen_sources.c
similarity index 100%
rename from gen_sources.c
rename to src/bin/gen_sources.c
diff --git a/hmc_tm.c b/src/bin/hmc_tm.c
similarity index 100%
rename from hmc_tm.c
rename to src/bin/hmc_tm.c
diff --git a/hopping_test.c b/src/bin/hopping_test.c
similarity index 100%
rename from hopping_test.c
rename to src/bin/hopping_test.c
diff --git a/invert.c b/src/bin/invert.c
similarity index 100%
rename from invert.c
rename to src/bin/invert.c
diff --git a/util/main_ildg2uk.c b/src/bin/main_ildg2uk.c
similarity index 100%
rename from util/main_ildg2uk.c
rename to src/bin/main_ildg2uk.c
diff --git a/offline_measurement.c b/src/bin/offline_measurement.c
similarity index 100%
rename from offline_measurement.c
rename to src/bin/offline_measurement.c
diff --git a/qphix_test_Dslash.c b/src/bin/qphix_test_Dslash.c
similarity index 100%
rename from qphix_test_Dslash.c
rename to src/bin/qphix_test_Dslash.c
diff --git a/test/scalar_prod_r_test.c b/src/bin/scalar_prod_r_test.c
similarity index 100%
rename from test/scalar_prod_r_test.c
rename to src/bin/scalar_prod_r_test.c
diff --git a/test/test_eigenvalues.c b/src/bin/test_eigenvalues.c
similarity index 100%
rename from test/test_eigenvalues.c
rename to src/bin/test_eigenvalues.c
diff --git a/test_lemon.c b/src/bin/test_lemon.c
similarity index 100%
rename from test_lemon.c
rename to src/bin/test_lemon.c
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
new file mode 100644
index 000000000..4ace6c997
--- /dev/null
+++ b/src/lib/CMakeLists.txt
@@ -0,0 +1,457 @@
+list(
+  APPEND
+  IO_SRC_C
+  io/utils_write_inverter_info.c
+  io/gauge_read.c
+  io/utils_write_xlf.c
+  io/utils_construct_reader.c
+  io/params_construct_xlfInfo.c
+  io/utils_kill_with_error.c
+  io/DML_crc32.c
+  io/spinor_write_source_format.c
+  io/deri_write_stdout.c
+  io/spinor_write_propagator_format.c
+  io/utils_engineering.c
+  io/utils_parse_propagator_type.c
+  io/io_cm.c
+  io/utils_parse_ildgformat_xml.c
+  io/utils_read_message.c
+  io/utils_write_ildg_format.c
+  io/utils_destruct_writer.c
+  io/gauge_write.c
+  io/utils_write_message.c
+  io/params_construct_ildgFormat.c
+  io/spinor_read.c
+  io/utils_close_reader_record.c
+  io/spinor_read_binary.c
+  io/utils.c
+  io/spinor_write_stdout.c
+  io/spinor_write_info.c
+  io/utils_write_checksum.c
+  io/utils_write_header.c
+  io/eospinor_read.c
+  io/utils_write_first_message.c
+  io/params_construct_InverterInfo.c
+  io/utils_parse_checksum_xml.c
+  io/utils_construct_writer.c
+  io/sw_write_stdout.c
+  io/spinor_write_propagator_type.c
+  io/gauge_write_binary.c
+  io/spinor_write.c
+  io/utils_write_xlf_xml.c
+  io/params_construct_propagatorFormat.c
+  io/gauge_read_binary.c
+  io/dml.c
+  io/spinor_write_binary.c
+  io/utils_destruct_reader.c
+  io/utils_close_writer_record.c
+  io/eospinor_write.c
+  io/gauge_write_luscher_binary.c
+  io/params_construct_sourceFormat.c)
+
+list(
+  APPEND
+  INIT_SRC_C
+  init/init_dirac_halfspinor.c
+  init/init_geometry_indices.c
+  init/init_openmp.c
+  init/init_gauge_field.c
+  init/init_parallel.c
+  init/init_chi_spinor_field.c
+  init/init_gauge_fg.c
+  init/init_spinor_field.c
+  init/init_global_states.c
+  init/init_bispinor_field.c
+  init/init_gauge_tmp.c
+  init/init_critical_globals.c
+  init/init_omp_accumulators.c
+  # init/init_stout_smear_vars.c
+  init/init_moment_field.c)
+
+list(
+  APPEND
+  SOLVER_SRC_C
+  solver/bicg_complex.c
+  solver/dfl_projector.c
+  solver/gcr.c
+  # solver/gmres_precon.c
+  solver/chrono_guess.c
+  solver/gcr4complex.c
+  solver/jdher.c
+  # solver/gcr4complex_body.c
+  solver/gmres_dr.c
+  # solver/fgmres4complex_body.c
+  solver/cg_her_bi.c
+  solver/solver_field.c
+  solver/quicksort.c
+  solver/bicgstab2.c
+  solver/cgs_real.c
+  # solver/M_plus_block_psi_body.c solver/little_mg_precon_body.c
+  # solver/little_project_eo_body.c
+  solver/monomial_solve.c
+  solver/cr.c
+  solver/gram-schmidt.c
+  solver/solver_types.c
+  solver/cg_her.c
+  solver/jdher_bi.c
+  # solver/mrblk_body.c
+  solver/eigcg.c
+  solver/poly_precon.c
+  solver/Msap.c
+  solver/fgmres.c
+  solver/dirac_operator_eigenvectors.c
+  solver/incr_eigcg.c
+  solver/index_jd.c
+  solver/sumr.c
+  solver/cgne4complex.c
+  solver/eigenvalues_bi.c
+  solver/gmres.c
+  solver/lu_solve.c
+  solver/diagonalise_general_matrix.c
+  solver/mcr.c
+  solver/bicgstabell.c
+  solver/rg_mixed_cg_her.c
+  solver/mixed_cg_her.c
+  solver/mixed_cg_mms_tm_nd.c
+  solver/rg_mixed_cg_her_nd.c
+  solver/restart_X.c
+  solver/generate_dfl_subspace.c
+  solver/eigenvalues.c
+  solver/mcr4complex.c
+  solver/mr4complex.c
+  solver/bicgstab_complex.c
+  solver/cg_mms_tm_nd.c
+  solver/mr.c
+  solver/cg_her_nd.c
+  solver/bicgstab_complex_bi.c
+  solver/sub_low_ev.c
+  solver/ortho.c
+  solver/pcg_her.c
+  solver/fgmres4complex.c
+  solver/cg_mms_tm.c
+  solver/init_guess.c)
+
+list(
+  APPEND
+  LINALG_SRC_C
+  linalg/assign_mul_bra_add_mul_r.c
+  linalg/mul_r_gamma5.c
+  linalg/convert_eo_to_lexic.c
+  linalg/print_spinor.c
+  # linalg/assign_add_mul_body.c
+  linalg/mul_diff_mul_r.c
+  linalg/square_norm_32.c
+  linalg/mul.c
+  linalg/mul_r.c
+  linalg/mul_gamma5.c
+  linalg/ratio.c
+  linalg/square_norm.c
+  linalg/mul_diff_mul.c
+  linalg/square_and_minmax.c
+  linalg/add.c
+  linalg/assign_add_mul_add_mul_r.c
+  linalg/comp_decomp.c
+  linalg/mul_add_mul.c
+  linalg/diff_32.c
+  linalg/assign_add_mul.c
+  linalg/addto_32.c
+  linalg/assign_mul_add_mul_add_mul_add_mul_r.c
+  linalg/assign_add_mul_r.c
+  linalg/diff.c
+  linalg/assign_mul_add_mul_r.c
+  linalg/scalar_prod_r.c
+  linalg/assign_to_32.c
+  linalg/assign_add_mul_add_mul.c
+  linalg/mul_diff_r.c
+  linalg/assign_mul_add_r_and_square.c
+  linalg/assign_mul_add_mul_r_32.c
+  linalg/assign_mul_add_mul.c
+  linalg/assign_mul_add_mul_add_mul_r.c
+  linalg/scalar_prod_r_32.c
+  linalg/assign_mul_add_r.c
+  linalg/assign_mul_add_r_32.c
+  linalg/assign_add_mul_r_32.c
+  linalg/convert_even_to_lexic.c
+  linalg/mul_r_32.c
+  linalg/assign_add_mul_r_add_mul.c
+  linalg/convert_odd_to_lexic.c
+  linalg/diff_and_square_norm.c
+  linalg/scalar_prod_i.c
+  linalg/mul_add_mul_r.c
+  linalg/assign_diff_mul.c
+  linalg/assign_mul_bra_add_mul_ket_add_r.c
+  linalg/set_even_to_zero.c
+  linalg/assign_mul_add.c
+  linalg/square_and_prod_r.c
+  # linalg/scalar_prod_body.c
+  linalg/assign_mul_bra_add_mul_ket_add.c
+  linalg/assign_add_mul_r_32.c
+  linalg/scalar_prod.c
+  linalg/mattimesvec.c
+  linalg/assign.c
+  linalg/print_spinor_similar_components.c)
+
+list(APPEND RATIONAL_SRC_C rational/zolotarev.c rational/rational.c
+     rational/elliptic.c)
+
+list(
+  APPEND
+  OPERATOR_SRC_C
+  operator/clover_invert.c
+  # operator/hopping_body_dbl.c
+  operator/tm_operators_nd_32.c
+  # operator/halfspinor_body.c operator/Block_D_psi_body.c
+  # operator/mul_one_pm_imu_sub_mul_body.c
+  # operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
+  # operator/assign_mul_one_sw_pm_imu_inv_block_body.c
+  operator/clover_accumulate_deriv.c
+  operator/Hopping_Matrix.c
+  operator/tm_operators.c
+  operator/tm_times_Hopping_Matrix.c
+  operator/clovertm_operators_32.c
+  # operator/hopping_sgl.c
+  operator/Dov_proj.c
+  operator/clover_deriv.c
+  operator/clover_det.c
+  operator/clover_leaf.c
+  # operator/D_psi_body.c
+  operator/clovertm_operators.c
+  operator/Dov_psi.c
+  operator/tm_operators_nd.c
+  operator/tm_sub_Hopping_Matrix.c
+  operator/Hopping_Matrix_nocom.c
+  operator/clover_term.c
+  operator/Hopping_Matrix_32_nocom.c
+  operator/D_psi.c
+  operator/tm_operators_32.c
+  operator/Hopping_Matrix_32.c)
+# operator/halfspinor_body_32.c operator/mul_one_pm_imu_inv_body.c)
+
+list(
+  APPEND
+  SMEARING_SRC_C
+  smearing/hex_stout_exclude_two.c
+  smearing/hex_hex_smear.c
+  smearing/utils_print_su3.c
+  smearing/hyp_APE_project_exclude_none.c
+  smearing/hyp_hyp_staples_exclude_one.c
+  smearing/hyp_APE_project_exclude_one.c
+  smearing/hex_stout_exclude_one.c
+  smearing/hyp_hyp_staples_exclude_two.c
+  smearing/hex_stout_exclude_none.c
+  smearing/stout_stout_smear.c
+  smearing/hyp_hyp_smear.c
+  smearing/hyp_APE_project_exclude_two.c
+  smearing/utils_project_herm.c
+  smearing/utils_reunitarize.c
+  smearing/utils_generic_staples.c
+  smearing/hyp_hyp_staples_exclude_none.c
+  smearing/ape_ape_smear.c
+  smearing/uils_print_config_to_screen.c
+  smearing/utils_project_antiherm.c)
+# smearing/utils_print_config_to_screen.c smearing/utils_reunitarize_MILC.c)
+
+list(
+  APPEND
+  BUFFER_SRC_C
+  buffers/gauge_return_gauge_field.c
+  buffers/gauge_get_gauge_field.c
+  buffers/gauge_finalize_gauge_buffers.c
+  buffers/gauge_initialize_gauge_buffers.c
+  buffers/gauge.c
+  buffers/gauge_free_unused_gauge_buffers.c
+  buffers/gauge_get_gauge_field_array.c
+  buffers/utils_generic_exchange.c
+  buffers/gauge_allocate_gauge_buffers.c
+  buffers/gauge_return_gauge_field_array.c)
+
+list(
+  APPEND
+  MONOMIAL_SRC_C
+  monomial/detratio_monomial.c
+  monomial/poly_monomial.c
+  monomial/cloverdetratio_monomial.c
+  monomial/ndrat_monomial.c
+  monomial/cloverdet_monomial.c
+  monomial/clover_trlog_monomial.c
+  monomial/cloverndpoly_monomial.c
+  monomial/monitor_forces.c
+  monomial/ndpoly_monomial.c
+  monomial/det_monomial.c
+  monomial/monomial.c
+  monomial/cloverdetratio_rwmonomial.c
+  monomial/gauge_monomial.c
+  monomial/clovernd_trlog_monomial.c
+  monomial/ratcor_monomial.c
+  monomial/nddetratio_monomial.c
+  monomial/rat_monomial.c
+  monomial/ndratcor_monomial.c
+  monomial/moment_energy.c)
+
+list(
+  APPEND
+  EXCHANGE_SRC_C
+  xchange/xchange_lexicfield.c
+  xchange/xchange_2fields.c
+  xchange/xchange_gauge.c
+  xchange/xchange_halffield.c
+  # xchange/xchange_jacobi.c xchange/little_field_gather_body.c
+  xchange/little_field_gather.c
+  xchange/xchange_deri.c
+  xchange/xchange_field.c)
+# xchange/xchange_field_tslice.c)
+
+list(
+  APPEND
+  MEAS_SRC_C
+  meas/pion_norm.c
+  meas/correlators.c
+  meas/polyakov_loop.c
+  meas/measurements.c
+  meas/oriented_plaquettes.c
+  meas/gradient_flow.c
+  meas/measure_clover_field_strength_observables.c)
+
+list(
+  APPEND
+  MAIN_SRC_C
+  # cu/cu.c
+  measure_gauge_action.c
+  start.c
+  deriv_Sb.c
+  reweighting_factor_nd.c
+  ranlxs.c
+  source_generation.c
+  invert_doublet_eo.c
+  geometry_eo.c
+  getopt.c
+  tm_debug_printf.c
+  chebyshev_polynomial_nd.c
+  invert_eo.c
+  little_D.c
+  get_rectangle_staples.c
+  rnd_gauge_trafo.c
+  measure_rectangles.c
+  #invert.c
+  deriv_Sb_D_psi.c
+  mpi_init.c
+  update_momenta_fg.c
+  gamma.c
+  matrix_utils.c
+  reweighting_factor.c
+  update_tm.c
+  invert_overlap.c
+  phmc.c
+  get_staples.c
+  clenshaw_coef.c
+  block.c
+  spinor_fft.c
+  boundary.c
+  prepare_source.c
+  DDalphaAMG_interface.c
+  update_backward_gauge.c
+  invert_clover_eo.c
+  gettime.c
+  update_momenta.c
+  sighandler.c
+  compare_derivative.c
+  ranlxd.c
+  aligned_malloc.c
+  fatal_error.c
+  operator.c
+  # cu/cu.c chebyshev_polynomial.c qphix_test_Dslash.c
+  expo.c
+  overrelaxation.c
+  Ptilde_nd.c
+  update_gauge.c
+  # hopping_test.c
+  integrator.c)
+
+list(APPEND TEST_SRC_C test/check_xchange.c test/check_geometry.c
+     test/overlaptests.c)
+if(TMLQCD_USE_QPHIX)
+  list(APPEND MAIN_SRC_C QphiX/qphix_interface.cpp)
+endif()
+
+if(TMLQCD_USE_QUDA)
+  list(APPEND MAIN_SRC_C quda_interface.c)
+endif()
+
+list(
+  APPEND
+  ALL_SRC
+  ${MAIN_SRC_C}
+  ${EXCHANGE_SRC_C}
+  ${MONOMIAL_SRC_C}
+  ${BUFFER_SRC_C}
+  ${SMEARING_SRC_C}
+  ${OPERATOR_SRC_C}
+  ${RATIONAL_SRC_C}
+  ${LINALG_SRC_C}
+  ${IO_SRC_C}
+  ${INIT_SRC_C}
+  ${SOLVER_SRC_C}
+  ${TEST_SRC_C}
+  ${MEAS_SRC_C})
+
+include_directories(
+  $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+  $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/src/lib/include>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<$<BOOL:${TMLQCD_USE_LEMON}>:${TMLQCD_LEMON_INCLUDE_DIRS}>
+  ${TMLQCD_CLIME_INCLUDE_DIRS})
+
+# cmake 4.0 uses a different syntax for the option
+if(CMAKE_MAJOR_VERSION LESS 4)
+  flex_target(tmlqcd_input_read read_input.l read_input.c
+              COMPILE_FLAGS "-Ca -Ptmlqcd")
+else()
+  flex_target(tmlqcd_input_read read_input.l read_input.c OPTIONS
+              "-Ca -Ptmlqcd")
+endif()
+
+# create a target library with namespacing because cmake does not know name
+# space at all
+
+if (BUILD_SHARED_LIBS)
+  add_library(hmc SHARED "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
+else()
+  add_library(hmc STATIC "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
+endif()
+
+set_target_properties(hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
+
+# define a library and add the dependencies
+target_link_libraries(
+  hmc
+  PUBLIC $<$<BOOL:${HAVE_CLOCK_GETTIME_IN_RT}>:rt>
+         $<$<BOOL:${TM_USE_LEMON}>:tmlqcd::clime>
+         $<$<BOOL:${TM_USE_LEMON}>:clemon::lemon>
+         $<$<BOOL:${TM_USE_QPHIX}>:tmlqcd::qphix>
+         $<$<BOOL:${TM_USE_FFTW}>:tmlqcd::fftw3>
+         $<$<BOOL:${TM_USE_MPI}>:MPI::MPI_C
+         MPI::MPI_CXX>
+         $<$<BOOL:${TM_USE_QUDA}>:QUDA::quda>
+         $<$<BOOL:${TM_USE_CUDA}>:CUDA::cufft
+         CUDA::cufftw
+         CUDA::cublas
+         CUDA::cudart
+         CUDA::cuda_driver>
+         $<$<BOOL:${TM_USE_HIP}>:hip::hipfft
+         roc::hipblas
+         hip::host>
+         ${LAPACK_LIBRARIES}
+         ${BLAS_LIBRARIES}
+         $<$<BOOL:${TM_USE_OPENMP}>:OpenMP::OpenMP_C
+         OpenMP::OpenMP_CXX>
+         m)
+
+target_compile_definitions(
+  hmc PUBLIC HAVE_CONFIG_H
+             $<$<BOOL:${TM_USE_HIP}>:${TM_GPU_PLATFORM_DFLAGS}>)
+
+target_include_directories(
+  hmc
+  PUBLIC $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+         $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
diff --git a/DDalphaAMG_interface.c b/src/lib/DDalphaAMG_interface.c
similarity index 100%
rename from DDalphaAMG_interface.c
rename to src/lib/DDalphaAMG_interface.c
diff --git a/DDalphaAMG_interface.h b/src/lib/DDalphaAMG_interface.h
similarity index 100%
rename from DDalphaAMG_interface.h
rename to src/lib/DDalphaAMG_interface.h
diff --git a/Ptilde_nd.c b/src/lib/Ptilde_nd.c
similarity index 100%
rename from Ptilde_nd.c
rename to src/lib/Ptilde_nd.c
diff --git a/Ptilde_nd.h b/src/lib/Ptilde_nd.h
similarity index 100%
rename from Ptilde_nd.h
rename to src/lib/Ptilde_nd.h
diff --git a/aligned_malloc.c b/src/lib/aligned_malloc.c
similarity index 100%
rename from aligned_malloc.c
rename to src/lib/aligned_malloc.c
diff --git a/aligned_malloc.h b/src/lib/aligned_malloc.h
similarity index 100%
rename from aligned_malloc.h
rename to src/lib/aligned_malloc.h
diff --git a/block.c b/src/lib/block.c
similarity index 100%
rename from block.c
rename to src/lib/block.c
diff --git a/block.h b/src/lib/block.h
similarity index 100%
rename from block.h
rename to src/lib/block.h
diff --git a/boundary.c b/src/lib/boundary.c
similarity index 100%
rename from boundary.c
rename to src/lib/boundary.c
diff --git a/boundary.h b/src/lib/boundary.h
similarity index 100%
rename from boundary.h
rename to src/lib/boundary.h
diff --git a/buffers/Makefile.in b/src/lib/buffers/Makefile.in
similarity index 100%
rename from buffers/Makefile.in
rename to src/lib/buffers/Makefile.in
diff --git a/buffers/gauge.c b/src/lib/buffers/gauge.c
similarity index 100%
rename from buffers/gauge.c
rename to src/lib/buffers/gauge.c
diff --git a/buffers/gauge.h b/src/lib/buffers/gauge.h
similarity index 100%
rename from buffers/gauge.h
rename to src/lib/buffers/gauge.h
diff --git a/buffers/gauge.ih b/src/lib/buffers/gauge.ih
similarity index 100%
rename from buffers/gauge.ih
rename to src/lib/buffers/gauge.ih
diff --git a/buffers/gauge_allocate_gauge_buffers.c b/src/lib/buffers/gauge_allocate_gauge_buffers.c
similarity index 100%
rename from buffers/gauge_allocate_gauge_buffers.c
rename to src/lib/buffers/gauge_allocate_gauge_buffers.c
diff --git a/buffers/gauge_finalize_gauge_buffers.c b/src/lib/buffers/gauge_finalize_gauge_buffers.c
similarity index 100%
rename from buffers/gauge_finalize_gauge_buffers.c
rename to src/lib/buffers/gauge_finalize_gauge_buffers.c
diff --git a/buffers/gauge_free_unused_gauge_buffers.c b/src/lib/buffers/gauge_free_unused_gauge_buffers.c
similarity index 100%
rename from buffers/gauge_free_unused_gauge_buffers.c
rename to src/lib/buffers/gauge_free_unused_gauge_buffers.c
diff --git a/buffers/gauge_get_gauge_field.c b/src/lib/buffers/gauge_get_gauge_field.c
similarity index 100%
rename from buffers/gauge_get_gauge_field.c
rename to src/lib/buffers/gauge_get_gauge_field.c
diff --git a/buffers/gauge_get_gauge_field_array.c b/src/lib/buffers/gauge_get_gauge_field_array.c
similarity index 100%
rename from buffers/gauge_get_gauge_field_array.c
rename to src/lib/buffers/gauge_get_gauge_field_array.c
diff --git a/buffers/gauge_initialize_gauge_buffers.c b/src/lib/buffers/gauge_initialize_gauge_buffers.c
similarity index 100%
rename from buffers/gauge_initialize_gauge_buffers.c
rename to src/lib/buffers/gauge_initialize_gauge_buffers.c
diff --git a/buffers/gauge_return_gauge_field.c b/src/lib/buffers/gauge_return_gauge_field.c
similarity index 100%
rename from buffers/gauge_return_gauge_field.c
rename to src/lib/buffers/gauge_return_gauge_field.c
diff --git a/buffers/gauge_return_gauge_field_array.c b/src/lib/buffers/gauge_return_gauge_field_array.c
similarity index 100%
rename from buffers/gauge_return_gauge_field_array.c
rename to src/lib/buffers/gauge_return_gauge_field_array.c
diff --git a/buffers/utils.h b/src/lib/buffers/utils.h
similarity index 100%
rename from buffers/utils.h
rename to src/lib/buffers/utils.h
diff --git a/buffers/utils.ih b/src/lib/buffers/utils.ih
similarity index 100%
rename from buffers/utils.ih
rename to src/lib/buffers/utils.ih
diff --git a/buffers/utils_generic_exchange.blocking.inc b/src/lib/buffers/utils_generic_exchange.blocking.inc
similarity index 100%
rename from buffers/utils_generic_exchange.blocking.inc
rename to src/lib/buffers/utils_generic_exchange.blocking.inc
diff --git a/buffers/utils_generic_exchange.c b/src/lib/buffers/utils_generic_exchange.c
similarity index 100%
rename from buffers/utils_generic_exchange.c
rename to src/lib/buffers/utils_generic_exchange.c
diff --git a/buffers/utils_generic_exchange.nonblocking.inc b/src/lib/buffers/utils_generic_exchange.nonblocking.inc
similarity index 100%
rename from buffers/utils_generic_exchange.nonblocking.inc
rename to src/lib/buffers/utils_generic_exchange.nonblocking.inc
diff --git a/chebyshev_polynomial.c b/src/lib/chebyshev_polynomial.c
similarity index 98%
rename from chebyshev_polynomial.c
rename to src/lib/chebyshev_polynomial.c
index 501937b80..f67055cda 100644
--- a/chebyshev_polynomial.c
+++ b/src/lib/chebyshev_polynomial.c
@@ -280,11 +280,11 @@ void degree_of_polynomial(const int repro) {
           printf("||auxc_3||=%e\n",temp); */
 
     diff(&auxs[0], &auxs[0], &aux3s[0], VOLUME / 2);
-    temp = square_norm(&auxs[0], VOLUME / 2) / square_norm(&aux3s[0], VOLUME / 2, 1) / 4.0;
+    temp = square_norm(&auxs[0], VOLUME / 2, 1) / square_norm(&aux3s[0], VOLUME / 2, 1) / 4.0;
     if (g_proc_id == g_stdio_proc) {
       printf("difference=%e\n", temp);
       diff(&auxc[0], &auxc[0], &aux3c[0], VOLUME / 2);
-      temp = square_norm(&auxc[0], VOLUME / 2) / square_norm(&aux3c[0], VOLUME / 2, 1) / 4.0;
+      temp = square_norm(&auxc[0], VOLUME / 2, 1) / square_norm(&aux3c[0], VOLUME / 2, 1) / 4.0;
       printf("difference=%e\n", temp);
     }
     if (temp < stopeps) break;
diff --git a/chebyshev_polynomial.h b/src/lib/chebyshev_polynomial.h
similarity index 100%
rename from chebyshev_polynomial.h
rename to src/lib/chebyshev_polynomial.h
diff --git a/chebyshev_polynomial_nd.c b/src/lib/chebyshev_polynomial_nd.c
similarity index 100%
rename from chebyshev_polynomial_nd.c
rename to src/lib/chebyshev_polynomial_nd.c
diff --git a/chebyshev_polynomial_nd.h b/src/lib/chebyshev_polynomial_nd.h
similarity index 100%
rename from chebyshev_polynomial_nd.h
rename to src/lib/chebyshev_polynomial_nd.h
diff --git a/clenshaw_coef.c b/src/lib/clenshaw_coef.c
similarity index 100%
rename from clenshaw_coef.c
rename to src/lib/clenshaw_coef.c
diff --git a/clenshaw_coef.h b/src/lib/clenshaw_coef.h
similarity index 100%
rename from clenshaw_coef.h
rename to src/lib/clenshaw_coef.h
diff --git a/compare_derivative.c b/src/lib/compare_derivative.c
similarity index 100%
rename from compare_derivative.c
rename to src/lib/compare_derivative.c
diff --git a/compare_derivative.h b/src/lib/compare_derivative.h
similarity index 100%
rename from compare_derivative.h
rename to src/lib/compare_derivative.h
diff --git a/cu/COPYING b/src/lib/cu/COPYING
similarity index 100%
rename from cu/COPYING
rename to src/lib/cu/COPYING
diff --git a/cu/COPYING.LESSER b/src/lib/cu/COPYING.LESSER
similarity index 100%
rename from cu/COPYING.LESSER
rename to src/lib/cu/COPYING.LESSER
diff --git a/cu/Makefile.in b/src/lib/cu/Makefile.in
similarity index 100%
rename from cu/Makefile.in
rename to src/lib/cu/Makefile.in
diff --git a/cu/check-regressions b/src/lib/cu/check-regressions
similarity index 100%
rename from cu/check-regressions
rename to src/lib/cu/check-regressions
diff --git a/cu/cu.c b/src/lib/cu/cu.c
similarity index 100%
rename from cu/cu.c
rename to src/lib/cu/cu.c
diff --git a/cu/cu.h b/src/lib/cu/cu.h
similarity index 100%
rename from cu/cu.h
rename to src/lib/cu/cu.h
diff --git a/default_input_values.h b/src/lib/default_input_values.h
similarity index 100%
rename from default_input_values.h
rename to src/lib/default_input_values.h
diff --git a/deriv_Sb.c b/src/lib/deriv_Sb.c
similarity index 100%
rename from deriv_Sb.c
rename to src/lib/deriv_Sb.c
diff --git a/deriv_Sb.h b/src/lib/deriv_Sb.h
similarity index 100%
rename from deriv_Sb.h
rename to src/lib/deriv_Sb.h
diff --git a/deriv_Sb_D_psi.c b/src/lib/deriv_Sb_D_psi.c
similarity index 100%
rename from deriv_Sb_D_psi.c
rename to src/lib/deriv_Sb_D_psi.c
diff --git a/deriv_Sb_D_psi.h b/src/lib/deriv_Sb_D_psi.h
similarity index 100%
rename from deriv_Sb_D_psi.h
rename to src/lib/deriv_Sb_D_psi.h
diff --git a/expo.c b/src/lib/expo.c
similarity index 100%
rename from expo.c
rename to src/lib/expo.c
diff --git a/expo.h b/src/lib/expo.h
similarity index 100%
rename from expo.h
rename to src/lib/expo.h
diff --git a/fatal_error.c b/src/lib/fatal_error.c
similarity index 100%
rename from fatal_error.c
rename to src/lib/fatal_error.c
diff --git a/fatal_error.h b/src/lib/fatal_error.h
similarity index 100%
rename from fatal_error.h
rename to src/lib/fatal_error.h
diff --git a/gamma.c b/src/lib/gamma.c
similarity index 100%
rename from gamma.c
rename to src/lib/gamma.c
diff --git a/gamma.h b/src/lib/gamma.h
similarity index 100%
rename from gamma.h
rename to src/lib/gamma.h
diff --git a/geometry_eo.c b/src/lib/geometry_eo.c
similarity index 100%
rename from geometry_eo.c
rename to src/lib/geometry_eo.c
diff --git a/geometry_eo.h b/src/lib/geometry_eo.h
similarity index 100%
rename from geometry_eo.h
rename to src/lib/geometry_eo.h
diff --git a/get_rectangle_staples.c b/src/lib/get_rectangle_staples.c
similarity index 100%
rename from get_rectangle_staples.c
rename to src/lib/get_rectangle_staples.c
diff --git a/get_rectangle_staples.h b/src/lib/get_rectangle_staples.h
similarity index 100%
rename from get_rectangle_staples.h
rename to src/lib/get_rectangle_staples.h
diff --git a/get_staples.c b/src/lib/get_staples.c
similarity index 100%
rename from get_staples.c
rename to src/lib/get_staples.c
diff --git a/get_staples.h b/src/lib/get_staples.h
similarity index 100%
rename from get_staples.h
rename to src/lib/get_staples.h
diff --git a/getopt.c b/src/lib/getopt.c
similarity index 100%
rename from getopt.c
rename to src/lib/getopt.c
diff --git a/getopt.h b/src/lib/getopt.h
similarity index 100%
rename from getopt.h
rename to src/lib/getopt.h
diff --git a/gettime.c b/src/lib/gettime.c
similarity index 100%
rename from gettime.c
rename to src/lib/gettime.c
diff --git a/gettime.h b/src/lib/gettime.h
similarity index 100%
rename from gettime.h
rename to src/lib/gettime.h
diff --git a/global.h b/src/lib/global.h
similarity index 100%
rename from global.h
rename to src/lib/global.h
diff --git a/hamiltonian_field.h b/src/lib/hamiltonian_field.h
similarity index 100%
rename from hamiltonian_field.h
rename to src/lib/hamiltonian_field.h
diff --git a/include/tmLQCD.h b/src/lib/include/tmLQCD.h
similarity index 100%
rename from include/tmLQCD.h
rename to src/lib/include/tmLQCD.h
diff --git a/include/tmlqcd_config.h b/src/lib/include/tmlqcd_config.h
similarity index 100%
rename from include/tmlqcd_config.h
rename to src/lib/include/tmlqcd_config.h
diff --git a/init/Makefile.in b/src/lib/init/Makefile.in
similarity index 100%
rename from init/Makefile.in
rename to src/lib/init/Makefile.in
diff --git a/init/init.h b/src/lib/init/init.h
similarity index 100%
rename from init/init.h
rename to src/lib/init/init.h
diff --git a/init/init_bispinor_field.c b/src/lib/init/init_bispinor_field.c
similarity index 100%
rename from init/init_bispinor_field.c
rename to src/lib/init/init_bispinor_field.c
diff --git a/init/init_bispinor_field.h b/src/lib/init/init_bispinor_field.h
similarity index 100%
rename from init/init_bispinor_field.h
rename to src/lib/init/init_bispinor_field.h
diff --git a/init/init_chi_spinor_field.c b/src/lib/init/init_chi_spinor_field.c
similarity index 100%
rename from init/init_chi_spinor_field.c
rename to src/lib/init/init_chi_spinor_field.c
diff --git a/init/init_chi_spinor_field.h b/src/lib/init/init_chi_spinor_field.h
similarity index 100%
rename from init/init_chi_spinor_field.h
rename to src/lib/init/init_chi_spinor_field.h
diff --git a/init/init_critical_globals.c b/src/lib/init/init_critical_globals.c
similarity index 100%
rename from init/init_critical_globals.c
rename to src/lib/init/init_critical_globals.c
diff --git a/init/init_critical_globals.h b/src/lib/init/init_critical_globals.h
similarity index 100%
rename from init/init_critical_globals.h
rename to src/lib/init/init_critical_globals.h
diff --git a/init/init_dirac_halfspinor.c b/src/lib/init/init_dirac_halfspinor.c
similarity index 100%
rename from init/init_dirac_halfspinor.c
rename to src/lib/init/init_dirac_halfspinor.c
diff --git a/init/init_dirac_halfspinor.h b/src/lib/init/init_dirac_halfspinor.h
similarity index 100%
rename from init/init_dirac_halfspinor.h
rename to src/lib/init/init_dirac_halfspinor.h
diff --git a/init/init_gauge_fg.c b/src/lib/init/init_gauge_fg.c
similarity index 100%
rename from init/init_gauge_fg.c
rename to src/lib/init/init_gauge_fg.c
diff --git a/init/init_gauge_fg.h b/src/lib/init/init_gauge_fg.h
similarity index 100%
rename from init/init_gauge_fg.h
rename to src/lib/init/init_gauge_fg.h
diff --git a/init/init_gauge_field.c b/src/lib/init/init_gauge_field.c
similarity index 100%
rename from init/init_gauge_field.c
rename to src/lib/init/init_gauge_field.c
diff --git a/init/init_gauge_field.h b/src/lib/init/init_gauge_field.h
similarity index 100%
rename from init/init_gauge_field.h
rename to src/lib/init/init_gauge_field.h
diff --git a/init/init_gauge_tmp.c b/src/lib/init/init_gauge_tmp.c
similarity index 100%
rename from init/init_gauge_tmp.c
rename to src/lib/init/init_gauge_tmp.c
diff --git a/init/init_gauge_tmp.h b/src/lib/init/init_gauge_tmp.h
similarity index 100%
rename from init/init_gauge_tmp.h
rename to src/lib/init/init_gauge_tmp.h
diff --git a/init/init_geometry_indices.c b/src/lib/init/init_geometry_indices.c
similarity index 100%
rename from init/init_geometry_indices.c
rename to src/lib/init/init_geometry_indices.c
diff --git a/init/init_geometry_indices.h b/src/lib/init/init_geometry_indices.h
similarity index 100%
rename from init/init_geometry_indices.h
rename to src/lib/init/init_geometry_indices.h
diff --git a/init/init_global_states.c b/src/lib/init/init_global_states.c
similarity index 100%
rename from init/init_global_states.c
rename to src/lib/init/init_global_states.c
diff --git a/init/init_global_states.h b/src/lib/init/init_global_states.h
similarity index 100%
rename from init/init_global_states.h
rename to src/lib/init/init_global_states.h
diff --git a/init/init_moment_field.c b/src/lib/init/init_moment_field.c
similarity index 100%
rename from init/init_moment_field.c
rename to src/lib/init/init_moment_field.c
diff --git a/init/init_moment_field.h b/src/lib/init/init_moment_field.h
similarity index 100%
rename from init/init_moment_field.h
rename to src/lib/init/init_moment_field.h
diff --git a/init/init_omp_accumulators.c b/src/lib/init/init_omp_accumulators.c
similarity index 100%
rename from init/init_omp_accumulators.c
rename to src/lib/init/init_omp_accumulators.c
diff --git a/init/init_omp_accumulators.h b/src/lib/init/init_omp_accumulators.h
similarity index 100%
rename from init/init_omp_accumulators.h
rename to src/lib/init/init_omp_accumulators.h
diff --git a/init/init_openmp.c b/src/lib/init/init_openmp.c
similarity index 100%
rename from init/init_openmp.c
rename to src/lib/init/init_openmp.c
diff --git a/init/init_openmp.h b/src/lib/init/init_openmp.h
similarity index 100%
rename from init/init_openmp.h
rename to src/lib/init/init_openmp.h
diff --git a/init/init_parallel.c b/src/lib/init/init_parallel.c
similarity index 100%
rename from init/init_parallel.c
rename to src/lib/init/init_parallel.c
diff --git a/init/init_parallel.h b/src/lib/init/init_parallel.h
similarity index 100%
rename from init/init_parallel.h
rename to src/lib/init/init_parallel.h
diff --git a/init/init_spinor_field.c b/src/lib/init/init_spinor_field.c
similarity index 100%
rename from init/init_spinor_field.c
rename to src/lib/init/init_spinor_field.c
diff --git a/init/init_spinor_field.h b/src/lib/init/init_spinor_field.h
similarity index 100%
rename from init/init_spinor_field.h
rename to src/lib/init/init_spinor_field.h
diff --git a/init/init_stout_smear_vars.c b/src/lib/init/init_stout_smear_vars.c
similarity index 100%
rename from init/init_stout_smear_vars.c
rename to src/lib/init/init_stout_smear_vars.c
diff --git a/init/init_stout_smear_vars.h b/src/lib/init/init_stout_smear_vars.h
similarity index 100%
rename from init/init_stout_smear_vars.h
rename to src/lib/init/init_stout_smear_vars.h
diff --git a/integrator.c b/src/lib/integrator.c
similarity index 100%
rename from integrator.c
rename to src/lib/integrator.c
diff --git a/integrator.h b/src/lib/integrator.h
similarity index 100%
rename from integrator.h
rename to src/lib/integrator.h
diff --git a/invert_clover_eo.c b/src/lib/invert_clover_eo.c
similarity index 100%
rename from invert_clover_eo.c
rename to src/lib/invert_clover_eo.c
diff --git a/invert_clover_eo.h b/src/lib/invert_clover_eo.h
similarity index 100%
rename from invert_clover_eo.h
rename to src/lib/invert_clover_eo.h
diff --git a/invert_doublet_eo.c b/src/lib/invert_doublet_eo.c
similarity index 100%
rename from invert_doublet_eo.c
rename to src/lib/invert_doublet_eo.c
diff --git a/invert_doublet_eo.h b/src/lib/invert_doublet_eo.h
similarity index 100%
rename from invert_doublet_eo.h
rename to src/lib/invert_doublet_eo.h
diff --git a/invert_eo.c b/src/lib/invert_eo.c
similarity index 99%
rename from invert_eo.c
rename to src/lib/invert_eo.c
index 25ee4a297..997cab021 100644
--- a/invert_eo.c
+++ b/src/lib/invert_eo.c
@@ -34,6 +34,7 @@
 #ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
 #endif
+
 #include <stdlib.h>
 #include "gamma.h"
 #include "global.h"
diff --git a/invert_eo.h b/src/lib/invert_eo.h
similarity index 100%
rename from invert_eo.h
rename to src/lib/invert_eo.h
diff --git a/invert_overlap.c b/src/lib/invert_overlap.c
similarity index 100%
rename from invert_overlap.c
rename to src/lib/invert_overlap.c
diff --git a/invert_overlap.h b/src/lib/invert_overlap.h
similarity index 100%
rename from invert_overlap.h
rename to src/lib/invert_overlap.h
diff --git a/io/DML_crc32.c b/src/lib/io/DML_crc32.c
similarity index 100%
rename from io/DML_crc32.c
rename to src/lib/io/DML_crc32.c
diff --git a/io/deri_write_stdout.c b/src/lib/io/deri_write_stdout.c
similarity index 100%
rename from io/deri_write_stdout.c
rename to src/lib/io/deri_write_stdout.c
diff --git a/io/deri_write_stdout.h b/src/lib/io/deri_write_stdout.h
similarity index 100%
rename from io/deri_write_stdout.h
rename to src/lib/io/deri_write_stdout.h
diff --git a/io/dml.c b/src/lib/io/dml.c
similarity index 100%
rename from io/dml.c
rename to src/lib/io/dml.c
diff --git a/io/dml.h b/src/lib/io/dml.h
similarity index 100%
rename from io/dml.h
rename to src/lib/io/dml.h
diff --git a/io/eospinor.h b/src/lib/io/eospinor.h
similarity index 100%
rename from io/eospinor.h
rename to src/lib/io/eospinor.h
diff --git a/io/eospinor.ih b/src/lib/io/eospinor.ih
similarity index 100%
rename from io/eospinor.ih
rename to src/lib/io/eospinor.ih
diff --git a/io/eospinor_read.c b/src/lib/io/eospinor_read.c
similarity index 100%
rename from io/eospinor_read.c
rename to src/lib/io/eospinor_read.c
diff --git a/io/eospinor_write.c b/src/lib/io/eospinor_write.c
similarity index 100%
rename from io/eospinor_write.c
rename to src/lib/io/eospinor_write.c
diff --git a/io/gauge.h b/src/lib/io/gauge.h
similarity index 100%
rename from io/gauge.h
rename to src/lib/io/gauge.h
diff --git a/io/gauge.ih b/src/lib/io/gauge.ih
similarity index 100%
rename from io/gauge.ih
rename to src/lib/io/gauge.ih
diff --git a/io/gauge_read.c b/src/lib/io/gauge_read.c
similarity index 100%
rename from io/gauge_read.c
rename to src/lib/io/gauge_read.c
diff --git a/io/gauge_read_binary.c b/src/lib/io/gauge_read_binary.c
similarity index 100%
rename from io/gauge_read_binary.c
rename to src/lib/io/gauge_read_binary.c
diff --git a/io/gauge_write.c b/src/lib/io/gauge_write.c
similarity index 100%
rename from io/gauge_write.c
rename to src/lib/io/gauge_write.c
diff --git a/io/gauge_write_binary.c b/src/lib/io/gauge_write_binary.c
similarity index 100%
rename from io/gauge_write_binary.c
rename to src/lib/io/gauge_write_binary.c
diff --git a/io/gauge_write_luscher_binary.c b/src/lib/io/gauge_write_luscher_binary.c
similarity index 100%
rename from io/gauge_write_luscher_binary.c
rename to src/lib/io/gauge_write_luscher_binary.c
diff --git a/io/gauge_write_luscher_binary.h b/src/lib/io/gauge_write_luscher_binary.h
similarity index 100%
rename from io/gauge_write_luscher_binary.h
rename to src/lib/io/gauge_write_luscher_binary.h
diff --git a/io/io_cm.c b/src/lib/io/io_cm.c
similarity index 100%
rename from io/io_cm.c
rename to src/lib/io/io_cm.c
diff --git a/io/io_cm.h b/src/lib/io/io_cm.h
similarity index 100%
rename from io/io_cm.h
rename to src/lib/io/io_cm.h
diff --git a/io/params.h b/src/lib/io/params.h
similarity index 100%
rename from io/params.h
rename to src/lib/io/params.h
diff --git a/io/params.ih b/src/lib/io/params.ih
similarity index 100%
rename from io/params.ih
rename to src/lib/io/params.ih
diff --git a/io/params_construct_InverterInfo.c b/src/lib/io/params_construct_InverterInfo.c
similarity index 100%
rename from io/params_construct_InverterInfo.c
rename to src/lib/io/params_construct_InverterInfo.c
diff --git a/io/params_construct_ildgFormat.c b/src/lib/io/params_construct_ildgFormat.c
similarity index 100%
rename from io/params_construct_ildgFormat.c
rename to src/lib/io/params_construct_ildgFormat.c
diff --git a/io/params_construct_propagatorFormat.c b/src/lib/io/params_construct_propagatorFormat.c
similarity index 100%
rename from io/params_construct_propagatorFormat.c
rename to src/lib/io/params_construct_propagatorFormat.c
diff --git a/io/params_construct_sourceFormat.c b/src/lib/io/params_construct_sourceFormat.c
similarity index 100%
rename from io/params_construct_sourceFormat.c
rename to src/lib/io/params_construct_sourceFormat.c
diff --git a/io/params_construct_xlfInfo.c b/src/lib/io/params_construct_xlfInfo.c
similarity index 100%
rename from io/params_construct_xlfInfo.c
rename to src/lib/io/params_construct_xlfInfo.c
diff --git a/io/selector.h b/src/lib/io/selector.h
similarity index 100%
rename from io/selector.h
rename to src/lib/io/selector.h
diff --git a/io/spinor.h b/src/lib/io/spinor.h
similarity index 100%
rename from io/spinor.h
rename to src/lib/io/spinor.h
diff --git a/io/spinor.ih b/src/lib/io/spinor.ih
similarity index 100%
rename from io/spinor.ih
rename to src/lib/io/spinor.ih
diff --git a/io/spinor_read.c b/src/lib/io/spinor_read.c
similarity index 100%
rename from io/spinor_read.c
rename to src/lib/io/spinor_read.c
diff --git a/io/spinor_read_binary.c b/src/lib/io/spinor_read_binary.c
similarity index 100%
rename from io/spinor_read_binary.c
rename to src/lib/io/spinor_read_binary.c
diff --git a/io/spinor_write.c b/src/lib/io/spinor_write.c
similarity index 100%
rename from io/spinor_write.c
rename to src/lib/io/spinor_write.c
diff --git a/io/spinor_write_binary.c b/src/lib/io/spinor_write_binary.c
similarity index 100%
rename from io/spinor_write_binary.c
rename to src/lib/io/spinor_write_binary.c
diff --git a/io/spinor_write_info.c b/src/lib/io/spinor_write_info.c
similarity index 100%
rename from io/spinor_write_info.c
rename to src/lib/io/spinor_write_info.c
diff --git a/io/spinor_write_propagator_format.c b/src/lib/io/spinor_write_propagator_format.c
similarity index 100%
rename from io/spinor_write_propagator_format.c
rename to src/lib/io/spinor_write_propagator_format.c
diff --git a/io/spinor_write_propagator_type.c b/src/lib/io/spinor_write_propagator_type.c
similarity index 100%
rename from io/spinor_write_propagator_type.c
rename to src/lib/io/spinor_write_propagator_type.c
diff --git a/io/spinor_write_source_format.c b/src/lib/io/spinor_write_source_format.c
similarity index 100%
rename from io/spinor_write_source_format.c
rename to src/lib/io/spinor_write_source_format.c
diff --git a/io/spinor_write_stdout.c b/src/lib/io/spinor_write_stdout.c
similarity index 100%
rename from io/spinor_write_stdout.c
rename to src/lib/io/spinor_write_stdout.c
diff --git a/io/spinor_write_stdout.h b/src/lib/io/spinor_write_stdout.h
similarity index 100%
rename from io/spinor_write_stdout.h
rename to src/lib/io/spinor_write_stdout.h
diff --git a/io/sw_write_stdout.c b/src/lib/io/sw_write_stdout.c
similarity index 100%
rename from io/sw_write_stdout.c
rename to src/lib/io/sw_write_stdout.c
diff --git a/io/sw_write_stdout.h b/src/lib/io/sw_write_stdout.h
similarity index 100%
rename from io/sw_write_stdout.h
rename to src/lib/io/sw_write_stdout.h
diff --git a/io/utils.c b/src/lib/io/utils.c
similarity index 100%
rename from io/utils.c
rename to src/lib/io/utils.c
diff --git a/io/utils.h b/src/lib/io/utils.h
similarity index 99%
rename from io/utils.h
rename to src/lib/io/utils.h
index afcca1553..85e98a5e2 100644
--- a/io/utils.h
+++ b/src/lib/io/utils.h
@@ -20,9 +20,7 @@
 #ifndef _UTILS_H
 #define _UTILS_H
 
-#ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
-#endif
 
 #include "io/dml.h"
 #include "io/params.h"
diff --git a/io/utils.ih b/src/lib/io/utils.ih
similarity index 96%
rename from io/utils.ih
rename to src/lib/io/utils.ih
index 073bd64b5..dd963b5b9 100644
--- a/io/utils.ih
+++ b/src/lib/io/utils.ih
@@ -18,9 +18,7 @@
 ***********************************************************************/
 
 #include <lime.h>
-#ifdef HAVE_CONFIG_H
 #include "tmlqcd_config.h"
-#endif
 
 #include <stdlib.h>
 #include <stdio.h>
@@ -29,7 +27,7 @@
 #include <endian.h>
 #include <sys/time.h>
 #include <sys/types.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include <unistd.h>
diff --git a/io/utils_close_reader_record.c b/src/lib/io/utils_close_reader_record.c
similarity index 100%
rename from io/utils_close_reader_record.c
rename to src/lib/io/utils_close_reader_record.c
diff --git a/io/utils_close_writer_record.c b/src/lib/io/utils_close_writer_record.c
similarity index 100%
rename from io/utils_close_writer_record.c
rename to src/lib/io/utils_close_writer_record.c
diff --git a/io/utils_construct_reader.c b/src/lib/io/utils_construct_reader.c
similarity index 97%
rename from io/utils_construct_reader.c
rename to src/lib/io/utils_construct_reader.c
index 085206786..2714455b2 100644
--- a/io/utils_construct_reader.c
+++ b/src/lib/io/utils_construct_reader.c
@@ -1,5 +1,7 @@
 #include "utils.ih"
 
+extern MPI_Comm g_cart_grid;
+
 void construct_reader(READER **reader, char *filename) {
   LIME_FILE *fh = NULL;
   int status = 0;
diff --git a/io/utils_construct_writer.c b/src/lib/io/utils_construct_writer.c
similarity index 100%
rename from io/utils_construct_writer.c
rename to src/lib/io/utils_construct_writer.c
diff --git a/io/utils_destruct_reader.c b/src/lib/io/utils_destruct_reader.c
similarity index 100%
rename from io/utils_destruct_reader.c
rename to src/lib/io/utils_destruct_reader.c
diff --git a/io/utils_destruct_writer.c b/src/lib/io/utils_destruct_writer.c
similarity index 100%
rename from io/utils_destruct_writer.c
rename to src/lib/io/utils_destruct_writer.c
diff --git a/io/utils_engineering.c b/src/lib/io/utils_engineering.c
similarity index 100%
rename from io/utils_engineering.c
rename to src/lib/io/utils_engineering.c
diff --git a/io/utils_kill_with_error.c b/src/lib/io/utils_kill_with_error.c
similarity index 100%
rename from io/utils_kill_with_error.c
rename to src/lib/io/utils_kill_with_error.c
diff --git a/io/utils_parse_checksum_xml.c b/src/lib/io/utils_parse_checksum_xml.c
similarity index 100%
rename from io/utils_parse_checksum_xml.c
rename to src/lib/io/utils_parse_checksum_xml.c
diff --git a/io/utils_parse_ildgformat_xml.c b/src/lib/io/utils_parse_ildgformat_xml.c
similarity index 100%
rename from io/utils_parse_ildgformat_xml.c
rename to src/lib/io/utils_parse_ildgformat_xml.c
diff --git a/io/utils_parse_propagator_type.c b/src/lib/io/utils_parse_propagator_type.c
similarity index 100%
rename from io/utils_parse_propagator_type.c
rename to src/lib/io/utils_parse_propagator_type.c
diff --git a/io/utils_read_message.c b/src/lib/io/utils_read_message.c
similarity index 100%
rename from io/utils_read_message.c
rename to src/lib/io/utils_read_message.c
diff --git a/io/utils_write_checksum.c b/src/lib/io/utils_write_checksum.c
similarity index 100%
rename from io/utils_write_checksum.c
rename to src/lib/io/utils_write_checksum.c
diff --git a/io/utils_write_first_message.c b/src/lib/io/utils_write_first_message.c
similarity index 100%
rename from io/utils_write_first_message.c
rename to src/lib/io/utils_write_first_message.c
diff --git a/io/utils_write_header.c b/src/lib/io/utils_write_header.c
similarity index 100%
rename from io/utils_write_header.c
rename to src/lib/io/utils_write_header.c
diff --git a/io/utils_write_ildg_format.c b/src/lib/io/utils_write_ildg_format.c
similarity index 100%
rename from io/utils_write_ildg_format.c
rename to src/lib/io/utils_write_ildg_format.c
diff --git a/io/utils_write_inverter_info.c b/src/lib/io/utils_write_inverter_info.c
similarity index 100%
rename from io/utils_write_inverter_info.c
rename to src/lib/io/utils_write_inverter_info.c
diff --git a/io/utils_write_message.c b/src/lib/io/utils_write_message.c
similarity index 100%
rename from io/utils_write_message.c
rename to src/lib/io/utils_write_message.c
diff --git a/io/utils_write_xlf.c b/src/lib/io/utils_write_xlf.c
similarity index 100%
rename from io/utils_write_xlf.c
rename to src/lib/io/utils_write_xlf.c
diff --git a/io/utils_write_xlf_xml.c b/src/lib/io/utils_write_xlf_xml.c
similarity index 100%
rename from io/utils_write_xlf_xml.c
rename to src/lib/io/utils_write_xlf_xml.c
diff --git a/kahan_summation.h b/src/lib/kahan_summation.h
similarity index 100%
rename from kahan_summation.h
rename to src/lib/kahan_summation.h
diff --git a/linalg/Makefile.in b/src/lib/linalg/Makefile.in
similarity index 100%
rename from linalg/Makefile.in
rename to src/lib/linalg/Makefile.in
diff --git a/linalg/add.c b/src/lib/linalg/add.c
similarity index 100%
rename from linalg/add.c
rename to src/lib/linalg/add.c
diff --git a/linalg/add.h b/src/lib/linalg/add.h
similarity index 100%
rename from linalg/add.h
rename to src/lib/linalg/add.h
diff --git a/linalg/addto_32.c b/src/lib/linalg/addto_32.c
similarity index 100%
rename from linalg/addto_32.c
rename to src/lib/linalg/addto_32.c
diff --git a/linalg/addto_32.h b/src/lib/linalg/addto_32.h
similarity index 100%
rename from linalg/addto_32.h
rename to src/lib/linalg/addto_32.h
diff --git a/linalg/assign.c b/src/lib/linalg/assign.c
similarity index 100%
rename from linalg/assign.c
rename to src/lib/linalg/assign.c
diff --git a/linalg/assign.h b/src/lib/linalg/assign.h
similarity index 100%
rename from linalg/assign.h
rename to src/lib/linalg/assign.h
diff --git a/linalg/assign_add_mul.c b/src/lib/linalg/assign_add_mul.c
similarity index 100%
rename from linalg/assign_add_mul.c
rename to src/lib/linalg/assign_add_mul.c
diff --git a/linalg/assign_add_mul.h b/src/lib/linalg/assign_add_mul.h
similarity index 100%
rename from linalg/assign_add_mul.h
rename to src/lib/linalg/assign_add_mul.h
diff --git a/linalg/assign_add_mul_add_mul.c b/src/lib/linalg/assign_add_mul_add_mul.c
similarity index 100%
rename from linalg/assign_add_mul_add_mul.c
rename to src/lib/linalg/assign_add_mul_add_mul.c
diff --git a/linalg/assign_add_mul_add_mul.h b/src/lib/linalg/assign_add_mul_add_mul.h
similarity index 100%
rename from linalg/assign_add_mul_add_mul.h
rename to src/lib/linalg/assign_add_mul_add_mul.h
diff --git a/linalg/assign_add_mul_add_mul_r.c b/src/lib/linalg/assign_add_mul_add_mul_r.c
similarity index 100%
rename from linalg/assign_add_mul_add_mul_r.c
rename to src/lib/linalg/assign_add_mul_add_mul_r.c
diff --git a/linalg/assign_add_mul_add_mul_r.h b/src/lib/linalg/assign_add_mul_add_mul_r.h
similarity index 100%
rename from linalg/assign_add_mul_add_mul_r.h
rename to src/lib/linalg/assign_add_mul_add_mul_r.h
diff --git a/linalg/assign_add_mul_body.c b/src/lib/linalg/assign_add_mul_body.c
similarity index 100%
rename from linalg/assign_add_mul_body.c
rename to src/lib/linalg/assign_add_mul_body.c
diff --git a/linalg/assign_add_mul_r.c b/src/lib/linalg/assign_add_mul_r.c
similarity index 100%
rename from linalg/assign_add_mul_r.c
rename to src/lib/linalg/assign_add_mul_r.c
diff --git a/linalg/assign_add_mul_r.h b/src/lib/linalg/assign_add_mul_r.h
similarity index 100%
rename from linalg/assign_add_mul_r.h
rename to src/lib/linalg/assign_add_mul_r.h
diff --git a/linalg/assign_add_mul_r_32.c b/src/lib/linalg/assign_add_mul_r_32.c
similarity index 93%
rename from linalg/assign_add_mul_r_32.c
rename to src/lib/linalg/assign_add_mul_r_32.c
index 8df54858b..9f6b1a72f 100644
--- a/linalg/assign_add_mul_r_32.c
+++ b/src/lib/linalg/assign_add_mul_r_32.c
@@ -28,16 +28,13 @@
 #ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
 #endif
-#ifdef TM_USE_OMP
-#include <omp.h>
-#endif
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "assign_add_mul_r_32.h"
 #include "su3.h"
 
-inline void assign_add_mul_r_32_orphaned(spinor32 *const R, spinor32 *const S, const float c,
+void assign_add_mul_r_32_orphaned(spinor32 *const R, spinor32 *const S, const float c,
                                          const int N) {
 #ifdef TM_USE_OMP
 #pragma omp parallel for
diff --git a/linalg/assign_add_mul_r_32.h b/src/lib/linalg/assign_add_mul_r_32.h
similarity index 100%
rename from linalg/assign_add_mul_r_32.h
rename to src/lib/linalg/assign_add_mul_r_32.h
diff --git a/linalg/assign_add_mul_r_add_mul.c b/src/lib/linalg/assign_add_mul_r_add_mul.c
similarity index 100%
rename from linalg/assign_add_mul_r_add_mul.c
rename to src/lib/linalg/assign_add_mul_r_add_mul.c
diff --git a/linalg/assign_add_mul_r_add_mul.h b/src/lib/linalg/assign_add_mul_r_add_mul.h
similarity index 100%
rename from linalg/assign_add_mul_r_add_mul.h
rename to src/lib/linalg/assign_add_mul_r_add_mul.h
diff --git a/linalg/assign_diff_mul.c b/src/lib/linalg/assign_diff_mul.c
similarity index 100%
rename from linalg/assign_diff_mul.c
rename to src/lib/linalg/assign_diff_mul.c
diff --git a/linalg/assign_diff_mul.h b/src/lib/linalg/assign_diff_mul.h
similarity index 100%
rename from linalg/assign_diff_mul.h
rename to src/lib/linalg/assign_diff_mul.h
diff --git a/linalg/assign_mul_add.c b/src/lib/linalg/assign_mul_add.c
similarity index 100%
rename from linalg/assign_mul_add.c
rename to src/lib/linalg/assign_mul_add.c
diff --git a/linalg/assign_mul_add.h b/src/lib/linalg/assign_mul_add.h
similarity index 100%
rename from linalg/assign_mul_add.h
rename to src/lib/linalg/assign_mul_add.h
diff --git a/linalg/assign_mul_add_mul.c b/src/lib/linalg/assign_mul_add_mul.c
similarity index 100%
rename from linalg/assign_mul_add_mul.c
rename to src/lib/linalg/assign_mul_add_mul.c
diff --git a/linalg/assign_mul_add_mul.h b/src/lib/linalg/assign_mul_add_mul.h
similarity index 100%
rename from linalg/assign_mul_add_mul.h
rename to src/lib/linalg/assign_mul_add_mul.h
diff --git a/linalg/assign_mul_add_mul_add_mul_add_mul_r.c b/src/lib/linalg/assign_mul_add_mul_add_mul_add_mul_r.c
similarity index 100%
rename from linalg/assign_mul_add_mul_add_mul_add_mul_r.c
rename to src/lib/linalg/assign_mul_add_mul_add_mul_add_mul_r.c
diff --git a/linalg/assign_mul_add_mul_add_mul_add_mul_r.h b/src/lib/linalg/assign_mul_add_mul_add_mul_add_mul_r.h
similarity index 100%
rename from linalg/assign_mul_add_mul_add_mul_add_mul_r.h
rename to src/lib/linalg/assign_mul_add_mul_add_mul_add_mul_r.h
diff --git a/linalg/assign_mul_add_mul_add_mul_r.c b/src/lib/linalg/assign_mul_add_mul_add_mul_r.c
similarity index 100%
rename from linalg/assign_mul_add_mul_add_mul_r.c
rename to src/lib/linalg/assign_mul_add_mul_add_mul_r.c
diff --git a/linalg/assign_mul_add_mul_add_mul_r.h b/src/lib/linalg/assign_mul_add_mul_add_mul_r.h
similarity index 100%
rename from linalg/assign_mul_add_mul_add_mul_r.h
rename to src/lib/linalg/assign_mul_add_mul_add_mul_r.h
diff --git a/linalg/assign_mul_add_mul_r.c b/src/lib/linalg/assign_mul_add_mul_r.c
similarity index 100%
rename from linalg/assign_mul_add_mul_r.c
rename to src/lib/linalg/assign_mul_add_mul_r.c
diff --git a/linalg/assign_mul_add_mul_r.h b/src/lib/linalg/assign_mul_add_mul_r.h
similarity index 100%
rename from linalg/assign_mul_add_mul_r.h
rename to src/lib/linalg/assign_mul_add_mul_r.h
diff --git a/linalg/assign_mul_add_mul_r_32.c b/src/lib/linalg/assign_mul_add_mul_r_32.c
similarity index 100%
rename from linalg/assign_mul_add_mul_r_32.c
rename to src/lib/linalg/assign_mul_add_mul_r_32.c
diff --git a/linalg/assign_mul_add_mul_r_32.h b/src/lib/linalg/assign_mul_add_mul_r_32.h
similarity index 100%
rename from linalg/assign_mul_add_mul_r_32.h
rename to src/lib/linalg/assign_mul_add_mul_r_32.h
diff --git a/linalg/assign_mul_add_r.c b/src/lib/linalg/assign_mul_add_r.c
similarity index 100%
rename from linalg/assign_mul_add_r.c
rename to src/lib/linalg/assign_mul_add_r.c
diff --git a/linalg/assign_mul_add_r.h b/src/lib/linalg/assign_mul_add_r.h
similarity index 100%
rename from linalg/assign_mul_add_r.h
rename to src/lib/linalg/assign_mul_add_r.h
diff --git a/linalg/assign_mul_add_r_32.c b/src/lib/linalg/assign_mul_add_r_32.c
similarity index 100%
rename from linalg/assign_mul_add_r_32.c
rename to src/lib/linalg/assign_mul_add_r_32.c
diff --git a/linalg/assign_mul_add_r_32.h b/src/lib/linalg/assign_mul_add_r_32.h
similarity index 100%
rename from linalg/assign_mul_add_r_32.h
rename to src/lib/linalg/assign_mul_add_r_32.h
diff --git a/linalg/assign_mul_add_r_and_square.c b/src/lib/linalg/assign_mul_add_r_and_square.c
similarity index 100%
rename from linalg/assign_mul_add_r_and_square.c
rename to src/lib/linalg/assign_mul_add_r_and_square.c
diff --git a/linalg/assign_mul_add_r_and_square.h b/src/lib/linalg/assign_mul_add_r_and_square.h
similarity index 100%
rename from linalg/assign_mul_add_r_and_square.h
rename to src/lib/linalg/assign_mul_add_r_and_square.h
diff --git a/linalg/assign_mul_bra_add_mul_ket_add.c b/src/lib/linalg/assign_mul_bra_add_mul_ket_add.c
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_ket_add.c
rename to src/lib/linalg/assign_mul_bra_add_mul_ket_add.c
diff --git a/linalg/assign_mul_bra_add_mul_ket_add.h b/src/lib/linalg/assign_mul_bra_add_mul_ket_add.h
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_ket_add.h
rename to src/lib/linalg/assign_mul_bra_add_mul_ket_add.h
diff --git a/linalg/assign_mul_bra_add_mul_ket_add_r.c b/src/lib/linalg/assign_mul_bra_add_mul_ket_add_r.c
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_ket_add_r.c
rename to src/lib/linalg/assign_mul_bra_add_mul_ket_add_r.c
diff --git a/linalg/assign_mul_bra_add_mul_ket_add_r.h b/src/lib/linalg/assign_mul_bra_add_mul_ket_add_r.h
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_ket_add_r.h
rename to src/lib/linalg/assign_mul_bra_add_mul_ket_add_r.h
diff --git a/linalg/assign_mul_bra_add_mul_r.c b/src/lib/linalg/assign_mul_bra_add_mul_r.c
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_r.c
rename to src/lib/linalg/assign_mul_bra_add_mul_r.c
diff --git a/linalg/assign_mul_bra_add_mul_r.h b/src/lib/linalg/assign_mul_bra_add_mul_r.h
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_r.h
rename to src/lib/linalg/assign_mul_bra_add_mul_r.h
diff --git a/linalg/assign_to_32.c b/src/lib/linalg/assign_to_32.c
similarity index 100%
rename from linalg/assign_to_32.c
rename to src/lib/linalg/assign_to_32.c
diff --git a/linalg/assign_to_32.h b/src/lib/linalg/assign_to_32.h
similarity index 100%
rename from linalg/assign_to_32.h
rename to src/lib/linalg/assign_to_32.h
diff --git a/linalg/blas.h b/src/lib/linalg/blas.h
similarity index 100%
rename from linalg/blas.h
rename to src/lib/linalg/blas.h
diff --git a/linalg/comp_decomp.c b/src/lib/linalg/comp_decomp.c
similarity index 100%
rename from linalg/comp_decomp.c
rename to src/lib/linalg/comp_decomp.c
diff --git a/linalg/comp_decomp.h b/src/lib/linalg/comp_decomp.h
similarity index 100%
rename from linalg/comp_decomp.h
rename to src/lib/linalg/comp_decomp.h
diff --git a/linalg/convert_eo_to_lexic.c b/src/lib/linalg/convert_eo_to_lexic.c
similarity index 100%
rename from linalg/convert_eo_to_lexic.c
rename to src/lib/linalg/convert_eo_to_lexic.c
diff --git a/linalg/convert_eo_to_lexic.h b/src/lib/linalg/convert_eo_to_lexic.h
similarity index 100%
rename from linalg/convert_eo_to_lexic.h
rename to src/lib/linalg/convert_eo_to_lexic.h
diff --git a/linalg/convert_even_to_lexic.c b/src/lib/linalg/convert_even_to_lexic.c
similarity index 100%
rename from linalg/convert_even_to_lexic.c
rename to src/lib/linalg/convert_even_to_lexic.c
diff --git a/linalg/convert_even_to_lexic.h b/src/lib/linalg/convert_even_to_lexic.h
similarity index 100%
rename from linalg/convert_even_to_lexic.h
rename to src/lib/linalg/convert_even_to_lexic.h
diff --git a/linalg/convert_odd_to_lexic.c b/src/lib/linalg/convert_odd_to_lexic.c
similarity index 100%
rename from linalg/convert_odd_to_lexic.c
rename to src/lib/linalg/convert_odd_to_lexic.c
diff --git a/linalg/convert_odd_to_lexic.h b/src/lib/linalg/convert_odd_to_lexic.h
similarity index 100%
rename from linalg/convert_odd_to_lexic.h
rename to src/lib/linalg/convert_odd_to_lexic.h
diff --git a/linalg/diff.c b/src/lib/linalg/diff.c
similarity index 100%
rename from linalg/diff.c
rename to src/lib/linalg/diff.c
diff --git a/linalg/diff.h b/src/lib/linalg/diff.h
similarity index 100%
rename from linalg/diff.h
rename to src/lib/linalg/diff.h
diff --git a/linalg/diff_32.c b/src/lib/linalg/diff_32.c
similarity index 100%
rename from linalg/diff_32.c
rename to src/lib/linalg/diff_32.c
diff --git a/linalg/diff_32.h b/src/lib/linalg/diff_32.h
similarity index 100%
rename from linalg/diff_32.h
rename to src/lib/linalg/diff_32.h
diff --git a/linalg/diff_and_square_norm.c b/src/lib/linalg/diff_and_square_norm.c
similarity index 100%
rename from linalg/diff_and_square_norm.c
rename to src/lib/linalg/diff_and_square_norm.c
diff --git a/linalg/diff_and_square_norm.h b/src/lib/linalg/diff_and_square_norm.h
similarity index 100%
rename from linalg/diff_and_square_norm.h
rename to src/lib/linalg/diff_and_square_norm.h
diff --git a/linalg/fortran.h b/src/lib/linalg/fortran.h
similarity index 100%
rename from linalg/fortran.h
rename to src/lib/linalg/fortran.h
diff --git a/linalg/lapack.h b/src/lib/linalg/lapack.h
similarity index 100%
rename from linalg/lapack.h
rename to src/lib/linalg/lapack.h
diff --git a/linalg/map_to_blas.h b/src/lib/linalg/map_to_blas.h
similarity index 100%
rename from linalg/map_to_blas.h
rename to src/lib/linalg/map_to_blas.h
diff --git a/linalg/mattimesvec.c b/src/lib/linalg/mattimesvec.c
similarity index 100%
rename from linalg/mattimesvec.c
rename to src/lib/linalg/mattimesvec.c
diff --git a/linalg/mattimesvec.h b/src/lib/linalg/mattimesvec.h
similarity index 100%
rename from linalg/mattimesvec.h
rename to src/lib/linalg/mattimesvec.h
diff --git a/linalg/mul.c b/src/lib/linalg/mul.c
similarity index 100%
rename from linalg/mul.c
rename to src/lib/linalg/mul.c
diff --git a/linalg/mul.h b/src/lib/linalg/mul.h
similarity index 100%
rename from linalg/mul.h
rename to src/lib/linalg/mul.h
diff --git a/linalg/mul_add_mul.c b/src/lib/linalg/mul_add_mul.c
similarity index 100%
rename from linalg/mul_add_mul.c
rename to src/lib/linalg/mul_add_mul.c
diff --git a/linalg/mul_add_mul.h b/src/lib/linalg/mul_add_mul.h
similarity index 100%
rename from linalg/mul_add_mul.h
rename to src/lib/linalg/mul_add_mul.h
diff --git a/linalg/mul_add_mul_r.c b/src/lib/linalg/mul_add_mul_r.c
similarity index 100%
rename from linalg/mul_add_mul_r.c
rename to src/lib/linalg/mul_add_mul_r.c
diff --git a/linalg/mul_add_mul_r.h b/src/lib/linalg/mul_add_mul_r.h
similarity index 100%
rename from linalg/mul_add_mul_r.h
rename to src/lib/linalg/mul_add_mul_r.h
diff --git a/linalg/mul_diff_mul.c b/src/lib/linalg/mul_diff_mul.c
similarity index 100%
rename from linalg/mul_diff_mul.c
rename to src/lib/linalg/mul_diff_mul.c
diff --git a/linalg/mul_diff_mul.h b/src/lib/linalg/mul_diff_mul.h
similarity index 100%
rename from linalg/mul_diff_mul.h
rename to src/lib/linalg/mul_diff_mul.h
diff --git a/linalg/mul_diff_mul_r.c b/src/lib/linalg/mul_diff_mul_r.c
similarity index 100%
rename from linalg/mul_diff_mul_r.c
rename to src/lib/linalg/mul_diff_mul_r.c
diff --git a/linalg/mul_diff_mul_r.h b/src/lib/linalg/mul_diff_mul_r.h
similarity index 100%
rename from linalg/mul_diff_mul_r.h
rename to src/lib/linalg/mul_diff_mul_r.h
diff --git a/linalg/mul_diff_r.c b/src/lib/linalg/mul_diff_r.c
similarity index 100%
rename from linalg/mul_diff_r.c
rename to src/lib/linalg/mul_diff_r.c
diff --git a/linalg/mul_diff_r.h b/src/lib/linalg/mul_diff_r.h
similarity index 100%
rename from linalg/mul_diff_r.h
rename to src/lib/linalg/mul_diff_r.h
diff --git a/linalg/mul_gamma5.c b/src/lib/linalg/mul_gamma5.c
similarity index 100%
rename from linalg/mul_gamma5.c
rename to src/lib/linalg/mul_gamma5.c
diff --git a/linalg/mul_gamma5.h b/src/lib/linalg/mul_gamma5.h
similarity index 100%
rename from linalg/mul_gamma5.h
rename to src/lib/linalg/mul_gamma5.h
diff --git a/linalg/mul_r.c b/src/lib/linalg/mul_r.c
similarity index 100%
rename from linalg/mul_r.c
rename to src/lib/linalg/mul_r.c
diff --git a/linalg/mul_r.h b/src/lib/linalg/mul_r.h
similarity index 100%
rename from linalg/mul_r.h
rename to src/lib/linalg/mul_r.h
diff --git a/linalg/mul_r_32.c b/src/lib/linalg/mul_r_32.c
similarity index 100%
rename from linalg/mul_r_32.c
rename to src/lib/linalg/mul_r_32.c
diff --git a/linalg/mul_r_32.h b/src/lib/linalg/mul_r_32.h
similarity index 100%
rename from linalg/mul_r_32.h
rename to src/lib/linalg/mul_r_32.h
diff --git a/linalg/mul_r_gamma5.c b/src/lib/linalg/mul_r_gamma5.c
similarity index 100%
rename from linalg/mul_r_gamma5.c
rename to src/lib/linalg/mul_r_gamma5.c
diff --git a/linalg/mul_r_gamma5.h b/src/lib/linalg/mul_r_gamma5.h
similarity index 100%
rename from linalg/mul_r_gamma5.h
rename to src/lib/linalg/mul_r_gamma5.h
diff --git a/linalg/print_spinor.c b/src/lib/linalg/print_spinor.c
similarity index 100%
rename from linalg/print_spinor.c
rename to src/lib/linalg/print_spinor.c
diff --git a/linalg/print_spinor.h b/src/lib/linalg/print_spinor.h
similarity index 100%
rename from linalg/print_spinor.h
rename to src/lib/linalg/print_spinor.h
diff --git a/linalg/print_spinor_similar_components.c b/src/lib/linalg/print_spinor_similar_components.c
similarity index 100%
rename from linalg/print_spinor_similar_components.c
rename to src/lib/linalg/print_spinor_similar_components.c
diff --git a/linalg/print_spinor_similar_components.h b/src/lib/linalg/print_spinor_similar_components.h
similarity index 100%
rename from linalg/print_spinor_similar_components.h
rename to src/lib/linalg/print_spinor_similar_components.h
diff --git a/linalg/ratio.c b/src/lib/linalg/ratio.c
similarity index 100%
rename from linalg/ratio.c
rename to src/lib/linalg/ratio.c
diff --git a/linalg/ratio.h b/src/lib/linalg/ratio.h
similarity index 100%
rename from linalg/ratio.h
rename to src/lib/linalg/ratio.h
diff --git a/linalg/scalar_prod.c b/src/lib/linalg/scalar_prod.c
similarity index 100%
rename from linalg/scalar_prod.c
rename to src/lib/linalg/scalar_prod.c
diff --git a/linalg/scalar_prod.h b/src/lib/linalg/scalar_prod.h
similarity index 100%
rename from linalg/scalar_prod.h
rename to src/lib/linalg/scalar_prod.h
diff --git a/linalg/scalar_prod_body.c b/src/lib/linalg/scalar_prod_body.c
similarity index 100%
rename from linalg/scalar_prod_body.c
rename to src/lib/linalg/scalar_prod_body.c
diff --git a/linalg/scalar_prod_i.c b/src/lib/linalg/scalar_prod_i.c
similarity index 100%
rename from linalg/scalar_prod_i.c
rename to src/lib/linalg/scalar_prod_i.c
diff --git a/linalg/scalar_prod_i.h b/src/lib/linalg/scalar_prod_i.h
similarity index 100%
rename from linalg/scalar_prod_i.h
rename to src/lib/linalg/scalar_prod_i.h
diff --git a/linalg/scalar_prod_r.c b/src/lib/linalg/scalar_prod_r.c
similarity index 100%
rename from linalg/scalar_prod_r.c
rename to src/lib/linalg/scalar_prod_r.c
diff --git a/linalg/scalar_prod_r.h b/src/lib/linalg/scalar_prod_r.h
similarity index 100%
rename from linalg/scalar_prod_r.h
rename to src/lib/linalg/scalar_prod_r.h
diff --git a/linalg/scalar_prod_r_32.c b/src/lib/linalg/scalar_prod_r_32.c
similarity index 100%
rename from linalg/scalar_prod_r_32.c
rename to src/lib/linalg/scalar_prod_r_32.c
diff --git a/linalg/scalar_prod_r_32.h b/src/lib/linalg/scalar_prod_r_32.h
similarity index 100%
rename from linalg/scalar_prod_r_32.h
rename to src/lib/linalg/scalar_prod_r_32.h
diff --git a/linalg/set_even_to_zero.c b/src/lib/linalg/set_even_to_zero.c
similarity index 100%
rename from linalg/set_even_to_zero.c
rename to src/lib/linalg/set_even_to_zero.c
diff --git a/linalg/set_even_to_zero.h b/src/lib/linalg/set_even_to_zero.h
similarity index 100%
rename from linalg/set_even_to_zero.h
rename to src/lib/linalg/set_even_to_zero.h
diff --git a/linalg/square_and_minmax.c b/src/lib/linalg/square_and_minmax.c
similarity index 100%
rename from linalg/square_and_minmax.c
rename to src/lib/linalg/square_and_minmax.c
diff --git a/linalg/square_and_minmax.h b/src/lib/linalg/square_and_minmax.h
similarity index 100%
rename from linalg/square_and_minmax.h
rename to src/lib/linalg/square_and_minmax.h
diff --git a/linalg/square_and_prod_r.c b/src/lib/linalg/square_and_prod_r.c
similarity index 100%
rename from linalg/square_and_prod_r.c
rename to src/lib/linalg/square_and_prod_r.c
diff --git a/linalg/square_and_prod_r.h b/src/lib/linalg/square_and_prod_r.h
similarity index 100%
rename from linalg/square_and_prod_r.h
rename to src/lib/linalg/square_and_prod_r.h
diff --git a/linalg/square_norm.c b/src/lib/linalg/square_norm.c
similarity index 100%
rename from linalg/square_norm.c
rename to src/lib/linalg/square_norm.c
diff --git a/linalg/square_norm.h b/src/lib/linalg/square_norm.h
similarity index 100%
rename from linalg/square_norm.h
rename to src/lib/linalg/square_norm.h
diff --git a/linalg/square_norm_32.c b/src/lib/linalg/square_norm_32.c
similarity index 100%
rename from linalg/square_norm_32.c
rename to src/lib/linalg/square_norm_32.c
diff --git a/linalg/square_norm_32.h b/src/lib/linalg/square_norm_32.h
similarity index 100%
rename from linalg/square_norm_32.h
rename to src/lib/linalg/square_norm_32.h
diff --git a/linalg_eo.h b/src/lib/linalg_eo.h
similarity index 100%
rename from linalg_eo.h
rename to src/lib/linalg_eo.h
diff --git a/little_D.c b/src/lib/little_D.c
similarity index 100%
rename from little_D.c
rename to src/lib/little_D.c
diff --git a/little_D.h b/src/lib/little_D.h
similarity index 100%
rename from little_D.h
rename to src/lib/little_D.h
diff --git a/little_D_body.c b/src/lib/little_D_body.c
similarity index 100%
rename from little_D_body.c
rename to src/lib/little_D_body.c
diff --git a/matrix_utils.c b/src/lib/matrix_utils.c
similarity index 100%
rename from matrix_utils.c
rename to src/lib/matrix_utils.c
diff --git a/matrix_utils.h b/src/lib/matrix_utils.h
similarity index 100%
rename from matrix_utils.h
rename to src/lib/matrix_utils.h
diff --git a/meas/Makefile.in b/src/lib/meas/Makefile.in
similarity index 100%
rename from meas/Makefile.in
rename to src/lib/meas/Makefile.in
diff --git a/meas/correlators.c b/src/lib/meas/correlators.c
similarity index 100%
rename from meas/correlators.c
rename to src/lib/meas/correlators.c
diff --git a/meas/correlators.h b/src/lib/meas/correlators.h
similarity index 100%
rename from meas/correlators.h
rename to src/lib/meas/correlators.h
diff --git a/meas/field_strength_types.h b/src/lib/meas/field_strength_types.h
similarity index 100%
rename from meas/field_strength_types.h
rename to src/lib/meas/field_strength_types.h
diff --git a/meas/gradient_flow.c b/src/lib/meas/gradient_flow.c
similarity index 100%
rename from meas/gradient_flow.c
rename to src/lib/meas/gradient_flow.c
diff --git a/meas/gradient_flow.h b/src/lib/meas/gradient_flow.h
similarity index 100%
rename from meas/gradient_flow.h
rename to src/lib/meas/gradient_flow.h
diff --git a/meas/measure_clover_field_strength_observables.c b/src/lib/meas/measure_clover_field_strength_observables.c
similarity index 100%
rename from meas/measure_clover_field_strength_observables.c
rename to src/lib/meas/measure_clover_field_strength_observables.c
diff --git a/meas/measure_clover_field_strength_observables.h b/src/lib/meas/measure_clover_field_strength_observables.h
similarity index 100%
rename from meas/measure_clover_field_strength_observables.h
rename to src/lib/meas/measure_clover_field_strength_observables.h
diff --git a/meas/measurements.c b/src/lib/meas/measurements.c
similarity index 100%
rename from meas/measurements.c
rename to src/lib/meas/measurements.c
diff --git a/meas/measurements.h b/src/lib/meas/measurements.h
similarity index 100%
rename from meas/measurements.h
rename to src/lib/meas/measurements.h
diff --git a/meas/oriented_plaquettes.c b/src/lib/meas/oriented_plaquettes.c
similarity index 100%
rename from meas/oriented_plaquettes.c
rename to src/lib/meas/oriented_plaquettes.c
diff --git a/meas/oriented_plaquettes.h b/src/lib/meas/oriented_plaquettes.h
similarity index 100%
rename from meas/oriented_plaquettes.h
rename to src/lib/meas/oriented_plaquettes.h
diff --git a/meas/pion_norm.c b/src/lib/meas/pion_norm.c
similarity index 100%
rename from meas/pion_norm.c
rename to src/lib/meas/pion_norm.c
diff --git a/meas/pion_norm.h b/src/lib/meas/pion_norm.h
similarity index 100%
rename from meas/pion_norm.h
rename to src/lib/meas/pion_norm.h
diff --git a/meas/polyakov_loop.c b/src/lib/meas/polyakov_loop.c
similarity index 100%
rename from meas/polyakov_loop.c
rename to src/lib/meas/polyakov_loop.c
diff --git a/meas/polyakov_loop.h b/src/lib/meas/polyakov_loop.h
similarity index 100%
rename from meas/polyakov_loop.h
rename to src/lib/meas/polyakov_loop.h
diff --git a/measure_gauge_action.c b/src/lib/measure_gauge_action.c
similarity index 100%
rename from measure_gauge_action.c
rename to src/lib/measure_gauge_action.c
diff --git a/measure_gauge_action.h b/src/lib/measure_gauge_action.h
similarity index 100%
rename from measure_gauge_action.h
rename to src/lib/measure_gauge_action.h
diff --git a/measure_rectangles.c b/src/lib/measure_rectangles.c
similarity index 100%
rename from measure_rectangles.c
rename to src/lib/measure_rectangles.c
diff --git a/measure_rectangles.h b/src/lib/measure_rectangles.h
similarity index 100%
rename from measure_rectangles.h
rename to src/lib/measure_rectangles.h
diff --git a/misc_types.h b/src/lib/misc_types.h
similarity index 99%
rename from misc_types.h
rename to src/lib/misc_types.h
index 91ceda0a8..fee62159f 100644
--- a/misc_types.h
+++ b/src/lib/misc_types.h
@@ -101,7 +101,7 @@ typedef enum tm_mpi_thread_level_t {
   TM_MPI_THREAD_SINGLE = QMP_THREAD_SINGLE,
   TM_MPI_THREAD_MULTIPLE = QMP_THREAD_MULTIPLE
 } tm_mpi_thread_level_t;
-#elif TM_USE_MPI
+#elif defined(TM_USE_MPI) 
 typedef enum tm_mpi_thread_level_t {
   TM_MPI_THREAD_SINGLE = MPI_THREAD_SERIALIZED,
   TM_MPI_THREAD_MULTIPLE = MPI_THREAD_MULTIPLE
diff --git a/monomial/Makefile.in b/src/lib/monomial/Makefile.in
similarity index 100%
rename from monomial/Makefile.in
rename to src/lib/monomial/Makefile.in
diff --git a/monomial/clover_trlog_monomial.c b/src/lib/monomial/clover_trlog_monomial.c
similarity index 100%
rename from monomial/clover_trlog_monomial.c
rename to src/lib/monomial/clover_trlog_monomial.c
diff --git a/monomial/clover_trlog_monomial.h b/src/lib/monomial/clover_trlog_monomial.h
similarity index 100%
rename from monomial/clover_trlog_monomial.h
rename to src/lib/monomial/clover_trlog_monomial.h
diff --git a/monomial/cloverdet_monomial.c b/src/lib/monomial/cloverdet_monomial.c
similarity index 100%
rename from monomial/cloverdet_monomial.c
rename to src/lib/monomial/cloverdet_monomial.c
diff --git a/monomial/cloverdet_monomial.h b/src/lib/monomial/cloverdet_monomial.h
similarity index 100%
rename from monomial/cloverdet_monomial.h
rename to src/lib/monomial/cloverdet_monomial.h
diff --git a/monomial/cloverdetratio_monomial.c b/src/lib/monomial/cloverdetratio_monomial.c
similarity index 100%
rename from monomial/cloverdetratio_monomial.c
rename to src/lib/monomial/cloverdetratio_monomial.c
diff --git a/monomial/cloverdetratio_monomial.h b/src/lib/monomial/cloverdetratio_monomial.h
similarity index 100%
rename from monomial/cloverdetratio_monomial.h
rename to src/lib/monomial/cloverdetratio_monomial.h
diff --git a/monomial/cloverdetratio_rwmonomial.c b/src/lib/monomial/cloverdetratio_rwmonomial.c
similarity index 100%
rename from monomial/cloverdetratio_rwmonomial.c
rename to src/lib/monomial/cloverdetratio_rwmonomial.c
diff --git a/monomial/cloverdetratio_rwmonomial.h b/src/lib/monomial/cloverdetratio_rwmonomial.h
similarity index 100%
rename from monomial/cloverdetratio_rwmonomial.h
rename to src/lib/monomial/cloverdetratio_rwmonomial.h
diff --git a/monomial/clovernd_trlog_monomial.c b/src/lib/monomial/clovernd_trlog_monomial.c
similarity index 100%
rename from monomial/clovernd_trlog_monomial.c
rename to src/lib/monomial/clovernd_trlog_monomial.c
diff --git a/monomial/clovernd_trlog_monomial.h b/src/lib/monomial/clovernd_trlog_monomial.h
similarity index 100%
rename from monomial/clovernd_trlog_monomial.h
rename to src/lib/monomial/clovernd_trlog_monomial.h
diff --git a/monomial/cloverndpoly_monomial.c b/src/lib/monomial/cloverndpoly_monomial.c
similarity index 100%
rename from monomial/cloverndpoly_monomial.c
rename to src/lib/monomial/cloverndpoly_monomial.c
diff --git a/monomial/cloverndpoly_monomial.h b/src/lib/monomial/cloverndpoly_monomial.h
similarity index 100%
rename from monomial/cloverndpoly_monomial.h
rename to src/lib/monomial/cloverndpoly_monomial.h
diff --git a/monomial/det_monomial.c b/src/lib/monomial/det_monomial.c
similarity index 100%
rename from monomial/det_monomial.c
rename to src/lib/monomial/det_monomial.c
diff --git a/monomial/det_monomial.h b/src/lib/monomial/det_monomial.h
similarity index 100%
rename from monomial/det_monomial.h
rename to src/lib/monomial/det_monomial.h
diff --git a/monomial/detratio_monomial.c b/src/lib/monomial/detratio_monomial.c
similarity index 100%
rename from monomial/detratio_monomial.c
rename to src/lib/monomial/detratio_monomial.c
diff --git a/monomial/detratio_monomial.h b/src/lib/monomial/detratio_monomial.h
similarity index 100%
rename from monomial/detratio_monomial.h
rename to src/lib/monomial/detratio_monomial.h
diff --git a/monomial/gauge_monomial.c b/src/lib/monomial/gauge_monomial.c
similarity index 100%
rename from monomial/gauge_monomial.c
rename to src/lib/monomial/gauge_monomial.c
diff --git a/monomial/gauge_monomial.h b/src/lib/monomial/gauge_monomial.h
similarity index 100%
rename from monomial/gauge_monomial.h
rename to src/lib/monomial/gauge_monomial.h
diff --git a/monomial/moment_energy.c b/src/lib/monomial/moment_energy.c
similarity index 100%
rename from monomial/moment_energy.c
rename to src/lib/monomial/moment_energy.c
diff --git a/monomial/moment_energy.h b/src/lib/monomial/moment_energy.h
similarity index 100%
rename from monomial/moment_energy.h
rename to src/lib/monomial/moment_energy.h
diff --git a/monomial/monitor_forces.c b/src/lib/monomial/monitor_forces.c
similarity index 100%
rename from monomial/monitor_forces.c
rename to src/lib/monomial/monitor_forces.c
diff --git a/monomial/monitor_forces.h b/src/lib/monomial/monitor_forces.h
similarity index 100%
rename from monomial/monitor_forces.h
rename to src/lib/monomial/monitor_forces.h
diff --git a/monomial/monomial.c b/src/lib/monomial/monomial.c
similarity index 100%
rename from monomial/monomial.c
rename to src/lib/monomial/monomial.c
diff --git a/monomial/monomial.h b/src/lib/monomial/monomial.h
similarity index 100%
rename from monomial/monomial.h
rename to src/lib/monomial/monomial.h
diff --git a/monomial/nddetratio_monomial.c b/src/lib/monomial/nddetratio_monomial.c
similarity index 100%
rename from monomial/nddetratio_monomial.c
rename to src/lib/monomial/nddetratio_monomial.c
diff --git a/monomial/nddetratio_monomial.h b/src/lib/monomial/nddetratio_monomial.h
similarity index 100%
rename from monomial/nddetratio_monomial.h
rename to src/lib/monomial/nddetratio_monomial.h
diff --git a/monomial/ndpoly_monomial.c b/src/lib/monomial/ndpoly_monomial.c
similarity index 100%
rename from monomial/ndpoly_monomial.c
rename to src/lib/monomial/ndpoly_monomial.c
diff --git a/monomial/ndpoly_monomial.h b/src/lib/monomial/ndpoly_monomial.h
similarity index 100%
rename from monomial/ndpoly_monomial.h
rename to src/lib/monomial/ndpoly_monomial.h
diff --git a/monomial/ndrat_monomial.c b/src/lib/monomial/ndrat_monomial.c
similarity index 100%
rename from monomial/ndrat_monomial.c
rename to src/lib/monomial/ndrat_monomial.c
diff --git a/monomial/ndrat_monomial.h b/src/lib/monomial/ndrat_monomial.h
similarity index 100%
rename from monomial/ndrat_monomial.h
rename to src/lib/monomial/ndrat_monomial.h
diff --git a/monomial/ndratcor_monomial.c b/src/lib/monomial/ndratcor_monomial.c
similarity index 100%
rename from monomial/ndratcor_monomial.c
rename to src/lib/monomial/ndratcor_monomial.c
diff --git a/monomial/ndratcor_monomial.h b/src/lib/monomial/ndratcor_monomial.h
similarity index 100%
rename from monomial/ndratcor_monomial.h
rename to src/lib/monomial/ndratcor_monomial.h
diff --git a/monomial/poly_monomial.c b/src/lib/monomial/poly_monomial.c
similarity index 100%
rename from monomial/poly_monomial.c
rename to src/lib/monomial/poly_monomial.c
diff --git a/monomial/poly_monomial.h b/src/lib/monomial/poly_monomial.h
similarity index 100%
rename from monomial/poly_monomial.h
rename to src/lib/monomial/poly_monomial.h
diff --git a/monomial/rat_monomial.c b/src/lib/monomial/rat_monomial.c
similarity index 100%
rename from monomial/rat_monomial.c
rename to src/lib/monomial/rat_monomial.c
diff --git a/monomial/rat_monomial.h b/src/lib/monomial/rat_monomial.h
similarity index 100%
rename from monomial/rat_monomial.h
rename to src/lib/monomial/rat_monomial.h
diff --git a/monomial/ratcor_monomial.c b/src/lib/monomial/ratcor_monomial.c
similarity index 100%
rename from monomial/ratcor_monomial.c
rename to src/lib/monomial/ratcor_monomial.c
diff --git a/monomial/ratcor_monomial.h b/src/lib/monomial/ratcor_monomial.h
similarity index 100%
rename from monomial/ratcor_monomial.h
rename to src/lib/monomial/ratcor_monomial.h
diff --git a/mpi_init.c b/src/lib/mpi_init.c
similarity index 100%
rename from mpi_init.c
rename to src/lib/mpi_init.c
diff --git a/mpi_init.h b/src/lib/mpi_init.h
similarity index 100%
rename from mpi_init.h
rename to src/lib/mpi_init.h
diff --git a/omp_accumulator.h b/src/lib/omp_accumulator.h
similarity index 100%
rename from omp_accumulator.h
rename to src/lib/omp_accumulator.h
diff --git a/operator.c b/src/lib/operator.c
similarity index 100%
rename from operator.c
rename to src/lib/operator.c
diff --git a/operator.h b/src/lib/operator.h
similarity index 100%
rename from operator.h
rename to src/lib/operator.h
diff --git a/operator/Block_D_psi_body.c b/src/lib/operator/Block_D_psi_body.c
similarity index 100%
rename from operator/Block_D_psi_body.c
rename to src/lib/operator/Block_D_psi_body.c
diff --git a/operator/D_psi.c b/src/lib/operator/D_psi.c
similarity index 100%
rename from operator/D_psi.c
rename to src/lib/operator/D_psi.c
diff --git a/operator/D_psi.h b/src/lib/operator/D_psi.h
similarity index 100%
rename from operator/D_psi.h
rename to src/lib/operator/D_psi.h
diff --git a/operator/D_psi_body.c b/src/lib/operator/D_psi_body.c
similarity index 100%
rename from operator/D_psi_body.c
rename to src/lib/operator/D_psi_body.c
diff --git a/operator/Dov_proj.c b/src/lib/operator/Dov_proj.c
similarity index 100%
rename from operator/Dov_proj.c
rename to src/lib/operator/Dov_proj.c
diff --git a/operator/Dov_proj.h b/src/lib/operator/Dov_proj.h
similarity index 100%
rename from operator/Dov_proj.h
rename to src/lib/operator/Dov_proj.h
diff --git a/operator/Dov_psi.c b/src/lib/operator/Dov_psi.c
similarity index 100%
rename from operator/Dov_psi.c
rename to src/lib/operator/Dov_psi.c
diff --git a/operator/Dov_psi.h b/src/lib/operator/Dov_psi.h
similarity index 100%
rename from operator/Dov_psi.h
rename to src/lib/operator/Dov_psi.h
diff --git a/operator/Hopping_Matrix.c b/src/lib/operator/Hopping_Matrix.c
similarity index 100%
rename from operator/Hopping_Matrix.c
rename to src/lib/operator/Hopping_Matrix.c
diff --git a/operator/Hopping_Matrix.h b/src/lib/operator/Hopping_Matrix.h
similarity index 100%
rename from operator/Hopping_Matrix.h
rename to src/lib/operator/Hopping_Matrix.h
diff --git a/operator/Hopping_Matrix_32.c b/src/lib/operator/Hopping_Matrix_32.c
similarity index 100%
rename from operator/Hopping_Matrix_32.c
rename to src/lib/operator/Hopping_Matrix_32.c
diff --git a/operator/Hopping_Matrix_32.h b/src/lib/operator/Hopping_Matrix_32.h
similarity index 100%
rename from operator/Hopping_Matrix_32.h
rename to src/lib/operator/Hopping_Matrix_32.h
diff --git a/operator/Hopping_Matrix_32_nocom.c b/src/lib/operator/Hopping_Matrix_32_nocom.c
similarity index 100%
rename from operator/Hopping_Matrix_32_nocom.c
rename to src/lib/operator/Hopping_Matrix_32_nocom.c
diff --git a/operator/Hopping_Matrix_nocom.c b/src/lib/operator/Hopping_Matrix_nocom.c
similarity index 100%
rename from operator/Hopping_Matrix_nocom.c
rename to src/lib/operator/Hopping_Matrix_nocom.c
diff --git a/operator/Hopping_Matrix_nocom.h b/src/lib/operator/Hopping_Matrix_nocom.h
similarity index 100%
rename from operator/Hopping_Matrix_nocom.h
rename to src/lib/operator/Hopping_Matrix_nocom.h
diff --git a/operator/Makefile.in b/src/lib/operator/Makefile.in
similarity index 100%
rename from operator/Makefile.in
rename to src/lib/operator/Makefile.in
diff --git a/operator/assign_mul_one_sw_pm_imu_inv_block_body.c b/src/lib/operator/assign_mul_one_sw_pm_imu_inv_block_body.c
similarity index 100%
rename from operator/assign_mul_one_sw_pm_imu_inv_block_body.c
rename to src/lib/operator/assign_mul_one_sw_pm_imu_inv_block_body.c
diff --git a/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c b/src/lib/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
similarity index 100%
rename from operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
rename to src/lib/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
diff --git a/operator/clover_accumulate_deriv.c b/src/lib/operator/clover_accumulate_deriv.c
similarity index 100%
rename from operator/clover_accumulate_deriv.c
rename to src/lib/operator/clover_accumulate_deriv.c
diff --git a/operator/clover_deriv.c b/src/lib/operator/clover_deriv.c
similarity index 100%
rename from operator/clover_deriv.c
rename to src/lib/operator/clover_deriv.c
diff --git a/operator/clover_det.c b/src/lib/operator/clover_det.c
similarity index 100%
rename from operator/clover_det.c
rename to src/lib/operator/clover_det.c
diff --git a/operator/clover_inline.h b/src/lib/operator/clover_inline.h
similarity index 100%
rename from operator/clover_inline.h
rename to src/lib/operator/clover_inline.h
diff --git a/operator/clover_invert.c b/src/lib/operator/clover_invert.c
similarity index 100%
rename from operator/clover_invert.c
rename to src/lib/operator/clover_invert.c
diff --git a/operator/clover_leaf.c b/src/lib/operator/clover_leaf.c
similarity index 100%
rename from operator/clover_leaf.c
rename to src/lib/operator/clover_leaf.c
diff --git a/operator/clover_leaf.h b/src/lib/operator/clover_leaf.h
similarity index 100%
rename from operator/clover_leaf.h
rename to src/lib/operator/clover_leaf.h
diff --git a/operator/clover_term.c b/src/lib/operator/clover_term.c
similarity index 100%
rename from operator/clover_term.c
rename to src/lib/operator/clover_term.c
diff --git a/operator/clovertm_operators.c b/src/lib/operator/clovertm_operators.c
similarity index 100%
rename from operator/clovertm_operators.c
rename to src/lib/operator/clovertm_operators.c
diff --git a/operator/clovertm_operators.h b/src/lib/operator/clovertm_operators.h
similarity index 100%
rename from operator/clovertm_operators.h
rename to src/lib/operator/clovertm_operators.h
diff --git a/operator/clovertm_operators_32.c b/src/lib/operator/clovertm_operators_32.c
similarity index 100%
rename from operator/clovertm_operators_32.c
rename to src/lib/operator/clovertm_operators_32.c
diff --git a/operator/clovertm_operators_32.h b/src/lib/operator/clovertm_operators_32.h
similarity index 100%
rename from operator/clovertm_operators_32.h
rename to src/lib/operator/clovertm_operators_32.h
diff --git a/operator/halfspinor_body.c b/src/lib/operator/halfspinor_body.c
similarity index 100%
rename from operator/halfspinor_body.c
rename to src/lib/operator/halfspinor_body.c
diff --git a/operator/halfspinor_body_32.c b/src/lib/operator/halfspinor_body_32.c
similarity index 100%
rename from operator/halfspinor_body_32.c
rename to src/lib/operator/halfspinor_body_32.c
diff --git a/operator/halfspinor_hopping.h b/src/lib/operator/halfspinor_hopping.h
similarity index 100%
rename from operator/halfspinor_hopping.h
rename to src/lib/operator/halfspinor_hopping.h
diff --git a/operator/halfspinor_hopping_32.h b/src/lib/operator/halfspinor_hopping_32.h
similarity index 100%
rename from operator/halfspinor_hopping_32.h
rename to src/lib/operator/halfspinor_hopping_32.h
diff --git a/operator/hopping.h b/src/lib/operator/hopping.h
similarity index 100%
rename from operator/hopping.h
rename to src/lib/operator/hopping.h
diff --git a/operator/hopping_body_dbl.c b/src/lib/operator/hopping_body_dbl.c
similarity index 100%
rename from operator/hopping_body_dbl.c
rename to src/lib/operator/hopping_body_dbl.c
diff --git a/operator/hopping_sgl.c b/src/lib/operator/hopping_sgl.c
similarity index 100%
rename from operator/hopping_sgl.c
rename to src/lib/operator/hopping_sgl.c
diff --git a/operator/mul_one_pm_imu_inv_body.c b/src/lib/operator/mul_one_pm_imu_inv_body.c
similarity index 100%
rename from operator/mul_one_pm_imu_inv_body.c
rename to src/lib/operator/mul_one_pm_imu_inv_body.c
diff --git a/operator/mul_one_pm_imu_sub_mul_body.c b/src/lib/operator/mul_one_pm_imu_sub_mul_body.c
similarity index 100%
rename from operator/mul_one_pm_imu_sub_mul_body.c
rename to src/lib/operator/mul_one_pm_imu_sub_mul_body.c
diff --git a/operator/tm_operators.c b/src/lib/operator/tm_operators.c
similarity index 100%
rename from operator/tm_operators.c
rename to src/lib/operator/tm_operators.c
diff --git a/operator/tm_operators.h b/src/lib/operator/tm_operators.h
similarity index 100%
rename from operator/tm_operators.h
rename to src/lib/operator/tm_operators.h
diff --git a/operator/tm_operators_32.c b/src/lib/operator/tm_operators_32.c
similarity index 100%
rename from operator/tm_operators_32.c
rename to src/lib/operator/tm_operators_32.c
diff --git a/operator/tm_operators_32.h b/src/lib/operator/tm_operators_32.h
similarity index 100%
rename from operator/tm_operators_32.h
rename to src/lib/operator/tm_operators_32.h
diff --git a/operator/tm_operators_nd.c b/src/lib/operator/tm_operators_nd.c
similarity index 100%
rename from operator/tm_operators_nd.c
rename to src/lib/operator/tm_operators_nd.c
diff --git a/operator/tm_operators_nd.h b/src/lib/operator/tm_operators_nd.h
similarity index 100%
rename from operator/tm_operators_nd.h
rename to src/lib/operator/tm_operators_nd.h
diff --git a/operator/tm_operators_nd_32.c b/src/lib/operator/tm_operators_nd_32.c
similarity index 100%
rename from operator/tm_operators_nd_32.c
rename to src/lib/operator/tm_operators_nd_32.c
diff --git a/operator/tm_operators_nd_32.h b/src/lib/operator/tm_operators_nd_32.h
similarity index 100%
rename from operator/tm_operators_nd_32.h
rename to src/lib/operator/tm_operators_nd_32.h
diff --git a/operator/tm_sub_Hopping_Matrix.c b/src/lib/operator/tm_sub_Hopping_Matrix.c
similarity index 100%
rename from operator/tm_sub_Hopping_Matrix.c
rename to src/lib/operator/tm_sub_Hopping_Matrix.c
diff --git a/operator/tm_sub_Hopping_Matrix.h b/src/lib/operator/tm_sub_Hopping_Matrix.h
similarity index 100%
rename from operator/tm_sub_Hopping_Matrix.h
rename to src/lib/operator/tm_sub_Hopping_Matrix.h
diff --git a/operator/tm_times_Hopping_Matrix.c b/src/lib/operator/tm_times_Hopping_Matrix.c
similarity index 100%
rename from operator/tm_times_Hopping_Matrix.c
rename to src/lib/operator/tm_times_Hopping_Matrix.c
diff --git a/operator/tm_times_Hopping_Matrix.h b/src/lib/operator/tm_times_Hopping_Matrix.h
similarity index 100%
rename from operator/tm_times_Hopping_Matrix.h
rename to src/lib/operator/tm_times_Hopping_Matrix.h
diff --git a/operator_types.h b/src/lib/operator_types.h
similarity index 100%
rename from operator_types.h
rename to src/lib/operator_types.h
diff --git a/overrelaxation.c b/src/lib/overrelaxation.c
similarity index 99%
rename from overrelaxation.c
rename to src/lib/overrelaxation.c
index 2c2e486f7..91d95fa30 100644
--- a/overrelaxation.c
+++ b/src/lib/overrelaxation.c
@@ -205,7 +205,7 @@ void overrel_sweep() {
   static su3 v;
   for (mu = 0; mu < 4; mu++) {
     for (ix = 0; ix < VOLUME; ix++) {
-      get_staples(&v, ix, mu, g_gauge_field);
+      get_staples(&v, ix, mu, (const su3 **)g_gauge_field);
       flip_subgroup(ix, mu, v, 1);
       flip_subgroup(ix, mu, v, 2);
       flip_subgroup(ix, mu, v, 3);
diff --git a/overrelaxation.h b/src/lib/overrelaxation.h
similarity index 100%
rename from overrelaxation.h
rename to src/lib/overrelaxation.h
diff --git a/parallel_io.h b/src/lib/parallel_io.h
similarity index 100%
rename from parallel_io.h
rename to src/lib/parallel_io.h
diff --git a/phmc.c b/src/lib/phmc.c
similarity index 100%
rename from phmc.c
rename to src/lib/phmc.c
diff --git a/phmc.h b/src/lib/phmc.h
similarity index 100%
rename from phmc.h
rename to src/lib/phmc.h
diff --git a/prepare_source.c b/src/lib/prepare_source.c
similarity index 100%
rename from prepare_source.c
rename to src/lib/prepare_source.c
diff --git a/prepare_source.h b/src/lib/prepare_source.h
similarity index 100%
rename from prepare_source.h
rename to src/lib/prepare_source.h
diff --git a/profiling/hmc/Readme.md b/src/lib/profiling/hmc/Readme.md
similarity index 100%
rename from profiling/hmc/Readme.md
rename to src/lib/profiling/hmc/Readme.md
diff --git a/profiling/hmc/example_profile.pdf b/src/lib/profiling/hmc/example_profile.pdf
similarity index 100%
rename from profiling/hmc/example_profile.pdf
rename to src/lib/profiling/hmc/example_profile.pdf
diff --git a/profiling/hmc/profile.Rmd b/src/lib/profiling/hmc/profile.Rmd
similarity index 100%
rename from profiling/hmc/profile.Rmd
rename to src/lib/profiling/hmc/profile.Rmd
diff --git a/profiling/hmc/timing.R b/src/lib/profiling/hmc/timing.R
similarity index 100%
rename from profiling/hmc/timing.R
rename to src/lib/profiling/hmc/timing.R
diff --git a/profiling/hmc_mk2/.gitignore b/src/lib/profiling/hmc_mk2/.gitignore
similarity index 100%
rename from profiling/hmc_mk2/.gitignore
rename to src/lib/profiling/hmc_mk2/.gitignore
diff --git a/profiling/hmc_mk2/README.md b/src/lib/profiling/hmc_mk2/README.md
similarity index 100%
rename from profiling/hmc_mk2/README.md
rename to src/lib/profiling/hmc_mk2/README.md
diff --git a/profiling/hmc_mk2/logs/example_log.out b/src/lib/profiling/hmc_mk2/logs/example_log.out
similarity index 100%
rename from profiling/hmc_mk2/logs/example_log.out
rename to src/lib/profiling/hmc_mk2/logs/example_log.out
diff --git a/profiling/hmc_mk2/make_profile.R b/src/lib/profiling/hmc_mk2/make_profile.R
similarity index 100%
rename from profiling/hmc_mk2/make_profile.R
rename to src/lib/profiling/hmc_mk2/make_profile.R
diff --git a/profiling/hmc_mk2/profile.Rmd b/src/lib/profiling/hmc_mk2/profile.Rmd
similarity index 100%
rename from profiling/hmc_mk2/profile.Rmd
rename to src/lib/profiling/hmc_mk2/profile.Rmd
diff --git a/src/lib/qphix/qphix_base_classes.hpp b/src/lib/qphix/qphix_base_classes.hpp
new file mode 100644
index 000000000..26015e3a2
--- /dev/null
+++ b/src/lib/qphix/qphix_base_classes.hpp
@@ -0,0 +1,771 @@
+// Copyright © 2017 Martin Ueding <dev@martin-ueding.de>
+// Licensed unter the [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause).
+
+// Due to github issue #404, the helper functions to apply the full QPhiX operator
+// are currently disabled because they conflict with the new interfaces in QPhiX
+// itself. If required, these should be rewritten to use these interfaces
+// rather than the base classes in qphix_base_classes.hpp
+
+// This file should be deprecated or updated to provide any functionality
+// not covered by QPhiX itself.
+
+/**
+  \file Additions to QPhiX that are only needed for tmLQCD.
+
+  In the original QPhiX, there are only Wilson fermions and Wilson clover
+  fermions. The Dslash operators have a different call signature (the latter
+  requiring a clover term), so there is no common base class. With the addition
+  of Wilson twisted mass (Mario) and Wilson twisted clover (Peter), there are
+  now two instances of the Dslash that have the same signature. In order to
+  write a more general even-odd source preparation and solution reconstruction
+  code, a common base class for non-clover and clover is desired. In order to
+  leave the QPhiX code untouched (for now), this code lives here in tmLQCD.
+  */
+
+#pragma once
+
+#include <qphix/blas_new_c.h>
+#include <qphix/clover_dslash_def.h>
+#include <qphix/dslash_def.h>
+#include <qphix/geometry.h>
+#include <qphix/tm_clov_dslash_def.h>
+#include <qphix/tm_dslash_def.h>
+
+#include <cassert>
+
+namespace tmlqcd {
+
+namespace {
+size_t constexpr re = 0;
+size_t constexpr im = 1;
+int const n_blas_simt = 1;
+
+// The even checkerboard is given by ( (x + y + z + t ) & 1 == 0 ) -> cb0 is even
+int constexpr cb_even = 0;
+int constexpr cb_odd = 1;
+}
+
+/**
+  Complex multiplication accumulate.
+
+  Computes \f$ (r + \mathrm i i) += (a + \mathrm i b) * (c + \mathrm i d) \f$.
+  */
+template <typename FT>
+void cplx_mul_acc(FT &r_out, FT &i_out, FT const &a, FT const &b, FT const &c, FT const &d) {
+  r_out += a * c - b * d;
+  i_out += a * d + b * c;
+}
+
+/**
+  Wrapper for the clover multiplication function.
+
+  The `struct` is needed in order to allow for partial template specialization in the `Clover`
+  parameter.
+
+  \tparam Clover Type of clover block to use, must be a type from Geometry such that there exists a
+  specialization for it.
+  */
+template <typename FT, int veclen, int soalen, bool compress12, typename Clover>
+struct InnerCloverProduct {
+  /**
+  Multiplies the clover term for a single lattice size to a spinor.
+
+  This function is intended to be used in a loop over all lattice sites. It is expected from the
+  caller to have figured out all the correct indices. There are template specializations for the two
+  different types of clover term that are used in QPhiX.
+
+  \param[out] out Output spinor block. It is assumed to be zeroed properly, the function will just
+  accumulate values into that output variable. Use \ref QPhiX::zeroSpinor for that.
+  \param[in] in Input spinor block.
+  \param[in] clover Single clover block that contains the lattice site of the spinor.
+  \param[in] xi SIMD index for the arrays with length `soalen`, as in the spinors.
+  \param[in] veclen_idx SIMD index for the arrays with length `veclen`, as in the clover term.
+  */
+  static void multiply(
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &out,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &in,
+      Clover const &clover, int const xi, int const veclen_idx);
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+struct InnerCloverProduct<FT, veclen, soalen, compress12,
+                          typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock> {
+  static void multiply(
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &spinor_out,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &spinor_in,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock const &clov_block,
+      int const xi, int const veclen_idx) {
+    // The clover term is block-diagonal in spin. Therefore we need
+    // to iterate over the two blocks of spin.
+    for (auto s_block : {0, 1}) {
+      // Extract the diagonal and triangular parts.
+      auto const &diag_in = s_block == 0 ? clov_block.diag1 : clov_block.diag2;
+      auto const &off_diag_in = s_block == 0 ? clov_block.off_diag1 : clov_block.off_diag2;
+      // Input two-spinor component.
+      for (auto two_s_in : {0, 1}) {
+        // Reconstruct four spinor index.
+        auto const four_s_in = 2 * s_block + two_s_in;
+        // Output two-spinor component.
+        for (auto two_s_out : {0, 1}) {
+          // Reconstruct four spinor index.
+          auto const four_s_out = 2 * s_block + two_s_out;
+          // Input color.
+          for (auto c_in : {0, 1, 2}) {
+            // Spin-color index (0, ..., 5).
+            auto const sc_in = 3 * two_s_in + c_in;
+            // Output color.
+            for (auto c_out : {0, 1, 2}) {
+              // Spin-color index (0, ..., 5).
+              auto const sc_out = 3 * two_s_out + c_out;
+
+              // See `qphix-codegen` file `dslash_common.cc`
+              // function
+              // `clover_term` for the index manipulations done
+              // here.
+
+              // Using separate loops over the actual indices is
+              // probably
+              // faster than the branching in the innermost loop.
+
+              if (sc_out == sc_in) {
+                cplx_mul_acc(spinor_out[c_out][four_s_out][re][xi],
+                             spinor_out[c_out][four_s_out][im][xi], diag_in[sc_in][veclen_idx],
+                             QPhiX::rep<FT,double>(0.0), spinor_in[c_in][four_s_in][re][xi],
+                             spinor_in[c_in][four_s_in][im][xi]);
+              } else if (sc_out < sc_in) {
+                auto const idx15 = sc_in * (sc_in - 1) / 2 + sc_out;
+                cplx_mul_acc(
+                    spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
+                    off_diag_in[idx15][re][veclen_idx],
+                    // aww hell, maybe one should just add negation to QPhiX::half ?
+                    QPhiX::rep<FT,double>(-QPhiX::rep<double,FT>(off_diag_in[idx15][im][veclen_idx])),
+                    spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
+              } else {
+                auto const idx15 = sc_out * (sc_out - 1) / 2 + sc_in;
+                cplx_mul_acc(
+                    spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
+                    off_diag_in[idx15][re][veclen_idx], off_diag_in[idx15][im][veclen_idx],
+                    spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+struct InnerCloverProduct<
+    FT, veclen, soalen, compress12,
+    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock> {
+  static void multiply(
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &spinor_out,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &spinor_in,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock const &clov_block,
+      int const xi, int const veclen_idx) {
+    // The clover term is block-diagonal in spin. Therefore we need
+    // to iterate over the two blocks of spin.
+    for (auto s_block : {0, 1}) {
+      // handy reference to half-spinor block
+      auto const &block_in = s_block == 0 ? clov_block.block1 : clov_block.block2;
+      // Input two-spinor component.
+      for (auto two_s_in : {0, 1}) {
+        // Reconstruct four spinor index.
+        auto const four_s_in = 2 * s_block + two_s_in;
+        // Output two-spinor component.
+        for (auto two_s_out : {0, 1}) {
+          // Reconstruct four spinor index.
+          auto const four_s_out = 2 * s_block + two_s_out;
+          // Input color.
+          for (auto c_in : {0, 1, 2}) {
+            // Spin-color index (0, ..., 5).
+            auto const sc_in = 3 * two_s_in + c_in;
+            // Output color.
+            for (auto c_out : {0, 1, 2}) {
+              // Spin-color index (0, ..., 5).
+              auto const sc_out = 3 * two_s_out + c_out;
+
+              cplx_mul_acc(
+                  spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
+                  block_in[sc_out][sc_in][re][veclen_idx], block_in[sc_out][sc_in][im][veclen_idx],
+                  spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/**
+  Multiplies a checkerboarded QPhiX Clover term with a checkerboarded QPhiX spinor.
+
+  Padding is taken care of. A test case for (a copy of) this function exists in QPhiX.
+
+  If the preprocessor macro `PRINT_MAPPING` is defined, it will print out the mapping of `(x, y, z,
+  t)` coordinates to block indices. Also it will check that each block is accessed the proper number
+  of times, that is `soalen` for spinors and `veclen` for clover blocks.
+
+  \param[out] out Output spinor
+  \param[in] in Input spinor
+  \param[in] clover Clover block
+  \param[in] geom Geometry object holding the dimension of clover and spinor
+  */
+template <typename FT, int veclen, int soalen, bool compress12, typename Clover>
+void clover_product(
+    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock *const out,
+    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const *const in,
+    Clover *clover, ::QPhiX::Geometry<FT, veclen, soalen, compress12> &geom) {
+  ::QPhiX::zeroSpinor<FT, veclen, soalen, compress12>(out, geom, n_blas_simt);
+
+#ifdef PRINT_MAPPING
+  std::vector<int> spin_touches(geom.getPxyz() * geom.Nt(), 0);
+  std::vector<int> clover_touches(geom.getPxyz() * geom.Nt() * soalen / veclen, 0);
+
+  std::cout << std::setw(3) << "x" << std::setw(3) << "y" << std::setw(3) << "z" << std::setw(3)
+            << "t"
+            << ":" << std::setw(5) << "spin" << std::setw(5) << "clov"
+            << "\n";
+#endif
+
+  // Iterate through all the block.
+  for (int t = 0; t < geom.Nt(); ++t) {
+    for (int z = 0; z < geom.Nz(); ++z) {
+      for (int y = 0; y < geom.Ny(); ++y) {
+        for (int x = 0; x < geom.Nxh(); ++x) {
+          // First element in the current XY plane at desired Z and T.
+          auto const xyBase = t * geom.getPxyz() + z * geom.getPxy();
+          // Index of the SoA along the X direction.
+          auto const xb = x / soalen;
+          // Index within the SoA.
+          auto const xi = x % soalen;
+          // Global spin block index.
+          auto const spin_block_idx = xb + geom.Nxh() / soalen * y + xyBase;
+          // Global clover/gauge block index.
+          auto const clov_block_idx =
+              xb + (y / geom.nGY()) * geom.Nxh() / soalen + xyBase / geom.nGY();
+          // Index of the SoA structure within the current tile.
+          // auto const tile = (geom.Nxh() / soalen * y + xyBase) % geom.nGY();
+          auto const tile = y % geom.nGY();
+          // Vector index for clover/gauge. The SoA index only runs to
+          // `soalen`, this index needs to run to `veclen`, that is across the
+          // various SoA within the tile.
+          auto const veclen_idx = soalen * tile + xi;
+
+#ifdef PRINT_MAPPING
+          ++spin_touches[spin_block_idx];
+          ++clover_touches[clov_block_idx];
+
+          std::cout << std::setw(3) << x << std::setw(3) << y << std::setw(3) << z << std::setw(3)
+                    << t << ":" << std::setw(5) << spin_block_idx << std::setw(5) << clov_block_idx
+                    << "\n";
+#endif
+
+          assert(xi + xb * soalen == x);
+
+          // References to the objects at desired block.
+          auto const &clov_block = clover[clov_block_idx];
+          auto const &spinor_in = in[spin_block_idx];
+          auto &spinor_out = out[spin_block_idx];
+
+          InnerCloverProduct<FT, veclen, soalen, compress12, Clover>::multiply(
+              spinor_out, spinor_in, clov_block, xi, veclen_idx);
+        }
+      }
+    }
+  }
+
+#ifdef PRINT_MAPPING
+  std::cout << std::flush;
+
+  // Make sure that each block got touched the correct number of times.
+  for (int i = 0; i != spin_touches.size(); ++i) {
+    if (spin_touches[i] != soalen) {
+      std::cout << "Spin missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
+                << spin_touches[i] << " times instead of " << soalen << "\n";
+    }
+  }
+
+  for (int i = 0; i != clover_touches.size(); ++i) {
+    if (clover_touches[i] != veclen) {
+      std::cout << "Clover missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
+                << clover_touches[i] << " times instead of " << veclen << "\n";
+    }
+  }
+
+  std::cout << std::flush;
+#endif
+}
+
+/**
+  Abstract base class for all single-flavor Dslash variants.
+
+  There are four Dslash operators which are implemented in QPhiX:
+
+  - Wilson
+  - Wilson clover
+  - Wilson twisted mass
+  - Wilson clover with twisted mass
+
+  Each of these has a the actual Dslash operation and a so-called “achimbdpsi” operation. These act
+  on four-spinors given a gauge field. This base class provides a uniform interface to all four
+  kinds.
+
+  This code should eventually be migrated into the QPhiX repository. Currently these classes are
+  mere delegators. In the QPhiX repository, the actual classes there should be used as concrete
+  classes.
+  */
+template <typename FT, int veclen, int soalen, bool compress12>
+class Dslash {
+ public:
+  typedef ::QPhiX::Geometry<FT, veclen, soalen, compress12> Geom;
+  typedef typename Geom::FourSpinorBlock Spinor;
+  typedef typename Geom::SU3MatrixBlock SU3MatrixBlock;
+
+  explicit Dslash(Geom *geom, double const t_boundary_, double const aniso_coeff_S_,
+                  double const aniso_coeff_T_, double const mass_, bool use_tbc_[4] = nullptr,
+                  double tbc_phases_[4][2] = nullptr)
+      : geom(geom),
+        t_boundary(t_boundary_),
+        aniso_coeff_S(aniso_coeff_S_),
+        aniso_coeff_T(aniso_coeff_T_),
+        mass(mass_) {}
+
+  /**
+    Computes \f$ \psi_\mathrm o = A_\mathrm{oo} \chi_\mathrm o \f$.
+
+    The actual definition of the matrix \f$ A_\mathrm{oo} \f$ is
+    implementation dependent and can be the mass factor \f$ \alpha = 4 + m
+    \f$ for plain Wilson or something more complicated for twisted mass.
+
+    \param[out] out Output spinor \f$ \psi \f$.
+    \param[in] in Input spinor \f$ \chi \f$.
+    */
+  virtual void A_chi(Spinor *const out, Spinor const *const in, int const isign, int const cb) = 0;
+
+  /**
+    Computes \f$ \psi_\mathrm e = A_\mathrm{ee}^{-1} \chi_\mathrm e \f$.
+
+    \param[out] out Output spinor \f$ \psi \f$.
+    \param[in] in Input spinor \f$ \chi \f$.
+    */
+  virtual void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
+                         int const cb) = 0;
+
+  /**
+    Forwarder for the `dslash`.
+
+    This will call the `dslash` function of the respective QPhiX dslash class. There is a subtle
+    difference between the Wilson and all other cases. The Wilson dslash is just the hopping matrix,
+    just the operator \f$ D \f$. For every other case (clover, twisted mass, twisted mass clover),
+    the `dslash` member function will compute \f$ A^{-1} D \f$. In the Wilson case, this \f$ A =
+    \alpha = 4 + m = 1/(2 \kappa) \f$. Since that is _not_ included in the Wilson `dslash`, you will
+    obtain different results when using WilsonDslash::dslash and WilsonTMDslash::dslash with \f$
+    \mu = 0 \f$.
+
+    \todo Make this member function `const`. For this the member function in
+    QPhiX that is called internally must be marked `const` as well.
+    */
+  virtual void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+                      int const isign, int const cb) = 0;
+
+  /**
+    Always plain Wilson dslash.
+
+    In contrast to the \ref dslash member function which just forwards the implementation of QPhiX,
+    this will always give you the “naked” plain Wilson dslash without any factors of \f$ A^{-1} \f$
+    applied.
+    */
+  virtual void plain_dslash(Spinor *const res, const Spinor *const psi,
+                            const SU3MatrixBlock *const u, int const isign, int const cb) {
+    // XXX Perhaps rather implement this with an instance of the WilsonDslash instead?
+
+    auto tmp = QPhiX::makeFourSpinorHandle(*geom);
+    dslash(tmp.get(), psi, u, isign, cb);
+    A_chi(res, tmp.get(), isign, cb);
+  };
+
+  /**
+    Always “dressed” dslash.
+
+    This computes \f$ A^{-1} D \f$ for all variants. In the Wilson case, this will give \f$
+    \alpha^{-1} D \f$.
+    */
+  virtual void A_inv_dslash(Spinor *const res, const Spinor *const psi,
+                            const SU3MatrixBlock *const u, int const isign, int const cb) {
+    dslash(res, psi, u, isign, cb);
+  };
+
+  /**
+    Forwarder for the `achimbdpsi`.
+
+    \todo Make this member function `const`. For this the member function in QPhiX that is called
+    internally must be marked `const` as well.
+    */
+  virtual void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                          const SU3MatrixBlock *const u, double const alpha, double const beta,
+                          int const isign, int const cb) = 0;
+
+  /**
+    Prepares the sources on the odd checkerboard.
+
+    This computes
+    \f[
+        \tilde b_o = \frac 12 D_{oe} M_{ee}^{-1} b_e + b_o \,.
+    \f]
+
+    \param[out] tilde_b_odd Prepared source
+    \param[in] b_even Source (right hand side) on the even lattice sites
+    \param]in] b_odd Source on the odd lattice sites
+    \param[in] u Gauge field on the odd lattice sites
+    */
+  virtual void prepare_source(Spinor *const tilde_b_odd, Spinor const *const b_even,
+                              Spinor const *const b_odd, SU3MatrixBlock const *const u);
+
+  /**
+    Reconstructs the solution on the even lattices sites.
+
+    This computes
+    \f[
+        x_e = M_{ee}^{-1} \left( b_e - \frac 12 D_{eo} x_o \right) \,.
+    \f]
+
+    \param[out] x_even Solution on the even lattices sites
+    \param[in] b_even Source (right hand side) on the even lattice sites
+    \param[in] x_odd Solution on the odd lattices sites
+    \param[in] u Gauge field on the even lattice sites
+    */
+  virtual void reconstruct_solution(Spinor *const x_even, Spinor const *const b_even,
+                                    Spinor const *const x_odd, SU3MatrixBlock const *const u);
+
+  Geom *getGeometry() const { return geom; }
+
+ private:
+  Geom *const geom;
+
+  double const t_boundary;
+  double const aniso_coeff_S;
+  double const aniso_coeff_T;
+  double const mass;
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+
+  WilsonDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_, double const t_boundary_,
+               double const aniso_coeff_S_, double const aniso_coeff_T_, double const mass_,
+               bool use_tbc_[4] = nullptr, double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(1.0 / (4.0 * mass_factor_alpha)) {}
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+             int const cb_ignored) override {
+    int const n_blas_simt = 1;
+    ::QPhiX::axy(mass_factor_alpha, in, out, upstream_dslash.getGeometry(), n_blas_simt);
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+                 int const cb_ignored) override {
+    int const n_blas_simt = 1;
+    ::QPhiX::axy(1.0 / mass_factor_alpha, in, out, upstream_dslash.getGeometry(), n_blas_simt);
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, isign, cb);
+  }
+
+  void plain_dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+                    int const isign, int const cb) override {
+    dslash(res, psi, u, isign, cb);
+  };
+
+  void A_inv_dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+                    int const isign, int const cb) override {
+    auto tmp = QPhiX::makeFourSpinorHandle(upstream_dslash.getGeometry());
+    dslash(tmp.get(), psi, u, isign, cb);
+    A_inv_chi(res, tmp.get(), isign, cb);
+  };
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, alpha, beta, isign, cb);
+  }
+
+ private:
+  ::QPhiX::Dslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonTMDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+
+  WilsonTMDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_, double const t_boundary_,
+                 double const aniso_coeff_S_, double const aniso_coeff_T_, double const mass_,
+                 double const twisted_mass_, bool use_tbc_[4] = nullptr,
+                 double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, mass_, twisted_mass_,
+                        use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(0.25),
+        derived_mu(twisted_mass_ / mass_factor_alpha),
+        derived_mu_inv(mass_factor_alpha /
+                       (mass_factor_alpha * mass_factor_alpha + twisted_mass_ * twisted_mass_)) {}
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign,
+             int const cb_ignored) override {
+    helper_A_chi(out, in, -derived_mu * isign, mass_factor_alpha);
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
+                 int const cb_ignored) override {
+    helper_A_chi(out, in, derived_mu * isign, derived_mu_inv);
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, isign, cb);
+  }
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, alpha, beta, isign, cb);
+  }
+
+ private:
+  void helper_A_chi(Spinor *const out, Spinor const *const in, double const factor_a,
+                    double const factor_b);
+
+  ::QPhiX::TMDslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+  double const derived_mu;
+  double const derived_mu_inv;
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonClovDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock CloverBlock;
+
+  WilsonClovDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_,
+                   double const t_boundary_, double const aniso_coeff_S_,
+                   double const aniso_coeff_T_, double const mass_,
+                   CloverBlock *const (&clover_)[2], CloverBlock *const (&inv_clover_)[2],
+                   bool use_tbc_[4] = nullptr, double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(1.0 / (4.0 * mass_factor_alpha)) {
+    for (int cb : {0, 1}) {
+      clover[cb] = clover_[cb];
+      inv_clover[cb] = inv_clover_[cb];
+    }
+  }
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+             int const cb) override {
+    clover_product(out, in, clover[cb], upstream_dslash.getGeometry());
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+                 int const cb) override {
+    clover_product(out, in, inv_clover[cb], upstream_dslash.getGeometry());
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, inv_clover[cb], isign, cb);
+  }
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, clover[cb], mass_factor_beta, isign, cb);
+  }
+
+ private:
+  ::QPhiX::ClovDslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+
+  /**
+    Reference to the clover term.
+
+    This class has to provide a `dslash` and `achimbdpsi` member function with the prescribed
+    argument list which does not contain the clover term. The user of these classes should not have
+    to differentiate between non-clover and clover variants. In order to provide the function
+    signature, the clover term is a member. This means that the user has to construct a new operator
+    if the pointers to the clover field need to be changed. Seperate pointers are kept for the fields
+    on the even and odd checkerboards, hence the array dimension.
+    */
+  CloverBlock *clover[2];
+
+  /// See \ref clover.
+  CloverBlock *inv_clover[2];
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonClovTMDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+  typedef
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock FullCloverBlock;
+  typedef
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock CloverBlock;
+
+  WilsonClovTMDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_,
+                     double const t_boundary_, double const aniso_coeff_S_,
+                     double const aniso_coeff_T_, double const mass_, double const twisted_mass_,
+                     CloverBlock *const (&clover_)[2],
+                     FullCloverBlock *const (&inv_clover_)[2][2], bool use_tbc_[4] = nullptr,
+                     double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(0.25),
+        derived_mu(twisted_mass_ / mass_factor_alpha),
+        derived_mu_inv(mass_factor_alpha /
+                       (mass_factor_alpha * mass_factor_alpha + twisted_mass_ * twisted_mass_)) {
+    for (int cb : {0, 1}) {
+      clover[cb] = clover_[cb];
+      for (int fl : {0, 1}) {
+        inv_clover[cb][fl] = inv_clover_[cb][fl];
+      }
+    }
+  }
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign, int const cb) override {
+    clover_product(out, in, clover[cb], upstream_dslash.getGeometry());
+    // TODO: add twisted mass here
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
+                 int const cb) override {
+    if (isign == -1) {
+      clover_product(out, in, inv_clover[cb][1], upstream_dslash.getGeometry());
+    } else {
+      clover_product(out, in, inv_clover[cb][0], upstream_dslash.getGeometry());
+    }
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, (const FullCloverBlock **)inv_clover[cb], isign, cb);
+  }
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, clover[cb],
+                                         mass_factor_beta, isign, cb);
+  }
+
+ private:
+  ::QPhiX::TMClovDslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+  double const derived_mu;
+  double const derived_mu_inv;
+
+  CloverBlock *clover[2];
+  /* For twisted clover, there are two fields on each checkerboard which differ in the sign
+   * of the twisted quark mass. In effect then, the inner index can be thought of as being
+   * in flavour space while the outer index is the checkerboard index. 
+   */
+  FullCloverBlock *inv_clover[2][2];
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+void WilsonTMDslash<FT, veclen, soalen, compress12>::helper_A_chi(Spinor *const out,
+                                                                  Spinor const *const in,
+                                                                  double const factor_a,
+                                                                  double const factor_b) {
+  auto const nVecs = upstream_dslash.getGeometry().nVecs();
+  auto const Pxy = upstream_dslash.getGeometry().getPxy();
+  auto const Pxyz = upstream_dslash.getGeometry().getPxyz();
+
+  for (uint64_t t = 0; t < T; t++)
+    for (uint64_t x = 0; x < LX / 2; x++)
+      for (uint64_t y = 0; y < LY; y++)
+        for (uint64_t z = 0; z < LZ; z++) {
+          uint64_t const SIMD_vector = x / soalen;
+          uint64_t const x_internal = x % soalen;
+          uint64_t const qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
+
+          for (int color = 0; color < 3; ++color) {
+            for (int spin_block = 0; spin_block < 2; ++spin_block) {
+              // Implement the $\gamma_5$ structure.
+              auto const signed_factor_a = factor_a * (spin_block == 0 ? 1.0 : -1.0);
+
+              for (int half_spin = 0; half_spin < 2; ++half_spin) {
+                auto const four_spin = 2 * spin_block + half_spin;
+                for (int v = 0; v < soalen; ++v) {
+                  auto &out_bcs = out[qphix_idx][color][four_spin];
+                  auto const &in_bcs = in[qphix_idx][color][four_spin];
+
+                  out_bcs[re][v] = factor_b * (in_bcs[re][v] + signed_factor_a * in_bcs[im][v]);
+                  out_bcs[im][v] = factor_b * (in_bcs[im][v] - signed_factor_a * in_bcs[re][v]);
+                }
+              }
+            }
+          }
+
+        }  // volume
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+void Dslash<FT, veclen, soalen, compress12>::prepare_source(Spinor *const tilde_b_odd,
+                                                            Spinor const *const b_even,
+                                                            Spinor const *const b_odd,
+                                                            SU3MatrixBlock const *const u) {
+  auto Mee_be = QPhiX::makeFourSpinorHandle(*geom);
+  WilsonDslash<FT, veclen, soalen, compress12> plain_dslash(geom, t_boundary, aniso_coeff_S,
+                                                            aniso_coeff_T, mass);
+
+  A_inv_chi(Mee_be.get(), b_even, 1, cb_even);
+
+  plain_dslash.dslash(tilde_b_odd, Mee_be.get(), u, 1, cb_odd);
+
+  // FIXME Perhaps use a variable number of BLAS threads here (last parameter).
+  QPhiX::aypx(0.5, Mee_be.get(), tilde_b_odd, *geom, 1);
+}
+
+template <typename FT, int veclen, int soalen, bool compress12>
+void Dslash<FT, veclen, soalen, compress12>::reconstruct_solution(Spinor *const x_even,
+                                                                  Spinor const *const b_even,
+                                                                  Spinor const *const x_odd,
+                                                                  SU3MatrixBlock const *const u) {
+  auto tmp = QPhiX::makeFourSpinorHandle(*geom);
+  WilsonDslash<FT, veclen, soalen, compress12> plain_dslash(geom, t_boundary, aniso_coeff_S,
+                                                            aniso_coeff_T, mass);
+
+  plain_dslash.dslash(tmp.get(), x_odd, u, 1, cb_even);
+  QPhiX::aypx(0.5, b_even, tmp.get(), *geom, 1);
+  A_inv_chi(x_even, tmp.get(), 1, cb_even);
+}
+}
diff --git a/src/lib/qphix/qphix_interface.cpp b/src/lib/qphix/qphix_interface.cpp
new file mode 100644
index 000000000..2c61427dd
--- /dev/null
+++ b/src/lib/qphix/qphix_interface.cpp
@@ -0,0 +1,2192 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Peter Labus
+ *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#include "qphix_interface.h"
+#include "qphix_interface.hpp"
+#include "qphix_interface_utils.hpp"
+#include "qphix_types.h"
+#include "qphix_veclen.h"
+
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+
+extern "C" {
+#ifdef HAVE_CONFIG_H
+#include "tmlqcd_config.h"
+#endif
+#include "boundary.h"
+#include "geometry_eo.h"
+#include "gettime.h"
+#include "global.h"
+#include "linalg/convert_eo_to_lexic.h"
+#include "linalg/diff.h"
+#include "linalg/square_norm.h"
+#include "misc_types.h"
+#include "operator/Hopping_Matrix.h"
+#include "operator/clover_leaf.h"
+#include "operator/clovertm_operators.h"
+#include "operator_types.h"
+#include "struct_accessors.h"
+
+// for the normalisation of the heavy doublet when running
+// RHMC
+#include "phmc.h"
+
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver.h"
+#include "solver/solver_field.h"
+#include "solver/solver_params.h"
+#include "solver/solver_types.h"
+#include "start.h"
+#include "xchange/xchange_gauge.h"
+}
+#ifdef TM_USE_OMP
+#include <omp.h>
+#endif
+#include <qphix/blas_new_c.h>
+#include <qphix/clover.h>
+#include <qphix/inv_dummy_hermtest.h>
+#include <qphix/inv_richardson_multiprec.h>
+#include <qphix/invbicgstab.h>
+#include <qphix/invcg.h>
+#include <qphix/minvcg.h>
+#include <qphix/ndtm_reuse_operator.h>
+#include <qphix/ndtm_reuse_operator_clover.h>
+#include <qphix/print_utils.h>
+#include <qphix/qphix_config.h>
+#include <qphix/twisted_mass.h>
+#include <qphix/twisted_mass_clover.h>
+#include <qphix/wilson.h>
+#include <cfloat>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+using namespace tmlqcd;
+
+tm_QPhiXParams_t qphix_input;
+
+int By;
+int Bz;
+int NCores;
+int Sy;
+int Sz;
+int PadXY;
+int PadXYZ;
+int MinCt;
+int N_simt;
+bool compress12;
+QphixPrec_t qphix_precision;
+QphixPrec_t qphix_inner_precision;
+
+int subLattSize[4];
+int lattSize[4];
+int qmp_geom[4];
+int qmp_tm_map[4];
+
+// angles for boundary phases, values come from read_input
+extern double X0, X1, X2, X3;
+
+bool use_tbc[4];
+double tbc_phases[4][2];
+// we always use twisted boundary conditions, which means that we are always
+// periodic in time and any possible anti-periodicity is implemented via
+// the phase
+double constexpr t_boundary = 1.0;
+
+template <typename T>
+struct rsdTarget {
+  static const double value;
+};
+
+template <>
+const double rsdTarget<QPhiX::half>::value = 1.0e-3;
+
+template <>
+const double rsdTarget<float>::value = 1.0e-8;
+
+void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPrec_t precision_,
+                QphixPrec_t inner_precision_) {
+  static bool qmp_topo_initialised = false;
+
+  // Global Lattice Size
+  lattSize[0] = LX * g_nproc_x;
+  lattSize[1] = LY * g_nproc_y;
+  lattSize[2] = LZ * g_nproc_z;
+  lattSize[3] = T * g_nproc_t;
+
+  // Local Lattice Size
+  subLattSize[0] = LX;
+  subLattSize[1] = LY;
+  subLattSize[2] = LZ;
+  subLattSize[3] = T;
+
+  // extract twisted boundary conditions
+  for (int dim = 0; dim < 4; dim++) {
+    bool dim_tbc = false;
+    double dim_phase[2] = {1.0, 0.0};
+    if (dim == 0) {
+      dim_tbc = (fabs(X1) > DBL_EPSILON);
+      dim_phase[0] = -((double *)(&phase_1))[0] / g_kappa;
+      dim_phase[1] = -((double *)(&phase_1))[1] / g_kappa;
+    } else if (dim == 1) {
+      dim_tbc = (fabs(X2) > DBL_EPSILON);
+      dim_phase[0] = -((double *)(&phase_2))[0] / g_kappa;
+      dim_phase[1] = -((double *)(&phase_2))[1] / g_kappa;
+    } else if (dim == 2) {
+      dim_tbc = (fabs(X3) > DBL_EPSILON);
+      dim_phase[0] = -((double *)(&phase_3))[0] / g_kappa;
+      dim_phase[1] = -((double *)(&phase_3))[1] / g_kappa;
+    } else if (dim == 3) {
+      dim_tbc = (fabs(X0) > DBL_EPSILON);
+      dim_phase[0] = -((double *)(&phase_0))[0] / g_kappa;
+      dim_phase[1] = -((double *)(&phase_0))[1] / g_kappa;
+    }
+    use_tbc[dim] = dim_tbc;
+    tbc_phases[dim][0] = dim_phase[0];
+    tbc_phases[dim][1] = dim_phase[1];
+  }
+
+  By = params.By;
+  Bz = params.Bz;
+  NCores = params.NCores;
+  Sy = params.Sy;
+  Sz = params.Sz;
+  PadXY = params.PadXY;
+  PadXYZ = params.PadXYZ;
+  MinCt = params.MinCt;
+  N_simt = Sy * Sz;
+  if (c12 == 8) {
+    QPhiX::masterPrintf(
+        "# INFO QphiX: 8-parameter gauge compression not supported, using two row compression "
+        "instead!\n");
+    c12 = 12;
+  }
+  compress12 = c12 == 12 ? true : false;
+  qphix_precision = precision_;
+  qphix_inner_precision = inner_precision_;
+
+#ifdef QPHIX_QMP_COMMS
+  // Declare the logical topology
+  if (!qmp_topo_initialised) {
+    // the QMP topology is the one implied by the number of processes in each
+    // dimension as required by QPHIX ( x fastest to t slowest running )
+    qmp_geom[0] = g_nproc_x;
+    qmp_geom[1] = g_nproc_y;
+    qmp_geom[2] = g_nproc_z;
+    qmp_geom[3] = g_nproc_t;
+
+    // in order for the topologies to agree between tmLQCD and QPhiX, the dimensions need to be
+    // permuted
+    // since Z is fastest in tmLQCD and X is second-slowest
+    qmp_tm_map[0] = 2;
+    qmp_tm_map[1] = 1;
+    qmp_tm_map[2] = 0;
+    qmp_tm_map[3] = 3;
+    if (QMP_declare_logical_topology_map(qmp_geom, 4, qmp_tm_map, 4) != QMP_SUCCESS) {
+      QMP_error("Failed to declare QMP Logical Topology\n");
+      abort();
+    }
+    // longish test to check if the logical coordinates are correctly mapped
+    if (g_debug_level >= 5) {
+      for (int proc = 0; proc < g_nproc; proc++) {
+        if (proc == g_proc_id) {
+          const int coordinates[4] = {g_proc_coords[1], g_proc_coords[2], g_proc_coords[3],
+                                      g_proc_coords[0]};
+          int id = QMP_get_node_number_from(coordinates);
+          int *qmp_coords = QMP_get_logical_coordinates_from(id);
+          fflush(stdout);
+          printf("QMP id: %3d x:%3d y:%3d z:%3d t:%3d\n", id, qmp_coords[0], qmp_coords[1],
+                 qmp_coords[2], qmp_coords[3]);
+          printf("MPI id: %3d x:%3d y:%3d z:%3d t:%3d\n\n", g_proc_id, g_proc_coords[1],
+                 g_proc_coords[2], g_proc_coords[3], g_proc_coords[0]);
+          free(qmp_coords);
+          fflush(stdout);
+          MPI_Barrier(MPI_COMM_WORLD);
+        } else {
+          MPI_Barrier(MPI_COMM_WORLD);
+        }
+      }
+    }
+    qmp_topo_initialised = true;
+  }
+#endif
+
+#ifdef QPHIX_QPX_SOURCE
+  if (thread_bind) {
+    QPhiX::setThreadAffinity(NCores_user, Sy_user * Sz_user);
+  }
+  QPhiX::reportAffinity();
+#endif
+}
+
+void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPrec_t precision_) {
+  _initQphix(argc, argv, params, c12, precision_, precision_);
+}
+
+// Finalize the QPhiX library
+void _endQphix() {}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_clover_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::CloverBlock *qphix_clover, int cb,
+    bool inverse, bool fl_offdiag = false) {
+  const double startTime = gettime();
+
+  /* the spin-colour clover term in sw_term and the corresponding inverse
+   * in sw_inv are stored in the tmLQCD gamma basis.
+   * When we translate spinors to QPhiX, we apply a transformation V to the tmLQCD
+   * spinor and then apply the same transformation to the output spinor
+   * ( we have V^dagger = V and V*V = 1 )
+   * Thus, in order to translate the clover field, we need to copy
+   *   (1+T)' = V*(1+T)*V, where T is the spin-colour clover-term
+   * This way, the clover term will be in the correct gamma basis.
+   *
+   * The tmLQCD clover term is stored in half-spinor blocks of colour matrices
+   * for which we need to work out what (1+T)'=V*(1+T)*V implies.
+   * Below, each sAB represents one 3x3 colour matrix
+   *
+   *                +s33 -s32    0    0
+   *  T' = V*T*V =  -s23 +s22    0    0
+   *                   0    0 +s11 -s10
+   *                   0    0 -s01 +s00
+   *
+   * Such that the half-spinor blocks are inverted and within these, the ordering is
+   * reversed. Note that the off-diagonal 3x3 colour blocks are hermitian conjugate to
+   * each other and this is preserved by the transformation.
+   *
+   * The QPhiX (Wilson) clover term is stored as 12 reals on the diagonal
+   * in two 6-element vectors, one for each half-spinor spin pair
+   * and two sets of off-diagonal complex components.
+   *
+   * In addition, colour matrices are transposed in QPhiX.
+   *
+   * The tmLQCD clover term is stored as:
+   *
+   *      s00 s01
+   *          s11
+   * T =          s22 s23
+   *                  s33
+   *
+   * with indexing
+   *
+   *     sw[0][0] sw[1][0]
+   *              sw[2][0]
+   *                       sw[0][1] sw[1][1]
+   *                                sw[2][1]
+   *
+   * The inverse has four su3 blocks instead and is indexed
+   *     sw_inv[0][0] sw_inv[1][0]
+   *     sw_inv[3][0] sw_inv[2][0]
+   *                               sw_inv[0][1] sw_inv[1][1]
+   *                               sw_inv[3][1] sw_inv[2][1]
+   *
+   * where blocks sw_inv[3][0] and sw_inv[3][1] are relevant only when mu > 0
+   *
+   * There is a special case for the non-degenerate twisted clover operator. The
+   * flavour-off-diagonal components of the inverse clover term do not have an imaginary part on the
+   * spin-colour diagonal. They can thus be stored as CloverBlock, which is done in the QPhiX
+   * implementation of the ND tmclover operator.
+   *
+   * As a hack, this inverse is prepared by sw_invert_epsbar and placed in to the last
+   * VOLUME/2 sites of sw_inv. Reading from there is triggered by the boolean
+   * fl_offdiag.
+   */
+
+  // rescale to get clover term (or its inverse) in the physical normalisation
+  // rather than the kappa normalisation
+  const double scale = inverse ? 2.0 * g_kappa : 1.0 / (2.0 * g_kappa);
+  su3 ***tm_clover = inverse ? sw_inv : sw;
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto ngy = geom.nGY();
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // packer for Wilson clover (real diagonal + complex upper-triangular)
+  /* for the index in the off_diagN arrays, we map to an index in the su3 struct
+   * keeping in mind complex conjugation
+   * The off-diagonal in QPhiX is stored as follows:
+   *
+   * 0 1 3 6 10
+   *   2 4 7 11
+   *     5 8 12
+   *       9 13
+   *         14
+   *
+   * which we are going to map to su3 in blocks
+   *
+   *     0* 1*
+   *        2*
+   *
+   * 3   4  5
+   * 6   7  8
+   * 10 11 12
+   *
+   *   9* 13*
+   *      14*
+   *
+   * where the asterisk indicates complex conjugation. As a linear array then,
+   * these mappings are:
+   *
+   */
+  const int od_su3_offsets[15] = {Nz,
+                                  2 * Nz,            //     0 1
+                                  Nc * Nz + 2 * Nz,  //       2
+
+                                  0,
+                                  Nz,
+                                  2 * Nz,  // 3  4  5
+                                  Nc * Nz,
+                                  Nc * Nz + Nz,
+                                  Nc * Nz + 2 * Nz,  // 6  7  8
+
+                                  Nz,  //     9
+
+                                  2 * Nc * Nz,
+                                  2 * Nc * Nz + Nz,
+                                  2 * Nc * Nz + 2 * Nz,  // 10 11 12
+
+                                  2 * Nz,
+                                  Nc * Nz + 2 * Nz};  // 13 14
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
+
+          for (int64_t x_soa = 0; x_soa < SOALEN; x_soa++) {
+            int64_t xx = (y % ngy) * SOALEN + x_soa;
+            int64_t q_cb_x_coord = x_soa + v * SOALEN;
+            int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+
+            // the inverse of the clover term is in even-odd ordering
+            // while the clover term itself is lexicographically ordered
+            // for the special case of the nd tmclover operator, the inverse of the flavour
+            // off-diagonal components is stored in the last VOLUME/2 elements of sw_inv
+            int64_t tm_idx =
+                (inverse ? g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]] : g_ipt[t][tm_x_coord][y][z]) +
+                ((inverse && fl_offdiag) ? VOLUME / 2 : 0);
+
+            int b_idx;
+
+            //             we begin with the diagonal elements in CloverBlock
+            for (int d = 0; d < 6; d++) {
+              //               choose the block in sw which corresponds to the block in T'
+              b_idx = d < 3 ? 2 : 0;
+              //               get the right colour components
+              qphix_clover[block].diag1[d][xx] = QPhiX::rep<FT, double>(
+                  *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                    (Nc * Nz + Nz) * (d % 3)) *
+                  scale);
+
+              qphix_clover[block].diag2[d][xx] = QPhiX::rep<FT, double>(
+                  *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                    (Nc * Nz + Nz) * (d % 3)) *
+                  scale);
+            }
+
+            b_idx = 2;  // s33 and s11
+            for (int od : {0, 1, 2}) {
+              for (int reim : {0, 1}) {
+                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale);
+
+                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale);
+              }
+            }
+
+            b_idx = 1;  // s32 and s10
+            for (int od : {3, 4, 5, 6, 7, 8, 10, 11, 12}) {
+              for (int reim : {0, 1}) {
+                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                      od_su3_offsets[od] + reim) *
+                    (-scale));
+
+                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                      od_su3_offsets[od] + reim) *
+                    (-scale));
+              }
+            }
+
+            b_idx = 0;  // s22 and s00
+            for (int od : {9, 13, 14}) {
+              for (int reim : {0, 1}) {
+                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale);
+
+                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale);
+              }
+            }
+
+          }  // x_soa
+        }  // for(v)
+      }  // for(y)
+    }  // for(z)
+  }  // for(t)
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf(
+        "# QPHIX-interface: time spent in reorder_clover_to_QPhiX (CloverBlock): %f secs\n",
+        diffTime);
+  }
+}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_clover_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FullCloverBlock *qphix_clover[2],
+    int cb, bool inverse) {
+  const double startTime = gettime();
+
+  /* the spin-colour clover term in sw_term and the corresponding inverse
+   * in sw_inv are stored in the tmLQCD gamma basis.
+   * When we translate spinors to QPhiX, we apply a transformation V to the tmLQCD
+   * spinor and then apply the same transformation to the output spinor
+   * ( we have V^dagger = V and V*V = 1 )
+   * Thus, in order to translate the clover field, we need to copy
+   *   (1+T)' = V*(1+T)*V, where T is the spin-colour clover-term
+   * This way, the clover term will be in the correct gamma basis.
+   *
+   * The tmLQCD clover term is stored in half-spinor blocks of colour matrices
+   * for which we need to work out what (1+T)'=V*(1+T)*V implies.
+   * Below, each sAB represents one 3x3 colour matrix
+   *
+   *                +s33 -s32    0    0
+   *  T' = V*T*V =  -s23 +s22    0    0
+   *                   0    0 +s11 -s10
+   *                   0    0 -s01 +s00
+   *
+   * Such that the half-spinor blocks are inverted and within these, the ordering is
+   * reversed. Note that the off-diagonal 3x3 colour blocks are hermitian conjugate to
+   * each other and this is preserved by the transformation.
+   *
+   * The QPhiX (tmclover) clover term and its inverse are stored as a pair of full
+   * 6x6 complex matrices which are multiplied with the spinor in exactly the same way
+   * as in tmLQCD.
+   *
+   * The tmLQCD clover term is stored as:
+   *
+   *      s00 s01
+   *          s11
+   * T =          s22 s23
+   *                  s33
+   *
+   * with indexing
+   *
+   *     sw[0][0] sw[1][0]
+   *              sw[2][0]
+   *                       sw[0][1] sw[1][1]
+   *                                sw[2][1]
+   *
+   * The inverse has four su3 blocks instead and is indexed
+   *     sw_inv[0][0] sw_inv[1][0]
+   *     sw_inv[3][0] sw_inv[2][0]
+   *                               sw_inv[0][1] sw_inv[1][1]
+   *                               sw_inv[3][1] sw_inv[2][1]
+   *
+   * where blocks sw_inv[3][0] and sw_inv[3][1] are relevant only when mu > 0   *
+   */
+
+  // rescale to get clover term (or its inverse) in the physical normalisation
+  // rather than the kappa normalisation
+  const double scale = inverse ? 2.0 * g_kappa : 1.0 / (2.0 * g_kappa);
+  su3 ***tm_clover = inverse ? sw_inv : sw;
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  const double amu = g_mu / (2.0 * g_kappa);
+
+  // Geometric parameters for QPhiX data layout
+  const auto ngy = geom.nGY();
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
+
+          for (int64_t x_soa = 0; x_soa < SOALEN; x_soa++) {
+            int64_t xx = (y % ngy) * SOALEN + x_soa;
+            int64_t q_cb_x_coord = x_soa + v * SOALEN;
+            int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+
+            //             the inverse of the clover term is in even-odd ordering
+            //             while the clover term itself is lexicographically ordered
+            int64_t tm_idx =
+                inverse ? g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]] : g_ipt[t][tm_x_coord][y][z];
+
+            for (int fl : {0, 1}) {
+              if (inverse && fl == 1) {
+                // the inverse clover term for the second flavour is stored at an offset
+                tm_idx += VOLUME / 2;
+              }
+              for (int q_hs : {0, 1}) {
+                auto &hs_block =
+                    ((q_hs == 0) ? qphix_clover[fl][block].block1 : qphix_clover[fl][block].block2);
+                for (int q_sc1 = 0; q_sc1 < 6; q_sc1++) {
+                  for (int q_sc2 = 0; q_sc2 < 6; q_sc2++) {
+                    const int q_s1 = q_sc1 / 3;
+                    const int q_s2 = q_sc2 / 3;
+                    const int q_c1 = q_sc1 % 3;
+                    const int q_c2 = q_sc2 % 3;
+
+                    // invert in spin as required by V*T*V
+                    const int t_hs = 1 - q_hs;
+                    // the indices inside the half-spinor are also inverted
+                    // (which transposes them, of course)
+                    const int t_s1 = 1 - q_s1;
+                    const int t_s2 = 1 - q_s2;
+                    // carry out the mapping from T' to T, keeping in mind that for the inverse
+                    // there are four blocks also on the tmLQCD side, otherwise there are just three
+                    const int t_b_idx = t_s1 + t_s2 + ((inverse && t_s1 == 1 && t_s2 == 0) ? 2 : 0);
+                    for (int reim : {0, 1}) {
+                      hs_block[q_sc1][q_sc2][reim][xx] = QPhiX::rep<FT, double>(
+                          scale *
+                              // off-diagonal (odd-numbered) blocks change sign
+                              (t_b_idx & 1 ? (-1.0) : 1.0) *
+                              // if not doing the inverse and in the bottom-left block, need to
+                              // complex conjugate
+                              ((!inverse && (t_s1 == 1 && t_s2 == 0) && reim == 1) ? -1.0 : 1.0) *
+                              *(reinterpret_cast<double const *const>(
+                                    &(tm_clover[tm_idx][t_b_idx][t_hs].c00)) +
+                                // if not doing the inverse and in the bottom-left block, transpose
+                                // in colour
+                                // because we're actually reading out of the top-right block
+                                Nz * ((!inverse && (t_s1 == 1 && t_s2 == 0)) ? Nc * q_c2 + q_c1
+                                                                             : Nc * q_c1 + q_c2) +
+                                reim) +
+                          // in the QPhiX gamma basis, the twisted quark mass enters with the
+                          // opposite
+                          // sign for consistency
+                          ((!inverse && q_sc1 == q_sc2 && q_hs == 0 && reim == 1)
+                               ? -amu * (1 - 2 * fl)
+                               : 0) +
+                          ((!inverse && q_sc1 == q_sc2 && q_hs == 1 && reim == 1)
+                               ? amu * (1 - 2 * fl)
+                               : 0));
+                    }
+                  }  // q_sc2
+                }  // q_sc1
+              }  // q_hs
+            }  // fl
+
+          }  // x_soa
+        }  // for(v)
+      }  // for(y)
+    }  // for(z)
+  }  // for(t)
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf(
+        "# QPHIX-interface: time spent in reorder_clover_to_QPhiX (FullCloverBlock): %f secs\n",
+        diffTime);
+  }
+}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_gauge_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::SU3MatrixBlock *qphix_gauge_cb0,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::SU3MatrixBlock *qphix_gauge_cb1) {
+  const double startTime = gettime();
+
+  // Number of elements in spin, color & complex
+  // Here c1 is QPhiX's outer color, and c2 the inner one
+  const int Ns = 4;
+  const int Nc1 = compress12 ? 2 : 3;
+  const int Nc2 = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto ngy = geom.nGY();
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // This is needed to translate between the different
+  // orderings of the direction index "\mu" in tmlQCD
+  // and QPhiX, respectively
+  // in qphix, the Dirac operator is applied in the order
+  //   -+x -> -+y -> -+z -> -+t
+  // while tmlqcd does
+  //   -+t -> -+x -> -+y -> -+z
+  // same as the lattice ordering
+  // The mappingn between the application dimensions is thus:
+  //  tmlqcd_dim(t(0) -> x(1) -> y(2) -> z(3)) = qphix_dim( t(3) -> x(0) -> y(1) -> z(2) )
+  const int change_dim[4] = {1, 2, 3, 0};
+
+  // Get the base pointer for the (global) tmlQCD gauge field
+  xchange_gauge(g_gauge_field);
+  const double *in = reinterpret_cast<double *>(&g_gauge_field[0][0].c00);
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++)
+    for (int64_t z = 0; z < LZ; z++)
+      for (int64_t y = 0; y < LY; y++)
+        for (int64_t v = 0; v < nVecs; v++) {
+          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
+
+          for (int dim = 0; dim < 4; dim++)     // dimension == QPhiX \mu
+            for (int c1 = 0; c1 < Nc1; c1++)    // QPhiX convention color 1 (runs up to 2 or 3)
+              for (int c2 = 0; c2 < Nc2; c2++)  // QPhiX convention color 2 (always runs up to 3)
+                for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
+                  int64_t xx = (y % ngy) * SOALEN + x_soa;
+                  int64_t q_cb_x_coord = x_soa + v * SOALEN;
+                  int64_t tm_x_coord_cb0 = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ 0);
+                  int64_t tm_x_coord_cb1 = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ 1);
+
+                  int64_t tm_idx_cb0;
+                  int64_t tm_idx_cb1;
+
+                  // backward / forward
+                  for (int dir = 0; dir < 2; dir++) {
+                    if (dir == 0) {
+                      tm_idx_cb0 = g_idn[g_ipt[t][tm_x_coord_cb0][y][z]][change_dim[dim]];
+                      tm_idx_cb1 = g_idn[g_ipt[t][tm_x_coord_cb1][y][z]][change_dim[dim]];
+                    } else {
+                      tm_idx_cb0 = g_ipt[t][tm_x_coord_cb0][y][z];
+                      tm_idx_cb1 = g_ipt[t][tm_x_coord_cb1][y][z];
+                    }
+                    for (int reim = 0; reim < Nz; reim++) {
+                      // Note:
+                      // -----
+                      // 1. \mu in QPhiX runs from 0..7 for all eight neighbouring
+                      // links.
+                      //    Here, the ordering of the direction (backward/forward)
+                      //    is the same
+                      //    for tmlQCD and QPhiX, but we have to change the
+                      //    ordering of the dimensions.
+                      int q_mu = 2 * dim + dir;
+
+                      qphix_gauge_cb0[block][q_mu][c1][c2][reim][xx] =
+                          QPhiX::rep<FT, double>(su3_get_elem(
+                              &(g_gauge_field[tm_idx_cb0][change_dim[dim]]), c2, c1, reim));
+                      qphix_gauge_cb1[block][q_mu][c1][c2][reim][xx] =
+                          QPhiX::rep<FT, double>(su3_get_elem(
+                              &(g_gauge_field[tm_idx_cb1][change_dim[dim]]), c2, c1, reim));
+                    }
+                  }
+                }  // for(dim,c1,c2,x_soa)
+        }  // outer loop (t,z,y,v)
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_gauge_to_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+// Reorder tmLQCD eo-spinor to a FourSpinorBlock QPhiX spinor on the given checkerboard
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_eo_spinor_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom, spinor const *const tm_eo_spinor,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FourSpinorBlock *qphix_spinor,
+    const int cb) {
+  const double startTime = gettime();
+
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+  const auto Nxh = geom.Nxh();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  // (note, this is a 4x4 matrix with 4 non-zero elements)
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          for (int col = 0; col < Nc; col++) {
+            for (int q_spin = 0; q_spin < Ns; q_spin++) {
+              for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
+                int64_t q_ind = t * Pxyz + z * Pxy + y * nVecs + v;
+                int64_t q_cb_x_coord = v * SOALEN + x_soa;
+                // when t+y+z is odd and we're on an odd (1) checkerboard OR
+                // when t+y+z is even and we're on an even (0) checkerboard
+                // the full x coordinate is 2*x_cb
+                // otherwise, it is 2*x_cb+1
+                int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+                // exchange x and z dimensions
+                int64_t tm_eo_ind = g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]];
+
+                for (int reim = 0; reim < 2; reim++) {
+                  qphix_spinor[q_ind][col][q_spin][reim][x_soa] = QPhiX::rep<FT, double>(
+                      change_sign[q_spin] *
+                      spinor_get_elem(&(tm_eo_spinor[tm_eo_ind]), change_spin[q_spin], col, reim));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_eo_spinor_to_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_eo_spinor_from_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom, spinor *tm_eo_spinor,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FourSpinorBlock *qphix_spinor,
+    const int cb, double normFac = 1.0) {
+  const double startTime = gettime();
+
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+  const auto Nxh = geom.Nxh();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  // (note, this is a 4x4 matrix with 4 non-zero elements)
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          for (int col = 0; col < Nc; col++) {
+            for (int q_spin = 0; q_spin < Ns; q_spin++) {
+              for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
+                int64_t q_ind = t * Pxyz + z * Pxy + y * nVecs + v;
+                int64_t q_cb_x_coord = v * SOALEN + x_soa;
+                // when t+y+z is odd and we're on an odd checkerboard (1) OR
+                // when t+y+z is even and we're on an even (0) checkerboard
+                // the full x coordinate is 2*x_cb
+                // otherwise, it is 2*x_cb+1
+                int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+                // exchange x and z dimensions
+                int64_t tm_eo_ind = g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]];
+
+                spinor_set_elem(
+                    &(tm_eo_spinor[tm_eo_ind]), change_spin[q_spin], col,
+                    change_sign[q_spin] * normFac *
+                        QPhiX::rep<double, FT>(qphix_spinor[q_ind][col][q_spin][0][x_soa]),
+                    change_sign[q_spin] * normFac *
+                        QPhiX::rep<double, FT>(qphix_spinor[q_ind][col][q_spin][1][x_soa]));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_eo_spinor_from_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+// Reorder a full tmLQCD spinor to a cb0 and cb1 QPhiX spinor
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_spinor_to_QPhiX(QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+                             double const *tm_spinor, FT *qphix_spinor_cb0, FT *qphix_spinor_cb1) {
+  const double startTime = gettime();
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+// This will loop over the entire lattice and calculate
+// the array and internal indices for both tmlQCD & QPhiX
+#pragma omp parallel for collapse(4)
+  for (uint64_t t = 0; t < T; t++)
+    for (uint64_t x = 0; x < LX; x++)
+      for (uint64_t y = 0; y < LY; y++)
+        for (uint64_t z = 0; z < LZ; z++) {
+          // These are the QPhiX SIMD vector in checkerboarded x direction
+          // (up to LX/2) and the internal position inside the SIMD vector
+          const uint64_t SIMD_vector = (x / 2) / SOALEN;
+          const uint64_t x_internal = (x / 2) % SOALEN;
+
+          // Calculate the array index in tmlQCD & QPhiX,
+          // given a global lattice index (t,x,y,z)
+          const uint64_t qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
+          const uint64_t tm_idx = g_ipt[t][x][y][z];
+
+          // Calculate base point for every spinor field element (tmlQCD) or
+          // for every SIMD vector of spinors, a.k.a FourSpinorBlock (QPhiX),
+          // which will depend on the checkerboard (cb)
+          const double *in = tm_spinor + Ns * Nc * Nz * tm_idx;
+          FT *out;
+          if ((t + x + y + z) & 1)
+            out = qphix_spinor_cb1 + SOALEN * Nz * Nc * Ns * qphix_idx;  // odd -> cb1
+          else
+            out = qphix_spinor_cb0 + SOALEN * Nz * Nc * Ns * qphix_idx;  // even -> cb0
+
+          // Copy the internal elements, performing a gamma basis transformation
+          for (int spin = 0; spin < Ns; spin++)  // QPhiX spin index
+            for (int color = 0; color < Nc; color++)
+              for (int z = 0; z < Nz; z++)  // RE or IM
+              {
+                const uint64_t qId =
+                    x_internal + z * SOALEN + spin * SOALEN * Nz + color * SOALEN * Nz * Ns;
+                const uint64_t tId = z + color * Nz + change_spin[spin] * Nz * Nc;
+
+                out[qId] = QPhiX::rep<FT, double>(change_sign[spin] * in[tId]);
+              }
+
+        }  // volume
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_spinor_to_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+// Reorder a cb0 and cb1 QPhiX spinor to a full tmLQCD spinor
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_spinor_from_QPhiX(QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+                               double *tm_spinor, FT const *qphix_spinor_cb0,
+                               FT const *qphix_spinor_cb1, double normFac = 1.0) {
+  const double startTime = gettime();
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+// This will loop over the entire lattice and calculate
+// the array and internal indices for both tmlQCD & QPhiX
+#pragma omp parallel for collapse(4)
+  for (uint64_t t = 0; t < T; t++)
+    for (uint64_t x = 0; x < LX; x++)
+      for (uint64_t y = 0; y < LY; y++)
+        for (uint64_t z = 0; z < LZ; z++) {
+          // These are the QPhiX SIMD vector in checkerboarded x direction
+          // (up to LX/2) and the internal position inside the SIMD vector
+          const uint64_t SIMD_vector = (x / 2) / SOALEN;
+          const uint64_t x_internal = (x / 2) % SOALEN;
+
+          // Calculate the array index in tmlQCD & QPhiX,
+          // given a global lattice index (t,x,y,z)
+          const uint64_t qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
+          const uint64_t tm_idx = g_ipt[t][x][y][z];
+
+          // Calculate base point for every spinor field element (tmlQCD) or
+          // for every SIMD vector of spinors, a.k.a FourSpinorBlock (QPhiX),
+          // which will depend on the checkerboard (cb)
+          const FT *in;
+          if ((t + x + y + z) & 1)
+            in = qphix_spinor_cb1 + SOALEN * Nz * Nc * Ns * qphix_idx;  // cb1
+          else
+            in = qphix_spinor_cb0 + SOALEN * Nz * Nc * Ns * qphix_idx;  // cb0
+          double *out = tm_spinor + Ns * Nc * Nz * tm_idx;
+
+          // Copy the internal elements, performing a gamma basis transformation
+          for (int spin = 0; spin < Ns; spin++)  // tmlQCD spin index
+            for (int color = 0; color < Nc; color++)
+              for (int z = 0; z < Nz; z++)  // RE or IM
+              {
+                const uint64_t qId = x_internal + z * SOALEN + change_spin[spin] * SOALEN * Nz +
+                                     color * SOALEN * Nz * Ns;
+                const uint64_t tId = z + color * Nz + spin * Nz * Nc;
+
+                out[tId] = QPhiX::rep<double, FT>(normFac * change_sign[spin] * in[qId]);
+              }
+
+        }  // volume
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_spinor_from_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+template <typename FT, int V, int S, bool compress12, typename FT_inner, int V_inner, int S_inner,
+          bool compress12_inner>
+void pack_nd_clover(
+    QPhiX::Geometry<FT, V, S, compress12> &geom,
+    QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner> &geom_inner,
+    typename QPhiX::Geometry<FT, V, S, compress12>::FullCloverBlock *full_invclov[2],
+    typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock *invclov_odiag,
+    typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock *clov,
+    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::FullCloverBlock
+        *full_invclov_inner[2],
+    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock
+        *invclov_odiag_inner,
+    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock *clov_inner,
+    const int cb, bool pack_inner) {
+  typedef typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock QClover;
+  typedef typename QPhiX::Geometry<FT, V, S, compress12>::FullCloverBlock QFullClover;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock
+      QClover_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::FullCloverBlock
+      QFullClover_inner;
+
+  double start = gettime();
+  reorder_clover_to_QPhiX(geom, clov, cb, false);
+  if (pack_inner) {
+    reorder_clover_to_QPhiX(geom_inner, clov_inner, cb, false);
+  }
+
+  sw_invert_epsbar(g_epsbar);
+  reorder_clover_to_QPhiX(geom, invclov_odiag, 1 - cb, true, true);
+  if (pack_inner) {
+    reorder_clover_to_QPhiX(geom_inner, invclov_odiag_inner, 1 - cb, true, true);
+  }
+
+  // no minus sign here, the difference in the sign of gamma5
+  // is taken care of internally
+  sw_invert_mubar(g_mubar);
+  reorder_clover_to_QPhiX(geom, full_invclov, 1 - cb, true);
+  if (pack_inner) {
+    reorder_clover_to_QPhiX(geom_inner, full_invclov_inner, 1 - cb, true);
+  }
+
+  sw_invert_nd(g_mubar * g_mubar - g_epsbar * g_epsbar);
+
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-inteface: ND TMClover clover-field packing took %.4lf seconds\n",
+                        gettime() - start);
+  }
+}
+
+// Due to github issue #404, the helper functions to apply the full QPhiX operator
+// are currently disabled because they conflict with the new interfaces in QPhiX
+// itself. If required, these should be rewritten to use these interfaces
+// rather than the base classes in qphix_base_classes.hpp
+
+// Apply the full QPhiX fermion matrix to checkerboarded tm spinors
+// template <typename FT, int V, int S, bool compress>
+// void Mfull_helper(spinor *Even_out, spinor *Odd_out, const spinor *Even_in, const spinor *Odd_in,
+//                  const op_type_t op_type) {
+//  // TODO: this should use handles for gauge and spinors because these are definitely temporary
+//  // objects
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::SU3MatrixBlock QGauge;
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::FourSpinorBlock QSpinor;
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::CloverBlock QClover;
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::FullCloverBlock QFullClover;
+//
+//  if (g_debug_level > 1) tmlqcd::printQphixDiagnostics(V, S, compress, V, S, compress);
+//
+//  double coeff_s = (FT)(1);
+//  double coeff_t = (FT)(1);
+//
+//  QPhiX::Geometry<FT, V, S, compress> geom(subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ,
+//                                           MinCt);
+//
+//  // Wilson mass
+//  double mass = 1 / (2.0 * g_kappa) - 4;
+//
+//  tmlqcd::Dslash<FT, V, S, compress> *polymorphic_dslash;
+//
+//  QGauge *u_packed[2];
+//  QSpinor *qphix_in[2];
+//  QSpinor *qphix_out[2];
+//
+//  QClover *clover[2];
+//  QClover *inv_clover[2];
+//
+//  QFullClover *inv_fullclover[2][2];
+//
+//  QSpinor *tmp_spinor = (QSpinor *)geom.allocCBFourSpinor();
+//  for (int cb : {0, 1}) {
+//    u_packed[cb] = (QGauge *)geom.allocCBGauge();
+//    qphix_in[cb] = (QSpinor *)geom.allocCBFourSpinor();
+//    qphix_out[cb] = (QSpinor *)geom.allocCBFourSpinor();
+//    clover[cb] = nullptr;
+//    inv_clover[cb] = nullptr;
+//    for (int fl : {0, 1}) {
+//      inv_fullclover[cb][fl] = nullptr;
+//    }
+//  }
+//  reorder_gauge_to_QPhiX(geom, u_packed[cb_even], u_packed[cb_odd]);
+//
+//  if (op_type == WILSON) {
+//    polymorphic_dslash = new tmlqcd::WilsonDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, use_tbc, tbc_phases);
+//  } else if (op_type == TMWILSON) {
+//    polymorphic_dslash = new tmlqcd::WilsonTMDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, -g_mu / (2.0 * g_kappa), use_tbc, tbc_phases);
+//  } else if (op_type == CLOVER && fabs(g_mu) <= DBL_EPSILON) {
+//    for (int cb : {0, 1}) {
+//      clover[cb] = (QClover *)geom.allocCBClov();
+//      inv_clover[cb] = (QClover *)geom.allocCBClov();
+//
+//      reorder_clover_to_QPhiX(geom, clover[cb], cb, false);
+//      sw_invert(cb, 0);
+//      reorder_clover_to_QPhiX(geom, inv_clover[cb], cb, true);
+//    }
+//
+//    polymorphic_dslash = new tmlqcd::WilsonClovDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, clover, inv_clover, use_tbc, tbc_phases);
+//
+//  } else if (op_type == CLOVER && fabs(g_mu) > DBL_EPSILON) {
+//    for (int cb : {0, 1}) {
+//      clover[cb] = (QClover *)geom.allocCBClov();
+//      for (int fl : {0, 1}) {
+//        inv_fullclover[cb][fl] = (QFullClover *)geom.allocCBFullClov();
+//      }
+//      reorder_clover_to_QPhiX(geom, clover[cb], cb, false);
+//      sw_invert(cb, g_mu);
+//      reorder_clover_to_QPhiX(geom, inv_fullclover[cb], cb, true);
+//    }
+//
+//    polymorphic_dslash = new tmlqcd::WilsonClovTMDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, -g_mu / (2.0 * g_kappa), clover,
+//        inv_fullclover, use_tbc, tbc_phases);
+//
+//  } else {
+//    QPhiX::masterPrintf("tmlqcd::Mfull_helper; No such operator type: %d\n", op_type);
+//    abort();
+//  }
+//
+////   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Even_in),
+////                              qphix_in[cb_even], cb_even);
+////   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Odd_in),
+/// qphix_in[cb_odd], /                              cb_odd);
+//  reorder_eo_spinor_to_QPhiX(geom, Even_in,
+//                             qphix_in[cb_even], cb_even);
+//  reorder_eo_spinor_to_QPhiX(geom, Odd_in, qphix_in[cb_odd],
+//                             cb_odd);
+//  // Apply QPhiX Mfull
+//  polymorphic_dslash->plain_dslash(qphix_out[cb_odd], qphix_in[cb_even], u_packed[cb_odd],
+//                                   /* isign == non-conjugate */ 1, cb_odd);
+//  polymorphic_dslash->plain_dslash(qphix_out[cb_even], qphix_in[cb_odd], u_packed[cb_even],
+//                                   /* isign == non-conjugate */ 1, cb_even);
+//  for (int cb : {0, 1}) {
+//    polymorphic_dslash->A_chi(tmp_spinor, qphix_in[cb], 1, cb);
+//    QPhiX::aypx(-0.5, tmp_spinor, qphix_out[cb], geom, 1);
+//  }
+//
+//  reorder_eo_spinor_from_QPhiX(geom, Even_out, qphix_out[cb_even],
+//                               cb_even, 2.0 * g_kappa);
+//  reorder_eo_spinor_from_QPhiX(geom, Odd_out, qphix_out[cb_odd], cb_odd,
+//                               2.0 * g_kappa);
+//
+//  geom.free(tmp_spinor);
+//  for (int cb : {0, 1}) {
+//    geom.free(u_packed[cb]);
+//    geom.free(qphix_in[cb]);
+//    geom.free(qphix_out[cb]);
+//    geom.free(clover[cb]);
+//    geom.free(inv_clover[cb]);
+//    for (int fl : {0, 1}) {
+//      geom.free(inv_fullclover[cb][fl]);
+//    }
+//  };
+//  delete (polymorphic_dslash);
+//}
+
+// Templated even-odd preconditioned solver using QPhiX Library
+template <typename FT, int V, int S, bool compress, typename FT_inner = FT, int V_inner = V,
+          int S_inner = S, bool compress_inner = compress>
+int invert_eo_qphix_helper(std::vector<std::vector<spinor *> > &tmlqcd_odd_out,
+                           std::vector<std::vector<spinor *> > &tmlqcd_odd_in,
+                           const double target_precision, const int max_iter, const int solver_flag,
+                           solver_params_t solver_params, const int num_flavour) {
+  // TODO: it would perhaps be beneficial to keep the fields resident
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::SU3MatrixBlock QGauge;
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::FourSpinorBlock QSpinor;
+  typedef typename QPhiX::FourSpinorHandle<FT, V, S, compress> QSpinorHandle;
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::CloverBlock QClover;
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::FullCloverBlock QFullClover;
+
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::SU3MatrixBlock
+      QGauge_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::FourSpinorBlock
+      QSpinor_inner;
+  typedef typename QPhiX::FourSpinorHandle<FT_inner, V_inner, S_inner, compress_inner>
+      QSpinorHandle_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::CloverBlock
+      QClover_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::FullCloverBlock
+      QFullClover_inner;
+
+  /************************
+   *                      *
+   *    SETUP GEOMETRY    *
+   *                      *
+   ************************/
+
+  if (g_debug_level > 1) {
+    tmlqcd::printQphixDiagnostics(V, S, compress, V_inner, S_inner, compress_inner);
+  }
+
+  QPhiX::Geometry<FT, V, S, compress> geom(subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ,
+                                           MinCt);
+
+  // we always create the inner geometry, the overhead should be small...
+  QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner> geom_inner(
+      subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ, MinCt);
+
+  // Set number of BLAS threads by hand.
+  // In case some implements the tune routines in QPhiX
+  // this may be updated...
+  QPhiX::masterPrintf("# Setting number of BLAS threads...\n");
+  const int n_blas_simt = N_simt;
+  QPhiX::masterPrintf("# ...done.\n");
+
+  // Anisotropy Coefficents
+  const double coeff_s = 1.0;
+  const double coeff_t = 1.0;
+
+  // The Wilson mass
+  const double mass = 1.0 / (2.0 * g_kappa) - 4.0;
+
+  // Set variables need for solve
+  bool verbose = g_debug_level > 2 ? true : false;
+  int niters = -1;
+  int niters2 = 0;
+  double rsd_final = -1.0;
+  uint64_t site_flops = 0;
+  uint64_t site_flops2 = 0;
+  uint64_t mv_apps = 0;
+  uint64_t mv_apps2 = 0;
+
+  double start_time;
+  double end_time;
+
+  // support for multi-shift solves via the length of the output vector,
+  // which counts the shifts on the outer index and the flavour on the inner index
+  const int num_shifts = tmlqcd_odd_out.size();
+  std::vector<double> shifts;
+  shifts.resize(num_shifts);
+  std::vector<double> RsdTargetArr;
+  RsdTargetArr.resize(num_shifts);
+  std::vector<double> RsdFinalArr;
+  RsdFinalArr.resize(num_shifts);
+
+  double rescale = 0.5 / g_kappa;
+  // the inverse of M M^dag, as required for the HMC, comes with a factor of alpha^2
+  if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+    rescale *= rescale;
+  }
+
+  std::vector<QSpinorHandle> q_spinor_handles;
+
+  QGauge *u_packed[2] = {nullptr, nullptr};
+  QGauge_inner *u_packed_inner[2] = {nullptr, nullptr};
+  for (int cb : {0, 1}) {
+    u_packed[cb] = (QGauge *)geom.allocCBGauge();
+  }
+  // Reorder (global) input gauge field from tmLQCD to QPhiX
+  reorder_gauge_to_QPhiX(geom, u_packed[cb_even], u_packed[cb_odd]);
+
+  // for mixed solvers, we also need the gauge field in the inner precision
+  if (solver_is_mixed(solver_flag)) {
+    for (int cb : {0, 1}) {
+      u_packed_inner[cb] = (QGauge_inner *)geom_inner.allocCBGauge();
+    }
+    reorder_gauge_to_QPhiX(geom_inner, u_packed_inner[cb_even], u_packed_inner[cb_odd]);
+  }
+
+  if (num_flavour == 1) {
+    constexpr int nf = 1;
+    std::vector<QSpinor *> qphix_in;
+    qphix_in.resize(1);
+    std::vector<QSpinor *> qphix_out;
+    qphix_out.resize(num_shifts);
+    QSpinor *qphix_buffer;
+
+    QClover *qphix_clover = nullptr;
+    QClover *qphix_inv_clover = nullptr;
+
+    QClover_inner *qphix_clover_inner = nullptr;
+    QClover_inner *qphix_inv_clover_inner = nullptr;
+
+    QFullClover *qphix_inv_fullclover[2] = {nullptr, nullptr};
+
+    QFullClover_inner *qphix_inv_fullclover_inner[2] = {nullptr, nullptr};
+
+    q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+    qphix_in[0] = q_spinor_handles.back().get();
+
+    for (int shift = 0; shift < num_shifts; shift++) {
+      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+      qphix_out[shift] = q_spinor_handles.back().get();
+    }
+
+    q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+    qphix_buffer = q_spinor_handles.back().get();
+
+    QPhiX::EvenOddLinearOperator<FT, V, S, compress> *FermionMatrixQPhiX = nullptr;
+    QPhiX::EvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner>
+        *InnerFermionMatrixQPhiX = nullptr;
+    if ((fabs(g_mu) > DBL_EPSILON) && g_c_sw > DBL_EPSILON) {  // TWISTED-MASS-CLOVER
+      qphix_clover = (QClover *)geom.allocCBClov();
+      for (int fl : {0, 1}) {
+        qphix_inv_fullclover[fl] = (QFullClover *)geom.allocCBFullClov();
+      }
+      reorder_clover_to_QPhiX(geom, qphix_clover, cb_odd, false);
+      reorder_clover_to_QPhiX(geom, qphix_inv_fullclover, cb_even, true);
+
+      QPhiX::masterPrintf("# Creating QPhiX Twisted Clover Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddTMCloverOperator<FT, V, S, compress>(
+          u_packed, qphix_clover, qphix_inv_fullclover, &geom, t_boundary, coeff_s, coeff_t,
+          use_tbc, tbc_phases, -0.5 * (g_mu3 + g_mu) / g_kappa);
+      if (solver_is_mixed(solver_flag)) {
+        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        for (int fl : {0, 1}) {
+          qphix_inv_fullclover_inner[fl] = (QFullClover_inner *)geom_inner.allocCBFullClov();
+        }
+        reorder_clover_to_QPhiX(geom_inner, qphix_clover_inner, cb_odd, false);
+        reorder_clover_to_QPhiX(geom_inner, qphix_inv_fullclover_inner, cb_even, true);
+        InnerFermionMatrixQPhiX =
+            new QPhiX::EvenOddTMCloverOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                u_packed_inner, qphix_clover_inner, qphix_inv_fullclover_inner, &geom_inner,
+                t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases, -0.5 * (g_mu3 + g_mu) / g_kappa);
+      }
+      QPhiX::masterPrintf("# ...done.\n");
+    } else if (fabs(g_mu) > DBL_EPSILON) {  // TWISTED-MASS
+      const double TwistedMass = -g_mu / (2.0 * g_kappa);
+      QPhiX::masterPrintf("# Creating QPhiX Twisted Mass Wilson Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddTMWilsonOperator<FT, V, S, compress>(
+          mass, TwistedMass, u_packed, &geom, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      QPhiX::masterPrintf("# ...done.\n");
+      if (solver_is_mixed(solver_flag)) {
+        InnerFermionMatrixQPhiX =
+            new QPhiX::EvenOddTMWilsonOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                mass, TwistedMass, u_packed_inner, &geom_inner, t_boundary, coeff_s, coeff_t,
+                use_tbc, tbc_phases);
+      }
+    } else if (g_c_sw > DBL_EPSILON) {  // WILSON CLOVER
+      qphix_clover = (QClover *)geom.allocCBClov();
+      qphix_inv_clover = (QClover *)geom.allocCBClov();
+
+      reorder_clover_to_QPhiX(geom, qphix_clover, cb_odd, false);
+      reorder_clover_to_QPhiX(geom, qphix_inv_clover, cb_even, true);
+
+      QPhiX::masterPrintf("# Creating QPhiX Wilson Clover Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddCloverOperator<FT, V, S, compress>(
+          u_packed, qphix_clover, qphix_inv_clover, &geom, t_boundary, coeff_s, coeff_t, use_tbc,
+          tbc_phases, -0.5 * g_mu3 / g_kappa);
+      if (solver_is_mixed(solver_flag)) {
+        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        qphix_inv_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        reorder_clover_to_QPhiX(geom_inner, qphix_clover_inner, cb_odd, false);
+        reorder_clover_to_QPhiX(geom_inner, qphix_inv_clover_inner, cb_even, true);
+        InnerFermionMatrixQPhiX =
+            new QPhiX::EvenOddCloverOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                u_packed_inner, qphix_clover_inner, qphix_inv_clover_inner, &geom_inner, t_boundary,
+                coeff_s, coeff_t, use_tbc, tbc_phases, -0.5 * g_mu3 / g_kappa);
+      }
+      QPhiX::masterPrintf("# ...done.\n");
+
+    } else {  // WILSON
+      QPhiX::masterPrintf("# Creating QPhiX Wilson Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddWilsonOperator<FT, V, S, compress>(
+          mass, u_packed, &geom, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      if (solver_is_mixed(solver_flag)) {
+        InnerFermionMatrixQPhiX =
+            new QPhiX::EvenOddWilsonOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                mass, u_packed_inner, &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc,
+                tbc_phases);
+      }
+      QPhiX::masterPrintf("# ...done.\n");
+    }
+
+    // Create a Linear Solver Object
+    QPhiX::AbstractSolver<FT, V, S, compress> *SolverQPhiX = nullptr;
+    QPhiX::AbstractSolver<FT_inner, V_inner, S_inner, compress_inner> *InnerSolverQPhiX = nullptr;
+    QPhiX::AbstractMultiSolver<FT, V, S, compress, nf> *MultiSolverQPhiX = nullptr;
+    if (solver_flag == DUMMYHERMTEST) {
+      QPhiX::masterPrintf("# QPHIX: Creating dummy solver for hermiticity test...\n");
+      SolverQPhiX =
+          new QPhiX::InvDummyHermTest<FT, V, S, compress,
+                                      typename QPhiX::EvenOddLinearOperator<FT, V, S, compress> >(
+              *FermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == CG) {
+      QPhiX::masterPrintf("# QPHIX: Creating CG solver...\n");
+      SolverQPhiX = new QPhiX::InvCG<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == BICGSTAB) {
+      QPhiX::masterPrintf("# QPHIX: Creating BiCGStab solver...\n");
+      SolverQPhiX = new QPhiX::InvBiCGStab<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == MIXEDCG) {
+      // TODO: probably need to adjust inner solver iterations here...
+      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision CG solver...\n");
+      InnerSolverQPhiX = new QPhiX::InvCG<FT_inner, V_inner, S_inner, compress_inner>(
+          *InnerFermionMatrixQPhiX, max_iter);
+      const bool MMdag = true;
+      SolverQPhiX = new QPhiX::InvRichardsonMultiPrec<FT, V, S, compress, FT_inner, V_inner,
+                                                      S_inner, compress_inner, MMdag>(
+          *FermionMatrixQPhiX, *InnerSolverQPhiX, solver_params.mcg_delta, max_iter);
+    } else if (solver_flag == MIXEDBICGSTAB) {
+      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision BICGCGSTAB solver...\n");
+      InnerSolverQPhiX = new QPhiX::InvBiCGStab<FT_inner, V_inner, S_inner, compress_inner>(
+          *InnerFermionMatrixQPhiX, max_iter);
+      const bool MMdag = false;
+      SolverQPhiX = new QPhiX::InvRichardsonMultiPrec<FT, V, S, compress, FT_inner, V_inner,
+                                                      S_inner, compress_inner, MMdag>(
+          *FermionMatrixQPhiX, *InnerSolverQPhiX, solver_params.mcg_delta, max_iter);
+    } else if (solver_flag == CGMMS) {
+      QPhiX::masterPrintf("# QPHIX: Creating multi-shift CG solver ...\n");
+      MultiSolverQPhiX =
+          new QPhiX::MInvCG<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter, num_shifts);
+    } else {
+      QPhiX::masterPrintf(" Solver not yet supported by QPhiX!\n");
+      QPhiX::masterPrintf(" Aborting...\n");
+      abort();
+    }
+    QPhiX::masterPrintf("# ...done.\n");
+
+    //     reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const
+    //     *const>(tmlqcd_odd_in[0][0]),
+    //                                qphix_in[0], cb_odd);
+    reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_in[0][0], qphix_in[0], cb_odd);
+    QPhiX::masterPrintf("# Calling the solver...\n");
+
+    // Set the right precision for the QPhiX solver
+    // we get target_precision externally and and is given such, that it's either
+    // already relative or absolute
+    // Most QPhiX solvers allow setting absolute or relative residual
+    // by passing an appropriate flag, but this is not true for the multi-shift solver.
+    // As a result, we follow that solver and call ALL solvers with
+    // QPhiX::RELATIVE, which gives results consistent with tmLQCD in all cases.
+    double rhs_norm2 = 1.0;
+    QPhiX::norm2Spinor(rhs_norm2, qphix_in[0], geom, n_blas_simt);
+    const double RsdTarget = sqrt(target_precision / rhs_norm2);
+
+    // Calling the solver
+    start_time = gettime();
+    if (solver_flag == DUMMYHERMTEST) {
+      random_spinor_field_eo(tmlqcd_odd_out[0][0], 0, RN_GAUSS);
+      reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_out[0][0], qphix_buffer, cb_odd);
+      for (int isign : {-1, 1}) {
+        (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
+                       isign, verbose, cb_odd, QPhiX::RELATIVE);
+      }
+      QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+    } else if (solver_flag == CG || solver_flag == MIXEDCG || solver_flag == RGMIXEDCG) {
+      // USING CG:
+      // We are solving
+      //   M M^dagger qphix_buffer = qphix_in_prepared
+      // here, that is, isign = -1 for the QPhiX CG solver.
+      (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
+                     -1, verbose, cb_odd, QPhiX::RELATIVE);
+      // After that. if required by the solution type, multiply with M^dagger:
+      //   qphix_out[1] = M^dagger ( M^dagger^-1 M^-1 ) qphix_in_prepared
+      if (solver_params.solution_type == TM_SOLUTION_M) {
+        (*FermionMatrixQPhiX)(qphix_out[0], qphix_buffer, /* conjugate */ -1);
+        mv_apps++;
+      } else {
+        QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    } else if (solver_flag == CGMMS) {
+      // TODO: handle the residuals properly
+      if (g_debug_level > 2) QPhiX::masterPrintf("# QPHIX CGMMS: shifts: \n");
+      for (int shift = 0; shift < num_shifts; shift++) {
+        RsdTargetArr[shift] = RsdTarget;
+        RsdFinalArr[shift] = -1.0;
+        shifts[shift] =
+            solver_params.shifts[shift] * solver_params.shifts[shift] / (4 * g_kappa * g_kappa);
+        if (g_debug_level > 2)
+          QPhiX::masterPrintf("# QPHIX CGMMS: shift[%d] = %.6e\n", shift, shifts[shift]);
+      }
+      if (g_debug_level > 2) QPhiX::masterPrintf("\n");
+      (*MultiSolverQPhiX)(qphix_out.data(), qphix_in[0], num_shifts, shifts.data(),
+                          RsdTargetArr.data(), niters, RsdFinalArr.data(), site_flops, mv_apps, -1,
+                          verbose);
+      rsd_final = RsdFinalArr[0];
+    } else if (solver_flag == BICGSTAB || solver_flag == MIXEDBICGSTAB) {
+      (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
+                     1, verbose, cb_odd, QPhiX::RELATIVE);
+      // for M^dagger^-1 M^-1 solution type, need to call BiCGstab twice
+      if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+        (*SolverQPhiX)(qphix_out[0], qphix_buffer, RsdTarget, niters2, rsd_final, site_flops,
+                       mv_apps2, -1, verbose, cb_odd, QPhiX::RELATIVE);
+      } else {
+        QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    }
+    end_time = gettime();
+
+    for (int shift = 0; shift < num_shifts; shift++) {
+      reorder_eo_spinor_from_QPhiX(geom, tmlqcd_odd_out[shift][0], qphix_out[shift], cb_odd,
+                                   rescale);
+    }
+
+    QPhiX::masterPrintf("# QPHIX: ...done.\n");
+    QPhiX::masterPrintf("# QPHIX: Cleaning up\n");
+    delete (FermionMatrixQPhiX);
+    delete (InnerFermionMatrixQPhiX);
+    delete (SolverQPhiX);
+    delete (InnerSolverQPhiX);
+    delete (MultiSolverQPhiX);
+    // on KNL, it seems that munmap is problematic, so we check for nullptr
+    if (qphix_clover) geom.free(qphix_clover);
+    if (qphix_inv_clover) geom.free(qphix_inv_clover);
+    if (qphix_clover_inner) geom_inner.free(qphix_clover_inner);
+    if (qphix_inv_clover_inner) geom_inner.free(qphix_inv_clover_inner);
+    for (int fl : {0, 1}) {
+      if (qphix_inv_fullclover[fl]) geom.free(qphix_inv_fullclover[fl]);
+      if (qphix_inv_fullclover_inner[fl]) geom_inner.free(qphix_inv_fullclover_inner[fl]);
+    }
+    QPhiX::masterPrintf("# QPHIX: ...done.\n\n");
+
+  } else if (num_flavour == 2) {
+    // for explicit template arguments
+    constexpr int nf = 2;
+
+    QSpinor *qphix_in[2];
+    std::vector<QSpinor **> qphix_out;
+    qphix_out.resize(num_shifts);
+    for (int shift = 0; shift < num_shifts; shift++) {
+      qphix_out[shift] = new QSpinor *[2];
+      for (int fl : {0, 1}) {
+        q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+        qphix_out[shift][fl] = q_spinor_handles.back().get();
+      }
+    }
+
+    QSpinor *qphix_buffer[2];
+    for (int fl : {0, 1}) {
+      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+      qphix_in[fl] = q_spinor_handles.back().get();
+      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+      qphix_buffer[fl] = q_spinor_handles.back().get();
+    }
+
+    QClover *qphix_clover = nullptr;
+    QClover_inner *qphix_clover_inner = nullptr;
+
+    QClover *qphix_invclov_odiag = nullptr;
+    QClover_inner *qphix_invclov_odiag_inner = nullptr;
+
+    QFullClover *qphix_inv_fullclover[2] = {nullptr, nullptr};
+    QFullClover_inner *qphix_inv_fullclover_inner[2] = {nullptr, nullptr};
+
+    QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> *TwoFlavFermionMatrixQPhiX = nullptr;
+    QPhiX::TwoFlavEvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner>
+        *InnerTwoFlavFermionMatrixQPhiX = nullptr;
+
+    if (g_c_sw > DBL_EPSILON) {  // DBCLOVER
+      qphix_clover = (QClover *)geom.allocCBClov();
+      qphix_invclov_odiag = (QClover *)geom.allocCBClov();
+      if (solver_is_mixed(solver_flag)) {
+        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        qphix_invclov_odiag_inner = (QClover_inner *)geom_inner.allocCBClov();
+      }
+
+      for (int fl : {0, 1}) {
+        qphix_inv_fullclover[fl] = (QFullClover *)geom.allocCBFullClov();
+        if (solver_is_mixed(solver_flag)) {
+          qphix_inv_fullclover_inner[fl] = (QFullClover_inner *)geom_inner.allocCBFullClov();
+        }
+      }
+
+      pack_nd_clover(geom, geom_inner, qphix_inv_fullclover, qphix_invclov_odiag, qphix_clover,
+                     qphix_inv_fullclover_inner, qphix_invclov_odiag_inner, qphix_clover_inner,
+                     cb_odd, solver_is_mixed(solver_flag));
+
+      QPhiX::masterPrintf(
+          "# QPHIX: Creating two-flavour QPhiX Wilson Twisted Clover Fermion Matrix...\n");
+      TwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMCloverReuseOperator<FT, V, S, compress>(
+          -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed, qphix_clover,
+          qphix_invclov_odiag, qphix_inv_fullclover, &geom, t_boundary, coeff_s, coeff_t, use_tbc,
+          tbc_phases);
+      if (solver_is_mixed(solver_flag)) {
+        InnerTwoFlavFermionMatrixQPhiX =
+            new QPhiX::EvenOddNDTMCloverReuseOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed_inner,
+                qphix_clover_inner, qphix_invclov_odiag_inner, qphix_inv_fullclover_inner,
+                &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      }
+    } else {  // DBTMWILSON
+      QPhiX::masterPrintf(
+          "# QPHIX: Creating two-flavour QPhiX Wilson Twisted Mass Fermion Matrix...\n");
+      TwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMWilsonReuseOperator<FT, V, S, compress>(
+          mass, -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed, &geom, t_boundary,
+          coeff_s, coeff_t, use_tbc, tbc_phases);
+      if (solver_is_mixed(solver_flag)) {
+        InnerTwoFlavFermionMatrixQPhiX =
+            new QPhiX::EvenOddNDTMWilsonReuseOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                mass, -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed_inner,
+                &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      }
+    }
+
+    //
+    QPhiX::AbstractSolver<FT, V, S, compress, nf> *TwoFlavSolverQPhiX = nullptr;
+    QPhiX::AbstractSolver<FT_inner, V_inner, S_inner, compress_inner, nf> *InnerTwoFlavSolverQPhiX =
+        nullptr;
+    QPhiX::AbstractMultiSolver<FT, V, S, compress, nf> *TwoFlavMultiSolverQPhiX = nullptr;
+    if (solver_flag == DUMMYHERMTEST) {
+      QPhiX::masterPrintf("# QPHIX: Creating dummy solver for hermiticity test...\n");
+      TwoFlavSolverQPhiX = new QPhiX::InvDummyHermTest<
+          FT, V, S, compress, typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+          *TwoFlavFermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == CG) {
+      QPhiX::masterPrintf("# QPHIX: Creating CG solver...\n");
+      TwoFlavSolverQPhiX =
+          new QPhiX::InvCG<FT, V, S, compress,
+                           typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+              *TwoFlavFermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == BICGSTAB) {
+      QPhiX::masterPrintf("# QPHIX: Creating BiCGstab solver...\n");
+      TwoFlavSolverQPhiX =
+          new QPhiX::InvBiCGStab<FT, V, S, compress,
+                                 typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+              *TwoFlavFermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == MIXEDCG) {
+      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision CG solver...\n");
+      InnerTwoFlavSolverQPhiX =
+          new QPhiX::InvCG<FT_inner, V_inner, S_inner, compress_inner,
+                           typename QPhiX::TwoFlavEvenOddLinearOperator<FT_inner, V_inner, S_inner,
+                                                                        compress_inner> >(
+              *InnerTwoFlavFermionMatrixQPhiX, max_iter);
+      const bool MMdag = true;
+      TwoFlavSolverQPhiX = new QPhiX::InvRichardsonMultiPrec<
+          FT, V, S, compress, FT_inner, V_inner, S_inner, compress_inner, MMdag,
+          typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+          *TwoFlavFermionMatrixQPhiX, *InnerTwoFlavSolverQPhiX, solver_params.mcg_delta, max_iter);
+    } else if (solver_flag == CGMMSND) {
+      QPhiX::masterPrintf("# QPHIX: Creating multi-shift CG solver...\n");
+      TwoFlavMultiSolverQPhiX =
+          new QPhiX::MInvCG<FT, V, S, compress,
+                            typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+              *TwoFlavFermionMatrixQPhiX, max_iter, num_shifts);
+    } else {
+      QPhiX::masterPrintf(" Solver not yet supported by QPhiX!\n");
+      QPhiX::masterPrintf(" Aborting...\n");
+      abort();
+    }
+    QPhiX::masterPrintf("# QPHIX: ...done.\n");
+
+    for (int fl : {0, 1}) {
+      //       reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const
+      //       *const>(tmlqcd_odd_in[0][fl]),
+      //                                  qphix_in[fl], cb_odd);
+      reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_in[0][fl], qphix_in[fl], cb_odd);
+    }
+
+    QPhiX::masterPrintf("# QPHIX: Calling the solver...\n");
+
+    // Set the right precision for the QPhiX solver
+    // we get target_precision externally and and is given such, that it's either
+    // already relative or absolute
+    // Most QPhiX solvers allow setting absolute or relative residual
+    // by passing an appropriate flag, but this is not true for the multi-shift solver.
+    // As a result, we follow that solver and call ALL solvers with
+    // QPhiX::RELATIVE, which gives results consistent with tmLQCD in all cases.
+    double rhs_norm2 = 1.0;
+    QPhiX::norm2Spinor<FT, V, S, compress, nf>(rhs_norm2, qphix_in, geom, n_blas_simt);
+    const double RsdTarget = sqrt(target_precision / rhs_norm2);
+
+    // Calling the solver
+    start_time = gettime();
+    if (solver_flag == DUMMYHERMTEST) {
+      for (int fl : {0, 1}) {
+        random_spinor_field_eo(tmlqcd_odd_out[0][fl], 0, RN_GAUSS);
+        reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_out[0][fl], qphix_buffer[fl], cb_odd);
+      }
+      for (int isign : {-1, 1}) {
+        (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
+                              mv_apps, isign, verbose, cb_odd, QPhiX::RELATIVE);
+      }
+      QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+    } else if (solver_flag == CG || solver_flag == MIXEDCG) {
+      // USING CG:
+      // We are solving
+      //   M M^dagger qphix_buffer = qphix_in_prepared
+      // here, that is, isign = -1 for the QPhiX CG solver.
+      (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
+                            mv_apps, -1, verbose, cb_odd, QPhiX::RELATIVE);
+      // After that. if required by the solution type, multiply with M^dagger:
+      //   qphix_out[1] = M^dagger M^dagger^-1 M^-1 qphix_in_prepared
+      if (solver_params.solution_type == TM_SOLUTION_M) {
+        (*TwoFlavFermionMatrixQPhiX)(qphix_out[0], qphix_buffer, /* conjugate */ -1);
+        mv_apps++;
+      } else {
+        QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    } else if (solver_flag == BICGSTAB || solver_flag == MIXEDBICGSTAB) {
+      (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
+                            mv_apps, 1, verbose, cb_odd, QPhiX::RELATIVE);
+      // for M^dagger^-1 M^-1 solution type, need to call BiCGstab twice
+      if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+        (*TwoFlavSolverQPhiX)(qphix_out[0], qphix_buffer, RsdTarget, niters2, rsd_final, site_flops,
+                              mv_apps2, -1, verbose, cb_odd, QPhiX::RELATIVE);
+      } else {
+        QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    } else if (solver_flag == CGMMSND) {
+      // TODO: handle the residuals properly
+      if (g_debug_level > 2) QPhiX::masterPrintf("# QPHIX CGMMSND: shifts: \n");
+      // tmLQCD weights the operator with 1/maxev in the RHMC relative to the shifts
+      // we will do this externally on the inverse (in monomial_solve) and thus need to weight
+      // the shifts by maxev^2
+      const double maxev_sq = (1.0 / phmc_invmaxev) * (1.0 / phmc_invmaxev);
+      for (int shift = 0; shift < num_shifts; shift++) {
+        RsdTargetArr[shift] = RsdTarget;
+        RsdFinalArr[shift] = -1.0;
+        shifts[shift] = maxev_sq * solver_params.shifts[shift] * solver_params.shifts[shift] /
+                        (4 * g_kappa * g_kappa);
+        if (g_debug_level > 2) QPhiX::masterPrintf("# [%d] = %lf\n", shift, shifts[shift]);
+      }
+      if (g_debug_level > 2) QPhiX::masterPrintf("\n");
+      (*TwoFlavMultiSolverQPhiX)(qphix_out.data(), qphix_in, num_shifts, shifts.data(),
+                                 RsdTargetArr.data(), niters, RsdFinalArr.data(), site_flops,
+                                 mv_apps, -1, verbose);
+      rsd_final = RsdFinalArr[0];
+    }
+    end_time = gettime();
+
+    for (int shift = 0; shift < num_shifts; shift++) {
+      for (int fl : {0, 1}) {
+        reorder_eo_spinor_from_QPhiX(geom, tmlqcd_odd_out[shift][fl], qphix_out[shift][fl], cb_odd,
+                                     rescale);
+      }
+    }
+
+    delete TwoFlavFermionMatrixQPhiX;
+    delete InnerTwoFlavFermionMatrixQPhiX;
+    delete InnerTwoFlavSolverQPhiX;
+    delete TwoFlavMultiSolverQPhiX;
+    delete TwoFlavSolverQPhiX;
+    for (int shift = 0; shift < num_shifts; shift++) {
+      delete[] qphix_out[shift];
+    }
+
+    if (qphix_clover) geom.free(qphix_clover);
+    if (qphix_invclov_odiag) geom.free(qphix_invclov_odiag);
+    if (qphix_clover_inner) geom_inner.free(qphix_clover_inner);
+    if (qphix_invclov_odiag_inner) geom_inner.free(qphix_invclov_odiag_inner);
+    for (int fl : {0, 1}) {
+      if (qphix_inv_fullclover[fl]) geom.free(qphix_inv_fullclover[fl]);
+      if (qphix_inv_fullclover_inner[fl]) geom_inner.free(qphix_inv_fullclover_inner[fl]);
+    }
+
+  } else {  // if(num_flavour)
+    // complain, this number of flavours is not valid
+  }  // if(num_flavour)
+
+  for (int cb : {0, 1}) {
+    if (u_packed[cb]) geom.free(u_packed[cb]);
+    if (u_packed_inner[cb]) geom_inner.free(u_packed_inner[cb]);
+  }
+
+  // FIXME: This should be called properly somewhere else
+  _endQphix();
+
+  QPhiX::masterPrintf("# ...done.\n\n");
+
+  uint64_t num_cb_sites = lattSize[0] / 2 * lattSize[1] * lattSize[2] * lattSize[3];
+  // FIXME: this needs to be adjusted depending on the operator used
+  uint64_t op_flops_per_site = 1320;
+  uint64_t total_flops =
+      (site_flops + site_flops2 + (2 * num_flavour * op_flops_per_site) * (mv_apps + mv_apps2)) *
+      num_cb_sites;
+  QPhiX::masterPrintf("# QPHIX: Solver Time = %g sec\n", (end_time - start_time));
+  QPhiX::masterPrintf("# QPHIX: Performance in GFLOPS = %g\n\n",
+                      1.0e-9 * total_flops / (end_time - start_time));
+
+  if (solver_is_mixed(solver_flag)) {
+    // the mixed solver reports the outer iterations, we would like to get
+    // some better total
+    niters = mv_apps / 2;
+    if (solver_flag == MIXEDBICGSTAB && solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+      niters2 = mv_apps2 / 2;
+    }
+  }
+  // solver did not converge in maximum number of iterations
+  // FIXME: non-convergence does not work correctly yet
+  if ((niters + niters2) > max_iter) {
+    niters = -1;
+    niters2 = 0;
+  }
+  return (niters + niters2);
+}
+
+// Due to github issue #404, the helper functions to apply the full QPhiX operator
+// are currently disabled because they conflict with the new interfaces in QPhiX
+// itself. If required, these should be rewritten to use these interfaces
+// rather than the base classes in qphix_base_classes.hpp
+
+// Template wrapper for the Dslash operator call-able from C code
+// void Mfull_qphix(spinor *Even_out, spinor *Odd_out, const spinor *Even_in, const spinor *Odd_in,
+//                 const op_type_t op_type) {
+//  tmlqcd::checkQphixInputParameters(qphix_input);
+//  // FIXME: two-row gauge compression and double precision hard-coded
+//  _initQphix(0, nullptr, qphix_input, 12, QPHIX_DOUBLE_PREC);
+//
+//  if (qphix_precision == QPHIX_DOUBLE_PREC) {
+//    if (QPHIX_SOALEN > VECLEN_DP) {
+//      QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
+//                          VECLEN_DP);
+//      abort();
+//    }
+//    QPhiX::masterPrintf("TESTING IN DOUBLE PRECISION \n");
+//    if (compress12) {
+//      Mfull_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                          op_type);
+//    } else {
+//      Mfull_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                           op_type);
+//    }
+//  } else if (qphix_precision == QPHIX_FLOAT_PREC) {
+//    if (QPHIX_SOALEN > VECLEN_SP) {
+//      QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d\n", QPHIX_SOALEN,
+//                          VECLEN_SP);
+//      abort();
+//    }
+//    QPhiX::masterPrintf("TESTING IN SINGLE PRECISION \n");
+//    if (compress12) {
+//      Mfull_helper<float, VECLEN_SP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                         op_type);
+//    } else {
+//      Mfull_helper<float, VECLEN_SP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                          op_type);
+//    }
+//  }
+// #if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
+//  else if (qphix_precision == QPHIX_HALF_PREC) {
+//    if (QPHIX_SOALEN > VECLEN_HP) {
+//      QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d\n", QPHIX_SOALEN,
+//                          VECLEN_HP);
+//      abort();
+//    }
+//    QPhiX::masterPrintf("TESTING IN HALF PRECISION \n");
+//    if (compress12) {
+//      Mfull_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                               op_type);
+//    } else {
+//      Mfull_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in,
+//      Odd_in,
+//                                                                op_type);
+//    }
+//  }
+// #endif
+//}
+
+// we have a unified interface for n-flavour inversions, but we need to provide wrappers
+// which can be called by the tmLQCD solver drivers for one and two-flavour inversions
+int invert_eo_qphix_oneflavour(spinor *Odd_out_1f, spinor *Odd_in_1f, const int max_iter,
+                               const double precision, const int solver_flag, const int rel_prec,
+                               const solver_params_t solver_params, const SloppyPrecision sloppy,
+                               const CompressionType compression) {
+  const int num_flavour = 1;
+  const int num_shifts = 1;
+  std::vector<std::vector<spinor *> > Odd_out;
+  std::vector<std::vector<spinor *> > Odd_in;
+
+  Odd_out.resize(num_shifts);
+  Odd_out[0].resize(num_flavour);
+  Odd_in.resize(1);
+  Odd_in[0].resize(num_flavour);
+
+  Odd_in[0][0] = Odd_in_1f;
+  Odd_out[0][0] = Odd_out_1f;
+
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
+                                         rel_prec, solver_params, sloppy, compression, num_flavour);
+}
+
+int invert_eo_qphix_oneflavour_mshift(spinor **Odd_out_1f, spinor *Odd_in_1f, const int max_iter,
+                                      const double precision, const int solver_flag,
+                                      const int rel_prec, const solver_params_t solver_params,
+                                      const SloppyPrecision sloppy,
+                                      const CompressionType compression) {
+  // even though the default is set to 1, guard against zeroes
+  const int num_shifts = solver_params.no_shifts == 0 ? 1 : solver_params.no_shifts;
+  const int num_flavour = 1;
+  std::vector<std::vector<spinor *> > Odd_out;
+  std::vector<std::vector<spinor *> > Odd_in;
+
+  Odd_out.resize(num_shifts);
+  Odd_in.resize(1);
+  Odd_in[0].resize(num_flavour);
+
+  Odd_in[0][0] = Odd_in_1f;
+  for (int shift = 0; shift < num_shifts; shift++) {
+    Odd_out[shift].resize(num_flavour);
+    Odd_out[shift][0] = Odd_out_1f[shift];
+  }
+
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
+                                         rel_prec, solver_params, sloppy, compression, num_flavour);
+}
+
+// Template wrapper for QPhiX solvers callable from C code, return number of iterations
+int invert_eo_qphix_twoflavour(spinor *Odd_out_s, spinor *Odd_out_c, spinor *Odd_in_s,
+                               spinor *Odd_in_c, const int max_iter, const double precision,
+                               const int solver_flag, const int rel_prec,
+                               const solver_params_t solver_params, const SloppyPrecision sloppy,
+                               const CompressionType compression) {
+  const int num_flavour = 2;
+  const int num_shifts = 1;
+  std::vector<std::vector<spinor *> > Odd_out;
+  std::vector<std::vector<spinor *> > Odd_in;
+
+  Odd_out.resize(num_shifts);
+  Odd_out[0].resize(num_flavour);
+  Odd_in.resize(1);
+  Odd_in[0].resize(num_flavour);
+
+  Odd_in[0][0] = Odd_in_s;
+  Odd_in[0][1] = Odd_in_c;
+
+  Odd_out[0][0] = Odd_out_s;
+  Odd_out[0][1] = Odd_out_c;
+
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
+                                         rel_prec, solver_params, sloppy, compression, num_flavour);
+}
+
+int invert_eo_qphix_twoflavour_mshift(spinor **Odd_out_s, spinor **Odd_out_c, spinor *Odd_in_s,
+                                      spinor *Odd_in_c, const int max_iter, const double precision,
+                                      const int solver_flag, const int rel_prec,
+                                      const solver_params_t solver_params,
+                                      const SloppyPrecision sloppy,
+                                      const CompressionType compression) {
+  // even though the default is set to 1, guard against zeroes
+  const int num_shifts = solver_params.no_shifts == 0 ? 1 : solver_params.no_shifts;
+  const int num_flavour = 2;
+  std::vector<std::vector<spinor *> > Odd_out;
+  std::vector<std::vector<spinor *> > Odd_in;
+
+  Odd_out.resize(num_shifts);
+  Odd_in.resize(1);
+  Odd_in[0].resize(num_flavour);
+
+  Odd_in[0][0] = Odd_in_s;
+  Odd_in[0][1] = Odd_in_c;
+
+  for (int shift = 0; shift < num_shifts; shift++) {
+    Odd_out[shift].resize(num_flavour);
+    Odd_out[shift][0] = Odd_out_s[shift];
+    Odd_out[shift][1] = Odd_out_c[shift];
+  }
+
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
+                                         rel_prec, solver_params, sloppy, compression, num_flavour);
+}
+
+// Template wrapper for QPhiX solvers callable from C code, return number of iterations
+// the interface is prepared for multi-rhs solves, hence the double vector for the input
+int invert_eo_qphix_nflavour_mshift(std::vector<std::vector<spinor *> > &Odd_out,
+                                    std::vector<std::vector<spinor *> > &Odd_in,
+                                    const double precision, const int max_iter,
+                                    const int solver_flag, const int rel_prec,
+                                    solver_params_t solver_params, const SloppyPrecision sloppy,
+                                    const CompressionType compression, const int num_flavour) {
+  tmlqcd::checkQphixInputParameters(qphix_input);
+  double target_precision = precision;
+  double src_norm = 0.0;
+  for (int f = 0; f < num_flavour; ++f) {
+    src_norm += square_norm(Odd_in[0][f], VOLUME / 2, 1);
+  }
+  // we use "precision_lambda" to determine if a system can be solved in half or float
+  // precision (when a fixed-precision solver is used)
+  double precision_lambda = target_precision / src_norm;
+  if (rel_prec == 1) {
+    QPhiX::masterPrintf("# QPHIX: Using relative precision\n");
+    target_precision = precision * src_norm;
+    precision_lambda = precision;
+  }
+  QPhiX::masterPrintf("# QPHIX: precision_lambda: %g, target_precision: %g\n\n", precision_lambda,
+                      target_precision);
+
+  // mixed solvers require inner and outer precisions, which we specify explicitly here
+  if (solver_is_mixed(solver_flag)) {
+#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
+    if (sloppy == SLOPPY_HALF) {
+      if (QPHIX_SOALEN > VECLEN_DP || QPHIX_SOALEN > VECLEN_HP) {
+        QPhiX::masterPrintf(
+            "SOALEN=%d is greater than the half prec VECLEN=%d or the double prec VECLEN=%d\n",
+            QPHIX_SOALEN, VECLEN_HP, VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE-HALF PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_HALF_PREC);
+      if (compress12) {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true, QPhiX::half, VECLEN_HP,
+                                      QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false, QPhiX::half,
+                                      VECLEN_HP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else
+#else
+    if (sloppy == SLOPPY_HALF) {
+      QPhiX::masterPrintf("QPHIX interface: half precision not supported on this architecture!\n");
+      abort();
+    } else
+#endif
+        if (sloppy == SLOPPY_SINGLE) {
+      if (QPHIX_SOALEN > VECLEN_DP || QPHIX_SOALEN > VECLEN_SP) {
+        QPhiX::masterPrintf(
+            "SOALEN=%d is greater than the single prec VECLEN=%d or the double prec VECLEN=%d\n",
+            QPHIX_SOALEN, VECLEN_SP, VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE-SINGLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_FLOAT_PREC);
+      if (compress12) {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true, float, VECLEN_SP,
+                                      QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false, float, VECLEN_SP,
+                                      QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else {  // if(sloppy)
+      if (QPHIX_SOALEN > VECLEN_DP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE-DOUBLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_DOUBLE_PREC);
+      if (compress12) {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    }  // if( sloppy )
+  } else {  // if( solver_is_mixed )
+#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
+    if (sloppy == SLOPPY_HALF || precision_lambda >= rsdTarget<QPhiX::half>::value) {
+      if (QPHIX_SOALEN > VECLEN_HP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_HP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
+      QPhiX::masterPrintf("# USING HALF PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_HALF_PREC);
+
+      if (compress12) {
+        return invert_eo_qphix_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else
+#else
+    if (sloppy == SLOPPY_HALF) {
+      QPhiX::masterPrintf("QPHIX interface: half precision not supported on this architecture!\n");
+      abort();
+    } else
+#endif
+        if (sloppy == SLOPPY_SINGLE || precision_lambda >= rsdTarget<float>::value) {
+      if (QPHIX_SOALEN > VECLEN_SP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_SP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
+      QPhiX::masterPrintf("# USING SINGLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_FLOAT_PREC);
+
+      if (compress12) {
+        return invert_eo_qphix_helper<float, VECLEN_SP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<float, VECLEN_SP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else {
+      if (QPHIX_SOALEN > VECLEN_DP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC);
+
+      if (compress12) {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    }  // if( sloppy || target_precision )
+  }  // if ( solver_flag == *MIXEDCG )
+  return -1;
+}
+
+void tmlqcd::checkQphixInputParameters(const tm_QPhiXParams_t &params) {
+  if (params.MinCt == 0) {
+    QPhiX::masterPrintf("QPHIX Error: MinCt cannot be 0! Minimal value: 1. Aborting.\n");
+    abort();
+  }
+  if (params.By == 0 || params.Bz == 0) {
+    QPhiX::masterPrintf("QPHIX Error: By and Bz may not be 0! Minimal value: 1. Aborting.\n");
+    abort();
+  }
+  if (params.NCores * params.Sy * params.Sz != omp_num_threads) {
+    QPhiX::masterPrintf("QPHIX Error: NCores * Sy * Sz != ompnumthreads ! Aborting.\n");
+    abort();
+  }
+}
+
+void tmlqcd::printQphixDiagnostics(int VECLEN, int SOALEN, bool compress, int VECLEN_inner,
+                                   int SOALEN_inner, bool compress_inner) {
+  QPhiX::masterPrintf("# QphiX: VECLEN=%d SOALEN=%d VECLEN_inner=%d, SOALEN_inner=%d\n", VECLEN,
+                      SOALEN, VECLEN_inner, SOALEN_inner);
+
+  QPhiX::masterPrintf("# QphiX: Declared QMP Topology (xyzt):");
+  for (int mu = 0; mu < 4; mu++) QPhiX::masterPrintf(" %d", qmp_geom[mu]);
+  QPhiX::masterPrintf("\n");
+
+  QPhiX::masterPrintf("# QphiX: Mapping of dimensions QMP -> tmLQCD (xyzt):");
+  for (int mu = 0; mu < 4; mu++) QPhiX::masterPrintf(" %d->%d", mu, qmp_tm_map[mu]);
+  QPhiX::masterPrintf("\n");
+
+  QPhiX::masterPrintf("# QphiX: Global Lattice Size (xyzt) = ");
+  for (int mu = 0; mu < 4; mu++) {
+    QPhiX::masterPrintf(" %d", lattSize[mu]);
+  }
+  QPhiX::masterPrintf("\n");
+  QPhiX::masterPrintf("# QphiX: Local Lattice Size (xyzt) = ");
+  for (int mu = 0; mu < 4; mu++) {
+    QPhiX::masterPrintf(" %d", subLattSize[mu]);
+  }
+  QPhiX::masterPrintf("\n");
+  QPhiX::masterPrintf("# QphiX: Block Sizes: By= %d Bz=%d\n", By, Bz);
+  QPhiX::masterPrintf("# QphiX: Cores = %d\n", NCores);
+  QPhiX::masterPrintf("# QphiX: SMT Grid: Sy=%d Sz=%d\n", Sy, Sz);
+  QPhiX::masterPrintf("# QphiX: Pad Factors: PadXY=%d PadXYZ=%d\n", PadXY, PadXYZ);
+  QPhiX::masterPrintf("# QphiX: Threads_per_core = %d\n", N_simt);
+  QPhiX::masterPrintf("# QphiX: MinCt = %d\n", MinCt);
+  if (compress) {
+    QPhiX::masterPrintf("# QphiX: Using two-row gauge compression (compress12)\n");
+  }
+  if (compress_inner) {
+    QPhiX::masterPrintf("# QphiX: Inner solver using two-row gauge compression (compress12)\n");
+  }
+}
+
+void testSpinorPackers(spinor *Even_out, spinor *Odd_out, const spinor *const Even_in,
+                       const spinor *const Odd_in) {
+  tmlqcd::checkQphixInputParameters(qphix_input);
+  // FIXME: two-row gauge compression and double precision hard-coded
+  _initQphix(0, nullptr, qphix_input, 12, QPHIX_DOUBLE_PREC);
+
+  QPhiX::Geometry<double, VECLEN_SP, QPHIX_SOALEN, true> geom(subLattSize, By, Bz, NCores, Sy, Sz,
+                                                              PadXY, PadXYZ, MinCt);
+
+  auto qphix_cb_even = QPhiX::makeFourSpinorHandle(geom);
+  auto qphix_cb_odd = QPhiX::makeFourSpinorHandle(geom);
+
+  spinor **tmp;
+  init_solver_field(&tmp, VOLUME / 2, 2);
+
+  //   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Even_in),
+  //                              qphix_cb_even.get(), cb_even);
+  //   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Odd_in),
+  //                              qphix_cb_odd.get(), cb_odd);
+  reorder_eo_spinor_to_QPhiX(geom, Even_in, qphix_cb_even.get(), cb_even);
+  reorder_eo_spinor_to_QPhiX(geom, Odd_in, qphix_cb_odd.get(), cb_odd);
+
+  reorder_eo_spinor_from_QPhiX(geom, Even_out, qphix_cb_even.get(), cb_even, 1.0);
+  reorder_eo_spinor_from_QPhiX(geom, Odd_out, qphix_cb_odd.get(), cb_odd, 1.0);
+
+  diff(tmp[0], Even_out, Even_in, VOLUME / 2);
+  diff(tmp[1], Odd_out, Odd_in, VOLUME / 2);
+  double l2norm = square_norm(tmp[0], VOLUME / 2, 1) + square_norm(tmp[1], VOLUME / 2, 1);
+  QPhiX::masterPrintf("QPHIX eo spinor packer back and forth difference L2 norm: %lf\n", l2norm);
+  finalize_solver(tmp, 2);
+}
diff --git a/src/lib/qphix/qphix_interface.hpp b/src/lib/qphix/qphix_interface.hpp
new file mode 100644
index 000000000..b487eda66
--- /dev/null
+++ b/src/lib/qphix/qphix_interface.hpp
@@ -0,0 +1,51 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#pragma once
+
+#include "global.h"
+#include "qphix_types.h"
+
+#ifdef __cplusplus /* If this is a C++ compiler, use C linkage */
+extern "C" {
+#endif
+
+#include "misc_types.h"
+#include "operator_types.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver_params.h"
+#include "su3.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#include <vector>
+
+int invert_eo_qphix_nflavour_mshift(std::vector< std::vector< spinor* > > &Odd_out, 
+                                    std::vector< std::vector< spinor* > > &Odd_in, 
+                                    const double precision,
+                                    const int max_iter,
+                                    const int solver_flag, 
+                                    const int rel_prec,
+                                    solver_params_t solver_params,
+                                    const SloppyPrecision sloppy, const CompressionType compression,
+                                    const int num_flavour);
\ No newline at end of file
diff --git a/src/lib/qphix/qphix_interface_utils.hpp b/src/lib/qphix/qphix_interface_utils.hpp
new file mode 100644
index 000000000..56d8afe56
--- /dev/null
+++ b/src/lib/qphix/qphix_interface_utils.hpp
@@ -0,0 +1,33 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Peter Labus
+ *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#pragma once
+
+#include "qphix_types.h"
+
+namespace tmlqcd {
+
+void checkQphixInputParameters(const tm_QPhiXParams_t &params);
+void printQphixDiagnostics(int VECLEN, int SOALEN, bool compress, int VECLEN_inner, int SOALEN_inner, bool compress_inner);
+
+}  // namespace tmlqcd
diff --git a/qphix_interface.h b/src/lib/qphix_interface.h
similarity index 100%
rename from qphix_interface.h
rename to src/lib/qphix_interface.h
diff --git a/qphix_types.h b/src/lib/qphix_types.h
similarity index 100%
rename from qphix_types.h
rename to src/lib/qphix_types.h
diff --git a/qphix_veclen.h b/src/lib/qphix_veclen.h
similarity index 100%
rename from qphix_veclen.h
rename to src/lib/qphix_veclen.h
diff --git a/quda_dummy_types.h b/src/lib/quda_dummy_types.h
similarity index 100%
rename from quda_dummy_types.h
rename to src/lib/quda_dummy_types.h
diff --git a/src/lib/quda_gauge_paths.inc b/src/lib/quda_gauge_paths.inc
new file mode 100644
index 000000000..d2c898e6c
--- /dev/null
+++ b/src/lib/quda_gauge_paths.inc
@@ -0,0 +1,158 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2021 Bartosz Kostrzewa, Ferenc Pittler, Simone Bacchio
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ ***********************************************************************/
+
+const int plaq_rect_length[24] = {
+    3, 3, 3, 3, 3, 3,
+    5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5,
+  };
+
+const int plaq_rect_path[4][24][5] = {
+    { {1, 7, 6 },
+      {6, 7, 1 },
+      {2, 7, 5 },
+      {5, 7, 2 },
+      {3, 7, 4 },
+      {4, 7, 3 }, 
+      {1, 1, 7, 6, 6 },
+      {6, 6, 7, 1, 1 },
+      {2, 2, 7, 5, 5 },
+      {5, 5, 7, 2, 2 },
+      {3, 3, 7, 4, 4 },
+      {4, 4, 7, 3, 3 },
+      {0, 1, 7, 7, 6 },
+      {6, 7, 7, 1, 0 },
+      {0, 2, 7, 7, 5 },
+      {5, 7, 7, 2, 0 },
+      {0, 3, 7, 7, 4 },
+      {4, 7, 7, 3, 0 },
+      {0, 4, 7, 7, 3 },
+      {3, 7, 7, 4, 0 },
+      {0, 5, 7, 7, 2 },
+      {2, 7, 7, 5, 0 },
+      {0, 6, 7, 7, 1 },
+      {1, 7, 7, 6, 0 } },
+    { { 2, 6, 5 },
+      { 5, 6, 2 },
+      { 3, 6, 4 },
+      { 4, 6, 3 },
+      { 0, 6, 7 },
+      { 7, 6, 0 },
+      { 1, 2, 6, 6, 5 },
+      { 2, 6, 6, 5, 1 },
+      { 5, 6, 6, 2, 1 },
+      { 1, 5, 6, 6, 2 },
+      { 1, 3, 6, 6, 4 },
+      { 3, 6, 6, 4, 1 },
+      { 4, 6, 6, 3, 1 },
+      { 1, 4, 6, 6, 3 },
+      { 1, 0, 6, 6, 7 },
+      { 0, 6, 6, 7, 1 },
+      { 7, 6, 6, 0, 1 },
+      { 1, 7, 6, 6, 0 },
+      { 5, 5, 6, 2, 2 },
+      { 2, 2, 6, 5, 5 },
+      { 4, 4, 6, 3, 3 },
+      { 3, 3, 6, 4, 4 },
+      { 7, 7, 6, 0, 0 },
+      { 0, 0, 6, 7, 7 } },
+    { {3, 5, 4},
+      {4, 5, 3},
+      {0, 5, 7},
+      {7, 5, 0},
+      {1, 5, 6},
+      {6, 5, 1},
+      {2, 3, 5, 5, 4},
+      {3, 5, 5, 4, 2}, 
+      {4, 5, 5, 3, 2}, 
+      {2, 4, 5, 5, 3}, 
+      {2, 0, 5, 5, 7}, 
+      {0, 5, 5, 7, 2}, 
+      {7, 5, 5, 0, 2}, 
+      {2, 7, 5, 5, 0},
+      {2, 1, 5, 5, 6}, 
+      {1, 5, 5, 6, 2}, 
+      {6, 5, 5, 1, 2}, 
+      {2, 6, 5, 5, 1}, 
+      {4, 4, 5, 3, 3}, 
+      {3, 3, 5, 4, 4}, 
+      {7, 7, 5, 0, 0},
+      {0, 0, 5, 7, 7}, 
+      {6, 6, 5, 1, 1}, 
+      {1, 1, 5, 6, 6} }, 
+    { { 0, 4, 7 },
+      { 7, 4, 0 },
+      { 1, 4, 6 },
+      { 6, 4, 1 },
+      { 2, 4, 5 },
+      { 5, 4, 2 },
+      { 3, 0, 4, 4, 7 },
+      { 0, 4, 4, 7, 3 },
+      { 7, 4, 4, 0, 3 },
+      { 3, 7, 4, 4, 0 },
+      { 3, 1, 4, 4, 6 },
+      { 1, 4, 4, 6, 3 },
+      { 6, 4, 4, 1, 3 },
+      { 3, 6, 4, 4, 1 },
+      { 3, 2, 4, 4, 5 },
+      { 2, 4, 4, 5, 3 },
+      { 5, 4, 4, 2, 3 },
+      { 3, 5, 4, 4, 2 },
+      { 7, 7, 4, 0, 0 },
+      { 0, 0, 4, 7, 7 },
+      { 6, 6, 4, 1, 1 },
+      { 1, 1, 4, 6, 6 },
+      { 5, 5, 4, 2, 2 },
+      { 2, 2, 4, 5, 5 } } 
+  };
+
+const int plaq_length[] = {
+    3, 3, 3, 3, 3, 3 };
+
+const int plaq_path[4][6][3] = {
+    { { 1, 7, 6 },
+      { 6, 7, 1 },
+      { 2, 7, 5 },
+      { 5, 7, 2 },
+      { 3, 7, 4 },
+      { 4, 7, 3 } },
+    { { 2, 6, 5 },
+      { 5, 6, 2 },
+      { 3, 6, 4 },
+      { 4, 6, 3 },
+      { 0, 6, 7 },
+      { 7, 6, 0 } },
+    { { 3, 5, 4},
+      { 4, 5, 3},
+      { 0, 5, 7},
+      { 7, 5, 0},
+      { 1, 5, 6},
+      { 6, 5, 1} },
+    { { 0, 4, 7 },
+      { 7, 4, 0 },
+      { 1, 4, 6 },
+      { 6, 4, 1 },
+      { 2, 4, 5 },
+      { 5, 4, 2 } } 
+  };
+
diff --git a/quda_interface.c b/src/lib/quda_interface.c
similarity index 100%
rename from quda_interface.c
rename to src/lib/quda_interface.c
diff --git a/quda_interface.h b/src/lib/quda_interface.h
similarity index 100%
rename from quda_interface.h
rename to src/lib/quda_interface.h
diff --git a/quda_types.h b/src/lib/quda_types.h
similarity index 100%
rename from quda_types.h
rename to src/lib/quda_types.h
diff --git a/ranlxd.c b/src/lib/ranlxd.c
similarity index 100%
rename from ranlxd.c
rename to src/lib/ranlxd.c
diff --git a/ranlxd.h b/src/lib/ranlxd.h
similarity index 100%
rename from ranlxd.h
rename to src/lib/ranlxd.h
diff --git a/ranlxs.c b/src/lib/ranlxs.c
similarity index 100%
rename from ranlxs.c
rename to src/lib/ranlxs.c
diff --git a/ranlxs.h b/src/lib/ranlxs.h
similarity index 100%
rename from ranlxs.h
rename to src/lib/ranlxs.h
diff --git a/rational/Makefile.in b/src/lib/rational/Makefile.in
similarity index 100%
rename from rational/Makefile.in
rename to src/lib/rational/Makefile.in
diff --git a/rational/elliptic.c b/src/lib/rational/elliptic.c
similarity index 100%
rename from rational/elliptic.c
rename to src/lib/rational/elliptic.c
diff --git a/rational/elliptic.h b/src/lib/rational/elliptic.h
similarity index 100%
rename from rational/elliptic.h
rename to src/lib/rational/elliptic.h
diff --git a/rational/rational.c b/src/lib/rational/rational.c
similarity index 100%
rename from rational/rational.c
rename to src/lib/rational/rational.c
diff --git a/rational/rational.h b/src/lib/rational/rational.h
similarity index 100%
rename from rational/rational.h
rename to src/lib/rational/rational.h
diff --git a/rational/zolotarev.c b/src/lib/rational/zolotarev.c
similarity index 100%
rename from rational/zolotarev.c
rename to src/lib/rational/zolotarev.c
diff --git a/rational/zolotarev.h b/src/lib/rational/zolotarev.h
similarity index 100%
rename from rational/zolotarev.h
rename to src/lib/rational/zolotarev.h
diff --git a/read_input.h b/src/lib/read_input.h
similarity index 100%
rename from read_input.h
rename to src/lib/read_input.h
diff --git a/read_input.l b/src/lib/read_input.l
similarity index 100%
rename from read_input.l
rename to src/lib/read_input.l
diff --git a/reweighting_factor.c b/src/lib/reweighting_factor.c
similarity index 100%
rename from reweighting_factor.c
rename to src/lib/reweighting_factor.c
diff --git a/reweighting_factor.h b/src/lib/reweighting_factor.h
similarity index 100%
rename from reweighting_factor.h
rename to src/lib/reweighting_factor.h
diff --git a/reweighting_factor_nd.c b/src/lib/reweighting_factor_nd.c
similarity index 100%
rename from reweighting_factor_nd.c
rename to src/lib/reweighting_factor_nd.c
diff --git a/reweighting_factor_nd.h b/src/lib/reweighting_factor_nd.h
similarity index 100%
rename from reweighting_factor_nd.h
rename to src/lib/reweighting_factor_nd.h
diff --git a/rnd_gauge_trafo.c b/src/lib/rnd_gauge_trafo.c
similarity index 100%
rename from rnd_gauge_trafo.c
rename to src/lib/rnd_gauge_trafo.c
diff --git a/rnd_gauge_trafo.h b/src/lib/rnd_gauge_trafo.h
similarity index 100%
rename from rnd_gauge_trafo.h
rename to src/lib/rnd_gauge_trafo.h
diff --git a/sighandler.c b/src/lib/sighandler.c
similarity index 100%
rename from sighandler.c
rename to src/lib/sighandler.c
diff --git a/sighandler.h b/src/lib/sighandler.h
similarity index 100%
rename from sighandler.h
rename to src/lib/sighandler.h
diff --git a/smearing/Makefile.in b/src/lib/smearing/Makefile.in
similarity index 100%
rename from smearing/Makefile.in
rename to src/lib/smearing/Makefile.in
diff --git a/smearing/ape.h b/src/lib/smearing/ape.h
similarity index 100%
rename from smearing/ape.h
rename to src/lib/smearing/ape.h
diff --git a/smearing/ape.ih b/src/lib/smearing/ape.ih
similarity index 100%
rename from smearing/ape.ih
rename to src/lib/smearing/ape.ih
diff --git a/smearing/ape_ape_smear.c b/src/lib/smearing/ape_ape_smear.c
similarity index 100%
rename from smearing/ape_ape_smear.c
rename to src/lib/smearing/ape_ape_smear.c
diff --git a/smearing/hex.h b/src/lib/smearing/hex.h
similarity index 100%
rename from smearing/hex.h
rename to src/lib/smearing/hex.h
diff --git a/smearing/hex.ih b/src/lib/smearing/hex.ih
similarity index 100%
rename from smearing/hex.ih
rename to src/lib/smearing/hex.ih
diff --git a/smearing/hex_hex_smear.c b/src/lib/smearing/hex_hex_smear.c
similarity index 100%
rename from smearing/hex_hex_smear.c
rename to src/lib/smearing/hex_hex_smear.c
diff --git a/smearing/hex_stout_exclude_none.c b/src/lib/smearing/hex_stout_exclude_none.c
similarity index 100%
rename from smearing/hex_stout_exclude_none.c
rename to src/lib/smearing/hex_stout_exclude_none.c
diff --git a/smearing/hex_stout_exclude_one.c b/src/lib/smearing/hex_stout_exclude_one.c
similarity index 100%
rename from smearing/hex_stout_exclude_one.c
rename to src/lib/smearing/hex_stout_exclude_one.c
diff --git a/smearing/hex_stout_exclude_two.c b/src/lib/smearing/hex_stout_exclude_two.c
similarity index 100%
rename from smearing/hex_stout_exclude_two.c
rename to src/lib/smearing/hex_stout_exclude_two.c
diff --git a/smearing/hyp.h b/src/lib/smearing/hyp.h
similarity index 100%
rename from smearing/hyp.h
rename to src/lib/smearing/hyp.h
diff --git a/smearing/hyp.ih b/src/lib/smearing/hyp.ih
similarity index 100%
rename from smearing/hyp.ih
rename to src/lib/smearing/hyp.ih
diff --git a/smearing/hyp_APE_project_exclude_none.c b/src/lib/smearing/hyp_APE_project_exclude_none.c
similarity index 100%
rename from smearing/hyp_APE_project_exclude_none.c
rename to src/lib/smearing/hyp_APE_project_exclude_none.c
diff --git a/smearing/hyp_APE_project_exclude_one.c b/src/lib/smearing/hyp_APE_project_exclude_one.c
similarity index 100%
rename from smearing/hyp_APE_project_exclude_one.c
rename to src/lib/smearing/hyp_APE_project_exclude_one.c
diff --git a/smearing/hyp_APE_project_exclude_two.c b/src/lib/smearing/hyp_APE_project_exclude_two.c
similarity index 100%
rename from smearing/hyp_APE_project_exclude_two.c
rename to src/lib/smearing/hyp_APE_project_exclude_two.c
diff --git a/smearing/hyp_hyp_smear.c b/src/lib/smearing/hyp_hyp_smear.c
similarity index 100%
rename from smearing/hyp_hyp_smear.c
rename to src/lib/smearing/hyp_hyp_smear.c
diff --git a/smearing/hyp_hyp_staples_exclude_none.c b/src/lib/smearing/hyp_hyp_staples_exclude_none.c
similarity index 100%
rename from smearing/hyp_hyp_staples_exclude_none.c
rename to src/lib/smearing/hyp_hyp_staples_exclude_none.c
diff --git a/smearing/hyp_hyp_staples_exclude_one.c b/src/lib/smearing/hyp_hyp_staples_exclude_one.c
similarity index 100%
rename from smearing/hyp_hyp_staples_exclude_one.c
rename to src/lib/smearing/hyp_hyp_staples_exclude_one.c
diff --git a/smearing/hyp_hyp_staples_exclude_two.c b/src/lib/smearing/hyp_hyp_staples_exclude_two.c
similarity index 100%
rename from smearing/hyp_hyp_staples_exclude_two.c
rename to src/lib/smearing/hyp_hyp_staples_exclude_two.c
diff --git a/smearing/stout.h b/src/lib/smearing/stout.h
similarity index 100%
rename from smearing/stout.h
rename to src/lib/smearing/stout.h
diff --git a/smearing/stout.ih b/src/lib/smearing/stout.ih
similarity index 100%
rename from smearing/stout.ih
rename to src/lib/smearing/stout.ih
diff --git a/smearing/stout_stout_smear.c b/src/lib/smearing/stout_stout_smear.c
similarity index 100%
rename from smearing/stout_stout_smear.c
rename to src/lib/smearing/stout_stout_smear.c
diff --git a/smearing/uils_print_config_to_screen.c b/src/lib/smearing/uils_print_config_to_screen.c
similarity index 100%
rename from smearing/uils_print_config_to_screen.c
rename to src/lib/smearing/uils_print_config_to_screen.c
diff --git a/smearing/utils.h b/src/lib/smearing/utils.h
similarity index 100%
rename from smearing/utils.h
rename to src/lib/smearing/utils.h
diff --git a/smearing/utils.ih b/src/lib/smearing/utils.ih
similarity index 100%
rename from smearing/utils.ih
rename to src/lib/smearing/utils.ih
diff --git a/smearing/utils_generic_staples.c b/src/lib/smearing/utils_generic_staples.c
similarity index 100%
rename from smearing/utils_generic_staples.c
rename to src/lib/smearing/utils_generic_staples.c
diff --git a/smearing/utils_print_config_to_screen.c b/src/lib/smearing/utils_print_config_to_screen.c
similarity index 100%
rename from smearing/utils_print_config_to_screen.c
rename to src/lib/smearing/utils_print_config_to_screen.c
diff --git a/smearing/utils_print_su3.c b/src/lib/smearing/utils_print_su3.c
similarity index 100%
rename from smearing/utils_print_su3.c
rename to src/lib/smearing/utils_print_su3.c
diff --git a/smearing/utils_project_antiherm.c b/src/lib/smearing/utils_project_antiherm.c
similarity index 100%
rename from smearing/utils_project_antiherm.c
rename to src/lib/smearing/utils_project_antiherm.c
diff --git a/smearing/utils_project_herm.c b/src/lib/smearing/utils_project_herm.c
similarity index 100%
rename from smearing/utils_project_herm.c
rename to src/lib/smearing/utils_project_herm.c
diff --git a/smearing/utils_reunitarize.c b/src/lib/smearing/utils_reunitarize.c
similarity index 100%
rename from smearing/utils_reunitarize.c
rename to src/lib/smearing/utils_reunitarize.c
diff --git a/smearing/utils_reunitarize_MILC.c b/src/lib/smearing/utils_reunitarize_MILC.c
similarity index 88%
rename from smearing/utils_reunitarize_MILC.c
rename to src/lib/smearing/utils_reunitarize_MILC.c
index 757a797df..b5efa2936 100644
--- a/smearing/utils_reunitarize_MILC.c
+++ b/src/lib/smearing/utils_reunitarize_MILC.c
@@ -1,4 +1,5 @@
 #include "utils.ih"
+#include <complex.h>
 
 /* No reunitarization code seems to be available, so I've adapted (stolen) this routine from the
  * MILC code (who stole it elsewhere, I think ;]) -- AD. */
@@ -35,12 +36,11 @@ void reunitarize(su3 *omega) {
   bj2 = omega->c02;
 
   omega->c20 = bj1 * omega->c12;
-  omega->c20 -= bj2 *omega
-                    ->c11
+  omega->c20 -= bj2 *omega->c11;
 
-                        omega->c21 = bj2 * omega->c10;
+  omega->c21 = bj2 * omega->c10;
   omega->c21 -= bj0 * omega->c12;
 
   omega->c22 = bj0 * omega->c11;
-  omega->c22 -= bj1r * omega->c10;
+  omega->c22 -= bj1 * omega->c10;
 }
diff --git a/solver/M_plus_block_psi_body.c b/src/lib/solver/M_plus_block_psi_body.c
similarity index 100%
rename from solver/M_plus_block_psi_body.c
rename to src/lib/solver/M_plus_block_psi_body.c
diff --git a/solver/Makefile.in b/src/lib/solver/Makefile.in
similarity index 100%
rename from solver/Makefile.in
rename to src/lib/solver/Makefile.in
diff --git a/solver/Msap.c b/src/lib/solver/Msap.c
similarity index 100%
rename from solver/Msap.c
rename to src/lib/solver/Msap.c
diff --git a/solver/Msap.h b/src/lib/solver/Msap.h
similarity index 100%
rename from solver/Msap.h
rename to src/lib/solver/Msap.h
diff --git a/solver/bicg_complex.c b/src/lib/solver/bicg_complex.c
similarity index 100%
rename from solver/bicg_complex.c
rename to src/lib/solver/bicg_complex.c
diff --git a/solver/bicg_complex.h b/src/lib/solver/bicg_complex.h
similarity index 100%
rename from solver/bicg_complex.h
rename to src/lib/solver/bicg_complex.h
diff --git a/solver/bicgstab2.c b/src/lib/solver/bicgstab2.c
similarity index 100%
rename from solver/bicgstab2.c
rename to src/lib/solver/bicgstab2.c
diff --git a/solver/bicgstab2.h b/src/lib/solver/bicgstab2.h
similarity index 100%
rename from solver/bicgstab2.h
rename to src/lib/solver/bicgstab2.h
diff --git a/solver/bicgstab_complex.c b/src/lib/solver/bicgstab_complex.c
similarity index 100%
rename from solver/bicgstab_complex.c
rename to src/lib/solver/bicgstab_complex.c
diff --git a/solver/bicgstab_complex.h b/src/lib/solver/bicgstab_complex.h
similarity index 100%
rename from solver/bicgstab_complex.h
rename to src/lib/solver/bicgstab_complex.h
diff --git a/solver/bicgstab_complex_bi.c b/src/lib/solver/bicgstab_complex_bi.c
similarity index 100%
rename from solver/bicgstab_complex_bi.c
rename to src/lib/solver/bicgstab_complex_bi.c
diff --git a/solver/bicgstab_complex_bi.h b/src/lib/solver/bicgstab_complex_bi.h
similarity index 100%
rename from solver/bicgstab_complex_bi.h
rename to src/lib/solver/bicgstab_complex_bi.h
diff --git a/solver/bicgstabell.c b/src/lib/solver/bicgstabell.c
similarity index 100%
rename from solver/bicgstabell.c
rename to src/lib/solver/bicgstabell.c
diff --git a/solver/bicgstabell.h b/src/lib/solver/bicgstabell.h
similarity index 100%
rename from solver/bicgstabell.h
rename to src/lib/solver/bicgstabell.h
diff --git a/solver/cg_her.c b/src/lib/solver/cg_her.c
similarity index 100%
rename from solver/cg_her.c
rename to src/lib/solver/cg_her.c
diff --git a/solver/cg_her.h b/src/lib/solver/cg_her.h
similarity index 100%
rename from solver/cg_her.h
rename to src/lib/solver/cg_her.h
diff --git a/solver/cg_her_bi.c b/src/lib/solver/cg_her_bi.c
similarity index 100%
rename from solver/cg_her_bi.c
rename to src/lib/solver/cg_her_bi.c
diff --git a/solver/cg_her_bi.h b/src/lib/solver/cg_her_bi.h
similarity index 100%
rename from solver/cg_her_bi.h
rename to src/lib/solver/cg_her_bi.h
diff --git a/solver/cg_her_nd.c b/src/lib/solver/cg_her_nd.c
similarity index 100%
rename from solver/cg_her_nd.c
rename to src/lib/solver/cg_her_nd.c
diff --git a/solver/cg_her_nd.h b/src/lib/solver/cg_her_nd.h
similarity index 100%
rename from solver/cg_her_nd.h
rename to src/lib/solver/cg_her_nd.h
diff --git a/solver/cg_mms_tm.c b/src/lib/solver/cg_mms_tm.c
similarity index 100%
rename from solver/cg_mms_tm.c
rename to src/lib/solver/cg_mms_tm.c
diff --git a/solver/cg_mms_tm.h b/src/lib/solver/cg_mms_tm.h
similarity index 100%
rename from solver/cg_mms_tm.h
rename to src/lib/solver/cg_mms_tm.h
diff --git a/solver/cg_mms_tm_nd.c b/src/lib/solver/cg_mms_tm_nd.c
similarity index 100%
rename from solver/cg_mms_tm_nd.c
rename to src/lib/solver/cg_mms_tm_nd.c
diff --git a/solver/cg_mms_tm_nd.h b/src/lib/solver/cg_mms_tm_nd.h
similarity index 100%
rename from solver/cg_mms_tm_nd.h
rename to src/lib/solver/cg_mms_tm_nd.h
diff --git a/solver/cgne4complex.c b/src/lib/solver/cgne4complex.c
similarity index 100%
rename from solver/cgne4complex.c
rename to src/lib/solver/cgne4complex.c
diff --git a/solver/cgne4complex.h b/src/lib/solver/cgne4complex.h
similarity index 100%
rename from solver/cgne4complex.h
rename to src/lib/solver/cgne4complex.h
diff --git a/solver/cgs_real.c b/src/lib/solver/cgs_real.c
similarity index 100%
rename from solver/cgs_real.c
rename to src/lib/solver/cgs_real.c
diff --git a/solver/cgs_real.h b/src/lib/solver/cgs_real.h
similarity index 100%
rename from solver/cgs_real.h
rename to src/lib/solver/cgs_real.h
diff --git a/solver/chrono_guess.c b/src/lib/solver/chrono_guess.c
similarity index 100%
rename from solver/chrono_guess.c
rename to src/lib/solver/chrono_guess.c
diff --git a/solver/chrono_guess.h b/src/lib/solver/chrono_guess.h
similarity index 100%
rename from solver/chrono_guess.h
rename to src/lib/solver/chrono_guess.h
diff --git a/solver/cr.c b/src/lib/solver/cr.c
similarity index 100%
rename from solver/cr.c
rename to src/lib/solver/cr.c
diff --git a/solver/cr.h b/src/lib/solver/cr.h
similarity index 100%
rename from solver/cr.h
rename to src/lib/solver/cr.h
diff --git a/solver/dfl_projector.c b/src/lib/solver/dfl_projector.c
similarity index 100%
rename from solver/dfl_projector.c
rename to src/lib/solver/dfl_projector.c
diff --git a/solver/dfl_projector.h b/src/lib/solver/dfl_projector.h
similarity index 100%
rename from solver/dfl_projector.h
rename to src/lib/solver/dfl_projector.h
diff --git a/solver/diagonalise_general_matrix.c b/src/lib/solver/diagonalise_general_matrix.c
similarity index 100%
rename from solver/diagonalise_general_matrix.c
rename to src/lib/solver/diagonalise_general_matrix.c
diff --git a/solver/diagonalise_general_matrix.h b/src/lib/solver/diagonalise_general_matrix.h
similarity index 100%
rename from solver/diagonalise_general_matrix.h
rename to src/lib/solver/diagonalise_general_matrix.h
diff --git a/solver/dirac_operator_eigenvectors.c b/src/lib/solver/dirac_operator_eigenvectors.c
similarity index 100%
rename from solver/dirac_operator_eigenvectors.c
rename to src/lib/solver/dirac_operator_eigenvectors.c
diff --git a/solver/dirac_operator_eigenvectors.h b/src/lib/solver/dirac_operator_eigenvectors.h
similarity index 100%
rename from solver/dirac_operator_eigenvectors.h
rename to src/lib/solver/dirac_operator_eigenvectors.h
diff --git a/solver/eigcg.c b/src/lib/solver/eigcg.c
similarity index 100%
rename from solver/eigcg.c
rename to src/lib/solver/eigcg.c
diff --git a/solver/eigcg.h b/src/lib/solver/eigcg.h
similarity index 100%
rename from solver/eigcg.h
rename to src/lib/solver/eigcg.h
diff --git a/solver/eigenvalues.c b/src/lib/solver/eigenvalues.c
similarity index 100%
rename from solver/eigenvalues.c
rename to src/lib/solver/eigenvalues.c
diff --git a/solver/eigenvalues.h b/src/lib/solver/eigenvalues.h
similarity index 100%
rename from solver/eigenvalues.h
rename to src/lib/solver/eigenvalues.h
diff --git a/solver/eigenvalues_bi.c b/src/lib/solver/eigenvalues_bi.c
similarity index 100%
rename from solver/eigenvalues_bi.c
rename to src/lib/solver/eigenvalues_bi.c
diff --git a/solver/eigenvalues_bi.h b/src/lib/solver/eigenvalues_bi.h
similarity index 100%
rename from solver/eigenvalues_bi.h
rename to src/lib/solver/eigenvalues_bi.h
diff --git a/solver/fgmres.c b/src/lib/solver/fgmres.c
similarity index 100%
rename from solver/fgmres.c
rename to src/lib/solver/fgmres.c
diff --git a/solver/fgmres.h b/src/lib/solver/fgmres.h
similarity index 100%
rename from solver/fgmres.h
rename to src/lib/solver/fgmres.h
diff --git a/solver/fgmres4complex.c b/src/lib/solver/fgmres4complex.c
similarity index 100%
rename from solver/fgmres4complex.c
rename to src/lib/solver/fgmres4complex.c
diff --git a/solver/fgmres4complex.h b/src/lib/solver/fgmres4complex.h
similarity index 100%
rename from solver/fgmres4complex.h
rename to src/lib/solver/fgmres4complex.h
diff --git a/solver/fgmres4complex_body.c b/src/lib/solver/fgmres4complex_body.c
similarity index 100%
rename from solver/fgmres4complex_body.c
rename to src/lib/solver/fgmres4complex_body.c
diff --git a/solver/gcr.c b/src/lib/solver/gcr.c
similarity index 100%
rename from solver/gcr.c
rename to src/lib/solver/gcr.c
diff --git a/solver/gcr.h b/src/lib/solver/gcr.h
similarity index 100%
rename from solver/gcr.h
rename to src/lib/solver/gcr.h
diff --git a/solver/gcr4complex.c b/src/lib/solver/gcr4complex.c
similarity index 100%
rename from solver/gcr4complex.c
rename to src/lib/solver/gcr4complex.c
diff --git a/solver/gcr4complex.h b/src/lib/solver/gcr4complex.h
similarity index 100%
rename from solver/gcr4complex.h
rename to src/lib/solver/gcr4complex.h
diff --git a/solver/gcr4complex_body.c b/src/lib/solver/gcr4complex_body.c
similarity index 100%
rename from solver/gcr4complex_body.c
rename to src/lib/solver/gcr4complex_body.c
diff --git a/solver/gcr4complex_body.h b/src/lib/solver/gcr4complex_body.h
similarity index 100%
rename from solver/gcr4complex_body.h
rename to src/lib/solver/gcr4complex_body.h
diff --git a/solver/generate_dfl_subspace.c b/src/lib/solver/generate_dfl_subspace.c
similarity index 100%
rename from solver/generate_dfl_subspace.c
rename to src/lib/solver/generate_dfl_subspace.c
diff --git a/solver/generate_dfl_subspace.h b/src/lib/solver/generate_dfl_subspace.h
similarity index 100%
rename from solver/generate_dfl_subspace.h
rename to src/lib/solver/generate_dfl_subspace.h
diff --git a/solver/gmres.c b/src/lib/solver/gmres.c
similarity index 100%
rename from solver/gmres.c
rename to src/lib/solver/gmres.c
diff --git a/solver/gmres.h b/src/lib/solver/gmres.h
similarity index 100%
rename from solver/gmres.h
rename to src/lib/solver/gmres.h
diff --git a/solver/gmres_dr.c b/src/lib/solver/gmres_dr.c
similarity index 100%
rename from solver/gmres_dr.c
rename to src/lib/solver/gmres_dr.c
diff --git a/solver/gmres_dr.h b/src/lib/solver/gmres_dr.h
similarity index 100%
rename from solver/gmres_dr.h
rename to src/lib/solver/gmres_dr.h
diff --git a/solver/gmres_precon.c b/src/lib/solver/gmres_precon.c
similarity index 100%
rename from solver/gmres_precon.c
rename to src/lib/solver/gmres_precon.c
diff --git a/solver/gmres_precon.h b/src/lib/solver/gmres_precon.h
similarity index 100%
rename from solver/gmres_precon.h
rename to src/lib/solver/gmres_precon.h
diff --git a/solver/gram-schmidt.c b/src/lib/solver/gram-schmidt.c
similarity index 100%
rename from solver/gram-schmidt.c
rename to src/lib/solver/gram-schmidt.c
diff --git a/solver/gram-schmidt.h b/src/lib/solver/gram-schmidt.h
similarity index 100%
rename from solver/gram-schmidt.h
rename to src/lib/solver/gram-schmidt.h
diff --git a/solver/incr_eigcg.c b/src/lib/solver/incr_eigcg.c
similarity index 100%
rename from solver/incr_eigcg.c
rename to src/lib/solver/incr_eigcg.c
diff --git a/solver/incr_eigcg.h b/src/lib/solver/incr_eigcg.h
similarity index 100%
rename from solver/incr_eigcg.h
rename to src/lib/solver/incr_eigcg.h
diff --git a/solver/index_jd.c b/src/lib/solver/index_jd.c
similarity index 100%
rename from solver/index_jd.c
rename to src/lib/solver/index_jd.c
diff --git a/solver/index_jd.h b/src/lib/solver/index_jd.h
similarity index 100%
rename from solver/index_jd.h
rename to src/lib/solver/index_jd.h
diff --git a/solver/init_guess.c b/src/lib/solver/init_guess.c
similarity index 100%
rename from solver/init_guess.c
rename to src/lib/solver/init_guess.c
diff --git a/solver/init_guess.h b/src/lib/solver/init_guess.h
similarity index 100%
rename from solver/init_guess.h
rename to src/lib/solver/init_guess.h
diff --git a/solver/jdher.c b/src/lib/solver/jdher.c
similarity index 100%
rename from solver/jdher.c
rename to src/lib/solver/jdher.c
diff --git a/solver/jdher.h b/src/lib/solver/jdher.h
similarity index 100%
rename from solver/jdher.h
rename to src/lib/solver/jdher.h
diff --git a/solver/jdher_bi.c b/src/lib/solver/jdher_bi.c
similarity index 100%
rename from solver/jdher_bi.c
rename to src/lib/solver/jdher_bi.c
diff --git a/solver/jdher_bi.h b/src/lib/solver/jdher_bi.h
similarity index 100%
rename from solver/jdher_bi.h
rename to src/lib/solver/jdher_bi.h
diff --git a/solver/little_mg_precon_body.c b/src/lib/solver/little_mg_precon_body.c
similarity index 100%
rename from solver/little_mg_precon_body.c
rename to src/lib/solver/little_mg_precon_body.c
diff --git a/solver/little_project_eo_body.c b/src/lib/solver/little_project_eo_body.c
similarity index 100%
rename from solver/little_project_eo_body.c
rename to src/lib/solver/little_project_eo_body.c
diff --git a/solver/lu_solve.c b/src/lib/solver/lu_solve.c
similarity index 100%
rename from solver/lu_solve.c
rename to src/lib/solver/lu_solve.c
diff --git a/solver/lu_solve.h b/src/lib/solver/lu_solve.h
similarity index 100%
rename from solver/lu_solve.h
rename to src/lib/solver/lu_solve.h
diff --git a/solver/matrix_mult_typedef.h b/src/lib/solver/matrix_mult_typedef.h
similarity index 100%
rename from solver/matrix_mult_typedef.h
rename to src/lib/solver/matrix_mult_typedef.h
diff --git a/solver/matrix_mult_typedef_bi.h b/src/lib/solver/matrix_mult_typedef_bi.h
similarity index 100%
rename from solver/matrix_mult_typedef_bi.h
rename to src/lib/solver/matrix_mult_typedef_bi.h
diff --git a/solver/matrix_mult_typedef_nd.h b/src/lib/solver/matrix_mult_typedef_nd.h
similarity index 100%
rename from solver/matrix_mult_typedef_nd.h
rename to src/lib/solver/matrix_mult_typedef_nd.h
diff --git a/solver/mcr.c b/src/lib/solver/mcr.c
similarity index 100%
rename from solver/mcr.c
rename to src/lib/solver/mcr.c
diff --git a/solver/mcr.h b/src/lib/solver/mcr.h
similarity index 100%
rename from solver/mcr.h
rename to src/lib/solver/mcr.h
diff --git a/solver/mcr4complex.c b/src/lib/solver/mcr4complex.c
similarity index 100%
rename from solver/mcr4complex.c
rename to src/lib/solver/mcr4complex.c
diff --git a/solver/mcr4complex.h b/src/lib/solver/mcr4complex.h
similarity index 100%
rename from solver/mcr4complex.h
rename to src/lib/solver/mcr4complex.h
diff --git a/solver/mixed_cg_her.c b/src/lib/solver/mixed_cg_her.c
similarity index 100%
rename from solver/mixed_cg_her.c
rename to src/lib/solver/mixed_cg_her.c
diff --git a/solver/mixed_cg_her.h b/src/lib/solver/mixed_cg_her.h
similarity index 100%
rename from solver/mixed_cg_her.h
rename to src/lib/solver/mixed_cg_her.h
diff --git a/solver/mixed_cg_mms_tm_nd.c b/src/lib/solver/mixed_cg_mms_tm_nd.c
similarity index 100%
rename from solver/mixed_cg_mms_tm_nd.c
rename to src/lib/solver/mixed_cg_mms_tm_nd.c
diff --git a/solver/mixed_cg_mms_tm_nd.h b/src/lib/solver/mixed_cg_mms_tm_nd.h
similarity index 100%
rename from solver/mixed_cg_mms_tm_nd.h
rename to src/lib/solver/mixed_cg_mms_tm_nd.h
diff --git a/solver/monomial_solve.c b/src/lib/solver/monomial_solve.c
similarity index 100%
rename from solver/monomial_solve.c
rename to src/lib/solver/monomial_solve.c
diff --git a/solver/monomial_solve.h b/src/lib/solver/monomial_solve.h
similarity index 100%
rename from solver/monomial_solve.h
rename to src/lib/solver/monomial_solve.h
diff --git a/solver/mr.c b/src/lib/solver/mr.c
similarity index 100%
rename from solver/mr.c
rename to src/lib/solver/mr.c
diff --git a/solver/mr.h b/src/lib/solver/mr.h
similarity index 100%
rename from solver/mr.h
rename to src/lib/solver/mr.h
diff --git a/solver/mr4complex.c b/src/lib/solver/mr4complex.c
similarity index 100%
rename from solver/mr4complex.c
rename to src/lib/solver/mr4complex.c
diff --git a/solver/mr4complex.h b/src/lib/solver/mr4complex.h
similarity index 100%
rename from solver/mr4complex.h
rename to src/lib/solver/mr4complex.h
diff --git a/solver/mrblk_body.c b/src/lib/solver/mrblk_body.c
similarity index 100%
rename from solver/mrblk_body.c
rename to src/lib/solver/mrblk_body.c
diff --git a/solver/ortho.c b/src/lib/solver/ortho.c
similarity index 100%
rename from solver/ortho.c
rename to src/lib/solver/ortho.c
diff --git a/solver/ortho.h b/src/lib/solver/ortho.h
similarity index 100%
rename from solver/ortho.h
rename to src/lib/solver/ortho.h
diff --git a/solver/pcg_her.c b/src/lib/solver/pcg_her.c
similarity index 100%
rename from solver/pcg_her.c
rename to src/lib/solver/pcg_her.c
diff --git a/solver/pcg_her.h b/src/lib/solver/pcg_her.h
similarity index 100%
rename from solver/pcg_her.h
rename to src/lib/solver/pcg_her.h
diff --git a/solver/poly_precon.c b/src/lib/solver/poly_precon.c
similarity index 100%
rename from solver/poly_precon.c
rename to src/lib/solver/poly_precon.c
diff --git a/solver/poly_precon.h b/src/lib/solver/poly_precon.h
similarity index 100%
rename from solver/poly_precon.h
rename to src/lib/solver/poly_precon.h
diff --git a/solver/quicksort.c b/src/lib/solver/quicksort.c
similarity index 100%
rename from solver/quicksort.c
rename to src/lib/solver/quicksort.c
diff --git a/solver/quicksort.h b/src/lib/solver/quicksort.h
similarity index 100%
rename from solver/quicksort.h
rename to src/lib/solver/quicksort.h
diff --git a/solver/restart_X.c b/src/lib/solver/restart_X.c
similarity index 100%
rename from solver/restart_X.c
rename to src/lib/solver/restart_X.c
diff --git a/solver/restart_X.h b/src/lib/solver/restart_X.h
similarity index 100%
rename from solver/restart_X.h
rename to src/lib/solver/restart_X.h
diff --git a/solver/rg_mixed_cg_her.c b/src/lib/solver/rg_mixed_cg_her.c
similarity index 100%
rename from solver/rg_mixed_cg_her.c
rename to src/lib/solver/rg_mixed_cg_her.c
diff --git a/solver/rg_mixed_cg_her.h b/src/lib/solver/rg_mixed_cg_her.h
similarity index 100%
rename from solver/rg_mixed_cg_her.h
rename to src/lib/solver/rg_mixed_cg_her.h
diff --git a/solver/rg_mixed_cg_her_nd.c b/src/lib/solver/rg_mixed_cg_her_nd.c
similarity index 100%
rename from solver/rg_mixed_cg_her_nd.c
rename to src/lib/solver/rg_mixed_cg_her_nd.c
diff --git a/solver/rg_mixed_cg_her_nd.h b/src/lib/solver/rg_mixed_cg_her_nd.h
similarity index 100%
rename from solver/rg_mixed_cg_her_nd.h
rename to src/lib/solver/rg_mixed_cg_her_nd.h
diff --git a/solver/rg_mixed_cg_typedef.h b/src/lib/solver/rg_mixed_cg_typedef.h
similarity index 100%
rename from solver/rg_mixed_cg_typedef.h
rename to src/lib/solver/rg_mixed_cg_typedef.h
diff --git a/solver/solver.h b/src/lib/solver/solver.h
similarity index 100%
rename from solver/solver.h
rename to src/lib/solver/solver.h
diff --git a/solver/solver_field.c b/src/lib/solver/solver_field.c
similarity index 100%
rename from solver/solver_field.c
rename to src/lib/solver/solver_field.c
diff --git a/solver/solver_field.h b/src/lib/solver/solver_field.h
similarity index 100%
rename from solver/solver_field.h
rename to src/lib/solver/solver_field.h
diff --git a/solver/solver_params.h b/src/lib/solver/solver_params.h
similarity index 100%
rename from solver/solver_params.h
rename to src/lib/solver/solver_params.h
diff --git a/solver/solver_types.c b/src/lib/solver/solver_types.c
similarity index 100%
rename from solver/solver_types.c
rename to src/lib/solver/solver_types.c
diff --git a/solver/solver_types.h b/src/lib/solver/solver_types.h
similarity index 100%
rename from solver/solver_types.h
rename to src/lib/solver/solver_types.h
diff --git a/solver/sub_low_ev.c b/src/lib/solver/sub_low_ev.c
similarity index 100%
rename from solver/sub_low_ev.c
rename to src/lib/solver/sub_low_ev.c
diff --git a/solver/sub_low_ev.h b/src/lib/solver/sub_low_ev.h
similarity index 100%
rename from solver/sub_low_ev.h
rename to src/lib/solver/sub_low_ev.h
diff --git a/solver/sumr.c b/src/lib/solver/sumr.c
similarity index 100%
rename from solver/sumr.c
rename to src/lib/solver/sumr.c
diff --git a/solver/sumr.h b/src/lib/solver/sumr.h
similarity index 100%
rename from solver/sumr.h
rename to src/lib/solver/sumr.h
diff --git a/source_generation.c b/src/lib/source_generation.c
similarity index 100%
rename from source_generation.c
rename to src/lib/source_generation.c
diff --git a/source_generation.h b/src/lib/source_generation.h
similarity index 100%
rename from source_generation.h
rename to src/lib/source_generation.h
diff --git a/spinor_fft.c b/src/lib/spinor_fft.c
similarity index 100%
rename from spinor_fft.c
rename to src/lib/spinor_fft.c
diff --git a/spinor_fft.h b/src/lib/spinor_fft.h
similarity index 100%
rename from spinor_fft.h
rename to src/lib/spinor_fft.h
diff --git a/start.c b/src/lib/start.c
similarity index 100%
rename from start.c
rename to src/lib/start.c
diff --git a/start.h b/src/lib/start.h
similarity index 100%
rename from start.h
rename to src/lib/start.h
diff --git a/struct_accessors.h b/src/lib/struct_accessors.h
similarity index 100%
rename from struct_accessors.h
rename to src/lib/struct_accessors.h
diff --git a/su3.h b/src/lib/su3.h
similarity index 100%
rename from su3.h
rename to src/lib/su3.h
diff --git a/su3adj.h b/src/lib/su3adj.h
similarity index 100%
rename from su3adj.h
rename to src/lib/su3adj.h
diff --git a/su3spinor.h b/src/lib/su3spinor.h
similarity index 100%
rename from su3spinor.h
rename to src/lib/su3spinor.h
diff --git a/tensors.h b/src/lib/tensors.h
similarity index 100%
rename from tensors.h
rename to src/lib/tensors.h
diff --git a/test/Makefile b/src/lib/test/Makefile
similarity index 100%
rename from test/Makefile
rename to src/lib/test/Makefile
diff --git a/test/check_geometry.c b/src/lib/test/check_geometry.c
similarity index 100%
rename from test/check_geometry.c
rename to src/lib/test/check_geometry.c
diff --git a/test/check_geometry.h b/src/lib/test/check_geometry.h
similarity index 100%
rename from test/check_geometry.h
rename to src/lib/test/check_geometry.h
diff --git a/test/check_nan.c b/src/lib/test/check_nan.c
similarity index 100%
rename from test/check_nan.c
rename to src/lib/test/check_nan.c
diff --git a/test/check_nan.h b/src/lib/test/check_nan.h
similarity index 100%
rename from test/check_nan.h
rename to src/lib/test/check_nan.h
diff --git a/test/check_overlap.c b/src/lib/test/check_overlap.c
similarity index 100%
rename from test/check_overlap.c
rename to src/lib/test/check_overlap.c
diff --git a/test/check_xchange.c b/src/lib/test/check_xchange.c
similarity index 100%
rename from test/check_xchange.c
rename to src/lib/test/check_xchange.c
diff --git a/test/hopping_test.README b/src/lib/test/hopping_test.README
similarity index 100%
rename from test/hopping_test.README
rename to src/lib/test/hopping_test.README
diff --git a/test/hopping_test.input.compare b/src/lib/test/hopping_test.input.compare
similarity index 100%
rename from test/hopping_test.input.compare
rename to src/lib/test/hopping_test.input.compare
diff --git a/test/hopping_test.input.new b/src/lib/test/hopping_test.input.new
similarity index 100%
rename from test/hopping_test.input.new
rename to src/lib/test/hopping_test.input.new
diff --git a/test/hopping_test.input.start b/src/lib/test/hopping_test.input.start
similarity index 100%
rename from test/hopping_test.input.start
rename to src/lib/test/hopping_test.input.start
diff --git a/test/hopping_test_generate_script b/src/lib/test/hopping_test_generate_script
similarity index 100%
rename from test/hopping_test_generate_script
rename to src/lib/test/hopping_test_generate_script
diff --git a/test/hopping_test_qscript b/src/lib/test/hopping_test_qscript
similarity index 100%
rename from test/hopping_test_qscript
rename to src/lib/test/hopping_test_qscript
diff --git a/test/measure_rectangles.debug.c b/src/lib/test/measure_rectangles.debug.c
similarity index 100%
rename from test/measure_rectangles.debug.c
rename to src/lib/test/measure_rectangles.debug.c
diff --git a/test/overlaptests.c b/src/lib/test/overlaptests.c
similarity index 100%
rename from test/overlaptests.c
rename to src/lib/test/overlaptests.c
diff --git a/test/overlaptests.h b/src/lib/test/overlaptests.h
similarity index 100%
rename from test/overlaptests.h
rename to src/lib/test/overlaptests.h
diff --git a/test/qdran64.h b/src/lib/test/qdran64.h
similarity index 100%
rename from test/qdran64.h
rename to src/lib/test/qdran64.h
diff --git a/tm_debug_printf.c b/src/lib/tm_debug_printf.c
similarity index 100%
rename from tm_debug_printf.c
rename to src/lib/tm_debug_printf.c
diff --git a/tm_debug_printf.h b/src/lib/tm_debug_printf.h
similarity index 100%
rename from tm_debug_printf.h
rename to src/lib/tm_debug_printf.h
diff --git a/update_backward_gauge.c b/src/lib/update_backward_gauge.c
similarity index 100%
rename from update_backward_gauge.c
rename to src/lib/update_backward_gauge.c
diff --git a/update_backward_gauge.h b/src/lib/update_backward_gauge.h
similarity index 100%
rename from update_backward_gauge.h
rename to src/lib/update_backward_gauge.h
diff --git a/update_gauge.c b/src/lib/update_gauge.c
similarity index 100%
rename from update_gauge.c
rename to src/lib/update_gauge.c
diff --git a/update_gauge.h b/src/lib/update_gauge.h
similarity index 100%
rename from update_gauge.h
rename to src/lib/update_gauge.h
diff --git a/update_momenta.c b/src/lib/update_momenta.c
similarity index 100%
rename from update_momenta.c
rename to src/lib/update_momenta.c
diff --git a/update_momenta.h b/src/lib/update_momenta.h
similarity index 100%
rename from update_momenta.h
rename to src/lib/update_momenta.h
diff --git a/update_momenta_fg.c b/src/lib/update_momenta_fg.c
similarity index 100%
rename from update_momenta_fg.c
rename to src/lib/update_momenta_fg.c
diff --git a/update_momenta_fg.h b/src/lib/update_momenta_fg.h
similarity index 100%
rename from update_momenta_fg.h
rename to src/lib/update_momenta_fg.h
diff --git a/update_tm.c b/src/lib/update_tm.c
similarity index 100%
rename from update_tm.c
rename to src/lib/update_tm.c
diff --git a/update_tm.h b/src/lib/update_tm.h
similarity index 100%
rename from update_tm.h
rename to src/lib/update_tm.h
diff --git a/util/io.c b/src/lib/util/io.c
similarity index 100%
rename from util/io.c
rename to src/lib/util/io.c
diff --git a/util/io.h b/src/lib/util/io.h
similarity index 100%
rename from util/io.h
rename to src/lib/util/io.h
diff --git a/util/laguer/Makefile b/src/lib/util/laguer/Makefile
similarity index 100%
rename from util/laguer/Makefile
rename to src/lib/util/laguer/Makefile
diff --git a/util/laguer/chebyRoot.C b/src/lib/util/laguer/chebyRoot.C
similarity index 100%
rename from util/laguer/chebyRoot.C
rename to src/lib/util/laguer/chebyRoot.C
diff --git a/util/laguer/chebyRoot.H b/src/lib/util/laguer/chebyRoot.H
similarity index 100%
rename from util/laguer/chebyRoot.H
rename to src/lib/util/laguer/chebyRoot.H
diff --git a/util/laguer/laguer.c b/src/lib/util/laguer/laguer.c
similarity index 100%
rename from util/laguer/laguer.c
rename to src/lib/util/laguer/laguer.c
diff --git a/util/laguer/quadroptRoot.C b/src/lib/util/laguer/quadroptRoot.C
similarity index 100%
rename from util/laguer/quadroptRoot.C
rename to src/lib/util/laguer/quadroptRoot.C
diff --git a/util/oox/Makefile b/src/lib/util/oox/Makefile
similarity index 100%
rename from util/oox/Makefile
rename to src/lib/util/oox/Makefile
diff --git a/util/oox/oox.c b/src/lib/util/oox/oox.c
similarity index 100%
rename from util/oox/oox.c
rename to src/lib/util/oox/oox.c
diff --git a/util/oox/oox_gawrapper.cxx b/src/lib/util/oox/oox_gawrapper.cxx
similarity index 100%
rename from util/oox/oox_gawrapper.cxx
rename to src/lib/util/oox/oox_gawrapper.cxx
diff --git a/util/oox/oox_gawrapper.h b/src/lib/util/oox/oox_gawrapper.h
similarity index 100%
rename from util/oox/oox_gawrapper.h
rename to src/lib/util/oox/oox_gawrapper.h
diff --git a/util/swapendian.c b/src/lib/util/swapendian.c
similarity index 100%
rename from util/swapendian.c
rename to src/lib/util/swapendian.c
diff --git a/util/tmlqcd-indent b/src/lib/util/tmlqcd-indent
similarity index 100%
rename from util/tmlqcd-indent
rename to src/lib/util/tmlqcd-indent
diff --git a/wrapper/Makefile.in b/src/lib/wrapper/Makefile.in
similarity index 100%
rename from wrapper/Makefile.in
rename to src/lib/wrapper/Makefile.in
diff --git a/wrapper/lib_wrapper.c b/src/lib/wrapper/lib_wrapper.c
similarity index 100%
rename from wrapper/lib_wrapper.c
rename to src/lib/wrapper/lib_wrapper.c
diff --git a/xchange/Makefile.in b/src/lib/xchange/Makefile.in
similarity index 100%
rename from xchange/Makefile.in
rename to src/lib/xchange/Makefile.in
diff --git a/xchange/little_field_gather.c b/src/lib/xchange/little_field_gather.c
similarity index 100%
rename from xchange/little_field_gather.c
rename to src/lib/xchange/little_field_gather.c
diff --git a/xchange/little_field_gather.h b/src/lib/xchange/little_field_gather.h
similarity index 100%
rename from xchange/little_field_gather.h
rename to src/lib/xchange/little_field_gather.h
diff --git a/xchange/little_field_gather_body.c b/src/lib/xchange/little_field_gather_body.c
similarity index 100%
rename from xchange/little_field_gather_body.c
rename to src/lib/xchange/little_field_gather_body.c
diff --git a/xchange/xchange.h b/src/lib/xchange/xchange.h
similarity index 100%
rename from xchange/xchange.h
rename to src/lib/xchange/xchange.h
diff --git a/xchange/xchange_2fields.c b/src/lib/xchange/xchange_2fields.c
similarity index 100%
rename from xchange/xchange_2fields.c
rename to src/lib/xchange/xchange_2fields.c
diff --git a/xchange/xchange_2fields.h b/src/lib/xchange/xchange_2fields.h
similarity index 100%
rename from xchange/xchange_2fields.h
rename to src/lib/xchange/xchange_2fields.h
diff --git a/xchange/xchange_deri.c b/src/lib/xchange/xchange_deri.c
similarity index 100%
rename from xchange/xchange_deri.c
rename to src/lib/xchange/xchange_deri.c
diff --git a/xchange/xchange_deri.h b/src/lib/xchange/xchange_deri.h
similarity index 100%
rename from xchange/xchange_deri.h
rename to src/lib/xchange/xchange_deri.h
diff --git a/xchange/xchange_field.c b/src/lib/xchange/xchange_field.c
similarity index 100%
rename from xchange/xchange_field.c
rename to src/lib/xchange/xchange_field.c
diff --git a/xchange/xchange_field.h b/src/lib/xchange/xchange_field.h
similarity index 100%
rename from xchange/xchange_field.h
rename to src/lib/xchange/xchange_field.h
diff --git a/xchange/xchange_gauge.c b/src/lib/xchange/xchange_gauge.c
similarity index 100%
rename from xchange/xchange_gauge.c
rename to src/lib/xchange/xchange_gauge.c
diff --git a/xchange/xchange_gauge.h b/src/lib/xchange/xchange_gauge.h
similarity index 100%
rename from xchange/xchange_gauge.h
rename to src/lib/xchange/xchange_gauge.h
diff --git a/xchange/xchange_halffield.c b/src/lib/xchange/xchange_halffield.c
similarity index 100%
rename from xchange/xchange_halffield.c
rename to src/lib/xchange/xchange_halffield.c
diff --git a/xchange/xchange_halffield.h b/src/lib/xchange/xchange_halffield.h
similarity index 100%
rename from xchange/xchange_halffield.h
rename to src/lib/xchange/xchange_halffield.h
diff --git a/xchange/xchange_lexicfield.c b/src/lib/xchange/xchange_lexicfield.c
similarity index 100%
rename from xchange/xchange_lexicfield.c
rename to src/lib/xchange/xchange_lexicfield.c
diff --git a/xchange/xchange_lexicfield.h b/src/lib/xchange/xchange_lexicfield.h
similarity index 100%
rename from xchange/xchange_lexicfield.h
rename to src/lib/xchange/xchange_lexicfield.h

From 44b5c0b443850813894e93a21435ea9eb774563f Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Mon, 9 Feb 2026 09:11:11 +0100
Subject: [PATCH 09/80] Use TM_BLA for #ifdef flags

- Moved git hash string to a c file
- Removed the CRAY keyword
- Moved tests files in separate directory
- Namespace all #ifdef varaibles TM_XXX
- Moved profile directory to the root
- updated the url info
---
 CMakeLists.txt                                |  160 +-
 Makefile.global                               |   64 -
 Makefile.in                                   |  167 --
 Makefile.tests                                |   64 -
 cmake/FindDDAlphaAMG.cmake                    |   29 +
 cmake/{git_hash.h.in => git_hash.c.in}        |    2 +-
 cmake/tmlqcd_config_internal.h.in             |   58 +-
 cmake_includes.txt                            |  425 ----
 config.guess                                  | 1701 -------------
 config.sub                                    | 1855 --------------
 configure.in                                  |  737 ------
 .../lib/profiling => profiling}/hmc/Readme.md |    0
 .../hmc/example_profile.pdf                   |  Bin
 .../profiling => profiling}/hmc/profile.Rmd   |    0
 {src/lib/profiling => profiling}/hmc/timing.R |    0
 .../hmc_mk2/.gitignore                        |    0
 .../profiling => profiling}/hmc_mk2/README.md |    0
 .../hmc_mk2/logs/example_log.out              |    4 +-
 .../hmc_mk2/make_profile.R                    |    0
 .../hmc_mk2/profile.Rmd                       |    0
 qphix_base_classes.hpp                        |  771 ------
 qphix_interface.cpp                           | 2192 -----------------
 qphix_interface.hpp                           |   51 -
 qphix_interface_utils.hpp                     |   33 -
 src/bin/LapH_ev.c                             |   20 +-
 src/bin/benchmark.c                           |   46 +-
 src/bin/deriv_mg_tune.c                       |   12 +-
 src/bin/hmc_tm.c                              |   16 +-
 src/bin/invert.c                              |   14 +-
 src/bin/offline_measurement.c                 |   10 +-
 src/bin/{ => tests}/check_locallity.c         |   12 +-
 src/bin/{ => tests}/hopping_test.c            |   40 +-
 src/bin/{ => tests}/qphix_test_Dslash.c       |   12 +-
 src/bin/{ => tests}/scalar_prod_r_test.c      |    0
 src/bin/{ => tests}/test_eigenvalues.c        |   12 +-
 src/bin/{ => tests}/test_lemon.c              |    4 +-
 src/lib/CMakeLists.txt                        |   20 +-
 src/lib/DDalphaAMG_interface.c                |   60 +-
 src/lib/DDalphaAMG_interface.h                |    2 +-
 .../utils_generic_exchange.blocking.inc       |   12 +-
 src/lib/buffers/utils_generic_exchange.c      |   12 +-
 .../utils_generic_exchange.nonblocking.inc    |   16 +-
 src/lib/deriv_Sb.c                            |   22 +-
 src/lib/deriv_Sb_D_psi.c                      |    4 +-
 .../lib/fixed_volume.h.in                     |    0
 src/lib/geometry_eo.c                         |  148 +-
 src/lib/get_rectangle_staples.c               |    4 +-
 src/lib/get_staples.c                         |   12 +-
 src/lib/gettime.c                             |    4 +-
 src/lib/git_hash.h                            |    6 +
 src/lib/global.h                              |    6 +-
 src/lib/init/init_dirac_halfspinor.c          |   48 +-
 src/lib/init/init_gauge_field.c               |    8 +-
 src/lib/init/init_geometry_indices.c          |    4 +-
 src/lib/init/init_parallel.h                  |    4 +-
 src/lib/init/init_spinor_field.c              |   16 +-
 src/lib/invert_clover_eo.c                    |    6 +-
 src/lib/invert_doublet_eo.c                   |    6 +-
 src/lib/invert_eo.c                           |    4 +-
 src/lib/io/gauge_read.c                       |    4 +-
 src/lib/io/gauge_read_binary.c                |    6 +-
 src/lib/io/gauge_write_binary.c               |    6 +-
 src/lib/io/selector.h                         |    8 +-
 src/lib/io/spinor_read_binary.c               |   12 +-
 src/lib/io/spinor_write_binary.c              |   12 +-
 src/lib/io/spinor_write_propagator_type.c     |    8 +-
 src/lib/io/spinor_write_source_format.c       |    8 +-
 src/lib/io/utils_construct_reader.c           |   14 +-
 src/lib/io/utils_construct_writer.c           |    8 +-
 src/lib/io/utils_destruct_reader.c            |    6 +-
 src/lib/io/utils_destruct_writer.c            |    6 +-
 src/lib/io/utils_kill_with_error.c            |    4 +-
 src/lib/io/utils_write_first_message.c        |   24 +-
 src/lib/io/utils_write_header.c               |    8 +-
 src/lib/io/utils_write_message.c              |    8 +-
 src/lib/linalg/blas.h                         |    4 +-
 src/lib/linalg/lapack.h                       |    2 +-
 src/lib/little_D.c                            |    8 +-
 src/lib/meas/polyakov_loop.c                  |   10 +-
 src/lib/measure_gauge_action.c                |    2 -
 src/lib/mpi_init.c                            |   74 +-
 src/lib/mpi_init.h                            |    4 +-
 src/lib/operator.c                            |    2 +-
 src/lib/operator/D_psi_body.c                 |    2 +-
 src/lib/operator/Hopping_Matrix.c             |   14 +-
 src/lib/operator/Hopping_Matrix_32.c          |   10 +-
 src/lib/operator/Hopping_Matrix_nocom.c       |    4 +-
 src/lib/operator/halfspinor_body.c            |    4 +-
 src/lib/operator/hopping_body_dbl.c           |   20 +-
 src/lib/operator/hopping_sgl.c                |   18 +-
 src/lib/operator/tm_sub_Hopping_Matrix.c      |    8 +-
 src/lib/operator/tm_times_Hopping_Matrix.c    |   10 +-
 src/lib/overrelaxation.c                      |    2 +-
 src/lib/parallel_io.h                         |    4 +-
 src/lib/read_input.l                          |   44 +-
 src/lib/solver/cg_her.c                       |    2 +-
 src/lib/solver/cg_her_nd.c                    |    2 +-
 src/lib/solver/cr.c                           |    2 +-
 src/lib/solver/diagonalise_general_matrix.c   |    2 +-
 src/lib/solver/dirac_operator_eigenvectors.c  |   20 +-
 src/lib/solver/dirac_operator_eigenvectors.h  |    6 +-
 src/lib/solver/eigenvalues.c                  |    2 +-
 src/lib/solver/fgmres.c                       |    2 +-
 src/lib/solver/fgmres4complex_body.c          |    2 +-
 src/lib/solver/gmres_dr.c                     |    2 +-
 src/lib/solver/gram-schmidt.c                 |    6 +-
 src/lib/solver/mcr.c                          |    2 +-
 src/lib/solver/monomial_solve.c               |   10 +-
 src/lib/solver/solver_field.c                 |    8 +-
 src/lib/spinor_fft.c                          |   10 +-
 src/lib/test/Makefile                         |   88 -
 src/lib/test/check_geometry.c                 |   30 +-
 src/lib/test/check_overlap.c                  |   18 +-
 src/lib/test/check_xchange.c                  |   68 +-
 src/lib/test/measure_rectangles.debug.c       |    4 +-
 src/lib/update_backward_gauge.c               |    2 +-
 src/lib/update_gauge.c                        |    8 +-
 src/lib/update_momenta_fg.c                   |    8 +-
 src/lib/update_tm.c                           |    8 +-
 src/lib/util/io.c                             |    2 +-
 src/lib/util/laguer/Makefile                  |    9 -
 src/lib/util/oox/Makefile                     |   46 -
 src/lib/wrapper/lib_wrapper.c                 |    6 +-
 src/lib/xchange/xchange_2fields.c             |   16 +-
 src/lib/xchange/xchange_2fields.h             |    2 +-
 src/lib/xchange/xchange_deri.c                |   24 +-
 src/lib/xchange/xchange_field.c               |   70 +-
 src/lib/xchange/xchange_gauge.c               |   36 +-
 src/lib/xchange/xchange_halffield.c           |   68 +-
 src/lib/xchange/xchange_lexicfield.c          |   80 +-
 130 files changed, 893 insertions(+), 9101 deletions(-)
 delete mode 100644 Makefile.global
 delete mode 100644 Makefile.in
 delete mode 100644 Makefile.tests
 create mode 100644 cmake/FindDDAlphaAMG.cmake
 rename cmake/{git_hash.h.in => git_hash.c.in} (62%)
 delete mode 100644 cmake_includes.txt
 delete mode 100644 config.guess
 delete mode 100644 config.sub
 delete mode 100644 configure.in
 rename {src/lib/profiling => profiling}/hmc/Readme.md (100%)
 rename {src/lib/profiling => profiling}/hmc/example_profile.pdf (100%)
 rename {src/lib/profiling => profiling}/hmc/profile.Rmd (100%)
 rename {src/lib/profiling => profiling}/hmc/timing.R (100%)
 rename {src/lib/profiling => profiling}/hmc_mk2/.gitignore (100%)
 rename {src/lib/profiling => profiling}/hmc_mk2/README.md (100%)
 rename {src/lib/profiling => profiling}/hmc_mk2/logs/example_log.out (99%)
 rename {src/lib/profiling => profiling}/hmc_mk2/make_profile.R (100%)
 rename {src/lib/profiling => profiling}/hmc_mk2/profile.Rmd (100%)
 delete mode 100644 qphix_base_classes.hpp
 delete mode 100644 qphix_interface.cpp
 delete mode 100644 qphix_interface.hpp
 delete mode 100644 qphix_interface_utils.hpp
 rename src/bin/{ => tests}/check_locallity.c (98%)
 rename src/bin/{ => tests}/hopping_test.c (94%)
 rename src/bin/{ => tests}/qphix_test_Dslash.c (99%)
 rename src/bin/{ => tests}/scalar_prod_r_test.c (100%)
 rename src/bin/{ => tests}/test_eigenvalues.c (98%)
 rename src/bin/{ => tests}/test_lemon.c (99%)
 rename fixed_volume.h.in => src/lib/fixed_volume.h.in (100%)
 create mode 100644 src/lib/git_hash.h
 delete mode 100644 src/lib/test/Makefile
 delete mode 100644 src/lib/util/laguer/Makefile
 delete mode 100644 src/lib/util/oox/Makefile

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9dc9f71f2..39adba1c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.24)
 project(
   tmlqcd
   DESCRIPTION "tmlQCD"
-  HOMEPAGE_URL "http://www.itkp.uni-bonn.de/~urbach/software.html"
+  HOMEPAGE_URL "https://github.com/etmc/tmLQCD"
   VERSION "6.0.0"
   LANGUAGES C CXX)
 
@@ -80,26 +80,22 @@ option(TM_USE_FFTW "Enable fftw support" OFF)
 option(TM_USE_MPI "Enable MPI support" OFF)
 option(TM_USE_CUDA "Enable QUDA support" OFF)
 option(TM_USE_HIP "Enable HIP support" OFF)
-option(TM_USE_DDALPHAAMG "Enable DDalphaAMG support" OFF)
-option(TM_USE_OPENMP "Enable openMP" ON)
-option(TM_FIXED_VOLUME "fix volume at compile time" OFF)
-set(
-  TM_ENABLE_ALIGNMENT
-  "auto"
-  CACHE STRING   "Automatically or expliclty align arrays to byte number. auto, none, 16, 32, 64")
-
-set_property(
-  CACHE TM_ENABLE_ALIGNMENT
-  PROPERTY STRINGS
-  "auto"
-  "none"
-  "16"
-  "32"
-  "64")
+option(TM_USE_DDalphaAMG "Enable DDalphaAMG support" OFF)
+option(TM_USE_OMP "Enable openMP" ON)
+option(TM_FIXEDVOLUME "fix volume at compile time" OFF)
+set(TM_ENABLE_ALIGNMENT
+    "auto"
+    CACHE
+      STRING
+      "Automatically or expliclty align arrays to byte number. auto, none, 16, 32, 64"
+)
+
+set_property(CACHE TM_ENABLE_ALIGNMENT PROPERTY STRINGS "auto" "none" "16" "32"
+                                                "64")
 
 option(TM_BGL_DRAM "use BGL dram window (BGL only!)" ON)
 option(TM_USE_OPTIMIZATION "enable optimisation" ON)
-option(TM_USE_GAUGE_COPY "Enable use of a copy of the gauge field" ON)
+option(TM_USE_GAUGECOPY "Enable use of a copy of the gauge field" ON)
 option(TM_USE_HALFSPINOR "Use a Dirac Op. with halfspinor exchange" ON)
 option(TM_USE_TSPLITPAR "Enable timeslice-splitted communications" ON)
 option(TM_USE_QPHIX "enable QPhiX" OFF)
@@ -110,8 +106,8 @@ option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
 
 # MPI dependent options
 cmake_dependent_option(
-  TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]"
-  OFF "TM_USE_MPI" OFF)
+  TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]" OFF
+  "TM_USE_MPI" OFF)
 cmake_dependent_option(
   TM_NONBLOCKING_MPI "Use non-blocking MPI calls for spinor and gaug" ON
   "TM_USE_MPI" OFF)
@@ -126,15 +122,15 @@ cmake_dependent_option(TM_USE_CUDA_HIP "Enable CUDA support in HIP" OFF
                        "TM_USE_HIP" OFF)
 
 # clime and lemon depend on MPI
-cmake_dependent_option(TM_USE_LEMON "Use the lemon io library" OFF
-                       "TM_USE_MPI" ON)
+cmake_dependent_option(TM_USE_LEMON "Use the lemon io library" OFF "TM_USE_MPI"
+                       ON)
 
 # GPU dependent options
 cmake_dependent_option(TM_USE_QUDA_EXPERIMENTAL "Enable QUDA support" ON
                        "TM_USE_QUDA" OFF)
 cmake_dependent_option(
-  TM_QUDA_FERMIONIC_FORCES "Enable support for fermionic forces using QUDA"
-  ON "TM_USE_QUDA" OFF)
+  TM_QUDA_FERMIONIC_FORCES "Enable support for fermionic forces using QUDA" ON
+  "TM_USE_QUDA" OFF)
 
 cmake_dependent_option(TM_USE_NVHPC "Enable Nvidia HPC toolkit" OFF
                        "TM_USE_CUDA" OFF)
@@ -143,7 +139,7 @@ cmake_dependent_option(TM_USE_NVHPC "Enable Nvidia HPC toolkit" OFF
 find_package(BLAS REQUIRED)
 #
 find_package(LAPACK REQUIRED)
-set(HAVE_LAPACK ON)
+set(TM_LAPACK ON)
 find_package(FLEX REQUIRED)
 # do we need bison ?
 find_package(BISON REQUIRED)
@@ -154,77 +150,51 @@ set(PACKAGE_TARNAME "tmlqcd")
 set(PACKAGE_BUGREPORT "curbach@gmx.de")
 set(PACKAGE_STRING "${PROJECT_DESCRIPTION} ${PROJECT_VERSION}")
 
-unset(TM_USE_MPI)
-unset(TM_USE_OMP)
-unset(HAVE_LIBLEMON)
-unset(HAVE_LIBLIME)
-unset(FIXEDVOLUME)
-unset(_PERSISTENT)
-unset(_NON_BLOCKING)
-unset(HAVE_LIBQUDA)
-unset(TM_USE_QUDA)
-unset(TM_QUDA_EXPERIMENTAL)
-unset(TM_QUDA_FERMIONIC_FORCES)
-unset(DDalphaAMG)
-unset(TM_USE_QPHIX)
-unset(QPHIX_SOALEN)
-unset(_NEW_GEOMETRY)
-unset(_NON_BLOCKING)
-unset(_USE_SHMEM)
-unset(_USE_HALFSPINOR)
 set(ALIGN " ")
 set(ALIGN_BASE "0")
 set(ALIGN_BASE32 "0")
 set(ALIGN32 " ")
 
 message("${TM_ENABLE_ALIGNMENT}")
-if (${TM_ENABLE_ALIGNMENT} STREQUAL "auto")
+if(${TM_ENABLE_ALIGNMENT} STREQUAL "auto")
   set(ALIGN_BASE "0x00")
   set(ALIGN " ")
   set(ALIGN_BASE32 "0x00")
   set(ALIGN32 " ")
-elseif (TM_ENABLE_ALIGNMENT EQUAL 16)
+elseif(TM_ENABLE_ALIGNMENT EQUAL 16)
   set(ALIGN_BASE "0x0F")
   set(ALIGN "__attribute__ ((aligned (16)))")
   set(ALIGN_BASE32 "0x0F")
   set(ALIGN32 "__attribute__ ((aligned (16)))")
-elseif (TM_ENABLE_ALIGNMENT EQUAL 32)
+elseif(TM_ENABLE_ALIGNMENT EQUAL 32)
   set(ALIGN_BASE "0x2F")
   set(ALIGN "__attribute__ ((aligned (32)))")
   set(ALIGN_BASE32 "0x2F")
   set(ALIGN32 "__attribute__ ((aligned (32)))")
-elseif (TM_ENABLE_ALIGNMENT EQUAL 64)
+elseif(TM_ENABLE_ALIGNMENT EQUAL 64)
   set(ALIGN_BASE "0x3F")
   set(ALIGN "__attribute__ ((aligned (64)))")
   set(ALIGN_BASE32 "0x3F")
   set(ALIGN32 "__attribute__ ((aligned (64)))")
 else()
-  message(FATAL_ERROR "Unusable value for array alignment. Allowed values are: auto, none, 16, 32, 64")
-endif()
-
-if(TM_USE_HALFSPINOR)
-  set(_USE_HALFSPINOR ON)
-endif()
-
-if(TM_FIXED_VOLUME)
-  set(FIXEDVOLUME ON)
-endif()
-
-if(TM_PERSISTENT_MPI)
-  set(_PERSISTENT ON)
+  message(
+    FATAL_ERROR
+      "Unusable value for array alignment. Allowed values are: auto, none, 16, 32, 64"
+  )
 endif()
 
 if(TM_USE_MPI)
   find_package(MPI REQUIRED)
-  set(TM_USE_MPI ON)
   if(TM_NONBLOCKING_MPI)
-    set(_NON_BLOCKING ON)
+    set(TM_NONBLOCKING ON)
+  endif()
+  if(TM_PERSISTENT_MPI)
+    set(TM_PERSISTENT ON)
   endif()
 endif()
 
-if(TM_USE_OPENMP)
+if(TM_USE_OMP)
   find_package(OpenMP REQUIRED COMPONENTS C CXX)
-  set(TM_USE_OMP ON)
 endif()
 
 if(TM_USE_HDF5)
@@ -233,24 +203,23 @@ endif()
 
 if(TM_USE_LEMON)
   find_package(Clemon REQUIRED)
-  set(HAVE_LIBLEMON ON)
 endif()
 
 find_package(CLime REQUIRED)
-set(HAVE_LIBLIME ON)
+set(TM_USE_LIME ON)
 
 if(TM_USE_QUDA)
   find_package(QUDA REQUIRED config)
-  set(HAVE_LIBQUDA ON)
   if(TM_USE_QUDA_EXPERIMENTAL)
     set(TM_QUDA_EXPERIMENTAL ON)
   endif()
   if(TM_QUDA_FERMIONIC_FORCES)
     set(TM_QUDA_FERMIONIC_FORCES ON)
   endif()
-  if(TM_USE_CUDA OR TM_USE_HIP)
-    set(TM_USE_QUDA ON)
-  endif()
+endif()
+
+if(TM_USE_SHMEM)
+  message(INFO "SHMEM needs to be included")
 endif()
 
 if(TM_USE_CUDA AND TM_USE_HIP)
@@ -268,7 +237,6 @@ if(TM_USE_CUDA OR QUDA_TARGET_CUDA)
   endif()
 endif()
 
-message("QUDA_TARGET: ${QUDA_TARGET_CUDA}")
 if(TM_USE_HIP OR QUDA_TARGET_HIP)
   enable_language(hip)
 
@@ -285,20 +253,15 @@ if(TM_USE_HIP OR QUDA_TARGET_HIP)
   endif()
 endif()
 
-if(TM_USE_SHMEM)
-  set(_USE_SHMEM ON)
-endif()
-
 if(TM_USE_QPIHX)
   find_package(QPhiX REQUIRED)
   if(NOT TARGET tmlqcd::qphix)
     add_library(tmlqcd::qphix INTERFACE IMPORTED)
     set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_LINK_LIBRARIES
-      "${QPHIX_LIBRARIES}")
+                                                   "${QPHIX_LIBRARIES}")
     set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-      "${QPHIX_INCLUDE_DIRS}")
+                                                   "${QPHIX_INCLUDE_DIRS}")
   endif()
-  set(TM_USE_QPHIX ON)
 endif()
 
 # check for fftw3 (rely on pkgconfig).
@@ -309,57 +272,60 @@ if(TM_USE_FFTW)
   endif()
 endif()
 
+if(TM_USE_DDalphaAMG)
+  find_package(DDAlphaAMG REQUIRED)
+endif()
+
 # gprofiler
 
-if (TM_USE_GPROF)
+if(TM_USE_GPROF)
   set(PROFILE_FLAGS "-pg;-g")
-  if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "powerpc|powerpc64")
+  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "powerpc|powerpc64")
     list(APPEND PROFILE_FLAGS "-qfullpath")
   endif()
   add_compile_options($<BOOL:$<COMPILE_LANGUAGE:C>:$PROFILE_FLAGS>)
 endif()
 
-if (TM_ENABLE_WARNINGS)
-  add_compile_options(
-    $<$<COMPILE_LANG_AND_ID:C,GNU>:-Wall>
-    $<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wall>)
+if(TM_ENABLE_WARNINGS)
+  add_compile_options($<$<COMPILE_LANG_AND_ID:C,GNU>:-Wall>
+                      $<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wall>)
 endif()
 
 # check for the presence of clock_gettime in libc or librt
-check_symbol_exists(clock_gettime "time.h" HAVE_CLOCK_GETTIME)
-check_library_exists(rt clock_gettime "" HAVE_CLOCK_GETTIME_IN_RT)
-check_function_exists(fseeko HAVE_FSEEKO)
+check_symbol_exists(clock_gettime "time.h" TM_CLOCK_GETTIME)
+check_library_exists(rt clock_gettime "" TM_CLOCK_GETTIME_IN_RT)
+check_function_exists(fseeko TM_FSEEKO)
 
 # set the parallelization
 
 if(TM_USE_MPI)
   if(TM_MPI_DIMENSION EQUAL "1")
     # T parallelisation
-    set(PARALLELT ON)
+    set(TM_PARALLELT ON)
   elseif(TM_MPI_DIMENSION EQUAL "2")
     # XT parallelisation
-    set(PARALLELXT ON)
+    set(TM_PARALLELXT ON)
   elseif(TM_MPI_DIMENSION EQUAL "3")
-    set(PARALLELXYT ON)
+    set(TM_PARALLELXYT ON)
     # XYZ parallelisation
   elseif(TM_MPI_DIMENSION EQUAL "4")
     # timeslice-splitted communications
-    set(PARALLELXYZT ON)
+    set(TM_PARALLELXYZT ON)
   elseif(TM_MPI_DIMENSION EQUAL "X")
-    set(PARALLELX ON)
+    set(TM_PARALLELX ON)
   elseif(TM_MPI_DIMENSION EQUAL "XY")
-    set(PARALLELXY ON)
+    set(TM_PARALLELXY ON)
   elseif(TM_MPI_DIMENSION EQUAL "XYZ")
-    set(PARALLELXYZ ON)
+    set(TM_PARALLELXYZ ON)
   else()
-    set(PARALLELXYZT ON)
+    set(TM_PARALLELXYZT ON)
   endif()
 endif()
 
 # keep the autotool config.h header.
 configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlqcd_config_internal.h.in"
                "${PROJECT_BINARY_DIR}/tmlqcd_config_internal.h" @ONLY)
-configure_file("${PROJECT_SOURCE_DIR}/fixed_volume.h.in"
+configure_file("${PROJECT_SOURCE_DIR}/src/lib/fixed_volume.h.in"
                "${PROJECT_BINARY_DIR}/fixed_volume.h" @ONLY)
 # check if git command exists
 find_program(GIT_EXE NAMES git)
@@ -385,6 +351,6 @@ else()
   )
 endif()
 
-configure_file(cmake/git_hash.h.in git_hash.h @ONLY)
+configure_file(cmake/git_hash.c.in git_hash.c @ONLY)
 add_subdirectory(src/lib)
 add_subdirectory(src/bin)
diff --git a/Makefile.global b/Makefile.global
deleted file mode 100644
index dc1eefcf1..000000000
--- a/Makefile.global
+++ /dev/null
@@ -1,64 +0,0 @@
-# This Makefile is included from the other Makefiles
-# It contains some overall targets...
-
-# refresh Makefile and other stuff
-
-
-
-PROGRAMS_WITH_GIT_HASH := hmc_tm invert offline_measurement test_Dslash deriv_mg_tune
-
-.SUFFIXES:
-
-Makefile: ${top_srcdir}/Makefile.global $(srcdir)/Makefile.in $(abs_top_builddir)/config.status 
-	cd $(abs_top_builddir) \
-	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
-
-$(abs_top_builddir)/config.status: $(top_srcdir)/configure
-	( cd ${abs_top_builddir} && $(SHELL) ./config.status --recheck ) 
-
-$(abs_top_builddir)/include/tmlqcd_config_internal.h: $(top_srcdir)/include/tmlqcd_config_internal.h.in $(abs_top_builddir)/config.status $(top_srcdir)/configure
-	( cd ${abs_top_builddir} && $(SHELL) ./config.status --header=include/tmlqcd_config_internal.h )
-
-# rebuild configure if configure.in changes but ignore errors
-# on many machines some of the macros fail to be recognized
-# but the resulting configure still works
-$(top_srcdir)/configure: $(top_srcdir)/configure.in 
-	-( cd $(top_srcdir) && $(AUTOCONF) )
-
-#dep rules
-
-# PROGRAMS_WITH_GIT_HASH require git_hash.h which is dynamically built by a phony make target
-# to prevent too frequent building of git_hash (slowing down the build)
-# we filter the list of all objects and treat these separately
-$(addsuffix .d, $(filter-out ${PROGRAMS_WITH_GIT_HASH},${ALLOBJ})): %.d: ${srcdir}/%.c Makefile
-	@ $(CCDEP) ${DEPFLAGS} ${DEFS} ${INCLUDES} $< > $@
-$(addsuffix .d, $(filter-out ${PROGRAMS_WITH_GIT_HASH},${CXXMODULES})): %.d: ${srcdir}/%.cpp Makefile
-	@ $(CXXDEP) ${CXXDEPFLAGS} ${DEFS} ${INCLUDES} $< > $@
-	
-# dirty hack to prevent make from entering an infinite loop because a phony target is given as a real
-# dependency (make will build invert.d and hmc_tm.d indefinitely)
-# when git_hash.h does not exist (as checked using wildcard) it is given as a dependency of invert.d and hmc_tm.d
-# once it exists, this is no longer the case
-# while this does break updating of git_hash.h while the dependencies are built, this is quite
-# irrelevant because it will be rebuilt during the compilation of either invert or hmc_tm
-ifneq (git_hash.h, $(findstring git_hash.h,$(wildcard $(top_srcdir)/git_has*.h)))
-$(addsuffix .d, $(filter ${PROGRAMS_WITH_GIT_HASH},${ALLOBJ})): %.d: ${srcdir}/%.c ${top_srcdir}/git_hash.h Makefile
-	@ $(CCDEP) ${DEPFLAGS} ${DEFS} ${INCLUDES} $< > $@
-else
-$(addsuffix .d, $(filter ${PROGRAMS_WITH_GIT_HASH},${ALLOBJ})): %.d: ${srcdir}/%.c Makefile
-	@ $(CCDEP) ${DEPFLAGS} ${DEFS} ${INCLUDES} $< > $@
-endif
-
-${top_builddir}/fixed_volume.h: ${top_srcdir}/fixed_volume.h.in ${top_builddir}/config.status
-	cd ${abs_top_builddir} && CONFIG_FILES=fixed_volume.h CONFIG_HEADERS= $(SHELL) ${top_builddir}/config.status
-
-all-recursive all-debug-recursive all-profile-recursive clean-recursive distclean-recursive compile-clean-recursive: Makefile
-	@set fnord ${MAKEFLAGS}; amf=$$2; \
-	dot_seen=no; \
-	target=`echo $@ | sed s/-recursive//`; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  echo "Making $$target in $$subdir"; \
-	  local_target="$$target"; \
-	  ( cd $$subdir && $(MAKE) $$local_target ) \
-	    || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
-	done; test -z "$$fail";
diff --git a/Makefile.in b/Makefile.in
deleted file mode 100644
index 51437ff05..000000000
--- a/Makefile.in
+++ /dev/null
@@ -1,167 +0,0 @@
-srcdir = @srcdir@
-top_srcdir = @top_srcdir@
-abs_top_srcdir = @abs_top_srcdir@
-top_builddir =  .
-abs_top_builddir = @abs_top_builddir@
-builddir = @builddir@
-prefix = @prefix@
-exec_prefix = @exec_prefix@
-bindir = @bindir@
-program_transform_name = @program_transform_name@
-subdir = .
-
-AR = @AR@
-RANLIB = @RANLIB@
-CC = @CC@
-CXX = @CXX@
-CCDEP = @CCDEP@
-CXXDEP = @CXXDEP@
-CFLAGS = @CFLAGS@
-CXXFLAGS = @CXXFLAGS@
-LDFLAGS = @LDFLAGS@
-DEPFLAGS = @DEPFLAGS@
-CXXDEPFLAGS = @CXXDEPFLAGS@
-CPPFLAGS = @CPPFLAGS@
-CCLD = @CCLD@
-LEX = @LEX@
-AUTOCONF = @AUTOCONF@
-LIBS = @LIBS@
-SHELL = @SHELL@
-OPTARGS = @OPTARGS@
-SOPTARGS = @SOPTARGS@
-DEFS = @DEFS@
-USESUBDIRS = @USESUBDIRS@
-NVCC = @NVCC@
-GPUMPICOMPILER = @GPUMPICOMPILER@
-
-INCLUDES = @INCLUDES@
-LINK = $(CCLD) -o $@ ${LDFLAGS}
-
-COMPILE = ${CC} ${DEFS} ${INCLUDES} -o $@ ${CFLAGS}
-CXXCOMPILE = ${CXX} ${DEFS} ${INCLUDES} -o $@ ${CXXFLAGS} ${LDFLAGS}
-
-SMODULES = 
-
-MODULES = read_input gamma measure_gauge_action start \
-	expo matrix_utils get_staples update_backward_gauge \
-	measure_rectangles get_rectangle_staples  \
-	test/check_geometry test/check_xchange \
-	test/overlaptests \
-	invert_eo invert_doublet_eo update_gauge \
-	getopt sighandler reweighting_factor \
-	source_generation boundary update_tm ranlxd  \
-	mpi_init deriv_Sb deriv_Sb_D_psi ranlxs \
-	geometry_eo invert_overlap aligned_malloc \
-	prepare_source chebyshev_polynomial_nd Ptilde_nd  \
-	reweighting_factor_nd rnd_gauge_trafo \
-        update_momenta update_momenta_fg integrator  phmc \
-	little_D block operator \
-	spinor_fft \
-	fatal_error invert_clover_eo gettime \
-	tm_debug_printf compare_derivative \
-        @QUDA_INTERFACE@ @DDalphaAMG_INTERFACE@
-
-CXXMODULES = @QPHIX_INTERFACE@
-
-NOOPTMOD = test/check_xchange test/check_geometry
-
-PROGRAMS = hmc_tm benchmark invert gen_sources  \
-	check_locallity test_lemon hopping_test \
-	offline_measurement deriv_mg_tune @QPHIX_PROGRAMS@
-
-ALLOBJ = ${MODULES} ${PROGRAMS} ${SMODULES}
-SUBDIRS = ${USESUBDIRS}
-
-# delete the default suffix rules
-.SUFFIXES:
-
-# need to build modules before subdirs!
-all: Makefile dep $(SUBDIRS) hmc_tm invert benchmark offline_measurement deriv_mg_tune @QPHIX_PROGRAMS@
-
-$(SUBDIRS):
-	$(MAKE) --directory=$@
-
-# run the GIT-VERSION-GEN script to generate version information in git_hash.h
-# making sure that we run in the correct directory
-${top_srcdir}/git_hash.h:
-	@ ( cd @srcdir@ && sh GIT-VERSION-GEN )
-
--include $(addsuffix .d,$(ALLOBJ))
--include $(addsuffix .d,$(CXXMODULES))
-
-include ${top_srcdir}/Makefile.global
-
-# follow https://www.owlfolio.org/possibly-useful/flex-input-scanner-rules-are-too-complicated/
-# and pass the -Ca option such that more than 32k "NFA" states are allowed
-# our ruleset is so complicated that this has become necessary!
-${top_srcdir}/read_input.c: ${top_srcdir}/read_input.l
-ifneq (,$(findstring lex,${LEX}))
-	${LEX} -Ca -Ptmlqcd -i -t ${top_srcdir}/read_input.l > ${top_srcdir}/read_input.c
-else
-	$(error Unable to find (f)lex, read_input.c not built. Please install (f)lex!)
-endif
-
-libhmc.a: ${addsuffix .o, ${MODULES} ${SMODULES}} Makefile
-	@rm -f libhmc.a
-	@${AR} cru libhmc.a ${addsuffix .o, ${MODULES} ${SMODULES}}
-	@$(RANLIB) libhmc.a
-	@cp libhmc.a ${top_builddir}/lib/libhmc.a
-
-$(addsuffix .o,$(filter-out ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config_internal.h
-	${COMPILE} ${OPTARGS} -c $<
-
-#here we don't need optimisation
-$(addsuffix .o,$(filter ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config_internal.h
-	${COMPILE} -c $<
-
-${addsuffix .o, ${SMODULES}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config_internal.h
-	${COMPILE} ${SOPTARGS} -c $<
-
-# C++ modules
-$(addsuffix .o,${CXXMODULES}): %.o: ${srcdir}/%.cpp %.d Makefile $(abs_top_builddir)/include/tmlqcd_config_internal.h
-	${CXXCOMPILE} -c $<
-	
-${addsuffix .o, ${PROGRAMS}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config_internal.h ${top_srcdir}/git_hash.h
-	${COMPILE} ${OPTARGS} -c $<
-
-${PROGRAMS}: %: %.o libhmc.a $(SUBDIRS) $(addsuffix .o,${CXXMODULES})
-	 ${LINK} $@.o $(addsuffix .o,${CXXMODULES}) $(GPUOBJECTS) $(GPUOBJECTS_C) $(LIBS) ${LDFLAGS}
-
-
-# The rules for unit tests are kept in a separate file for tidyness
-include ${top_srcdir}/Makefile.tests
-
-dep: $(addsuffix .d,$(ALLOBJ)) $(addsuffix .d,$(CXXMODULES))
-	@ echo "...dependency files built"
-
-install: Makefile
-	@mkdir -p $(bindir); \
-	for p in hmc_tm invert benchmark offline_measurement deriv_mg_tune; do \
-	  progname=`echo $$p | sed '$(program_transform_name)'`; \
-	  echo "Installing $$p as $$progname in $(bindir)..."; \
-	  cp $$p $(bindir)/$$progname; \
-	done; \
-	echo "done";
-
-uninstall: Makefile
-	for p in hmc_tm invert benchmark offline_measurement deriv_mg_tune; do \
-	  progname=`echo $$p | sed '$(program_transform_name)'`; \
-	  echo "Un-Installing $$progname in $(bindir)..."; \
-	  rm $(bindir)/$$progname; \
-	done; \
-	echo "done";
-
-compile-clean: compile-clean-recursive Makefile
-	rm -f *.o *.d test/*.o test/*.d tests/*.o tests/*.d
-
-clean: clean-recursive Makefile
-	rm -f benchmark hmc_tm invert offline_measurement test_Dslash deriv_mg_tune @QPHIX_PROGRAMS@ *.o *.d test/*.o test/*.d tests/*.o tests/*.d
-
-distclean: distclean-recursive Makefile
-	rm -f benchmark hmc_tm invert offline_measurement *.o *.d *~ Makefile config.log config.status fixed_volume.h
-	rm -f include/tmlqcd_config_internal.h
-
-.PHONY: all ${SUBDIRS} ${top_srcdir}/git_hash.h clean compile-clean distclean dep install \
-	all-recursive all-debug-recursive all-profile-recursive \
-	clean-recursive distclean-recursive \
-	compile-clean-recursive
diff --git a/Makefile.tests b/Makefile.tests
deleted file mode 100644
index a9a393ac6..000000000
--- a/Makefile.tests
+++ /dev/null
@@ -1,64 +0,0 @@
-TESTS = tests/test_sample tests/test_su3 tests/test_buffers tests/test_qpx tests/test_linalg tests/test_clover tests/test_rat
-
-TEMP = $(patsubst %.c,%,$(wildcard $(top_srcdir)/tests/*.c))
-TESTMODULES = $(patsubst $(top_srcdir)/%,%,$(TEMP))
-
-TESTFLAGS = -L$(top_builddir)/cu/ -lcu
-
-$(addsuffix .o,$(TESTMODULES)): %.o : $(top_srcdir)/%.c
-	${COMPILE} -c $(OPTARGS) ${DEFS} $<
-
-# The linking stage needs to be differentiated because different tests rely on
-# different modules from the codebase
-# Each test itself consists of a number of modules that need to be linked.
-
-# when used as a prerequisite, the wildcard with "tests/test_sample*.c" replaced by "$@*.c" is not evaluated
-# correctly, even though it works perfectly in an echo statement, it results in make
-# trying to compile all objects in top_srcdir
-# we therefore evaluate the wildcard into a variable
-
-TEST_SAMPLE_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_sample*.c))
-TEST_SAMPLE_FLAGS:=
-TEST_SAMPLE_LIBS:=$(top_builddir)/cu/libcu.a
-tests/test_sample: $(TEST_SAMPLE_OBJECTS) $(TEST_SAMPLE_LIBS)
-	${LINK} $(TEST_SAMPLE_OBJECTS) $(TESTFLAGS) $(TEST_SAMPLE_FLAGS)
-
-TEST_SU3_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_su3*.c)) expo.o
-TEST_SU3_FLAGS:=-lm
-TEST_SU3_LIBS:=$(top_builddir)/cu/libcu.a
-tests/test_su3: $(TEST_SU3_OBJECTS) $(TEST_SU3_LIBS)
-	${LINK} $(TEST_SU3_OBJECTS) $(TESTFLAGS) $(TEST_SU3_FLAGS)
-
-TEST_QPX_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_qpx*.c)) 
-TEST_QPX_FLAGS:=-lm
-TEST_QPX_LIBS:=$(top_builddir)/cu/libcu.a
-tests/test_qpx: $(TEST_QPX_OBJECTS) $(TEST_QPX_LIBS)
-	${LINK} $(TEST_QPX_OBJECTS) $(TESTFLAGS) $(TEST_QPX_FLAGS)
-
-TEST_LINALG_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_linalg*.c)) 
-TEST_LINALG_FLAGS:=-lm
-TEST_LINALG_LIBS:=$(top_builddir)/cu/libcu.a $(top_builddir)/linalg/liblinalg.a
-tests/test_linalg: $(TEST_LINALG_OBJECTS) $(TEST_LINALG_LIBS)
-	${LINK} $(TEST_LINALG_OBJECTS) $(TEST_LINALG_LIBS) $(TESTFLAGS) $(TEST_LINALG_FLAGS)
-
-TEST_BUFFERS_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_buffers*.c)) fatal_error.o
-TEST_BUFFERS_FLAGS:=-lbuffers -L$(top_builddir)/buffers/
-TEST_BUFFERS_LIBS:=$(top_builddir)/cu/libcu.a $(top_builddir)/buffers/libbuffers.a
-tests/test_buffers: $(TEST_BUFFERS_OBJECTS) $(TEST_BUFFERS_LIBS)
-	${LINK} $(TEST_BUFFERS_OBJECTS) $(TESTFLAGS) $(TEST_BUFFERS_FLAGS)
-
-TEST_CLOVER_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_clover*.c)) operator/clover_leaf.o
-TEST_CLOVER_FLAGS:=-lm -lhmc -llinalg
-TEST_CLOVER_LIBS:=$(top_builddir)/cu/libcu.a
-tests/test_clover: $(TEST_CLOVER_OBJECTS) $(TEST_CLOVER_LIBS)
-	${LINK} $(TEST_CLOVER_OBJECTS) $(TESTFLAGS) $(TEST_CLOVER_FLAGS)
-
-TEST_RAT_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_rat*.c)) 
-TEST_RAT_FLAGS:=-lm -lrational
-TEST_RAT_LIBS:=$(top_builddir)/cu/libcu.a
-tests/test_rat: $(TEST_RAT_OBJECTS) $(TEST_RAT_LIBS)
-	${LINK} $(TEST_RAT_OBJECTS) $(TESTFLAGS) $(TEST_RAT_FLAGS)
-
-
-tests: ${TESTS}
-
diff --git a/cmake/FindDDAlphaAMG.cmake b/cmake/FindDDAlphaAMG.cmake
new file mode 100644
index 000000000..f42c943cc
--- /dev/null
+++ b/cmake/FindDDAlphaAMG.cmake
@@ -0,0 +1,29 @@
+include(FindPackageHandleStandardArgs)
+
+find_library(
+  TM_DDALPHAAMG_LIBRARIES
+  NAMES DDalphaAMG DDalphaAMG_devel
+  PATH_SUFFIXES "lib" "lib64")
+
+find_path(
+  TM_DDALPHAAMG_INCLUDE_DIRS
+  NAMES DDalphaAMG.h
+  PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
+
+find_package_handle_standard_args(
+  DDAlphaAMG DEFAULT_MSG TMLQCD_DDALPHAAMG_LIBRARIES
+  TMLQCD_DDALPHAAMG_INCLUDE_DIRS)
+
+if(NOT TARGET tmlqcd::DDalphaAMG)
+  add_library(tmlqcd::DDalphaAMG INTERFACE IMPORTED)
+  set_target_properties(
+    tmlqcd::DDalphaAMG PROPERTIES INTERFACE_LINK_LIBRARIES
+                                  "${TMLQCD_DDALPHAAMG_LIBRARIES}")
+  set_target_properties(
+    tmlqcd::DDalphaAMG PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+                                  "${TMLQCD_DDALPHAAMG_INCLUDE_DIRS}")
+endif()
+
+set(TMLQCD_DDALPHAAMG_FOUND ON)
+mark_as_advanced(TMLQCD_DDALPHAAMG_FOUND TMLQCD_DDALPHAAMG_LIBRARIES
+                 TMLQCD_DDALPHAAMG_INCLUDE_DIRS)
diff --git a/cmake/git_hash.h.in b/cmake/git_hash.c.in
similarity index 62%
rename from cmake/git_hash.h.in
rename to cmake/git_hash.c.in
index 23f624742..912085abb 100644
--- a/cmake/git_hash.h.in
+++ b/cmake/git_hash.c.in
@@ -1,6 +1,6 @@
 #ifndef _GIT_HASH_H
 #define _GIT_HASH_H
 
-const char git_hash[] = "@TMLQCD_SHA@";
+const char git_hash[] = "@TM_SHA@";
 
 #endif /* _GIT_HASH_H */
diff --git a/cmake/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
index 5dd9c7096..2765a2b7c 100644
--- a/cmake/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -3,20 +3,17 @@
  * into static const variables, following the convention used by the USQCD build
  * systems, for example. */
 
-/* We are on a CRAY */
-#cmakedefine CRAY
-
 /* lapack available */
-#cmakedefine HAVE_LAPACK 
+#cmakedefine TM_LAPACK 
 
 /* Define to 1 if you have the `lime' library (-llime). */
-#cmakedefine HAVE_LIBLIME 
+#cmakedefine TM_USE_LIME 
 
 /* Define to 1 if you have the `lemon' library (-llemon). */
-#cmakedefine HAVE_LIBLEMON 
+#cmakedefine TM_USE_LEMON 
 
 /* 1 if clock_gettime is available for use in benchmark */
-#cmakedefine HAVE_CLOCK_GETTIME 
+#cmakedefine TM_CLOCK_GETTIME 
 
 /* Compile with MPI support */
 #cmakedefine TM_USE_MPI
@@ -25,7 +22,7 @@
 #cmakedefine TM_USE_OMP
 
 /* Compile with FFTW support */
-#cmakedefine HAVE_FFTW 
+#cmakedefine TM_USE_FFTW 
 
 /* Fortran has not extra _ */
 #cmakedefine NOF77_
@@ -45,31 +42,31 @@
 #define PACKAGE_VERSION "@PROJECT_DESCRIPTION@ @PROJECT_VERSION@"
 
 /* X parallelisation */
-#cmakedefine PARALLELX 
+#cmakedefine TM_PARALLELX 
 
 /* XY parallelisation */
-#cmakedefine PARALLELXY 
+#cmakedefine TM_PARALLELXY 
 
 /* XYZ parallelisation */
-#cmakedefine PARALLELXYZ
+#cmakedefine TM_PARALLELXYZ
 
 /* One dimensional parallelisation */
-#cmakedefine PARALLELT
+#cmakedefine TM_PARALLELT
 
 /* Two dimensional parallelisation */
-#cmakedefine PARALLELXT
+#cmakedefine TM_PARALLELXT
 
 /* Three dimensional parallelisation */
-#cmakedefine PARALLELXYT
+#cmakedefine TM_PARALLELXYT
 
 /* Four dimensional parallelisation */
-#cmakedefine PARALLELXYZT
+#cmakedefine TM_PARALLELXYZT
 
 /* Fixed volume at compiletime */
-#cmakedefine FIXEDVOLUME
+#cmakedefine TM_FIXEDVOLUME
 
 /* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
-#cmakedefine HAVE_FSEEKO
+#cmakedefine TM_FSEEKO
 
 /* Alignment for arrays -- necessary for SSE and automated vectorization */
 #define ALIGN_BASE @ALIGN_BASE@
@@ -88,40 +85,37 @@
 #cmakedefine YYTEXT_POINTER
 
 /* Number of bits in a file offset, on hosts where this is settable. */
-#cmakedefine _FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
+#cmakedefine TM_FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
 
 /* Construct an extra copy of the gauge fields */
-#cmakedefine _GAUGE_COPY
+#cmakedefine TM_USE_GAUGECOPY
 
 /* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
-#cmakedefine _LARGEFILE_SOURCE
+#cmakedefine TM_LARGEFILE_SOURCE
 
 /* Define for large files, on AIX-style hosts. */
-#cmakedefine _LARGE_FILES 
+#cmakedefine TM_LARGE_FILES 
 
 /* Use even/odd geometry in the gauge fields */
-#cmakedefine _NEW_GEOMETRY
+#cmakedefine TM_NEW_GEOMETRY
 
 /* x86 64 Bit architecture */
-#cmakedefine _x86_64
+#cmakedefine TM_x86_64
 
 /* Define to 1 if Dirac operator with halfspinor should be used */
-#cmakedefine _USE_HALFSPINOR 
+#cmakedefine TM_USE_HALFSPINOR 
 
 /* Define to 1 if shmem API should be used */
-#cmakedefine _USE_SHMEM
+#cmakedefine TM_USE_SHMEM
 
 /* Define to 1 if KOJAK instrumentalisation should be done*/
-#cmakedefine _KOJAK_INST
+#cmakedefine TM_KOJAK_INST
 
 /* Define to 1 if persistent MPI calls for halfspinor should be used */
-#cmakedefine _PERSISTENT
+#cmakedefine TM_PERSISTENT
 
 /* Define to 1 if non-blocking MPI calls for spinor and gauge should be used */
-#cmakedefine _NON_BLOCKING
-
-/* Define to 1 if you have the `quda' library (-lquda). */
-#cmakedefine HAVE_LIBQUDA
+#cmakedefine TM_NONBLOCKING
 
 /* Using QUDA GPU */
 #cmakedefine TM_USE_QUDA 
@@ -133,7 +127,7 @@
 #cmakedefine TM_QUDA_FERMIONIC_FORCES
 
 /* Using DDalphaAMG */
-#cmakedefine DDalphaAMG
+#cmakedefine TM_USE_DDalphaAMG
 
 /* Using QPHIX */
 #cmakedefine TM_USE_QPHIX 
diff --git a/cmake_includes.txt b/cmake_includes.txt
deleted file mode 100644
index b8e105cc0..000000000
--- a/cmake_includes.txt
+++ /dev/null
@@ -1,425 +0,0 @@
-LIST(APPEND IO_SRC_C io_srcio/utils_write_inverter_info.c
-io/gauge_read.c
-io/utils_write_xlf.c
-io/utils_construct_reader.c
-io/params_construct_xlfInfo.c
-io/utils_kill_with_error.c
-io/DML_crc32.c
-io/spinor_write_source_format.c
-io/deri_write_stdout.c
-io/spinor_write_propagator_format.c
-io/utils_engineering.c
-io/utils_parse_propagator_type.c
-io/io_cm.c
-io/utils_parse_ildgformat_xml.c
-io/utils_read_message.c
-io/utils_write_ildg_format.c
-io/utils_destruct_writer.c
-io/gauge_write.c
-io/utils_write_message.c
-io/params_construct_ildgFormat.c
-io/spinor_read.c
-io/utils_close_reader_record.c
-io/spinor_read_binary.c
-io/utils.c
-io/spinor_write_stdout.c
-io/spinor_write_info.c
-io/utils_write_checksum.c
-io/utils_write_header.c
-io/eospinor_read.c
-io/utils_write_first_message.c
-io/params_construct_InverterInfo.c
-io/utils_parse_checksum_xml.c
-io/utils_construct_writer.c
-io/sw_write_stdout.c
-io/spinor_write_propagator_type.c
-io/gauge_write_binary.c
-io/spinor_write.c
-io/utils_write_xlf_xml.c
-io/params_construct_propagatorFormat.c
-io/gauge_read_binary.c
-io/dml.c
-io/spinor_write_binary.c
-io/utils_destruct_reader.c
-io/utils_close_writer_record.c
-io/eospinor_write.c
-io/gauge_write_luscher_binary.c
-io/params_construct_sourceFormat.c)
-
-list(APPEND INIT_SRC_C init/init_dirac_halfspinor.c
-     init/init_geometry_indices.c
-     init/init_openmp.c
-     init/init_gauge_field.c
-     init/init_parallel.c
-     init/init_chi_spinor_field.c
-     init/init_gauge_fg.c
-     init/init_spinor_field.c
-     init/init_global_states.c
-     init/init_bispinor_field.c
-     init/init_gauge_tmp.c
-     init/init_critical_globals.c
-     init/init_omp_accumulators.c
-     init/init_jacobi_field.c
-     init/init_stout_smear_vars.c
-     init/init_moment_field.c)
-
-list(APPEND SOLVER_SRC_C
-solver/bicg_complex.c
-solver/dfl_projector.c
-solver/eigenvalues_Jacobi.c
-solver/gcr.c
-solver/gmres_precon.c
-solver/chrono_guess.c
-solver/gcr4complex.c
-solver/jdher.c
-solver/gcr4complex_body.c
-solver/gmres_dr.c
-solver/fgmres4complex_body.c
-solver/cg_her_bi.c
-solver/solver_field.c
-solver/quicksort.c
-solver/bicgstab2.c
-solver/cgs_real.c
-solver/M_plus_block_psi_body.c
-solver/little_mg_precon_body.c
-solver/cg_her_su3vect.c
-solver/little_project_eo_body.c
-solver/monomial_solve.c
-solver/cr.c
-solver/gram-schmidt.c
-solver/solver_types.c
-solver/mode_number.c
-solver/cg_her.c
-solver/jdher_bi.c
-solver/mrblk_body.c
-solver/eigcg.c
-solver/jdher_su3vect.c
-solver/poly_precon.c
-solver/Msap.c
-solver/fgmres.c
-solver/dirac_operator_eigenvectors.c
-solver/incr_eigcg.c
-solver/index_jd.c
-solver/sumr.c
-solver/cgne4complex.c
-solver/eigenvalues_bi.c
-solver/gmres.c
-solver/lu_solve.c
-solver/diagonalise_general_matrix.c
-solver/mcr.c
-solver/bicgstabell.c
-solver/rg_mixed_cg_her.c
-solver/mixed_cg_her.c
-solver/mixed_cg_mms_tm_nd.c
-solver/rg_mixed_cg_her_nd.c
-solver/spectral_proj.c
-solver/restart_X.c
-solver/generate_dfl_subspace.c
-solver/eigenvalues.c
-solver/mcr4complex.c
-solver/mr4complex.c
-solver/bicgstab_complex.c
-solver/cg_mms_tm_nd.c
-solver/mr.c
-solver/cg_her_nd.c
-solver/bicgstab_complex_bi.c
-solver/sub_low_ev.c
-solver/ortho.c
-solver/pcg_her.c
-solver/fgmres4complex.c
-solver/cg_mms_tm.c
-solver/init_guess.c)
-
-list(APPEND LINALG_SRC_C linalg/assign_mul_bra_add_mul_r.c
-     linalg/mul_r_gamma5.c
-     linalg/convert_eo_to_lexic.c
-     linalg/print_spinor.c
-     linalg/assign_add_mul_body.c
-     linalg/mul_diff_mul_r.c
-     linalg/square_norm_32.c
-     linalg/mul.c
-     linalg/mul_r.c
-     linalg/mul_gamma5.c
-     linalg/ratio.c
-     linalg/square_norm.c
-     linalg/mul_diff_mul.c
-     linalg/square_and_minmax.c
-     linalg/add.c
-     linalg/assign_add_mul_add_mul_r.c
-     linalg/comp_decomp.c
-     linalg/mul_add_mul.c
-     linalg/diff_32.c
-     linalg/assign_add_mul.c
-     linalg/addto_32.c
-     linalg/assign_mul_add_mul_add_mul_add_mul_r.c
-     linalg/assign_add_mul_r.c
-     linalg/diff.c
-     linalg/assign_mul_add_mul_r.c
-     linalg/scalar_prod_r.c
-     linalg/assign_to_32.c
-     linalg/assign_add_mul_add_mul.c
-     linalg/mul_diff_r.c
-     linalg/assign_mul_add_r_and_square.c
-     linalg/assign_mul_add_mul_r_32.c
-     linalg/assign_mul_add_mul.c
-     linalg/assign_mul_add_mul_add_mul_r.c
-     linalg/scalar_prod_r_32.c
-     linalg/assign_mul_add_r.c
-     linalg/assign_mul_add_r_32.c
-     linalg/scalar_prod_su3spinor.c
-     linalg/convert_even_to_lexic.c
-     linalg/mul_r_32.c
-     linalg/assign_add_mul_r_add_mul.c
-     linalg/convert_odd_to_lexic.c
-     linalg/diff_and_square_norm.c
-     linalg/scalar_prod_i.c
-     linalg/mul_add_mul_r.c
-     linalg/assign_diff_mul.c
-     linalg/assign_mul_bra_add_mul_ket_add_r.c
-     linalg/set_even_to_zero.c
-     linalg/assign_mul_add.c
-     linalg/square_and_prod_r.c
-     linalg/scalar_prod_body.c
-     linalg/assign_mul_bra_add_mul_ket_add.c
-     linalg/assign_add_mul_r_32.c
-     linalg/scalar_prod.c
-     linalg/mattimesvec.c
-     linalg/assign.c
-     linalg/print_spinor_similar_components.c)
-
-list(APPEND RATIONAL_SRC_C rational/zolotarev.c
-     rational/rational.c
-     rational/elliptic.c)
-
-list(APPEND OPERATOR_SRC_C operator/clover_invert.c
-     operator/hopping_body_dbl.c
-     operator/tm_operators_nd_32.c
-     operator/hopping_sse_dbl.c
-     operator/halfspinor_body.c
-     operator/Block_D_psi_body.c
-     operator/mul_one_pm_imu_sub_mul_body.c
-     operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
-     operator/assign_mul_one_sw_pm_imu_inv_block_body.c
-     operator/clover_accumulate_deriv.c
-     operator/Hopping_Matrix.c
-     operator/hopping_bg_dbl.c
-     operator/tm_operators.c
-     operator/tm_times_Hopping_Matrix.c
-     operator/clovertm_operators_32.c
-     operator/hopping_sgl.c
-     operator/Dov_proj.c
-     operator/clover_deriv.c
-     operator/halfspinor_bg_dbl.c
-     operator/clover_det.c
-     operator/clover_leaf.c
-     operator/D_psi_body.c
-     operator/clovertm_operators.c
-     operator/hopping_sse_sgl.c
-     operator/halfspinor_sse_dbl.c
-     operator/Dov_psi.c
-     operator/tm_operators_nd.c
-     operator/tm_sub_Hopping_Matrix.c
-     operator/Hopping_Matrix_nocom.c
-     operator/clover_term.c
-     operator/halfspinor_bgq_dbl.c
-     operator/Hopping_Matrix_32_nocom.c
-     operator/D_psi.c
-     operator/tm_operators_32.c
-     operator/Hopping_Matrix_32.c
-     operator/halfspinor_body_32.c
-     operator/mul_one_pm_imu_inv_body.c)
-
-list(APPEND SMEARING_SRC_C smearing/hex_stout_exclude_two.c
-     smearing/hex_hex_smear.c
-     smearing/utils_print_su3.c
-     smearing/hyp_APE_project_exclude_none.c
-     smearing/hyp_hyp_staples_exclude_one.c
-     smearing/hyp_APE_project_exclude_one.c
-     smearing/hex_stout_exclude_one.c
-     smearing/hyp_hyp_staples_exclude_two.c
-     smearing/hex_stout_exclude_none.c
-     smearing/stout_stout_smear.c
-     smearing/hyp_hyp_smear.c
-     smearing/hyp_APE_project_exclude_two.c
-     smearing/utils_project_herm.c
-     smearing/utils_reunitarize.c
-     smearing/utils_generic_staples.c
-     smearing/hyp_hyp_staples_exclude_none.c
-     smearing/ape_ape_smear.c
-     smearing/uils_print_config_to_screen.c
-     smearing/utils_project_antiherm.c
-     smearing/utils_print_config_to_screen.c
-     smearing/utils_reunitarize_MILC.c)
-
-list(APPEND BUFFER_SRC_C
-     buffers/gauge_return_gauge_field.c
-     buffers/gauge_get_gauge_field.c
-     buffers/gauge_finalize_gauge_buffers.c
-     buffers/gauge_initialize_gauge_buffers.c
-     buffers/gauge.c
-     buffers/gauge_free_unused_gauge_buffers.c
-     buffers/gauge_get_gauge_field_array.c
-     buffers/utils_generic_exchange.c
-     buffers/gauge_allocate_gauge_buffers.c
-     buffers/gauge_return_gauge_field_array.c)
-
-list(APPEND MONOMIAL_SRC_C
-     monomial/detratio_monomial.c
-     monomial/sf_gauge_monomial.c
-     monomial/poly_monomial.c
-     monomial/cloverdetratio_monomial.c
-     monomial/ndrat_monomial.c
-     monomial/cloverdet_monomial.c
-     monomial/clover_trlog_monomial.c
-     monomial/cloverndpoly_monomial.c
-     monomial/monitor_forces.c
-     monomial/ndpoly_monomial.c
-     monomial/det_monomial.c
-     monomial/monomial.c
-     monomial/cloverdetratio_rwmonomial.c
-     monomial/gauge_monomial.c
-     monomial/clovernd_trlog_monomial.c
-     monomial/ratcor_monomial.c
-     monomial/nddetratio_monomial.c
-     monomial/rat_monomial.c
-     monomial/ndratcor_monomial.c
-     monomial/moment_energy.c)
-
-list(APPEND EXCHANGE_SRC_C xchange/xchange_lexicfield.c
-xchange/xchange_2fields.c
-xchange/xchange_gauge.c
-xchange/xchange_halffield.c
-xchange/xchange_jacobi.c
-xchange/little_field_gather_body.c
-xchange/little_field_gather.c
-xchange/xchange_deri.c
-xchange/xchange_field.c
-xchange/xchange_field_tslice.c)
-
-list(APPEND MEAS_SRC_C
-meas/pion_norm.c
-meas/correlators.c
-meas/polyakov_loop.c
-meas/measurements.c
-meas/oriented_plaquettes.c
-meas/gradient_flow.c
-meas/measure_clover_field_strength_observables.c)
-
-list(APPEND SF_SRC_C sf/sf_calc_action.c
-     sf/sf_get_rectangle_staples.c
-     sf/sf_get_staples.c
-     sf/sf_observables.c
-     sf/sf_utils.c
-     )
-
-list(APPEND MAIN_SRC_C
-measure_gauge_action.c
-start.c
-deriv_Sb.c
-reweighting_factor_nd.c
-ranlxs.c
-source_generation.c
-read_input.c
-invert_doublet_eo.c
-geometry_eo.c
-getopt.c
-offline_measurement.c
-tm_debug_printf.c
-chebyshev_polynomial_nd.c
-invert_eo.c
-little_D.c
-get_rectangle_staples.c
-gen_sources.c
-rnd_gauge_trafo.c
-test_lemon.c
-LapH_ev.c
-benchmark.c
-measure_rectangles.c
-check_locallity.c
-invert.c
-deriv_Sb_D_psi.c
-deriv_mg_tune.c
-mpi_init.c
-update_momenta_fg.c
-gamma.c
-matrix_utils.c
-reweighting_factor.c
-update_tm.c
-jacobi.c
-invert_overlap.c
-phmc.c
-get_staples.c
-clenshaw_coef.c
-block.c
-spinor_fft.c
-boundary.c
-little_D_body.c
-X_psi.c
-prepare_source.c
-DDalphaAMG_interface.c
-update_backward_gauge.c
-invert_clover_eo.c
-gettime.c
-hmc_tm.c
-update_momenta.c
-sighandler.c
-compare_derivative.c
-ranlxd.c
-DirectPut.c
-aligned_malloc.c
-fatal_error.c
-operator.c
-cu/cu.c
-chebyshev_polynomial.c
-qphix_test_Dslash.c
-expo.c
-overrelaxation.c
-Ptilde_nd.c
-update_gauge.c
-hopping_test.c
-integrator.c
-P_M_eta.c)
-
-if (TMLQCD_USE_QPHIX)
-list(APPEND MAIN_SRC_C qphix_interface.cpp)
-endif()
-
-if (TMLQCD_USE_QUDA)
-list(APPEND MAIN_SRC_C quda_interface.c)
-endif()
-
-list(APPEND ALL_SRC ${MAIN_SRC_C} ${SF_SRC_C} ${XCHANGE_SRC_C} ${MONOMIAL_SRC_C} ${BUFFER_SRC_C} ${SMEARING_SRC_C} ${OPERATOR_SRC_C} ${RATIONAL_SRC_C} ${LINALG_SRC_C} ${IO_SRC_C} ${INIT_SRC_C} ${SOLVER_SRC_C})
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-
-# cmake 4.0 uses a different syntax for the option
-flex_target(tmlqcd_input_read input_read.l input_read.c
-            $<$<VERSION_LESS:${CMAKE_MAJOR_VERSION},4>:COMPILE_FLAGS "-Ca -Ptmlqcd">
-            $<$<VERSION_GREATER_EQUAL:${CMAKE_MAJOR_VERSION},4>:OPTIONS "-Ca;-Ptmlqcd">)
-
-# create a target library with namespacing because cmake does not know name space at all
-add_library(tmlqcd::hmc ALL_SRC ${FLEX_tmlqcd_input_read_OUTPUTS})
-set_target_properties(tmlqcd::hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
-
-# define a library and add the dependencies
-target_link_libraries(tmlqcd::hmc
-                      $<$<BOOL:${HAVE_CLOCK_GETTIME_IN_RT}>:rt>
-                      $<$<BOOL:${TMLQCD_USE_LIME}>:tmlqcd::lime>
-                      $<$<BOOL:${TMLQCD_USE_LEMON}>:tmlqcd::lemon>
-                      $<$<BOOL:${TMLQCD_USE_QPHIX}>:tmlqcd::qphix>
-                      $<$<BOOL:${TMLQCD_USE_FFTW}>:tmlqcd::fftw3>
-                      $<$<BOOL:${TMLQCD_USE_MPI}>:MPI::MPI_C MPI::MPI_CXX>
-                      $<$<BOOL:${TMLQCD_USE_QUDA}>:quda::quda>
-                      $<$<BOOL:${TMLQCD_USE_CUDA}>:CUDA::cufft CUDA::cufftw CUDA::cublas CUDA::cudart CUDA::cuda_driver>
-                      $<$<BOOL:${TMLQCD_USE_HIP}>:hip::hipfft roc::hipblas hip::host>
-                      ${LAPACK_LIBRARIES}
-                      ${BLAS_LIBRARIES}
-                      $<$<BOOL:${TMLQCD_USE_OPENMP}>:OpenMP::OpenMP_C OpenMP::OpenMP_CXX>
-                      m)
-
-target_compile_definitions(tmlqcd::hmc
-                           $<$<BOOL:${TMLQCD_USE_HIP}>:${TMLQCD_GPU_PLATFORM_DFLAGS}>
-                           )
-
-target_include_directories(tmlqcd::hmc PUBLIC $<INSTALL_INTERFACE:include>
-                           PRIVATE "init io linalg meas monomial operator profiling rational sf smearing solver util xchange wrapper")
diff --git a/config.guess b/config.guess
deleted file mode 100644
index f7727026b..000000000
--- a/config.guess
+++ /dev/null
@@ -1,1701 +0,0 @@
-#! /bin/sh
-# Attempt to guess a canonical system name.
-#   Copyright 1992-2021 Free Software Foundation, Inc.
-
-timestamp='2021-01-01'
-
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <https://www.gnu.org/licenses/>.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that
-# program.  This Exception is an additional permission under section 7
-# of the GNU General Public License, version 3 ("GPLv3").
-#
-# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
-#
-# You can get the latest version of this script from:
-# https://git.savannah.gnu.org/cgit/config.git/plain/config.guess
-#
-# Please send patches to <config-patches@gnu.org>.
-
-
-me=$(echo "$0" | sed -e 's,.*/,,')
-
-usage="\
-Usage: $0 [OPTION]
-
-Output the configuration name of the system \`$me' is run on.
-
-Options:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.guess ($timestamp)
-
-Originally written by Per Bothner.
-Copyright 1992-2021 Free Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help" >&2
-       exit 1 ;;
-    * )
-       break ;;
-  esac
-done
-
-if test $# != 0; then
-  echo "$me: too many arguments$help" >&2
-  exit 1
-fi
-
-# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
-# compiler to aid in system detection is discouraged as it requires
-# temporary files to be created and, as you can see below, it is a
-# headache to deal with in a portable fashion.
-
-# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
-# use `HOST_CC' if defined, but it is deprecated.
-
-# Portable tmp directory creation inspired by the Autoconf team.
-
-tmp=
-# shellcheck disable=SC2172
-trap 'test -z "$tmp" || rm -fr "$tmp"' 0 1 2 13 15
-
-set_cc_for_build() {
-    # prevent multiple calls if $tmp is already set
-    test "$tmp" && return 0
-    : "${TMPDIR=/tmp}"
-    # shellcheck disable=SC2039
-    { tmp=$( (umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null) && test -n "$tmp" && test -d "$tmp" ; } ||
-	{ test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir "$tmp" 2>/dev/null) ; } ||
-	{ tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir "$tmp" 2>/dev/null) && echo "Warning: creating insecure temp directory" >&2 ; } ||
-	{ echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; }
-    dummy=$tmp/dummy
-    case ${CC_FOR_BUILD-},${HOST_CC-},${CC-} in
-	,,)    echo "int x;" > "$dummy.c"
-	       for driver in cc gcc c89 c99 ; do
-		   if ($driver -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then
-		       CC_FOR_BUILD="$driver"
-		       break
-		   fi
-	       done
-	       if test x"$CC_FOR_BUILD" = x ; then
-		   CC_FOR_BUILD=no_compiler_found
-	       fi
-	       ;;
-	,,*)   CC_FOR_BUILD=$CC ;;
-	,*,*)  CC_FOR_BUILD=$HOST_CC ;;
-    esac
-}
-
-# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
-# (ghazi@noc.rutgers.edu 1994-08-24)
-if test -f /.attbin/uname ; then
-	PATH=$PATH:/.attbin ; export PATH
-fi
-
-UNAME_MACHINE=$( (uname -m) 2>/dev/null) || UNAME_MACHINE=unknown
-UNAME_RELEASE=$( (uname -r) 2>/dev/null) || UNAME_RELEASE=unknown
-UNAME_SYSTEM=$( (uname -s) 2>/dev/null) || UNAME_SYSTEM=unknown
-UNAME_VERSION=$( (uname -v) 2>/dev/null) || UNAME_VERSION=unknown
-
-case "$UNAME_SYSTEM" in
-Linux|GNU|GNU/*)
-	LIBC=unknown
-
-	set_cc_for_build
-	cat <<-EOF > "$dummy.c"
-	#include <features.h>
-	#if defined(__UCLIBC__)
-	LIBC=uclibc
-	#elif defined(__dietlibc__)
-	LIBC=dietlibc
-	#elif defined(__GLIBC__)
-	LIBC=gnu
-	#else
-	#include <stdarg.h>
-	/* First heuristic to detect musl libc.  */
-	#ifdef __DEFINED_va_list
-	LIBC=musl
-	#endif
-	#endif
-	EOF
-	eval "$($CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g')"
-
-	# Second heuristic to detect musl libc.
-	if [ "$LIBC" = unknown ] &&
-	   command -v ldd >/dev/null &&
-	   ldd --version 2>&1 | grep -q ^musl; then
-		LIBC=musl
-	fi
-
-	# If the system lacks a compiler, then just pick glibc.
-	# We could probably try harder.
-	if [ "$LIBC" = unknown ]; then
-		LIBC=gnu
-	fi
-	;;
-esac
-
-# Note: order is significant - the case branches are not exclusive.
-
-case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in
-    *:NetBSD:*:*)
-	# NetBSD (nbsd) targets should (where applicable) match one or
-	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
-	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
-	# switched to ELF, *-*-netbsd* would select the old
-	# object file format.  This provides both forward
-	# compatibility and a consistent mechanism for selecting the
-	# object file format.
-	#
-	# Note: NetBSD doesn't particularly care about the vendor
-	# portion of the name.  We always set it to "unknown".
-	sysctl="sysctl -n hw.machine_arch"
-	UNAME_MACHINE_ARCH=$( (uname -p 2>/dev/null || \
-	    "/sbin/$sysctl" 2>/dev/null || \
-	    "/usr/sbin/$sysctl" 2>/dev/null || \
-	    echo unknown))
-	case "$UNAME_MACHINE_ARCH" in
-	    aarch64eb) machine=aarch64_be-unknown ;;
-	    armeb) machine=armeb-unknown ;;
-	    arm*) machine=arm-unknown ;;
-	    sh3el) machine=shl-unknown ;;
-	    sh3eb) machine=sh-unknown ;;
-	    sh5el) machine=sh5le-unknown ;;
-	    earmv*)
-		arch=$(echo "$UNAME_MACHINE_ARCH" | sed -e 's,^e\(armv[0-9]\).*$,\1,')
-		endian=$(echo "$UNAME_MACHINE_ARCH" | sed -ne 's,^.*\(eb\)$,\1,p')
-		machine="${arch}${endian}"-unknown
-		;;
-	    *) machine="$UNAME_MACHINE_ARCH"-unknown ;;
-	esac
-	# The Operating System including object format, if it has switched
-	# to ELF recently (or will in the future) and ABI.
-	case "$UNAME_MACHINE_ARCH" in
-	    earm*)
-		os=netbsdelf
-		;;
-	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
-		set_cc_for_build
-		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
-			| grep -q __ELF__
-		then
-		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
-		    # Return netbsd for either.  FIX?
-		    os=netbsd
-		else
-		    os=netbsdelf
-		fi
-		;;
-	    *)
-		os=netbsd
-		;;
-	esac
-	# Determine ABI tags.
-	case "$UNAME_MACHINE_ARCH" in
-	    earm*)
-		expr='s/^earmv[0-9]/-eabi/;s/eb$//'
-		abi=$(echo "$UNAME_MACHINE_ARCH" | sed -e "$expr")
-		;;
-	esac
-	# The OS release
-	# Debian GNU/NetBSD machines have a different userland, and
-	# thus, need a distinct triplet. However, they do not need
-	# kernel version information, so it can be replaced with a
-	# suitable tag, in the style of linux-gnu.
-	case "$UNAME_VERSION" in
-	    Debian*)
-		release='-gnu'
-		;;
-	    *)
-		release=$(echo "$UNAME_RELEASE" | sed -e 's/[-_].*//' | cut -d. -f1,2)
-		;;
-	esac
-	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
-	# contains redundant information, the shorter form:
-	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-	echo "$machine-${os}${release}${abi-}"
-	exit ;;
-    *:Bitrig:*:*)
-	UNAME_MACHINE_ARCH=$(arch | sed 's/Bitrig.//')
-	echo "$UNAME_MACHINE_ARCH"-unknown-bitrig"$UNAME_RELEASE"
-	exit ;;
-    *:OpenBSD:*:*)
-	UNAME_MACHINE_ARCH=$(arch | sed 's/OpenBSD.//')
-	echo "$UNAME_MACHINE_ARCH"-unknown-openbsd"$UNAME_RELEASE"
-	exit ;;
-    *:LibertyBSD:*:*)
-	UNAME_MACHINE_ARCH=$(arch | sed 's/^.*BSD\.//')
-	echo "$UNAME_MACHINE_ARCH"-unknown-libertybsd"$UNAME_RELEASE"
-	exit ;;
-    *:MidnightBSD:*:*)
-	echo "$UNAME_MACHINE"-unknown-midnightbsd"$UNAME_RELEASE"
-	exit ;;
-    *:ekkoBSD:*:*)
-	echo "$UNAME_MACHINE"-unknown-ekkobsd"$UNAME_RELEASE"
-	exit ;;
-    *:SolidBSD:*:*)
-	echo "$UNAME_MACHINE"-unknown-solidbsd"$UNAME_RELEASE"
-	exit ;;
-    *:OS108:*:*)
-	echo "$UNAME_MACHINE"-unknown-os108_"$UNAME_RELEASE"
-	exit ;;
-    macppc:MirBSD:*:*)
-	echo powerpc-unknown-mirbsd"$UNAME_RELEASE"
-	exit ;;
-    *:MirBSD:*:*)
-	echo "$UNAME_MACHINE"-unknown-mirbsd"$UNAME_RELEASE"
-	exit ;;
-    *:Sortix:*:*)
-	echo "$UNAME_MACHINE"-unknown-sortix
-	exit ;;
-    *:Twizzler:*:*)
-	echo "$UNAME_MACHINE"-unknown-twizzler
-	exit ;;
-    *:Redox:*:*)
-	echo "$UNAME_MACHINE"-unknown-redox
-	exit ;;
-    mips:OSF1:*.*)
-	echo mips-dec-osf1
-	exit ;;
-    alpha:OSF1:*:*)
-	case $UNAME_RELEASE in
-	*4.0)
-		UNAME_RELEASE=$(/usr/sbin/sizer -v | awk '{print $3}')
-		;;
-	*5.*)
-		UNAME_RELEASE=$(/usr/sbin/sizer -v | awk '{print $4}')
-		;;
-	esac
-	# According to Compaq, /usr/sbin/psrinfo has been available on
-	# OSF/1 and Tru64 systems produced since 1995.  I hope that
-	# covers most systems running today.  This code pipes the CPU
-	# types through head -n 1, so we only detect the type of CPU 0.
-	ALPHA_CPU_TYPE=$(/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1)
-	case "$ALPHA_CPU_TYPE" in
-	    "EV4 (21064)")
-		UNAME_MACHINE=alpha ;;
-	    "EV4.5 (21064)")
-		UNAME_MACHINE=alpha ;;
-	    "LCA4 (21066/21068)")
-		UNAME_MACHINE=alpha ;;
-	    "EV5 (21164)")
-		UNAME_MACHINE=alphaev5 ;;
-	    "EV5.6 (21164A)")
-		UNAME_MACHINE=alphaev56 ;;
-	    "EV5.6 (21164PC)")
-		UNAME_MACHINE=alphapca56 ;;
-	    "EV5.7 (21164PC)")
-		UNAME_MACHINE=alphapca57 ;;
-	    "EV6 (21264)")
-		UNAME_MACHINE=alphaev6 ;;
-	    "EV6.7 (21264A)")
-		UNAME_MACHINE=alphaev67 ;;
-	    "EV6.8CB (21264C)")
-		UNAME_MACHINE=alphaev68 ;;
-	    "EV6.8AL (21264B)")
-		UNAME_MACHINE=alphaev68 ;;
-	    "EV6.8CX (21264D)")
-		UNAME_MACHINE=alphaev68 ;;
-	    "EV6.9A (21264/EV69A)")
-		UNAME_MACHINE=alphaev69 ;;
-	    "EV7 (21364)")
-		UNAME_MACHINE=alphaev7 ;;
-	    "EV7.9 (21364A)")
-		UNAME_MACHINE=alphaev79 ;;
-	esac
-	# A Pn.n version is a patched version.
-	# A Vn.n version is a released version.
-	# A Tn.n version is a released field test version.
-	# A Xn.n version is an unreleased experimental baselevel.
-	# 1.2 uses "1.2" for uname -r.
-	echo "$UNAME_MACHINE"-dec-osf"$(echo "$UNAME_RELEASE" | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz)"
-	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
-	exitcode=$?
-	trap '' 0
-	exit $exitcode ;;
-    Amiga*:UNIX_System_V:4.0:*)
-	echo m68k-unknown-sysv4
-	exit ;;
-    *:[Aa]miga[Oo][Ss]:*:*)
-	echo "$UNAME_MACHINE"-unknown-amigaos
-	exit ;;
-    *:[Mm]orph[Oo][Ss]:*:*)
-	echo "$UNAME_MACHINE"-unknown-morphos
-	exit ;;
-    *:OS/390:*:*)
-	echo i370-ibm-openedition
-	exit ;;
-    *:z/VM:*:*)
-	echo s390-ibm-zvmoe
-	exit ;;
-    *:OS400:*:*)
-	echo powerpc-ibm-os400
-	exit ;;
-    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
-	echo arm-acorn-riscix"$UNAME_RELEASE"
-	exit ;;
-    arm*:riscos:*:*|arm*:RISCOS:*:*)
-	echo arm-unknown-riscos
-	exit ;;
-    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
-	echo hppa1.1-hitachi-hiuxmpp
-	exit ;;
-    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
-	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
-	if test "$( (/bin/universe) 2>/dev/null)" = att ; then
-		echo pyramid-pyramid-sysv3
-	else
-		echo pyramid-pyramid-bsd
-	fi
-	exit ;;
-    NILE*:*:*:dcosx)
-	echo pyramid-pyramid-svr4
-	exit ;;
-    DRS?6000:unix:4.0:6*)
-	echo sparc-icl-nx6
-	exit ;;
-    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
-	case $(/usr/bin/uname -p) in
-	    sparc) echo sparc-icl-nx7; exit ;;
-	esac ;;
-    s390x:SunOS:*:*)
-	echo "$UNAME_MACHINE"-ibm-solaris2"$(echo "$UNAME_RELEASE" | sed -e 's/[^.]*//')"
-	exit ;;
-    sun4H:SunOS:5.*:*)
-	echo sparc-hal-solaris2"$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*//')"
-	exit ;;
-    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
-	echo sparc-sun-solaris2"$(echo "$UNAME_RELEASE" | sed -e 's/[^.]*//')"
-	exit ;;
-    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
-	echo i386-pc-auroraux"$UNAME_RELEASE"
-	exit ;;
-    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
-	set_cc_for_build
-	SUN_ARCH=i386
-	# If there is a compiler, see if it is configured for 64-bit objects.
-	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
-	# This test works for both compilers.
-	if test "$CC_FOR_BUILD" != no_compiler_found; then
-	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
-		(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-		grep IS_64BIT_ARCH >/dev/null
-	    then
-		SUN_ARCH=x86_64
-	    fi
-	fi
-	echo "$SUN_ARCH"-pc-solaris2"$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*//')"
-	exit ;;
-    sun4*:SunOS:6*:*)
-	# According to config.sub, this is the proper way to canonicalize
-	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
-	# it's likely to be more like Solaris than SunOS4.
-	echo sparc-sun-solaris3"$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*//')"
-	exit ;;
-    sun4*:SunOS:*:*)
-	case "$(/usr/bin/arch -k)" in
-	    Series*|S4*)
-		UNAME_RELEASE=$(uname -v)
-		;;
-	esac
-	# Japanese Language versions have a version number like `4.1.3-JL'.
-	echo sparc-sun-sunos"$(echo "$UNAME_RELEASE"|sed -e 's/-/_/')"
-	exit ;;
-    sun3*:SunOS:*:*)
-	echo m68k-sun-sunos"$UNAME_RELEASE"
-	exit ;;
-    sun*:*:4.2BSD:*)
-	UNAME_RELEASE=$( (sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null)
-	test "x$UNAME_RELEASE" = x && UNAME_RELEASE=3
-	case "$(/bin/arch)" in
-	    sun3)
-		echo m68k-sun-sunos"$UNAME_RELEASE"
-		;;
-	    sun4)
-		echo sparc-sun-sunos"$UNAME_RELEASE"
-		;;
-	esac
-	exit ;;
-    aushp:SunOS:*:*)
-	echo sparc-auspex-sunos"$UNAME_RELEASE"
-	exit ;;
-    # The situation for MiNT is a little confusing.  The machine name
-    # can be virtually everything (everything which is not
-    # "atarist" or "atariste" at least should have a processor
-    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
-    # to the lowercase version "mint" (or "freemint").  Finally
-    # the system name "TOS" denotes a system which is actually not
-    # MiNT.  But MiNT is downward compatible to TOS, so this should
-    # be no problem.
-    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint"$UNAME_RELEASE"
-	exit ;;
-    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint"$UNAME_RELEASE"
-	exit ;;
-    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-	echo m68k-atari-mint"$UNAME_RELEASE"
-	exit ;;
-    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-	echo m68k-milan-mint"$UNAME_RELEASE"
-	exit ;;
-    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-	echo m68k-hades-mint"$UNAME_RELEASE"
-	exit ;;
-    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-	echo m68k-unknown-mint"$UNAME_RELEASE"
-	exit ;;
-    m68k:machten:*:*)
-	echo m68k-apple-machten"$UNAME_RELEASE"
-	exit ;;
-    powerpc:machten:*:*)
-	echo powerpc-apple-machten"$UNAME_RELEASE"
-	exit ;;
-    RISC*:Mach:*:*)
-	echo mips-dec-mach_bsd4.3
-	exit ;;
-    RISC*:ULTRIX:*:*)
-	echo mips-dec-ultrix"$UNAME_RELEASE"
-	exit ;;
-    VAX*:ULTRIX*:*:*)
-	echo vax-dec-ultrix"$UNAME_RELEASE"
-	exit ;;
-    2020:CLIX:*:* | 2430:CLIX:*:*)
-	echo clipper-intergraph-clix"$UNAME_RELEASE"
-	exit ;;
-    mips:*:*:UMIPS | mips:*:*:RISCos)
-	set_cc_for_build
-	sed 's/^	//' << EOF > "$dummy.c"
-#ifdef __cplusplus
-#include <stdio.h>  /* for printf() prototype */
-	int main (int argc, char *argv[]) {
-#else
-	int main (argc, argv) int argc; char *argv[]; {
-#endif
-	#if defined (host_mips) && defined (MIPSEB)
-	#if defined (SYSTYPE_SYSV)
-	  printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_SVR4)
-	  printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
-	  printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0);
-	#endif
-	#endif
-	  exit (-1);
-	}
-EOF
-	$CC_FOR_BUILD -o "$dummy" "$dummy.c" &&
-	  dummyarg=$(echo "$UNAME_RELEASE" | sed -n 's/\([0-9]*\).*/\1/p') &&
-	  SYSTEM_NAME=$("$dummy" "$dummyarg") &&
-	    { echo "$SYSTEM_NAME"; exit; }
-	echo mips-mips-riscos"$UNAME_RELEASE"
-	exit ;;
-    Motorola:PowerMAX_OS:*:*)
-	echo powerpc-motorola-powermax
-	exit ;;
-    Motorola:*:4.3:PL8-*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:Power_UNIX:*:*)
-	echo powerpc-harris-powerunix
-	exit ;;
-    m88k:CX/UX:7*:*)
-	echo m88k-harris-cxux7
-	exit ;;
-    m88k:*:4*:R4*)
-	echo m88k-motorola-sysv4
-	exit ;;
-    m88k:*:3*:R3*)
-	echo m88k-motorola-sysv3
-	exit ;;
-    AViiON:dgux:*:*)
-	# DG/UX returns AViiON for all architectures
-	UNAME_PROCESSOR=$(/usr/bin/uname -p)
-	if test "$UNAME_PROCESSOR" = mc88100 || test "$UNAME_PROCESSOR" = mc88110
-	then
-	    if test "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx || \
-	       test "$TARGET_BINARY_INTERFACE"x = x
-	    then
-		echo m88k-dg-dgux"$UNAME_RELEASE"
-	    else
-		echo m88k-dg-dguxbcs"$UNAME_RELEASE"
-	    fi
-	else
-	    echo i586-dg-dgux"$UNAME_RELEASE"
-	fi
-	exit ;;
-    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
-	echo m88k-dolphin-sysv3
-	exit ;;
-    M88*:*:R3*:*)
-	# Delta 88k system running SVR3
-	echo m88k-motorola-sysv3
-	exit ;;
-    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
-	echo m88k-tektronix-sysv3
-	exit ;;
-    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
-	echo m68k-tektronix-bsd
-	exit ;;
-    *:IRIX*:*:*)
-	echo mips-sgi-irix"$(echo "$UNAME_RELEASE"|sed -e 's/-/_/g')"
-	exit ;;
-    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
-	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
-	exit ;;               # Note that: echo "'$(uname -s)'" gives 'AIX '
-    i*86:AIX:*:*)
-	echo i386-ibm-aix
-	exit ;;
-    ia64:AIX:*:*)
-	if test -x /usr/bin/oslevel ; then
-		IBM_REV=$(/usr/bin/oslevel)
-	else
-		IBM_REV="$UNAME_VERSION.$UNAME_RELEASE"
-	fi
-	echo "$UNAME_MACHINE"-ibm-aix"$IBM_REV"
-	exit ;;
-    *:AIX:2:3)
-	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
-		set_cc_for_build
-		sed 's/^		//' << EOF > "$dummy.c"
-		#include <sys/systemcfg.h>
-
-		main()
-			{
-			if (!__power_pc())
-				exit(1);
-			puts("powerpc-ibm-aix3.2.5");
-			exit(0);
-			}
-EOF
-		if $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=$("$dummy")
-		then
-			echo "$SYSTEM_NAME"
-		else
-			echo rs6000-ibm-aix3.2.5
-		fi
-	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
-		echo rs6000-ibm-aix3.2.4
-	else
-		echo rs6000-ibm-aix3.2
-	fi
-	exit ;;
-    *:AIX:*:[4567])
-	IBM_CPU_ID=$(/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }')
-	if /usr/sbin/lsattr -El "$IBM_CPU_ID" | grep ' POWER' >/dev/null 2>&1; then
-		IBM_ARCH=rs6000
-	else
-		IBM_ARCH=powerpc
-	fi
-	if test -x /usr/bin/lslpp ; then
-		IBM_REV=$(/usr/bin/lslpp -Lqc bos.rte.libc |
-			   awk -F: '{ print $3 }' | sed s/[0-9]*$/0/)
-	else
-		IBM_REV="$UNAME_VERSION.$UNAME_RELEASE"
-	fi
-	echo "$IBM_ARCH"-ibm-aix"$IBM_REV"
-	exit ;;
-    *:AIX:*:*)
-	echo rs6000-ibm-aix
-	exit ;;
-    ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*)
-	echo romp-ibm-bsd4.4
-	exit ;;
-    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
-	echo romp-ibm-bsd"$UNAME_RELEASE"   # 4.3 with uname added to
-	exit ;;                             # report: romp-ibm BSD 4.3
-    *:BOSX:*:*)
-	echo rs6000-bull-bosx
-	exit ;;
-    DPX/2?00:B.O.S.:*:*)
-	echo m68k-bull-sysv3
-	exit ;;
-    9000/[34]??:4.3bsd:1.*:*)
-	echo m68k-hp-bsd
-	exit ;;
-    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
-	echo m68k-hp-bsd4.4
-	exit ;;
-    9000/[34678]??:HP-UX:*:*)
-	HPUX_REV=$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*.[0B]*//')
-	case "$UNAME_MACHINE" in
-	    9000/31?)            HP_ARCH=m68000 ;;
-	    9000/[34]??)         HP_ARCH=m68k ;;
-	    9000/[678][0-9][0-9])
-		if test -x /usr/bin/getconf; then
-		    sc_cpu_version=$(/usr/bin/getconf SC_CPU_VERSION 2>/dev/null)
-		    sc_kernel_bits=$(/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null)
-		    case "$sc_cpu_version" in
-		      523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
-		      528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
-		      532)                      # CPU_PA_RISC2_0
-			case "$sc_kernel_bits" in
-			  32) HP_ARCH=hppa2.0n ;;
-			  64) HP_ARCH=hppa2.0w ;;
-			  '') HP_ARCH=hppa2.0 ;;   # HP-UX 10.20
-			esac ;;
-		    esac
-		fi
-		if test "$HP_ARCH" = ""; then
-		    set_cc_for_build
-		    sed 's/^		//' << EOF > "$dummy.c"
-
-		#define _HPUX_SOURCE
-		#include <stdlib.h>
-		#include <unistd.h>
-
-		int main ()
-		{
-		#if defined(_SC_KERNEL_BITS)
-		    long bits = sysconf(_SC_KERNEL_BITS);
-		#endif
-		    long cpu  = sysconf (_SC_CPU_VERSION);
-
-		    switch (cpu)
-			{
-			case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
-			case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
-			case CPU_PA_RISC2_0:
-		#if defined(_SC_KERNEL_BITS)
-			    switch (bits)
-				{
-				case 64: puts ("hppa2.0w"); break;
-				case 32: puts ("hppa2.0n"); break;
-				default: puts ("hppa2.0"); break;
-				} break;
-		#else  /* !defined(_SC_KERNEL_BITS) */
-			    puts ("hppa2.0"); break;
-		#endif
-			default: puts ("hppa1.0"); break;
-			}
-		    exit (0);
-		}
-EOF
-		    (CCOPTS="" $CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null) && HP_ARCH=$("$dummy")
-		    test -z "$HP_ARCH" && HP_ARCH=hppa
-		fi ;;
-	esac
-	if test "$HP_ARCH" = hppa2.0w
-	then
-	    set_cc_for_build
-
-	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
-	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
-	    # generating 64-bit code.  GNU and HP use different nomenclature:
-	    #
-	    # $ CC_FOR_BUILD=cc ./config.guess
-	    # => hppa2.0w-hp-hpux11.23
-	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
-	    # => hppa64-hp-hpux11.23
-
-	    if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
-		grep -q __LP64__
-	    then
-		HP_ARCH=hppa2.0w
-	    else
-		HP_ARCH=hppa64
-	    fi
-	fi
-	echo "$HP_ARCH"-hp-hpux"$HPUX_REV"
-	exit ;;
-    ia64:HP-UX:*:*)
-	HPUX_REV=$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*.[0B]*//')
-	echo ia64-hp-hpux"$HPUX_REV"
-	exit ;;
-    3050*:HI-UX:*:*)
-	set_cc_for_build
-	sed 's/^	//' << EOF > "$dummy.c"
-	#include <unistd.h>
-	int
-	main ()
-	{
-	  long cpu = sysconf (_SC_CPU_VERSION);
-	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
-	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
-	     results, however.  */
-	  if (CPU_IS_PA_RISC (cpu))
-	    {
-	      switch (cpu)
-		{
-		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
-		  default: puts ("hppa-hitachi-hiuxwe2"); break;
-		}
-	    }
-	  else if (CPU_IS_HP_MC68K (cpu))
-	    puts ("m68k-hitachi-hiuxwe2");
-	  else puts ("unknown-hitachi-hiuxwe2");
-	  exit (0);
-	}
-EOF
-	$CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=$("$dummy") &&
-		{ echo "$SYSTEM_NAME"; exit; }
-	echo unknown-hitachi-hiuxwe2
-	exit ;;
-    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*)
-	echo hppa1.1-hp-bsd
-	exit ;;
-    9000/8??:4.3bsd:*:*)
-	echo hppa1.0-hp-bsd
-	exit ;;
-    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
-	echo hppa1.0-hp-mpeix
-	exit ;;
-    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*)
-	echo hppa1.1-hp-osf
-	exit ;;
-    hp8??:OSF1:*:*)
-	echo hppa1.0-hp-osf
-	exit ;;
-    i*86:OSF1:*:*)
-	if test -x /usr/sbin/sysversion ; then
-	    echo "$UNAME_MACHINE"-unknown-osf1mk
-	else
-	    echo "$UNAME_MACHINE"-unknown-osf1
-	fi
-	exit ;;
-    parisc*:Lites*:*:*)
-	echo hppa1.1-hp-lites
-	exit ;;
-    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
-	echo c1-convex-bsd
-	exit ;;
-    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
-	echo c34-convex-bsd
-	exit ;;
-    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
-	echo c38-convex-bsd
-	exit ;;
-    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
-	echo c4-convex-bsd
-	exit ;;
-    CRAY*Y-MP:*:*:*)
-	echo ymp-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*[A-Z]90:*:*:*)
-	echo "$UNAME_MACHINE"-cray-unicos"$UNAME_RELEASE" \
-	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
-	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
-	      -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*TS:*:*:*)
-	echo t90-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*T3E:*:*:*)
-	echo alphaev5-cray-unicosmk"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*SV1:*:*:*)
-	echo sv1-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    *:UNICOS/mp:*:*)
-	echo craynv-cray-unicosmp"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
-	FUJITSU_PROC=$(uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz)
-	FUJITSU_SYS=$(uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///')
-	FUJITSU_REL=$(echo "$UNAME_RELEASE" | sed -e 's/ /_/')
-	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-	exit ;;
-    5000:UNIX_System_V:4.*:*)
-	FUJITSU_SYS=$(uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///')
-	FUJITSU_REL=$(echo "$UNAME_RELEASE" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/')
-	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-	exit ;;
-    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
-	echo "$UNAME_MACHINE"-pc-bsdi"$UNAME_RELEASE"
-	exit ;;
-    sparc*:BSD/OS:*:*)
-	echo sparc-unknown-bsdi"$UNAME_RELEASE"
-	exit ;;
-    *:BSD/OS:*:*)
-	echo "$UNAME_MACHINE"-unknown-bsdi"$UNAME_RELEASE"
-	exit ;;
-    arm:FreeBSD:*:*)
-	UNAME_PROCESSOR=$(uname -p)
-	set_cc_for_build
-	if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
-	    | grep -q __ARM_PCS_VFP
-	then
-	    echo "${UNAME_PROCESSOR}"-unknown-freebsd"$(echo ${UNAME_RELEASE}|sed -e 's/[-(].*//')"-gnueabi
-	else
-	    echo "${UNAME_PROCESSOR}"-unknown-freebsd"$(echo ${UNAME_RELEASE}|sed -e 's/[-(].*//')"-gnueabihf
-	fi
-	exit ;;
-    *:FreeBSD:*:*)
-	UNAME_PROCESSOR=$(/usr/bin/uname -p)
-	case "$UNAME_PROCESSOR" in
-	    amd64)
-		UNAME_PROCESSOR=x86_64 ;;
-	    i386)
-		UNAME_PROCESSOR=i586 ;;
-	esac
-	echo "$UNAME_PROCESSOR"-unknown-freebsd"$(echo "$UNAME_RELEASE"|sed -e 's/[-(].*//')"
-	exit ;;
-    i*:CYGWIN*:*)
-	echo "$UNAME_MACHINE"-pc-cygwin
-	exit ;;
-    *:MINGW64*:*)
-	echo "$UNAME_MACHINE"-pc-mingw64
-	exit ;;
-    *:MINGW*:*)
-	echo "$UNAME_MACHINE"-pc-mingw32
-	exit ;;
-    *:MSYS*:*)
-	echo "$UNAME_MACHINE"-pc-msys
-	exit ;;
-    i*:PW*:*)
-	echo "$UNAME_MACHINE"-pc-pw32
-	exit ;;
-    *:Interix*:*)
-	case "$UNAME_MACHINE" in
-	    x86)
-		echo i586-pc-interix"$UNAME_RELEASE"
-		exit ;;
-	    authenticamd | genuineintel | EM64T)
-		echo x86_64-unknown-interix"$UNAME_RELEASE"
-		exit ;;
-	    IA64)
-		echo ia64-unknown-interix"$UNAME_RELEASE"
-		exit ;;
-	esac ;;
-    i*:UWIN*:*)
-	echo "$UNAME_MACHINE"-pc-uwin
-	exit ;;
-    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
-	echo x86_64-pc-cygwin
-	exit ;;
-    prep*:SunOS:5.*:*)
-	echo powerpcle-unknown-solaris2"$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*//')"
-	exit ;;
-    *:GNU:*:*)
-	# the GNU system
-	echo "$(echo "$UNAME_MACHINE"|sed -e 's,[-/].*$,,')-unknown-$LIBC$(echo "$UNAME_RELEASE"|sed -e 's,/.*$,,')"
-	exit ;;
-    *:GNU/*:*:*)
-	# other systems with GNU libc and userland
-	echo "$UNAME_MACHINE-unknown-$(echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]")$(echo "$UNAME_RELEASE"|sed -e 's/[-(].*//')-$LIBC"
-	exit ;;
-    *:Minix:*:*)
-	echo "$UNAME_MACHINE"-unknown-minix
-	exit ;;
-    aarch64:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    aarch64_be:Linux:*:*)
-	UNAME_MACHINE=aarch64_be
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    alpha:Linux:*:*)
-	case $(sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null) in
-	  EV5)   UNAME_MACHINE=alphaev5 ;;
-	  EV56)  UNAME_MACHINE=alphaev56 ;;
-	  PCA56) UNAME_MACHINE=alphapca56 ;;
-	  PCA57) UNAME_MACHINE=alphapca56 ;;
-	  EV6)   UNAME_MACHINE=alphaev6 ;;
-	  EV67)  UNAME_MACHINE=alphaev67 ;;
-	  EV68*) UNAME_MACHINE=alphaev68 ;;
-	esac
-	objdump --private-headers /bin/sh | grep -q ld.so.1
-	if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    arc:Linux:*:* | arceb:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    arm*:Linux:*:*)
-	set_cc_for_build
-	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
-	    | grep -q __ARM_EABI__
-	then
-	    echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	else
-	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
-		| grep -q __ARM_PCS_VFP
-	    then
-		echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"eabi
-	    else
-		echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"eabihf
-	    fi
-	fi
-	exit ;;
-    avr32*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    cris:Linux:*:*)
-	echo "$UNAME_MACHINE"-axis-linux-"$LIBC"
-	exit ;;
-    crisv32:Linux:*:*)
-	echo "$UNAME_MACHINE"-axis-linux-"$LIBC"
-	exit ;;
-    e2k:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    frv:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    hexagon:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    i*86:Linux:*:*)
-	echo "$UNAME_MACHINE"-pc-linux-"$LIBC"
-	exit ;;
-    ia64:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    k1om:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    loongarch32:Linux:*:* | loongarch64:Linux:*:* | loongarchx32:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    m32r*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    m68*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    mips:Linux:*:* | mips64:Linux:*:*)
-	set_cc_for_build
-	IS_GLIBC=0
-	test x"${LIBC}" = xgnu && IS_GLIBC=1
-	sed 's/^	//' << EOF > "$dummy.c"
-	#undef CPU
-	#undef mips
-	#undef mipsel
-	#undef mips64
-	#undef mips64el
-	#if ${IS_GLIBC} && defined(_ABI64)
-	LIBCABI=gnuabi64
-	#else
-	#if ${IS_GLIBC} && defined(_ABIN32)
-	LIBCABI=gnuabin32
-	#else
-	LIBCABI=${LIBC}
-	#endif
-	#endif
-
-	#if ${IS_GLIBC} && defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6
-	CPU=mipsisa64r6
-	#else
-	#if ${IS_GLIBC} && !defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6
-	CPU=mipsisa32r6
-	#else
-	#if defined(__mips64)
-	CPU=mips64
-	#else
-	CPU=mips
-	#endif
-	#endif
-	#endif
-
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	MIPS_ENDIAN=el
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	MIPS_ENDIAN=
-	#else
-	MIPS_ENDIAN=
-	#endif
-	#endif
-EOF
-	eval "$($CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU\|^MIPS_ENDIAN\|^LIBCABI')"
-	test "x$CPU" != x && { echo "$CPU${MIPS_ENDIAN}-unknown-linux-$LIBCABI"; exit; }
-	;;
-    mips64el:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    openrisc*:Linux:*:*)
-	echo or1k-unknown-linux-"$LIBC"
-	exit ;;
-    or32:Linux:*:* | or1k*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    padre:Linux:*:*)
-	echo sparc-unknown-linux-"$LIBC"
-	exit ;;
-    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-"$LIBC"
-	exit ;;
-    parisc:Linux:*:* | hppa:Linux:*:*)
-	# Look for CPU level
-	case $(grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2) in
-	  PA7*) echo hppa1.1-unknown-linux-"$LIBC" ;;
-	  PA8*) echo hppa2.0-unknown-linux-"$LIBC" ;;
-	  *)    echo hppa-unknown-linux-"$LIBC" ;;
-	esac
-	exit ;;
-    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-"$LIBC"
-	exit ;;
-    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-"$LIBC"
-	exit ;;
-    ppc64le:Linux:*:*)
-	echo powerpc64le-unknown-linux-"$LIBC"
-	exit ;;
-    ppcle:Linux:*:*)
-	echo powerpcle-unknown-linux-"$LIBC"
-	exit ;;
-    riscv32:Linux:*:* | riscv32be:Linux:*:* | riscv64:Linux:*:* | riscv64be:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    s390:Linux:*:* | s390x:Linux:*:*)
-	echo "$UNAME_MACHINE"-ibm-linux-"$LIBC"
-	exit ;;
-    sh64*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    sh*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    tile*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    vax:Linux:*:*)
-	echo "$UNAME_MACHINE"-dec-linux-"$LIBC"
-	exit ;;
-    x86_64:Linux:*:*)
-	set_cc_for_build
-	LIBCABI=$LIBC
-	if test "$CC_FOR_BUILD" != no_compiler_found; then
-	    if (echo '#ifdef __ILP32__'; echo IS_X32; echo '#endif') | \
-		(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-		grep IS_X32 >/dev/null
-	    then
-		LIBCABI="$LIBC"x32
-	    fi
-	fi
-	echo "$UNAME_MACHINE"-pc-linux-"$LIBCABI"
-	exit ;;
-    xtensa*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    i*86:DYNIX/ptx:4*:*)
-	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
-	# earlier versions are messed up and put the nodename in both
-	# sysname and nodename.
-	echo i386-sequent-sysv4
-	exit ;;
-    i*86:UNIX_SV:4.2MP:2.*)
-	# Unixware is an offshoot of SVR4, but it has its own version
-	# number series starting with 2...
-	# I am not positive that other SVR4 systems won't match this,
-	# I just have to hope.  -- rms.
-	# Use sysv4.2uw... so that sysv4* matches it.
-	echo "$UNAME_MACHINE"-pc-sysv4.2uw"$UNAME_VERSION"
-	exit ;;
-    i*86:OS/2:*:*)
-	# If we were able to find `uname', then EMX Unix compatibility
-	# is probably installed.
-	echo "$UNAME_MACHINE"-pc-os2-emx
-	exit ;;
-    i*86:XTS-300:*:STOP)
-	echo "$UNAME_MACHINE"-unknown-stop
-	exit ;;
-    i*86:atheos:*:*)
-	echo "$UNAME_MACHINE"-unknown-atheos
-	exit ;;
-    i*86:syllable:*:*)
-	echo "$UNAME_MACHINE"-pc-syllable
-	exit ;;
-    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
-	echo i386-unknown-lynxos"$UNAME_RELEASE"
-	exit ;;
-    i*86:*DOS:*:*)
-	echo "$UNAME_MACHINE"-pc-msdosdjgpp
-	exit ;;
-    i*86:*:4.*:*)
-	UNAME_REL=$(echo "$UNAME_RELEASE" | sed 's/\/MP$//')
-	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
-		echo "$UNAME_MACHINE"-univel-sysv"$UNAME_REL"
-	else
-		echo "$UNAME_MACHINE"-pc-sysv"$UNAME_REL"
-	fi
-	exit ;;
-    i*86:*:5:[678]*)
-	# UnixWare 7.x, OpenUNIX and OpenServer 6.
-	case $(/bin/uname -X | grep "^Machine") in
-	    *486*)	     UNAME_MACHINE=i486 ;;
-	    *Pentium)	     UNAME_MACHINE=i586 ;;
-	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
-	esac
-	echo "$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}"
-	exit ;;
-    i*86:*:3.2:*)
-	if test -f /usr/options/cb.name; then
-		UNAME_REL=$(sed -n 's/.*Version //p' </usr/options/cb.name)
-		echo "$UNAME_MACHINE"-pc-isc"$UNAME_REL"
-	elif /bin/uname -X 2>/dev/null >/dev/null ; then
-		UNAME_REL=$( (/bin/uname -X|grep Release|sed -e 's/.*= //'))
-		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
-		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
-			&& UNAME_MACHINE=i586
-		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		echo "$UNAME_MACHINE"-pc-sco"$UNAME_REL"
-	else
-		echo "$UNAME_MACHINE"-pc-sysv32
-	fi
-	exit ;;
-    pc:*:*:*)
-	# Left here for compatibility:
-	# uname -m prints for DJGPP always 'pc', but it prints nothing about
-	# the processor, so we play safe by assuming i586.
-	# Note: whatever this is, it MUST be the same as what config.sub
-	# prints for the "djgpp" host, or else GDB configure will decide that
-	# this is a cross-build.
-	echo i586-pc-msdosdjgpp
-	exit ;;
-    Intel:Mach:3*:*)
-	echo i386-pc-mach3
-	exit ;;
-    paragon:*:*:*)
-	echo i860-intel-osf1
-	exit ;;
-    i860:*:4.*:*) # i860-SVR4
-	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
-	  echo i860-stardent-sysv"$UNAME_RELEASE" # Stardent Vistra i860-SVR4
-	else # Add other i860-SVR4 vendors below as they are discovered.
-	  echo i860-unknown-sysv"$UNAME_RELEASE"  # Unknown i860-SVR4
-	fi
-	exit ;;
-    mini*:CTIX:SYS*5:*)
-	# "miniframe"
-	echo m68010-convergent-sysv
-	exit ;;
-    mc68k:UNIX:SYSTEM5:3.51m)
-	echo m68k-convergent-sysv
-	exit ;;
-    M680?0:D-NIX:5.3:*)
-	echo m68k-diab-dnix
-	exit ;;
-    M68*:*:R3V[5678]*:*)
-	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
-    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
-	OS_REL=''
-	test -r /etc/.relid \
-	&& OS_REL=.$(sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid)
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	  && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
-    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && { echo i486-ncr-sysv4; exit; } ;;
-    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
-	OS_REL='.3'
-	test -r /etc/.relid \
-	    && OS_REL=.$(sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid)
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	    && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	    && { echo i586-ncr-sysv4.3"$OS_REL"; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
-	    && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
-    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
-	echo m68k-unknown-lynxos"$UNAME_RELEASE"
-	exit ;;
-    mc68030:UNIX_System_V:4.*:*)
-	echo m68k-atari-sysv4
-	exit ;;
-    TSUNAMI:LynxOS:2.*:*)
-	echo sparc-unknown-lynxos"$UNAME_RELEASE"
-	exit ;;
-    rs6000:LynxOS:2.*:*)
-	echo rs6000-unknown-lynxos"$UNAME_RELEASE"
-	exit ;;
-    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
-	echo powerpc-unknown-lynxos"$UNAME_RELEASE"
-	exit ;;
-    SM[BE]S:UNIX_SV:*:*)
-	echo mips-dde-sysv"$UNAME_RELEASE"
-	exit ;;
-    RM*:ReliantUNIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    RM*:SINIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    *:SINIX-*:*:*)
-	if uname -p 2>/dev/null >/dev/null ; then
-		UNAME_MACHINE=$( (uname -p) 2>/dev/null)
-		echo "$UNAME_MACHINE"-sni-sysv4
-	else
-		echo ns32k-sni-sysv
-	fi
-	exit ;;
-    PENTIUM:*:4.0*:*)	# Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-			# says <Richard.M.Bartel@ccMail.Census.GOV>
-	echo i586-unisys-sysv4
-	exit ;;
-    *:UNIX_System_V:4*:FTX*)
-	# From Gerald Hewes <hewes@openmarket.com>.
-	# How about differentiating between stratus architectures? -djm
-	echo hppa1.1-stratus-sysv4
-	exit ;;
-    *:*:*:FTX*)
-	# From seanf@swdc.stratus.com.
-	echo i860-stratus-sysv4
-	exit ;;
-    i*86:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo "$UNAME_MACHINE"-stratus-vos
-	exit ;;
-    *:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo hppa1.1-stratus-vos
-	exit ;;
-    mc68*:A/UX:*:*)
-	echo m68k-apple-aux"$UNAME_RELEASE"
-	exit ;;
-    news*:NEWS-OS:6*:*)
-	echo mips-sony-newsos6
-	exit ;;
-    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
-	if test -d /usr/nec; then
-		echo mips-nec-sysv"$UNAME_RELEASE"
-	else
-		echo mips-unknown-sysv"$UNAME_RELEASE"
-	fi
-	exit ;;
-    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
-	echo powerpc-be-beos
-	exit ;;
-    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
-	echo powerpc-apple-beos
-	exit ;;
-    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
-	echo i586-pc-beos
-	exit ;;
-    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
-	echo i586-pc-haiku
-	exit ;;
-    x86_64:Haiku:*:*)
-	echo x86_64-unknown-haiku
-	exit ;;
-    SX-4:SUPER-UX:*:*)
-	echo sx4-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-5:SUPER-UX:*:*)
-	echo sx5-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-6:SUPER-UX:*:*)
-	echo sx6-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-7:SUPER-UX:*:*)
-	echo sx7-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-8:SUPER-UX:*:*)
-	echo sx8-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-8R:SUPER-UX:*:*)
-	echo sx8r-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-ACE:SUPER-UX:*:*)
-	echo sxace-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    Power*:Rhapsody:*:*)
-	echo powerpc-apple-rhapsody"$UNAME_RELEASE"
-	exit ;;
-    *:Rhapsody:*:*)
-	echo "$UNAME_MACHINE"-apple-rhapsody"$UNAME_RELEASE"
-	exit ;;
-    arm64:Darwin:*:*)
-	echo aarch64-apple-darwin"$UNAME_RELEASE"
-	exit ;;
-    *:Darwin:*:*)
-	UNAME_PROCESSOR=$(uname -p)
-	case $UNAME_PROCESSOR in
-	    unknown) UNAME_PROCESSOR=powerpc ;;
-	esac
-	if command -v xcode-select > /dev/null 2> /dev/null && \
-		! xcode-select --print-path > /dev/null 2> /dev/null ; then
-	    # Avoid executing cc if there is no toolchain installed as
-	    # cc will be a stub that puts up a graphical alert
-	    # prompting the user to install developer tools.
-	    CC_FOR_BUILD=no_compiler_found
-	else
-	    set_cc_for_build
-	fi
-	if test "$CC_FOR_BUILD" != no_compiler_found; then
-	    if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		   (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-		   grep IS_64BIT_ARCH >/dev/null
-	    then
-		case $UNAME_PROCESSOR in
-		    i386) UNAME_PROCESSOR=x86_64 ;;
-		    powerpc) UNAME_PROCESSOR=powerpc64 ;;
-		esac
-	    fi
-	    # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc
-	    if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \
-		   (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-		   grep IS_PPC >/dev/null
-	    then
-		UNAME_PROCESSOR=powerpc
-	    fi
-	elif test "$UNAME_PROCESSOR" = i386 ; then
-	    # uname -m returns i386 or x86_64
-	    UNAME_PROCESSOR=$UNAME_MACHINE
-	fi
-	echo "$UNAME_PROCESSOR"-apple-darwin"$UNAME_RELEASE"
-	exit ;;
-    *:procnto*:*:* | *:QNX:[0123456789]*:*)
-	UNAME_PROCESSOR=$(uname -p)
-	if test "$UNAME_PROCESSOR" = x86; then
-		UNAME_PROCESSOR=i386
-		UNAME_MACHINE=pc
-	fi
-	echo "$UNAME_PROCESSOR"-"$UNAME_MACHINE"-nto-qnx"$UNAME_RELEASE"
-	exit ;;
-    *:QNX:*:4*)
-	echo i386-pc-qnx
-	exit ;;
-    NEO-*:NONSTOP_KERNEL:*:*)
-	echo neo-tandem-nsk"$UNAME_RELEASE"
-	exit ;;
-    NSE-*:NONSTOP_KERNEL:*:*)
-	echo nse-tandem-nsk"$UNAME_RELEASE"
-	exit ;;
-    NSR-*:NONSTOP_KERNEL:*:*)
-	echo nsr-tandem-nsk"$UNAME_RELEASE"
-	exit ;;
-    NSV-*:NONSTOP_KERNEL:*:*)
-	echo nsv-tandem-nsk"$UNAME_RELEASE"
-	exit ;;
-    NSX-*:NONSTOP_KERNEL:*:*)
-	echo nsx-tandem-nsk"$UNAME_RELEASE"
-	exit ;;
-    *:NonStop-UX:*:*)
-	echo mips-compaq-nonstopux
-	exit ;;
-    BS2000:POSIX*:*:*)
-	echo bs2000-siemens-sysv
-	exit ;;
-    DS/*:UNIX_System_V:*:*)
-	echo "$UNAME_MACHINE"-"$UNAME_SYSTEM"-"$UNAME_RELEASE"
-	exit ;;
-    *:Plan9:*:*)
-	# "uname -m" is not consistent, so use $cputype instead. 386
-	# is converted to i386 for consistency with other x86
-	# operating systems.
-	# shellcheck disable=SC2154
-	if test "$cputype" = 386; then
-	    UNAME_MACHINE=i386
-	else
-	    UNAME_MACHINE="$cputype"
-	fi
-	echo "$UNAME_MACHINE"-unknown-plan9
-	exit ;;
-    *:TOPS-10:*:*)
-	echo pdp10-unknown-tops10
-	exit ;;
-    *:TENEX:*:*)
-	echo pdp10-unknown-tenex
-	exit ;;
-    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
-	echo pdp10-dec-tops20
-	exit ;;
-    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
-	echo pdp10-xkl-tops20
-	exit ;;
-    *:TOPS-20:*:*)
-	echo pdp10-unknown-tops20
-	exit ;;
-    *:ITS:*:*)
-	echo pdp10-unknown-its
-	exit ;;
-    SEI:*:*:SEIUX)
-	echo mips-sei-seiux"$UNAME_RELEASE"
-	exit ;;
-    *:DragonFly:*:*)
-	echo "$UNAME_MACHINE"-unknown-dragonfly"$(echo "$UNAME_RELEASE"|sed -e 's/[-(].*//')"
-	exit ;;
-    *:*VMS:*:*)
-	UNAME_MACHINE=$( (uname -p) 2>/dev/null)
-	case "$UNAME_MACHINE" in
-	    A*) echo alpha-dec-vms ; exit ;;
-	    I*) echo ia64-dec-vms ; exit ;;
-	    V*) echo vax-dec-vms ; exit ;;
-	esac ;;
-    *:XENIX:*:SysV)
-	echo i386-pc-xenix
-	exit ;;
-    i*86:skyos:*:*)
-	echo "$UNAME_MACHINE"-pc-skyos"$(echo "$UNAME_RELEASE" | sed -e 's/ .*$//')"
-	exit ;;
-    i*86:rdos:*:*)
-	echo "$UNAME_MACHINE"-pc-rdos
-	exit ;;
-    i*86:AROS:*:*)
-	echo "$UNAME_MACHINE"-pc-aros
-	exit ;;
-    x86_64:VMkernel:*:*)
-	echo "$UNAME_MACHINE"-unknown-esx
-	exit ;;
-    amd64:Isilon\ OneFS:*:*)
-	echo x86_64-unknown-onefs
-	exit ;;
-    *:Unleashed:*:*)
-	echo "$UNAME_MACHINE"-unknown-unleashed"$UNAME_RELEASE"
-	exit ;;
-esac
-
-# No uname command or uname output not recognized.
-set_cc_for_build
-cat > "$dummy.c" <<EOF
-#ifdef _SEQUENT_
-#include <sys/types.h>
-#include <sys/utsname.h>
-#endif
-#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__)
-#if defined (vax) || defined (__vax) || defined (__vax__) || defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__)
-#include <signal.h>
-#if defined(_SIZE_T_) || defined(SIGLOST)
-#include <sys/utsname.h>
-#endif
-#endif
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-  "4"
-#else
-  ""
-#endif
-  ); exit (0);
-#endif
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=$( (hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null);
-  if (version < 4)
-    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  else
-    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-  struct utsname un;
-
-  uname(&un);
-  if (strncmp(un.version, "V2", 2) == 0) {
-    printf ("i386-sequent-ptx2\n"); exit (0);
-  }
-  if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-    printf ("i386-sequent-ptx1\n"); exit (0);
-  }
-  printf ("i386-sequent-ptx\n"); exit (0);
-#endif
-
-#if defined (vax)
-#if !defined (ultrix)
-#include <sys/param.h>
-#if defined (BSD)
-#if BSD == 43
-  printf ("vax-dec-bsd4.3\n"); exit (0);
-#else
-#if BSD == 199006
-  printf ("vax-dec-bsd4.3reno\n"); exit (0);
-#else
-  printf ("vax-dec-bsd\n"); exit (0);
-#endif
-#endif
-#else
-  printf ("vax-dec-bsd\n"); exit (0);
-#endif
-#else
-#if defined(_SIZE_T_) || defined(SIGLOST)
-  struct utsname un;
-  uname (&un);
-  printf ("vax-dec-ultrix%s\n", un.release); exit (0);
-#else
-  printf ("vax-dec-ultrix\n"); exit (0);
-#endif
-#endif
-#endif
-#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__)
-#if defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__)
-#if defined(_SIZE_T_) || defined(SIGLOST)
-  struct utsname *un;
-  uname (&un);
-  printf ("mips-dec-ultrix%s\n", un.release); exit (0);
-#else
-  printf ("mips-dec-ultrix\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-$CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null && SYSTEM_NAME=$($dummy) &&
-	{ echo "$SYSTEM_NAME"; exit; }
-
-# Apollos put the system type in the environment.
-test -d /usr/apollo && { echo "$ISP-apollo-$SYSTYPE"; exit; }
-
-echo "$0: unable to guess system type" >&2
-
-case "$UNAME_MACHINE:$UNAME_SYSTEM" in
-    mips:Linux | mips64:Linux)
-	# If we got here on MIPS GNU/Linux, output extra information.
-	cat >&2 <<EOF
-
-NOTE: MIPS GNU/Linux systems require a C compiler to fully recognize
-the system type. Please install a C compiler and try again.
-EOF
-	;;
-esac
-
-cat >&2 <<EOF
-
-This script (version $timestamp), has failed to recognize the
-operating system you are using. If your script is old, overwrite *all*
-copies of config.guess and config.sub with the latest versions from:
-
-  https://git.savannah.gnu.org/cgit/config.git/plain/config.guess
-and
-  https://git.savannah.gnu.org/cgit/config.git/plain/config.sub
-EOF
-
-year=$(echo $timestamp | sed 's,-.*,,')
-# shellcheck disable=SC2003
-if test "$(expr "$(date +%Y)" - "$year")" -lt 3 ; then
-   cat >&2 <<EOF
-
-If $0 has already been updated, send the following data and any
-information you think might be pertinent to config-patches@gnu.org to
-provide the necessary information to handle your system.
-
-config.guess timestamp = $timestamp
-
-uname -m = $( (uname -m) 2>/dev/null || echo unknown)
-uname -r = $( (uname -r) 2>/dev/null || echo unknown)
-uname -s = $( (uname -s) 2>/dev/null || echo unknown)
-uname -v = $( (uname -v) 2>/dev/null || echo unknown)
-
-/usr/bin/uname -p = $( (/usr/bin/uname -p) 2>/dev/null)
-/bin/uname -X     = $( (/bin/uname -X) 2>/dev/null)
-
-hostinfo               = $( (hostinfo) 2>/dev/null)
-/bin/universe          = $( (/bin/universe) 2>/dev/null)
-/usr/bin/arch -k       = $( (/usr/bin/arch -k) 2>/dev/null)
-/bin/arch              = $( (/bin/arch) 2>/dev/null)
-/usr/bin/oslevel       = $( (/usr/bin/oslevel) 2>/dev/null)
-/usr/convex/getsysinfo = $( (/usr/convex/getsysinfo) 2>/dev/null)
-
-UNAME_MACHINE = "$UNAME_MACHINE"
-UNAME_RELEASE = "$UNAME_RELEASE"
-UNAME_SYSTEM  = "$UNAME_SYSTEM"
-UNAME_VERSION = "$UNAME_VERSION"
-EOF
-fi
-
-exit 1
-
-# Local variables:
-# eval: (add-hook 'before-save-hook 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/config.sub b/config.sub
deleted file mode 100644
index 0cbdae682..000000000
--- a/config.sub
+++ /dev/null
@@ -1,1855 +0,0 @@
-#! /bin/sh
-# Configuration validation subroutine script.
-#   Copyright 1992-2021 Free Software Foundation, Inc.
-
-timestamp='2021-01-01'
-
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <https://www.gnu.org/licenses/>.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that
-# program.  This Exception is an additional permission under section 7
-# of the GNU General Public License, version 3 ("GPLv3").
-
-
-# Please send patches to <config-patches@gnu.org>.
-#
-# Configuration subroutine to validate and canonicalize a configuration type.
-# Supply the specified configuration type as an argument.
-# If it is invalid, we print an error message on stderr and exit with code 1.
-# Otherwise, we print the canonical config type on stdout and succeed.
-
-# You can get the latest version of this script from:
-# https://git.savannah.gnu.org/cgit/config.git/plain/config.sub
-
-# This file is supposed to be the same for all GNU packages
-# and recognize all the CPU types, system types and aliases
-# that are meaningful with *any* GNU software.
-# Each package is responsible for reporting which valid configurations
-# it does not support.  The user should be able to distinguish
-# a failure to support a valid configuration from a meaningless
-# configuration.
-
-# The goal of this file is to map all the various variations of a given
-# machine specification into a single specification in the form:
-#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
-# or in some cases, the newer four-part form:
-#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
-# It is wrong to echo any other type of specification.
-
-me=$(echo "$0" | sed -e 's,.*/,,')
-
-usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
-
-Canonicalize a configuration name.
-
-Options:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.sub ($timestamp)
-
-Copyright 1992-2021 Free Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help" >&2
-       exit 1 ;;
-
-    *local*)
-       # First pass through any local machine types.
-       echo "$1"
-       exit ;;
-
-    * )
-       break ;;
-  esac
-done
-
-case $# in
- 0) echo "$me: missing argument$help" >&2
-    exit 1;;
- 1) ;;
- *) echo "$me: too many arguments$help" >&2
-    exit 1;;
-esac
-
-# Split fields of configuration type
-# shellcheck disable=SC2162
-IFS="-" read field1 field2 field3 field4 <<EOF
-$1
-EOF
-
-# Separate into logical components for further validation
-case $1 in
-	*-*-*-*-*)
-		echo Invalid configuration \`"$1"\': more than four components >&2
-		exit 1
-		;;
-	*-*-*-*)
-		basic_machine=$field1-$field2
-		basic_os=$field3-$field4
-		;;
-	*-*-*)
-		# Ambiguous whether COMPANY is present, or skipped and KERNEL-OS is two
-		# parts
-		maybe_os=$field2-$field3
-		case $maybe_os in
-			nto-qnx* | linux-* | uclinux-uclibc* \
-			| uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* \
-			| netbsd*-eabi* | kopensolaris*-gnu* | cloudabi*-eabi* \
-			| storm-chaos* | os2-emx* | rtmk-nova*)
-				basic_machine=$field1
-				basic_os=$maybe_os
-				;;
-			android-linux)
-				basic_machine=$field1-unknown
-				basic_os=linux-android
-				;;
-			*)
-				basic_machine=$field1-$field2
-				basic_os=$field3
-				;;
-		esac
-		;;
-	*-*)
-		# A lone config we happen to match not fitting any pattern
-		case $field1-$field2 in
-			decstation-3100)
-				basic_machine=mips-dec
-				basic_os=
-				;;
-			*-*)
-				# Second component is usually, but not always the OS
-				case $field2 in
-					# Prevent following clause from handling this valid os
-					sun*os*)
-						basic_machine=$field1
-						basic_os=$field2
-						;;
-					# Manufacturers
-					dec* | mips* | sequent* | encore* | pc533* | sgi* | sony* \
-					| att* | 7300* | 3300* | delta* | motorola* | sun[234]* \
-					| unicom* | ibm* | next | hp | isi* | apollo | altos* \
-					| convergent* | ncr* | news | 32* | 3600* | 3100* \
-					| hitachi* | c[123]* | convex* | sun | crds | omron* | dg \
-					| ultra | tti* | harris | dolphin | highlevel | gould \
-					| cbm | ns | masscomp | apple | axis | knuth | cray \
-					| microblaze* | sim | cisco \
-					| oki | wec | wrs | winbond)
-						basic_machine=$field1-$field2
-						basic_os=
-						;;
-					*)
-						basic_machine=$field1
-						basic_os=$field2
-						;;
-				esac
-			;;
-		esac
-		;;
-	*)
-		# Convert single-component short-hands not valid as part of
-		# multi-component configurations.
-		case $field1 in
-			386bsd)
-				basic_machine=i386-pc
-				basic_os=bsd
-				;;
-			a29khif)
-				basic_machine=a29k-amd
-				basic_os=udi
-				;;
-			adobe68k)
-				basic_machine=m68010-adobe
-				basic_os=scout
-				;;
-			alliant)
-				basic_machine=fx80-alliant
-				basic_os=
-				;;
-			altos | altos3068)
-				basic_machine=m68k-altos
-				basic_os=
-				;;
-			am29k)
-				basic_machine=a29k-none
-				basic_os=bsd
-				;;
-			amdahl)
-				basic_machine=580-amdahl
-				basic_os=sysv
-				;;
-			amiga)
-				basic_machine=m68k-unknown
-				basic_os=
-				;;
-			amigaos | amigados)
-				basic_machine=m68k-unknown
-				basic_os=amigaos
-				;;
-			amigaunix | amix)
-				basic_machine=m68k-unknown
-				basic_os=sysv4
-				;;
-			apollo68)
-				basic_machine=m68k-apollo
-				basic_os=sysv
-				;;
-			apollo68bsd)
-				basic_machine=m68k-apollo
-				basic_os=bsd
-				;;
-			aros)
-				basic_machine=i386-pc
-				basic_os=aros
-				;;
-			aux)
-				basic_machine=m68k-apple
-				basic_os=aux
-				;;
-			balance)
-				basic_machine=ns32k-sequent
-				basic_os=dynix
-				;;
-			blackfin)
-				basic_machine=bfin-unknown
-				basic_os=linux
-				;;
-			cegcc)
-				basic_machine=arm-unknown
-				basic_os=cegcc
-				;;
-			convex-c1)
-				basic_machine=c1-convex
-				basic_os=bsd
-				;;
-			convex-c2)
-				basic_machine=c2-convex
-				basic_os=bsd
-				;;
-			convex-c32)
-				basic_machine=c32-convex
-				basic_os=bsd
-				;;
-			convex-c34)
-				basic_machine=c34-convex
-				basic_os=bsd
-				;;
-			convex-c38)
-				basic_machine=c38-convex
-				basic_os=bsd
-				;;
-			cray)
-				basic_machine=j90-cray
-				basic_os=unicos
-				;;
-			crds | unos)
-				basic_machine=m68k-crds
-				basic_os=
-				;;
-			da30)
-				basic_machine=m68k-da30
-				basic_os=
-				;;
-			decstation | pmax | pmin | dec3100 | decstatn)
-				basic_machine=mips-dec
-				basic_os=
-				;;
-			delta88)
-				basic_machine=m88k-motorola
-				basic_os=sysv3
-				;;
-			dicos)
-				basic_machine=i686-pc
-				basic_os=dicos
-				;;
-			djgpp)
-				basic_machine=i586-pc
-				basic_os=msdosdjgpp
-				;;
-			ebmon29k)
-				basic_machine=a29k-amd
-				basic_os=ebmon
-				;;
-			es1800 | OSE68k | ose68k | ose | OSE)
-				basic_machine=m68k-ericsson
-				basic_os=ose
-				;;
-			gmicro)
-				basic_machine=tron-gmicro
-				basic_os=sysv
-				;;
-			go32)
-				basic_machine=i386-pc
-				basic_os=go32
-				;;
-			h8300hms)
-				basic_machine=h8300-hitachi
-				basic_os=hms
-				;;
-			h8300xray)
-				basic_machine=h8300-hitachi
-				basic_os=xray
-				;;
-			h8500hms)
-				basic_machine=h8500-hitachi
-				basic_os=hms
-				;;
-			harris)
-				basic_machine=m88k-harris
-				basic_os=sysv3
-				;;
-			hp300 | hp300hpux)
-				basic_machine=m68k-hp
-				basic_os=hpux
-				;;
-			hp300bsd)
-				basic_machine=m68k-hp
-				basic_os=bsd
-				;;
-			hppaosf)
-				basic_machine=hppa1.1-hp
-				basic_os=osf
-				;;
-			hppro)
-				basic_machine=hppa1.1-hp
-				basic_os=proelf
-				;;
-			i386mach)
-				basic_machine=i386-mach
-				basic_os=mach
-				;;
-			isi68 | isi)
-				basic_machine=m68k-isi
-				basic_os=sysv
-				;;
-			m68knommu)
-				basic_machine=m68k-unknown
-				basic_os=linux
-				;;
-			magnum | m3230)
-				basic_machine=mips-mips
-				basic_os=sysv
-				;;
-			merlin)
-				basic_machine=ns32k-utek
-				basic_os=sysv
-				;;
-			mingw64)
-				basic_machine=x86_64-pc
-				basic_os=mingw64
-				;;
-			mingw32)
-				basic_machine=i686-pc
-				basic_os=mingw32
-				;;
-			mingw32ce)
-				basic_machine=arm-unknown
-				basic_os=mingw32ce
-				;;
-			monitor)
-				basic_machine=m68k-rom68k
-				basic_os=coff
-				;;
-			morphos)
-				basic_machine=powerpc-unknown
-				basic_os=morphos
-				;;
-			moxiebox)
-				basic_machine=moxie-unknown
-				basic_os=moxiebox
-				;;
-			msdos)
-				basic_machine=i386-pc
-				basic_os=msdos
-				;;
-			msys)
-				basic_machine=i686-pc
-				basic_os=msys
-				;;
-			mvs)
-				basic_machine=i370-ibm
-				basic_os=mvs
-				;;
-			nacl)
-				basic_machine=le32-unknown
-				basic_os=nacl
-				;;
-			ncr3000)
-				basic_machine=i486-ncr
-				basic_os=sysv4
-				;;
-			netbsd386)
-				basic_machine=i386-pc
-				basic_os=netbsd
-				;;
-			netwinder)
-				basic_machine=armv4l-rebel
-				basic_os=linux
-				;;
-			news | news700 | news800 | news900)
-				basic_machine=m68k-sony
-				basic_os=newsos
-				;;
-			news1000)
-				basic_machine=m68030-sony
-				basic_os=newsos
-				;;
-			necv70)
-				basic_machine=v70-nec
-				basic_os=sysv
-				;;
-			nh3000)
-				basic_machine=m68k-harris
-				basic_os=cxux
-				;;
-			nh[45]000)
-				basic_machine=m88k-harris
-				basic_os=cxux
-				;;
-			nindy960)
-				basic_machine=i960-intel
-				basic_os=nindy
-				;;
-			mon960)
-				basic_machine=i960-intel
-				basic_os=mon960
-				;;
-			nonstopux)
-				basic_machine=mips-compaq
-				basic_os=nonstopux
-				;;
-			os400)
-				basic_machine=powerpc-ibm
-				basic_os=os400
-				;;
-			OSE68000 | ose68000)
-				basic_machine=m68000-ericsson
-				basic_os=ose
-				;;
-			os68k)
-				basic_machine=m68k-none
-				basic_os=os68k
-				;;
-			paragon)
-				basic_machine=i860-intel
-				basic_os=osf
-				;;
-			parisc)
-				basic_machine=hppa-unknown
-				basic_os=linux
-				;;
-			psp)
-				basic_machine=mipsallegrexel-sony
-				basic_os=psp
-				;;
-			pw32)
-				basic_machine=i586-unknown
-				basic_os=pw32
-				;;
-			rdos | rdos64)
-				basic_machine=x86_64-pc
-				basic_os=rdos
-				;;
-			rdos32)
-				basic_machine=i386-pc
-				basic_os=rdos
-				;;
-			rom68k)
-				basic_machine=m68k-rom68k
-				basic_os=coff
-				;;
-			sa29200)
-				basic_machine=a29k-amd
-				basic_os=udi
-				;;
-			sei)
-				basic_machine=mips-sei
-				basic_os=seiux
-				;;
-			sequent)
-				basic_machine=i386-sequent
-				basic_os=
-				;;
-			sps7)
-				basic_machine=m68k-bull
-				basic_os=sysv2
-				;;
-			st2000)
-				basic_machine=m68k-tandem
-				basic_os=
-				;;
-			stratus)
-				basic_machine=i860-stratus
-				basic_os=sysv4
-				;;
-			sun2)
-				basic_machine=m68000-sun
-				basic_os=
-				;;
-			sun2os3)
-				basic_machine=m68000-sun
-				basic_os=sunos3
-				;;
-			sun2os4)
-				basic_machine=m68000-sun
-				basic_os=sunos4
-				;;
-			sun3)
-				basic_machine=m68k-sun
-				basic_os=
-				;;
-			sun3os3)
-				basic_machine=m68k-sun
-				basic_os=sunos3
-				;;
-			sun3os4)
-				basic_machine=m68k-sun
-				basic_os=sunos4
-				;;
-			sun4)
-				basic_machine=sparc-sun
-				basic_os=
-				;;
-			sun4os3)
-				basic_machine=sparc-sun
-				basic_os=sunos3
-				;;
-			sun4os4)
-				basic_machine=sparc-sun
-				basic_os=sunos4
-				;;
-			sun4sol2)
-				basic_machine=sparc-sun
-				basic_os=solaris2
-				;;
-			sun386 | sun386i | roadrunner)
-				basic_machine=i386-sun
-				basic_os=
-				;;
-			sv1)
-				basic_machine=sv1-cray
-				basic_os=unicos
-				;;
-			symmetry)
-				basic_machine=i386-sequent
-				basic_os=dynix
-				;;
-			t3e)
-				basic_machine=alphaev5-cray
-				basic_os=unicos
-				;;
-			t90)
-				basic_machine=t90-cray
-				basic_os=unicos
-				;;
-			toad1)
-				basic_machine=pdp10-xkl
-				basic_os=tops20
-				;;
-			tpf)
-				basic_machine=s390x-ibm
-				basic_os=tpf
-				;;
-			udi29k)
-				basic_machine=a29k-amd
-				basic_os=udi
-				;;
-			ultra3)
-				basic_machine=a29k-nyu
-				basic_os=sym1
-				;;
-			v810 | necv810)
-				basic_machine=v810-nec
-				basic_os=none
-				;;
-			vaxv)
-				basic_machine=vax-dec
-				basic_os=sysv
-				;;
-			vms)
-				basic_machine=vax-dec
-				basic_os=vms
-				;;
-			vsta)
-				basic_machine=i386-pc
-				basic_os=vsta
-				;;
-			vxworks960)
-				basic_machine=i960-wrs
-				basic_os=vxworks
-				;;
-			vxworks68)
-				basic_machine=m68k-wrs
-				basic_os=vxworks
-				;;
-			vxworks29k)
-				basic_machine=a29k-wrs
-				basic_os=vxworks
-				;;
-			xbox)
-				basic_machine=i686-pc
-				basic_os=mingw32
-				;;
-			ymp)
-				basic_machine=ymp-cray
-				basic_os=unicos
-				;;
-			*)
-				basic_machine=$1
-				basic_os=
-				;;
-		esac
-		;;
-esac
-
-# Decode 1-component or ad-hoc basic machines
-case $basic_machine in
-	# Here we handle the default manufacturer of certain CPU types.  It is in
-	# some cases the only manufacturer, in others, it is the most popular.
-	w89k)
-		cpu=hppa1.1
-		vendor=winbond
-		;;
-	op50n)
-		cpu=hppa1.1
-		vendor=oki
-		;;
-	op60c)
-		cpu=hppa1.1
-		vendor=oki
-		;;
-	ibm*)
-		cpu=i370
-		vendor=ibm
-		;;
-	orion105)
-		cpu=clipper
-		vendor=highlevel
-		;;
-	mac | mpw | mac-mpw)
-		cpu=m68k
-		vendor=apple
-		;;
-	pmac | pmac-mpw)
-		cpu=powerpc
-		vendor=apple
-		;;
-
-	# Recognize the various machine names and aliases which stand
-	# for a CPU type and a company and sometimes even an OS.
-	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
-		cpu=m68000
-		vendor=att
-		;;
-	3b*)
-		cpu=we32k
-		vendor=att
-		;;
-	bluegene*)
-		cpu=powerpc
-		vendor=ibm
-		basic_os=cnk
-		;;
-	decsystem10* | dec10*)
-		cpu=pdp10
-		vendor=dec
-		basic_os=tops10
-		;;
-	decsystem20* | dec20*)
-		cpu=pdp10
-		vendor=dec
-		basic_os=tops20
-		;;
-	delta | 3300 | motorola-3300 | motorola-delta \
-	      | 3300-motorola | delta-motorola)
-		cpu=m68k
-		vendor=motorola
-		;;
-	dpx2*)
-		cpu=m68k
-		vendor=bull
-		basic_os=sysv3
-		;;
-	encore | umax | mmax)
-		cpu=ns32k
-		vendor=encore
-		;;
-	elxsi)
-		cpu=elxsi
-		vendor=elxsi
-		basic_os=${basic_os:-bsd}
-		;;
-	fx2800)
-		cpu=i860
-		vendor=alliant
-		;;
-	genix)
-		cpu=ns32k
-		vendor=ns
-		;;
-	h3050r* | hiux*)
-		cpu=hppa1.1
-		vendor=hitachi
-		basic_os=hiuxwe2
-		;;
-	hp3k9[0-9][0-9] | hp9[0-9][0-9])
-		cpu=hppa1.0
-		vendor=hp
-		;;
-	hp9k2[0-9][0-9] | hp9k31[0-9])
-		cpu=m68000
-		vendor=hp
-		;;
-	hp9k3[2-9][0-9])
-		cpu=m68k
-		vendor=hp
-		;;
-	hp9k6[0-9][0-9] | hp6[0-9][0-9])
-		cpu=hppa1.0
-		vendor=hp
-		;;
-	hp9k7[0-79][0-9] | hp7[0-79][0-9])
-		cpu=hppa1.1
-		vendor=hp
-		;;
-	hp9k78[0-9] | hp78[0-9])
-		# FIXME: really hppa2.0-hp
-		cpu=hppa1.1
-		vendor=hp
-		;;
-	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
-		# FIXME: really hppa2.0-hp
-		cpu=hppa1.1
-		vendor=hp
-		;;
-	hp9k8[0-9][13679] | hp8[0-9][13679])
-		cpu=hppa1.1
-		vendor=hp
-		;;
-	hp9k8[0-9][0-9] | hp8[0-9][0-9])
-		cpu=hppa1.0
-		vendor=hp
-		;;
-	i*86v32)
-		cpu=$(echo "$1" | sed -e 's/86.*/86/')
-		vendor=pc
-		basic_os=sysv32
-		;;
-	i*86v4*)
-		cpu=$(echo "$1" | sed -e 's/86.*/86/')
-		vendor=pc
-		basic_os=sysv4
-		;;
-	i*86v)
-		cpu=$(echo "$1" | sed -e 's/86.*/86/')
-		vendor=pc
-		basic_os=sysv
-		;;
-	i*86sol2)
-		cpu=$(echo "$1" | sed -e 's/86.*/86/')
-		vendor=pc
-		basic_os=solaris2
-		;;
-	j90 | j90-cray)
-		cpu=j90
-		vendor=cray
-		basic_os=${basic_os:-unicos}
-		;;
-	iris | iris4d)
-		cpu=mips
-		vendor=sgi
-		case $basic_os in
-		    irix*)
-			;;
-		    *)
-			basic_os=irix4
-			;;
-		esac
-		;;
-	miniframe)
-		cpu=m68000
-		vendor=convergent
-		;;
-	*mint | mint[0-9]* | *MiNT | *MiNT[0-9]*)
-		cpu=m68k
-		vendor=atari
-		basic_os=mint
-		;;
-	news-3600 | risc-news)
-		cpu=mips
-		vendor=sony
-		basic_os=newsos
-		;;
-	next | m*-next)
-		cpu=m68k
-		vendor=next
-		case $basic_os in
-		    openstep*)
-		        ;;
-		    nextstep*)
-			;;
-		    ns2*)
-		      basic_os=nextstep2
-			;;
-		    *)
-		      basic_os=nextstep3
-			;;
-		esac
-		;;
-	np1)
-		cpu=np1
-		vendor=gould
-		;;
-	op50n-* | op60c-*)
-		cpu=hppa1.1
-		vendor=oki
-		basic_os=proelf
-		;;
-	pa-hitachi)
-		cpu=hppa1.1
-		vendor=hitachi
-		basic_os=hiuxwe2
-		;;
-	pbd)
-		cpu=sparc
-		vendor=tti
-		;;
-	pbb)
-		cpu=m68k
-		vendor=tti
-		;;
-	pc532)
-		cpu=ns32k
-		vendor=pc532
-		;;
-	pn)
-		cpu=pn
-		vendor=gould
-		;;
-	power)
-		cpu=power
-		vendor=ibm
-		;;
-	ps2)
-		cpu=i386
-		vendor=ibm
-		;;
-	rm[46]00)
-		cpu=mips
-		vendor=siemens
-		;;
-	rtpc | rtpc-*)
-		cpu=romp
-		vendor=ibm
-		;;
-	sde)
-		cpu=mipsisa32
-		vendor=sde
-		basic_os=${basic_os:-elf}
-		;;
-	simso-wrs)
-		cpu=sparclite
-		vendor=wrs
-		basic_os=vxworks
-		;;
-	tower | tower-32)
-		cpu=m68k
-		vendor=ncr
-		;;
-	vpp*|vx|vx-*)
-		cpu=f301
-		vendor=fujitsu
-		;;
-	w65)
-		cpu=w65
-		vendor=wdc
-		;;
-	w89k-*)
-		cpu=hppa1.1
-		vendor=winbond
-		basic_os=proelf
-		;;
-	none)
-		cpu=none
-		vendor=none
-		;;
-	leon|leon[3-9])
-		cpu=sparc
-		vendor=$basic_machine
-		;;
-	leon-*|leon[3-9]-*)
-		cpu=sparc
-		vendor=$(echo "$basic_machine" | sed 's/-.*//')
-		;;
-
-	*-*)
-		# shellcheck disable=SC2162
-		IFS="-" read cpu vendor <<EOF
-$basic_machine
-EOF
-		;;
-	# We use `pc' rather than `unknown'
-	# because (1) that's what they normally are, and
-	# (2) the word "unknown" tends to confuse beginning users.
-	i*86 | x86_64)
-		cpu=$basic_machine
-		vendor=pc
-		;;
-	# These rules are duplicated from below for sake of the special case above;
-	# i.e. things that normalized to x86 arches should also default to "pc"
-	pc98)
-		cpu=i386
-		vendor=pc
-		;;
-	x64 | amd64)
-		cpu=x86_64
-		vendor=pc
-		;;
-	# Recognize the basic CPU types without company name.
-	*)
-		cpu=$basic_machine
-		vendor=unknown
-		;;
-esac
-
-unset -v basic_machine
-
-# Decode basic machines in the full and proper CPU-Company form.
-case $cpu-$vendor in
-	# Here we handle the default manufacturer of certain CPU types in canonical form. It is in
-	# some cases the only manufacturer, in others, it is the most popular.
-	craynv-unknown)
-		vendor=cray
-		basic_os=${basic_os:-unicosmp}
-		;;
-	c90-unknown | c90-cray)
-		vendor=cray
-		basic_os=${Basic_os:-unicos}
-		;;
-	fx80-unknown)
-		vendor=alliant
-		;;
-	romp-unknown)
-		vendor=ibm
-		;;
-	mmix-unknown)
-		vendor=knuth
-		;;
-	microblaze-unknown | microblazeel-unknown)
-		vendor=xilinx
-		;;
-	rs6000-unknown)
-		vendor=ibm
-		;;
-	vax-unknown)
-		vendor=dec
-		;;
-	pdp11-unknown)
-		vendor=dec
-		;;
-	we32k-unknown)
-		vendor=att
-		;;
-	cydra-unknown)
-		vendor=cydrome
-		;;
-	i370-ibm*)
-		vendor=ibm
-		;;
-	orion-unknown)
-		vendor=highlevel
-		;;
-	xps-unknown | xps100-unknown)
-		cpu=xps100
-		vendor=honeywell
-		;;
-
-	# Here we normalize CPU types with a missing or matching vendor
-	dpx20-unknown | dpx20-bull)
-		cpu=rs6000
-		vendor=bull
-		basic_os=${basic_os:-bosx}
-		;;
-
-	# Here we normalize CPU types irrespective of the vendor
-	amd64-*)
-		cpu=x86_64
-		;;
-	blackfin-*)
-		cpu=bfin
-		basic_os=linux
-		;;
-	c54x-*)
-		cpu=tic54x
-		;;
-	c55x-*)
-		cpu=tic55x
-		;;
-	c6x-*)
-		cpu=tic6x
-		;;
-	e500v[12]-*)
-		cpu=powerpc
-		basic_os=${basic_os}"spe"
-		;;
-	mips3*-*)
-		cpu=mips64
-		;;
-	ms1-*)
-		cpu=mt
-		;;
-	m68knommu-*)
-		cpu=m68k
-		basic_os=linux
-		;;
-	m9s12z-* | m68hcs12z-* | hcs12z-* | s12z-*)
-		cpu=s12z
-		;;
-	openrisc-*)
-		cpu=or32
-		;;
-	parisc-*)
-		cpu=hppa
-		basic_os=linux
-		;;
-	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
-		cpu=i586
-		;;
-	pentiumpro-* | p6-* | 6x86-* | athlon-* | athalon_*-*)
-		cpu=i686
-		;;
-	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
-		cpu=i686
-		;;
-	pentium4-*)
-		cpu=i786
-		;;
-	pc98-*)
-		cpu=i386
-		;;
-	ppc-* | ppcbe-*)
-		cpu=powerpc
-		;;
-	ppcle-* | powerpclittle-*)
-		cpu=powerpcle
-		;;
-	ppc64-*)
-		cpu=powerpc64
-		;;
-	ppc64le-* | powerpc64little-*)
-		cpu=powerpc64le
-		;;
-	sb1-*)
-		cpu=mipsisa64sb1
-		;;
-	sb1el-*)
-		cpu=mipsisa64sb1el
-		;;
-	sh5e[lb]-*)
-		cpu=$(echo "$cpu" | sed 's/^\(sh.\)e\(.\)$/\1\2e/')
-		;;
-	spur-*)
-		cpu=spur
-		;;
-	strongarm-* | thumb-*)
-		cpu=arm
-		;;
-	tx39-*)
-		cpu=mipstx39
-		;;
-	tx39el-*)
-		cpu=mipstx39el
-		;;
-	x64-*)
-		cpu=x86_64
-		;;
-	xscale-* | xscalee[bl]-*)
-		cpu=$(echo "$cpu" | sed 's/^xscale/arm/')
-		;;
-	arm64-*)
-		cpu=aarch64
-		;;
-
-	# Recognize the canonical CPU Types that limit and/or modify the
-	# company names they are paired with.
-	cr16-*)
-		basic_os=${basic_os:-elf}
-		;;
-	crisv32-* | etraxfs*-*)
-		cpu=crisv32
-		vendor=axis
-		;;
-	cris-* | etrax*-*)
-		cpu=cris
-		vendor=axis
-		;;
-	crx-*)
-		basic_os=${basic_os:-elf}
-		;;
-	neo-tandem)
-		cpu=neo
-		vendor=tandem
-		;;
-	nse-tandem)
-		cpu=nse
-		vendor=tandem
-		;;
-	nsr-tandem)
-		cpu=nsr
-		vendor=tandem
-		;;
-	nsv-tandem)
-		cpu=nsv
-		vendor=tandem
-		;;
-	nsx-tandem)
-		cpu=nsx
-		vendor=tandem
-		;;
-	mipsallegrexel-sony)
-		cpu=mipsallegrexel
-		vendor=sony
-		;;
-	tile*-*)
-		basic_os=${basic_os:-linux-gnu}
-		;;
-
-	*)
-		# Recognize the canonical CPU types that are allowed with any
-		# company name.
-		case $cpu in
-			1750a | 580 \
-			| a29k \
-			| aarch64 | aarch64_be \
-			| abacus \
-			| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] \
-			| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] \
-			| alphapca5[67] | alpha64pca5[67] \
-			| am33_2.0 \
-			| amdgcn \
-			| arc | arceb \
-			| arm | arm[lb]e | arme[lb] | armv* \
-			| avr | avr32 \
-			| asmjs \
-			| ba \
-			| be32 | be64 \
-			| bfin | bpf | bs2000 \
-			| c[123]* | c30 | [cjt]90 | c4x \
-			| c8051 | clipper | craynv | csky | cydra \
-			| d10v | d30v | dlx | dsp16xx \
-			| e2k | elxsi | epiphany \
-			| f30[01] | f700 | fido | fr30 | frv | ft32 | fx80 \
-			| h8300 | h8500 \
-			| hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
-			| hexagon \
-			| i370 | i*86 | i860 | i960 | ia16 | ia64 \
-			| ip2k | iq2000 \
-			| k1om \
-			| le32 | le64 \
-			| lm32 \
-			| loongarch32 | loongarch64 | loongarchx32 \
-			| m32c | m32r | m32rle \
-			| m5200 | m68000 | m680[012346]0 | m68360 | m683?2 | m68k \
-			| m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x \
-			| m88110 | m88k | maxq | mb | mcore | mep | metag \
-			| microblaze | microblazeel \
-			| mips | mipsbe | mipseb | mipsel | mipsle \
-			| mips16 \
-			| mips64 | mips64eb | mips64el \
-			| mips64octeon | mips64octeonel \
-			| mips64orion | mips64orionel \
-			| mips64r5900 | mips64r5900el \
-			| mips64vr | mips64vrel \
-			| mips64vr4100 | mips64vr4100el \
-			| mips64vr4300 | mips64vr4300el \
-			| mips64vr5000 | mips64vr5000el \
-			| mips64vr5900 | mips64vr5900el \
-			| mipsisa32 | mipsisa32el \
-			| mipsisa32r2 | mipsisa32r2el \
-			| mipsisa32r6 | mipsisa32r6el \
-			| mipsisa64 | mipsisa64el \
-			| mipsisa64r2 | mipsisa64r2el \
-			| mipsisa64r6 | mipsisa64r6el \
-			| mipsisa64sb1 | mipsisa64sb1el \
-			| mipsisa64sr71k | mipsisa64sr71kel \
-			| mipsr5900 | mipsr5900el \
-			| mipstx39 | mipstx39el \
-			| mmix \
-			| mn10200 | mn10300 \
-			| moxie \
-			| mt \
-			| msp430 \
-			| nds32 | nds32le | nds32be \
-			| nfp \
-			| nios | nios2 | nios2eb | nios2el \
-			| none | np1 | ns16k | ns32k | nvptx \
-			| open8 \
-			| or1k* \
-			| or32 \
-			| orion \
-			| picochip \
-			| pdp10 | pdp11 | pj | pjl | pn | power \
-			| powerpc | powerpc64 | powerpc64le | powerpcle | powerpcspe \
-			| pru \
-			| pyramid \
-			| riscv | riscv32 | riscv32be | riscv64 | riscv64be \
-			| rl78 | romp | rs6000 | rx \
-			| s390 | s390x \
-			| score \
-			| sh | shl \
-			| sh[1234] | sh[24]a | sh[24]ae[lb] | sh[23]e | she[lb] | sh[lb]e \
-			| sh[1234]e[lb] |  sh[12345][lb]e | sh[23]ele | sh64 | sh64le \
-			| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet \
-			| sparclite \
-			| sparcv8 | sparcv9 | sparcv9b | sparcv9v | sv1 | sx* \
-			| spu \
-			| tahoe \
-			| thumbv7* \
-			| tic30 | tic4x | tic54x | tic55x | tic6x | tic80 \
-			| tron \
-			| ubicom32 \
-			| v70 | v850 | v850e | v850e1 | v850es | v850e2 | v850e2v3 \
-			| vax \
-			| visium \
-			| w65 \
-			| wasm32 | wasm64 \
-			| we32k \
-			| x86 | x86_64 | xc16x | xgate | xps100 \
-			| xstormy16 | xtensa* \
-			| ymp \
-			| z8k | z80)
-				;;
-
-			*)
-				echo Invalid configuration \`"$1"\': machine \`"$cpu-$vendor"\' not recognized 1>&2
-				exit 1
-				;;
-		esac
-		;;
-esac
-
-# Here we canonicalize certain aliases for manufacturers.
-case $vendor in
-	digital*)
-		vendor=dec
-		;;
-	commodore*)
-		vendor=cbm
-		;;
-	*)
-		;;
-esac
-
-# Decode manufacturer-specific aliases for certain operating systems.
-
-if test x$basic_os != x
-then
-
-# First recognize some ad-hoc caes, or perhaps split kernel-os, or else just
-# set os.
-case $basic_os in
-	gnu/linux*)
-		kernel=linux
-		os=$(echo $basic_os | sed -e 's|gnu/linux|gnu|')
-		;;
-	os2-emx)
-		kernel=os2
-		os=$(echo $basic_os | sed -e 's|os2-emx|emx|')
-		;;
-	nto-qnx*)
-		kernel=nto
-		os=$(echo $basic_os | sed -e 's|nto-qnx|qnx|')
-		;;
-	*-*)
-		# shellcheck disable=SC2162
-		IFS="-" read kernel os <<EOF
-$basic_os
-EOF
-		;;
-	# Default OS when just kernel was specified
-	nto*)
-		kernel=nto
-		os=$(echo $basic_os | sed -e 's|nto|qnx|')
-		;;
-	linux*)
-		kernel=linux
-		os=$(echo $basic_os | sed -e 's|linux|gnu|')
-		;;
-	*)
-		kernel=
-		os=$basic_os
-		;;
-esac
-
-# Now, normalize the OS (knowing we just have one component, it's not a kernel,
-# etc.)
-case $os in
-	# First match some system type aliases that might get confused
-	# with valid system types.
-	# solaris* is a basic system type, with this one exception.
-	auroraux)
-		os=auroraux
-		;;
-	bluegene*)
-		os=cnk
-		;;
-	solaris1 | solaris1.*)
-		os=$(echo $os | sed -e 's|solaris1|sunos4|')
-		;;
-	solaris)
-		os=solaris2
-		;;
-	unixware*)
-		os=sysv4.2uw
-		;;
-	# es1800 is here to avoid being matched by es* (a different OS)
-	es1800*)
-		os=ose
-		;;
-	# Some version numbers need modification
-	chorusos*)
-		os=chorusos
-		;;
-	isc)
-		os=isc2.2
-		;;
-	sco6)
-		os=sco5v6
-		;;
-	sco5)
-		os=sco3.2v5
-		;;
-	sco4)
-		os=sco3.2v4
-		;;
-	sco3.2.[4-9]*)
-		os=$(echo $os | sed -e 's/sco3.2./sco3.2v/')
-		;;
-	sco*v* | scout)
-		# Don't match below
-		;;
-	sco*)
-		os=sco3.2v2
-		;;
-	psos*)
-		os=psos
-		;;
-	qnx*)
-		os=qnx
-		;;
-	hiux*)
-		os=hiuxwe2
-		;;
-	lynx*178)
-		os=lynxos178
-		;;
-	lynx*5)
-		os=lynxos5
-		;;
-	lynxos*)
-		# don't get caught up in next wildcard
-		;;
-	lynx*)
-		os=lynxos
-		;;
-	mac[0-9]*)
-		os=$(echo "$os" | sed -e 's|mac|macos|')
-		;;
-	opened*)
-		os=openedition
-		;;
-	os400*)
-		os=os400
-		;;
-	sunos5*)
-		os=$(echo "$os" | sed -e 's|sunos5|solaris2|')
-		;;
-	sunos6*)
-		os=$(echo "$os" | sed -e 's|sunos6|solaris3|')
-		;;
-	wince*)
-		os=wince
-		;;
-	utek*)
-		os=bsd
-		;;
-	dynix*)
-		os=bsd
-		;;
-	acis*)
-		os=aos
-		;;
-	atheos*)
-		os=atheos
-		;;
-	syllable*)
-		os=syllable
-		;;
-	386bsd)
-		os=bsd
-		;;
-	ctix* | uts*)
-		os=sysv
-		;;
-	nova*)
-		os=rtmk-nova
-		;;
-	ns2)
-		os=nextstep2
-		;;
-	# Preserve the version number of sinix5.
-	sinix5.*)
-		os=$(echo $os | sed -e 's|sinix|sysv|')
-		;;
-	sinix*)
-		os=sysv4
-		;;
-	tpf*)
-		os=tpf
-		;;
-	triton*)
-		os=sysv3
-		;;
-	oss*)
-		os=sysv3
-		;;
-	svr4*)
-		os=sysv4
-		;;
-	svr3)
-		os=sysv3
-		;;
-	sysvr4)
-		os=sysv4
-		;;
-	ose*)
-		os=ose
-		;;
-	*mint | mint[0-9]* | *MiNT | MiNT[0-9]*)
-		os=mint
-		;;
-	dicos*)
-		os=dicos
-		;;
-	pikeos*)
-		# Until real need of OS specific support for
-		# particular features comes up, bare metal
-		# configurations are quite functional.
-		case $cpu in
-		    arm*)
-			os=eabi
-			;;
-		    *)
-			os=elf
-			;;
-		esac
-		;;
-	*)
-		# No normalization, but not necessarily accepted, that comes below.
-		;;
-esac
-
-else
-
-# Here we handle the default operating systems that come with various machines.
-# The value should be what the vendor currently ships out the door with their
-# machine or put another way, the most popular os provided with the machine.
-
-# Note that if you're going to try to match "-MANUFACTURER" here (say,
-# "-sun"), then you have to tell the case statement up towards the top
-# that MANUFACTURER isn't an operating system.  Otherwise, code above
-# will signal an error saying that MANUFACTURER isn't an operating
-# system, and we'll never get to this point.
-
-kernel=
-case $cpu-$vendor in
-	score-*)
-		os=elf
-		;;
-	spu-*)
-		os=elf
-		;;
-	*-acorn)
-		os=riscix1.2
-		;;
-	arm*-rebel)
-		kernel=linux
-		os=gnu
-		;;
-	arm*-semi)
-		os=aout
-		;;
-	c4x-* | tic4x-*)
-		os=coff
-		;;
-	c8051-*)
-		os=elf
-		;;
-	clipper-intergraph)
-		os=clix
-		;;
-	hexagon-*)
-		os=elf
-		;;
-	tic54x-*)
-		os=coff
-		;;
-	tic55x-*)
-		os=coff
-		;;
-	tic6x-*)
-		os=coff
-		;;
-	# This must come before the *-dec entry.
-	pdp10-*)
-		os=tops20
-		;;
-	pdp11-*)
-		os=none
-		;;
-	*-dec | vax-*)
-		os=ultrix4.2
-		;;
-	m68*-apollo)
-		os=domain
-		;;
-	i386-sun)
-		os=sunos4.0.2
-		;;
-	m68000-sun)
-		os=sunos3
-		;;
-	m68*-cisco)
-		os=aout
-		;;
-	mep-*)
-		os=elf
-		;;
-	mips*-cisco)
-		os=elf
-		;;
-	mips*-*)
-		os=elf
-		;;
-	or32-*)
-		os=coff
-		;;
-	*-tti)	# must be before sparc entry or we get the wrong os.
-		os=sysv3
-		;;
-	sparc-* | *-sun)
-		os=sunos4.1.1
-		;;
-	pru-*)
-		os=elf
-		;;
-	*-be)
-		os=beos
-		;;
-	*-ibm)
-		os=aix
-		;;
-	*-knuth)
-		os=mmixware
-		;;
-	*-wec)
-		os=proelf
-		;;
-	*-winbond)
-		os=proelf
-		;;
-	*-oki)
-		os=proelf
-		;;
-	*-hp)
-		os=hpux
-		;;
-	*-hitachi)
-		os=hiux
-		;;
-	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
-		os=sysv
-		;;
-	*-cbm)
-		os=amigaos
-		;;
-	*-dg)
-		os=dgux
-		;;
-	*-dolphin)
-		os=sysv3
-		;;
-	m68k-ccur)
-		os=rtu
-		;;
-	m88k-omron*)
-		os=luna
-		;;
-	*-next)
-		os=nextstep
-		;;
-	*-sequent)
-		os=ptx
-		;;
-	*-crds)
-		os=unos
-		;;
-	*-ns)
-		os=genix
-		;;
-	i370-*)
-		os=mvs
-		;;
-	*-gould)
-		os=sysv
-		;;
-	*-highlevel)
-		os=bsd
-		;;
-	*-encore)
-		os=bsd
-		;;
-	*-sgi)
-		os=irix
-		;;
-	*-siemens)
-		os=sysv4
-		;;
-	*-masscomp)
-		os=rtu
-		;;
-	f30[01]-fujitsu | f700-fujitsu)
-		os=uxpv
-		;;
-	*-rom68k)
-		os=coff
-		;;
-	*-*bug)
-		os=coff
-		;;
-	*-apple)
-		os=macos
-		;;
-	*-atari*)
-		os=mint
-		;;
-	*-wrs)
-		os=vxworks
-		;;
-	*)
-		os=none
-		;;
-esac
-
-fi
-
-# Now, validate our (potentially fixed-up) OS.
-case $os in
-	# Sometimes we do "kernel-abi", so those need to count as OSes.
-	musl* | newlib* | uclibc*)
-		;;
-	# Likewise for "kernel-libc"
-	eabi | eabihf | gnueabi | gnueabihf)
-		;;
-	# Now accept the basic system types.
-	# The portable systems comes first.
-	# Each alternative MUST end in a * to match a version number.
-	gnu* | android* | bsd* | mach* | minix* | genix* | ultrix* | irix* \
-	     | *vms* | esix* | aix* | cnk* | sunos | sunos[34]* \
-	     | hpux* | unos* | osf* | luna* | dgux* | auroraux* | solaris* \
-	     | sym* |  plan9* | psp* | sim* | xray* | os68k* | v88r* \
-	     | hiux* | abug | nacl* | netware* | windows* \
-	     | os9* | macos* | osx* | ios* \
-	     | mpw* | magic* | mmixware* | mon960* | lnews* \
-	     | amigaos* | amigados* | msdos* | newsos* | unicos* | aof* \
-	     | aos* | aros* | cloudabi* | sortix* | twizzler* \
-	     | nindy* | vxsim* | vxworks* | ebmon* | hms* | mvs* \
-	     | clix* | riscos* | uniplus* | iris* | isc* | rtu* | xenix* \
-	     | mirbsd* | netbsd* | dicos* | openedition* | ose* \
-	     | bitrig* | openbsd* | solidbsd* | libertybsd* | os108* \
-	     | ekkobsd* | freebsd* | riscix* | lynxos* | os400* \
-	     | bosx* | nextstep* | cxux* | aout* | elf* | oabi* \
-	     | ptx* | coff* | ecoff* | winnt* | domain* | vsta* \
-	     | udi* | lites* | ieee* | go32* | aux* | hcos* \
-	     | chorusrdb* | cegcc* | glidix* \
-	     | cygwin* | msys* | pe* | moss* | proelf* | rtems* \
-	     | midipix* | mingw32* | mingw64* | mint* \
-	     | uxpv* | beos* | mpeix* | udk* | moxiebox* \
-	     | interix* | uwin* | mks* | rhapsody* | darwin* \
-	     | openstep* | oskit* | conix* | pw32* | nonstopux* \
-	     | storm-chaos* | tops10* | tenex* | tops20* | its* \
-	     | os2* | vos* | palmos* | uclinux* | nucleus* | morphos* \
-	     | scout* | superux* | sysv* | rtmk* | tpf* | windiss* \
-	     | powermax* | dnix* | nx6 | nx7 | sei* | dragonfly* \
-	     | skyos* | haiku* | rdos* | toppers* | drops* | es* \
-	     | onefs* | tirtos* | phoenix* | fuchsia* | redox* | bme* \
-	     | midnightbsd* | amdhsa* | unleashed* | emscripten* | wasi* \
-	     | nsk* | powerunix* | genode* | zvmoe* | qnx* | emx*)
-		;;
-	# This one is extra strict with allowed versions
-	sco3.2v2 | sco3.2v[4-9]* | sco5v6*)
-		# Don't forget version if it is 3.2v4 or newer.
-		;;
-	none)
-		;;
-	*)
-		echo Invalid configuration \`"$1"\': OS \`"$os"\' not recognized 1>&2
-		exit 1
-		;;
-esac
-
-# As a final step for OS-related things, validate the OS-kernel combination
-# (given a valid OS), if there is a kernel.
-case $kernel-$os in
-	linux-gnu* | linux-dietlibc* | linux-android* | linux-newlib* | linux-musl* | linux-uclibc* )
-		;;
-	uclinux-uclibc* )
-		;;
-	-dietlibc* | -newlib* | -musl* | -uclibc* )
-		# These are just libc implementations, not actual OSes, and thus
-		# require a kernel.
-		echo "Invalid configuration \`$1': libc \`$os' needs explicit kernel." 1>&2
-		exit 1
-		;;
-	kfreebsd*-gnu* | kopensolaris*-gnu*)
-		;;
-	nto-qnx*)
-		;;
-	os2-emx)
-		;;
-	*-eabi* | *-gnueabi*)
-		;;
-	-*)
-		# Blank kernel with real OS is always fine.
-		;;
-	*-*)
-		echo "Invalid configuration \`$1': Kernel \`$kernel' not known to work with OS \`$os'." 1>&2
-		exit 1
-		;;
-esac
-
-# Here we handle the case where we know the os, and the CPU type, but not the
-# manufacturer.  We pick the logical manufacturer.
-case $vendor in
-	unknown)
-		case $cpu-$os in
-			*-riscix*)
-				vendor=acorn
-				;;
-			*-sunos*)
-				vendor=sun
-				;;
-			*-cnk* | *-aix*)
-				vendor=ibm
-				;;
-			*-beos*)
-				vendor=be
-				;;
-			*-hpux*)
-				vendor=hp
-				;;
-			*-mpeix*)
-				vendor=hp
-				;;
-			*-hiux*)
-				vendor=hitachi
-				;;
-			*-unos*)
-				vendor=crds
-				;;
-			*-dgux*)
-				vendor=dg
-				;;
-			*-luna*)
-				vendor=omron
-				;;
-			*-genix*)
-				vendor=ns
-				;;
-			*-clix*)
-				vendor=intergraph
-				;;
-			*-mvs* | *-opened*)
-				vendor=ibm
-				;;
-			*-os400*)
-				vendor=ibm
-				;;
-			s390-* | s390x-*)
-				vendor=ibm
-				;;
-			*-ptx*)
-				vendor=sequent
-				;;
-			*-tpf*)
-				vendor=ibm
-				;;
-			*-vxsim* | *-vxworks* | *-windiss*)
-				vendor=wrs
-				;;
-			*-aux*)
-				vendor=apple
-				;;
-			*-hms*)
-				vendor=hitachi
-				;;
-			*-mpw* | *-macos*)
-				vendor=apple
-				;;
-			*-*mint | *-mint[0-9]* | *-*MiNT | *-MiNT[0-9]*)
-				vendor=atari
-				;;
-			*-vos*)
-				vendor=stratus
-				;;
-		esac
-		;;
-esac
-
-echo "$cpu-$vendor-${kernel:+$kernel-}$os"
-exit
-
-# Local variables:
-# eval: (add-hook 'before-save-hook 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/configure.in b/configure.in
deleted file mode 100644
index 01f51feb4..000000000
--- a/configure.in
+++ /dev/null
@@ -1,737 +0,0 @@
-#
-# Process this file with autoconf to produce a configure script
-#
-AC_PREREQ(2.59)
-AC_INIT(tmLQCD, 6.0.2, curbach@gmx.de)
-AC_CONFIG_HEADER(include/tmlqcd_config_internal.h)
-AC_CONFIG_SRCDIR([hmc_tm.c])
-AC_CANONICAL_HOST()
-AC_PREFIX_DEFAULT($HOME)
-AC_ARG_PROGRAM
-
-if test "$host_vendor" = "cray"; then
-  ac_cv_c_bigendian=yes
-fi
-
-AC_PROG_CC
-AC_PROG_CC_C99
-dnl AC_PROG_CC_STDC
-AC_C_CONST
-AC_C_INLINE
-AC_C_RESTRICT
-AC_F77_LIBRARY_LDFLAGS
-AC_CHECK_TOOL(AR, ar, [ar])
-LIBS="$LIBS $FLIBS -lm"
-
-AC_PROG_LEX
-dnl AC_PROG_LEX sets $LEX to ":" if neither lex nor flex are found! 
-if test "$LEX" = ":"; then
-  AC_MSG_ERROR([(F)LEX is required for building read_input.c. Please install it and run configure again.])
-fi
-
-AC_PROG_MAKE_SET
-AC_PROG_RANLIB
-AC_CHECK_PROG(CCDEP, gcc, "gcc", "$CC")
-AC_CHECK_PROG(CXXDEP, g++, "g++", "$CXX")
-#(endian="", AC_DEFINE(LITTLE_ENDIAN,1,The endian of the architechture))
-
-# AC_PROG_FC([ifort gfortran])
-# AC_FC_FUNC(testfunc, )
-
-LDFLAGS="$LDFLAGS -L\${HOME}/lib -L\${top_builddir}/lib"
-CCLD=${CC}
-
-# compilation in operator is slowest so we do it first, saves time in parallel compiles
-USESUBDIRS="operator linalg solver monomial buffers cu io meas xchange init rational smearing wrapper"
-
-AC_CHECK_HEADERS([stdint.h],
-[ dnl for inttypes.h and stdint.h for uint_xxx types
-  dnl if successful check for the actual types too
-  AC_CHECK_TYPES([uint16_t, uint32_t, uint64_t],
-                 [],
-                 [AC_MSG_ERROR([stdint.h found but either uint16_t, uint32_t or uint64_t not found]) ]
-                )
-],
-[
-  dnl no inttypes.h or stdint.h found check common unsigned types
-  dnl for sizes and make appropriate decisions in the lime_fixed_types.h file
-  AC_CHECK_SIZEOF(unsigned char)
-  AC_CHECK_SIZEOF(unsigned short)
-  AC_CHECK_SIZEOF(unsigned int)
-  AC_CHECK_SIZEOF(unsigned long)
-  AC_CHECK_SIZEOF(unsigned long long)
-]
-)
-
-AC_MSG_CHECKING(where to find lime)
-AC_ARG_WITH(limedir,
-  AS_HELP_STRING([--with-limedir[=dir]], [search lime in dir [default=./lime]]),
-  lime_dir=$withval, lime_dir="./lime")
-AC_MSG_RESULT($lime_dir)
-LDFLAGS="$LDFLAGS -L${lime_dir}/lib/"
-AC_CHECK_LIB([lime], [limeReaderNextRecord],[],
-              [AC_MSG_ERROR([library liblime is missing or needed function is not available])])
-
-#LIBS="$LIBS $FLIBS -lm"
-
-AC_MSG_CHECKING(whether we want to use lemon)
-AC_ARG_WITH(lemondir,
-            AS_HELP_STRING([--with-lemondir[=dir]], [use lemon, to be found in dir]),
-             [echo $withval
-              LEMON_AVAILABLE=1
-              lemon_dir=$withval
-              LDFLAGS="$LDFLAGS -L${lemon_dir}/lib"
-              AC_CHECK_LIB([lemon],
-                           [lemonReaderNextRecord],
-                           [],
-                           [AC_MSG_ERROR([library liblemon was not found])])],
-             [echo no
-              LEMON_AVAILABLE=0])
-
-AC_MSG_CHECKING(whether we want to use MPI)
-AC_ARG_ENABLE(mpi,
-  AS_HELP_STRING([--enable-mpi], [enable use of mpi [default=yes]]),
-  enable_mpi=$enableval, enable_mpi=yes)
-if test $enable_mpi = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(TM_USE_MPI,1,Compile with MPI support)
-else
-  AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING(whether we want to use DDalphaAMG)
-AC_ARG_WITH(DDalphaAMG,
-            AS_HELP_STRING([--with-DDalphaAMG[=dir]], [use DDalphaAMG, to be found in dir]),
-             [echo $withval
-              DDalphaAMG_AVAILABLE=1
-              DDalphaAMG_INTERFACE="DDalphaAMG_interface"
-              AC_DEFINE(DDalphaAMG,1,Using DDalphaAMG)
-              DDalphaAMG_dir=$withval
-              LDFLAGS="$LDFLAGS -L${DDalphaAMG_dir}/lib"
-              INCLUDES="$INCLUDES -I${DDalphaAMG_dir}/include/"
-              AC_CHECK_LIB([DDalphaAMG],
-                           [DDalphaAMG_finalize],
-                           [],
-                           [AC_MSG_ERROR([library DDalphaAMG was not found])])],
-             [echo no
-              DDalphaAMG_AVAILABLE=0
-              DDalphaAMG_INTERFACE="DDalphaAMG_interface"
-              ])
-
-AC_MSG_CHECKING(whether we want to use OpenMP)
-AC_ARG_ENABLE(omp,
-  AS_HELP_STRING([--enable-omp], [enable use of OpenMP [default=yes]]),
-  enable_omp=$enableval, enable_omp=yes)
-if test $enable_omp = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(TM_USE_OMP,1,Compile with OpenMP support)
-  AC_CHECK_HEADERS([omp.h],,[AC_MSG_ERROR([Cannot find OpenMP headers!])])
-  AC_OPENMP
-# -- AC_OPENMP provides a compiler-dependent OPENMP_CFLAGS so we can set it here
-    CFLAGS="$CFLAGS $OPENMP_CFLAGS"
-    CPPFLAGS="$CPPFLAGS $OPENMP_CFLAGS"
-    LDFLAGS="$LDFLAGS $OPENMP_CFLAGS"
-else
-  AC_MSG_RESULT(no)
-fi
-
-fftw_lib=/usr
-AC_MSG_CHECKING(whether we want to use FFTW)
-AC_ARG_ENABLE(fftw,
-  AS_HELP_STRING([--enable-fftw], [enable use of fftw [default=no]]),
-  enable_fftw=$enableval, enable_fftw=no)
-if test $enable_fftw = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(HAVE_FFTW,1,Compile with FFTW support)
-  LIBS="-lfftw3 ${LIBS}"
-elif test $enable_fftw = no; then
-  AC_MSG_RESULT(no)
-else
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(HAVE_FFTW,1,Compile with FFTW support)
-  fftw_lib=${enable_fftw}
-  LDFLAGS="$LDFLAGS -L${fftw_lib}/lib64"
-  LIBS="-lfftw3 ${LIBS}"
-  INCLUDES="-I${fftw_lib}/include ${INCLUDES}"
-fi
-
-if test $enable_mpi = yes; then
-  AC_MSG_CHECKING(which parallelisation to use for MPI)
-  AC_ARG_WITH(mpidimension,
-    AS_HELP_STRING([--with-mpidimension[=n]], [use n dimensional parallelisation [default=1]]),
-    withmpidimension=$withval, withmpidimension=1)
-  if test $withmpidimension = 1; then
-    AC_MSG_RESULT(n=1 [t])
-    AC_DEFINE(PARALLELT,1,One dimensional parallelisation)
-  elif test $withmpidimension = 2; then
-    AC_MSG_RESULT(n=2 [xt])
-    AC_DEFINE(PARALLELXT,1,Two dimensional parallelisation)
-  elif test $withmpidimension = 3; then
-    AC_MSG_RESULT(n=3 [xyt])
-    AC_DEFINE(PARALLELXYT,1,Three dimensional parallelisation)
-  elif test $withmpidimension = 4; then
-    AC_MSG_RESULT(n=4 [xyzt])
-    AC_DEFINE(PARALLELXYZT,1,Four dimensional parallelisation)
-  elif test $withmpidimension = X; then
-    AC_MSG_RESULT(n=1 [x])
-    AC_DEFINE(PARALLELX,1, X parallelisation)
-  elif test $withmpidimension = XY; then
-    AC_MSG_RESULT(n=2 [xy])
-    AC_DEFINE(PARALLELXY,1, XY parallelisation)
-  elif test $withmpidimension = XYZ; then
-    AC_MSG_RESULT(n=3 [xyz])
-    AC_DEFINE(PARALLELXYZ,1, XYZ parallelisation)
-  elif test $withmpidimension = T; then
-    AC_MSG_RESULT(n=1 [t])
-    AC_DEFINE(PARALLELT,1, T parallelisation)
-  elif test $withmpidimension = XT; then
-    AC_MSG_RESULT(n=2 [xt])
-    AC_DEFINE(PARALLELXT,1, XT parallelisation)
-  elif test $withmpidimension = XYT; then
-    AC_MSG_RESULT(n=3 [xyt])
-    AC_DEFINE(PARALLELXYT,1, XYT parallelisation)
-  elif test $withmpidimension = XYZT; then
-    AC_MSG_RESULT(n=4 [xyzt])
-    AC_DEFINE(PARALLELXYZT,1, XYZT parallelisation)
-  else
-    AC_MSG_RESULT(unknown)
-    AC_MSG_ERROR([Only t, xt, xyt, xyzt, x, xy, xyz parallelisation available])
-  fi
-
-  AC_MSG_CHECKING(whether we shall use persistent MPI calls for halfspinor)
-  AC_ARG_WITH([persistentmpi],
-    AS_HELP_STRING([--with-persistentmpi], [use persistent MPI calls for halfspinor [default=no]]),
-    withpersistent=$withval, withpersistent=no)
-  if test $withpersistent = yes; then
-    AC_MSG_RESULT(yes)
-    AC_DEFINE(_PERSISTENT,1,use persistent MPI calls for halfspinor)
-  else
-    AC_MSG_RESULT(no)
-  fi
-
-  AC_MSG_CHECKING(whether we shall use non-blocking MPI calls)
-  AC_ARG_WITH([nonblockingmpi],
-    AS_HELP_STRING([--with-nonblockingmpi], [use non-blocking MPI calls for spinor and gauge [default=yes]]),
-    withnonblock=$withval, withnonblock=yes)
-  if test $withnonblock = yes; then
-    AC_MSG_RESULT(yes)
-    AC_DEFINE(_NON_BLOCKING,1,use non-blocking MPI calls for spinor ang gauge)
-  else
-    AC_MSG_RESULT(no)
-  fi
-fi
-
-AC_MSG_CHECKING([whether we want to fix volume at compiletime])
-AC_ARG_WITH([fixedvolume],
-  AS_HELP_STRING([--with-fixedvolume], [fix volume at compiletime [default=no]]),
-  with_fixvol=$withval, with_fixvol=no)
-if test $with_fixvol = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(FIXEDVOLUME,1,Fixed volume at compiletime)
-  AC_CONFIG_FILES([fixed_volume.h])
-else
-  AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING([whether we want to use KOJAK instrumentalisation])
-AC_ARG_WITH([kojakinst],
-  AS_HELP_STRING([--with-kojakinst], [instrumentalise for KOJAK [default=no]]),
-  with_kojakinst=$withval, with_kojakinst=no)
-if test $with_kojakinst = yes; then
-  AC_MSG_RESULT(yes)
-  CC="kinst-pomp ${CC}"
-else
-  AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING(whether we want to use lapack and blas)
-AC_ARG_WITH(lapack,
-  AS_HELP_STRING([--with-lapack], [enable use of lapack [default=yes]]),
-  with_lapack=$withval, with_lapack=yes)
-if test "$with_lapack" = yes; then
-  AC_MSG_RESULT(yes)
-  LAPACKLIB=
-  AC_DEFINE(HAVE_LAPACK,1,lapack available)
-elif test "$with_lapack" != no; then
-  AC_MSG_RESULT(yes)
-  LIBS="$withval $LIBS"
-  with_lapack=yes
-  AC_DEFINE(HAVE_LAPACK,1,lapack available)
-else
-  AC_MSG_RESULT(no)
-  AC_MSG_ERROR([lapack is needed! Will stop here.])
-fi
-
-if test $enable_mpi = yes; then
-  dnl In general one cannot run mpi programs directly
-  dnl thats why we need here cross_compiling=yes
-  dnl for non CRAY
-  if test "$host_vendor" != "cray"; then
-    cross_compiling=yes
-  fi
-fi
-
-dnl for the case of other configure scripts
-dnl AC_CONFIG_SUBDIRS( rng )
-
-dnl check for clock_gettime and set correct library flag if one is required
-dnl (this is done by AC_CHECK_LIB)
-AC_CHECK_FUNCS(clock_gettime, [], [AC_CHECK_LIB(rt, clock_gettime)])
-
-dnl in principle clock_gettime and CLOCK_MONOTONIC/CLOCK_REALTIME should be available
-dnl only when using POSIX 199309, we set this explicitly here
-dnl this should not cause problems on any relatively modern (post y2k) machine!
-if ( test "$ac_cv_lib_rt_clock_gettime" = "yes" || test "$ac_cv_func_clock_gettime" = "yes" ); then
-  AC_DEFINE(HAVE_CLOCK_GETTIME,1)
-dnl  we set this in gettime.c explicitly for the time being 
-dnl  due to endian problem on BG/Q
-dnl  CFLAGS="$CFLAGS -D_POSIX_C_SOURCE=199309L"
-  AC_MSG_NOTICE([Instructing the compiler to use POSIX 199309L])
-fi
-
-dnl Checks for lapack and defines proper name mangling scheme for
-dnl linking with f77 code
-AC_F77_FUNC(zheev)
-if test "$zheev" = "zheev"; then
-  AC_DEFINE(NOF77_,1,Fortran has no extra _)
-fi
-AC_SEARCH_LIBS([$zheev],[lapack], [], [AC_MSG_ERROR([Cannot find lapack])])
-
-dnl Checks for header files.
-AC_HEADER_STDC
-AC_CHECK_HEADERS([float.h libintl.h limits.h stdint.h stdlib.h string.h strings.h sys/time.h unistd.h endian.h])
-AC_CHECK_HEADER( getopt.h, [])
-
-dnl Checks for typedefs, structures, and compiler characteristics.
-AC_C_CONST
-AC_TYPE_OFF_T
-AC_TYPE_SIZE_T
-AC_HEADER_TIME
-
-dnl Checks for library functions.
-AC_SYS_LARGEFILE
-AC_FUNC_FSEEKO
-AC_FUNC_MALLOC
-AC_TYPE_SIGNAL
-AC_CHECK_FUNCS([gettimeofday pow sqrt])
-
-dnl We now define some replacement variables
-AC_SUBST(OPTARGS)
-AC_SUBST(SOPTARGS)
-AC_SUBST(INCLUDES)
-AC_SUBST(AUTOCONF)
-AC_SUBST(SOLVEROUT)
-AC_SUBST(CCDEP)
-AC_SUBST(CXXDEP)
-AC_SUBST(CCLD)
-AC_SUBST(DEPFLAGS)
-AC_SUBST(CXXDEPFLAGS)
-AC_SUBST(DEBUG_FLAG)
-AC_SUBST(PROFILE_FLAG)
-AC_SUBST(XCHANGELIB)
-AC_SUBST(XCHANGEDIR)
-AC_SUBST(MEASDIR)
-AC_SUBST(XLIB)
-AC_SUBST([LEMON_AVAILABLE])
-AC_SUBST(QUDA_INTERFACE)
-AC_SUBST(QPHIX_INTERFACE)
-AC_SUBST(QPHIX_PROGRAMS)
-AC_SUBST(DDalphaAMG_INTERFACE)
-
-INCLUDES="$INCLUDES -I\$(HOME)/include/ -I. -I\${abs_top_builddir}/  -I\${abs_top_builddir}/include/ -I\${abs_top_srcdir}/ -I\${abs_top_srcdir}/include/ -I${lime_dir}/include/ -I${lemon_dir}/include/"
-DEPFLAGS="$DEPFLAGS"
-
-AC_MSG_CHECKING(what alignment we want for arrays)
-AC_ARG_ENABLE(alignment,
-  [AS_HELP_STRING([--enable-alignment[=n]], [Automatically or expliclty align arrays to byte number: auto, none, 16, 32, 64 [default=auto]])],
-  withalign=$enableval, withalign=auto)
-if test "$withalign" = "none"; then
-  AC_MSG_RESULT(none)
-  withalign=1
-  AC_DEFINE(ALIGN_BASE, 0x00, [Align base])
-  AC_DEFINE(ALIGN, [])
-  AC_DEFINE(ALIGN_BASE32, 0x00, [Align base32])
-  AC_DEFINE(ALIGN32, [], [])
-elif test $withalign = 16; then
-  AC_MSG_RESULT(16 bytes)
-  AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
-  AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))])
-  AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32])
-  AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))], [])
-elif test $withalign = 32; then
-  AC_MSG_RESULT(32 bytes)
-  AC_DEFINE(ALIGN_BASE, 0x1F, [Align base])
-  AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))])
-  AC_DEFINE(ALIGN_BASE32, 0x1F, [Align base32])
-  AC_DEFINE(ALIGN32, [__attribute__ ((aligned (32)))], [])
-elif test $withalign = 64; then
-  AC_MSG_RESULT(64 bytes)
-  AC_DEFINE(ALIGN_BASE, 0x3F, [Align base])
-  AC_DEFINE(ALIGN, [__attribute__ ((aligned (64)))])
-  AC_DEFINE(ALIGN_BASE32, 0x3F, [Align base32])
-  AC_DEFINE(ALIGN32, [__attribute__ ((aligned (64)))], [])
-elif test $withalign = auto; then
-  withautoalign=1
-  AC_MSG_RESULT(auto)
-  AC_DEFINE(ALIGN_BASE, 0x00, [Align base])
-  AC_DEFINE(ALIGN, [], [])
-  AC_DEFINE(ALIGN_BASE32, 0x00, [Align base32])
-  AC_DEFINE(ALIGN32, [], [])
-else
-  AC_MSG_RESULT(Unusable value for array alignment)
-  AC_MSG_ERROR([Allowed values are: auto, none, 16, 32, 64])
-fi
-
-dnl We here check for alignment issues with QPX instructions -- this flag has been set earlier
-if test $enable_qpx = yes; then
-  if test $withalign = auto; then
-    if test $withautoalign -lt 32; then
-      AC_MSG_RESULT(increasing array alignment to 32 bytes for use of QPX instructions on BG/Q)
-      AC_DEFINE(ALIGN_BASE, 0x1F, [Align base])
-      AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))])
-      AC_MSG_RESULT(increasing 32bit array alignment to 16 bytes for use of QPX instructions on BG/Q)
-      AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32])
-      AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))])
-      withautoalign=32
-    fi
-  elif test $withalign -lt 32; then
-    AC_MSG_ERROR([alignment incompatible with QPX instructions (32 bytes required)])
-  fi
-fi
-
-dnl Check for alignment associated with (non-QPX) BG optimization.
-dnl This will also result in using 32 byte alignment on MareNostrum, but that should be fairly innocuous.
-if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "blrts"; then
-  if test $withalign = auto; then
-    if test $withautoalign -lt 16; then
-      AC_MSG_RESULT(increasing array alignment to 16 bytes for BG/L optimization)
-      AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
-      AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))], [Align base])
-      withautoalign=16
-    fi
-  fi
-elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "bprts"; then
-  if test $withalign = auto; then
-    if test $withautoalign -lt 16; then
-      AC_MSG_RESULT(increasing array alignment to 16 bytes for BG/P optimization)
-      AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
-      AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))], [Align base])
-      withautoalign=16
-    fi
-  fi
-elif test "$host_cpu" = "powerpc64" && test "$host_vendor" = "unknown" && test "$host_os" = "linux-gnu"; then
-  if test $withalign = auto; then
-    if test $withautoalign -lt 32; then
-      AC_MSG_RESULT(increasing array alignment to 32 bytes for BG/Q and generic POWER optimization)
-      AC_DEFINE(ALIGN_BASE, 0x1F, [Align base])
-      AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))])
-      AC_MSG_RESULT(increasing array 32 bit alignment to 16 bytes for BG/Q and generic POWER optimization)
-      AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base])
-      AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))])
-      withautoalign=32
-    fi
-  fi
-fi
-
-AC_MSG_CHECKING(whether we want to use gprof as profiler)
-AC_ARG_WITH(gprof,
-  AS_HELP_STRING([--with-gprof], [use of gprof profiler [default=no]]),
-  enable_gprof=$withval, enable_gprof=no)
-if test $enable_gprof = yes; then
-  AC_MSG_RESULT(yes)
-    if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm"; then
-      PROFILE_FLAG="-pg -qfullpath -g"
-    else
-      PROFILE_FLAG="-pg -g"
-    fi
-else
-  AC_MSG_RESULT(no)
-  PROFILE_FLAG=
-fi
-
-dnl Now we have to set all Flags and compiler properly
-PGCC=`$CC -V 2>&1 | grep pgcc`
-ICC=`$CC -V 2>&1 | grep -i intel`
-
-dnl first for PC's
-if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then
-dnl the GNU compiler
-  if test "$GCC" = yes && test "$ICC" = ""; then
-    DEPFLAGS="-MM"
-    CFLAGS="$CFLAGS -pedantic -Wall"
-    OPTARGS='-O'
-    SOPTARGS='-O'
-
-    if test "$host_cpu" = "x86_64"; then
-      AC_DEFINE(_x86_64,1,x86 64 Bit architecture)
-    fi
-    CCDEP="$CC"
-    if test $enable_mpi = yes; then
-      CCDEP="gcc"
-    fi
-    CXXDEP="$CXX"
-    if test $enable_mpi = yes; then
-      CXXDEP="g++"
-    fi
-    DEBUG_FLAG="-g"
-dnl other compilers
-  else
-dnl check for pgcc
-    if test "$PGCC" != ""; then
-      DEPFLAGS="-M"
-      echo "We are using the Portland Group C compiler!"
-      OPTARGS="-O2"
-      SOPTARGS="-O2"
-      DEBUG_FLAG="-g"
-      PROFILE_FLAG="-p -g"
-      CCDEP="$CC"
-
-dnl check for icc
-    elif test "$ICC" != ""; then
-      echo "We are using the Intel C compiler!"
-      DEPFLAGS="-M"
-      OPTARGS="-O3"
-      SOPTARGS="-O3"
-      DEBUG_FLAG="-g"
-      PROFILE_FLAG="-p -g"
-      CCDEP="$CC"
-      CXXDEP="$CXX"
-    else
-      DEPFLAGS="-M"
-      CFLAGS="$CFLAGS -O"
-      DEBUG_FLAG="-g"
-      CCDEP="$CC"
-      CXXDEP="$CXX"
-    fi
-  fi
-# The CRAY
-elif test "$host_vendor" = "cray"; then
-  echo
-  echo "Hey, we are on a cray, you should take some time for this..."
-  echo "get yourself a coffee or so!"
-  echo
-  CFLAGS="$CFLAGS -dp"
-  AC_DEFINE(CRAY,1,We are on a CRAY)
-  OPTARGS="-O3"
-  SOPTARGS="-O3"
-  DEBUG_FLAG="-g"
-  CCDEP="$CC"
-  DEPFLAGS="-M"
-else
-  AC_CHECK_PROG(CCDEP, gcc, "gcc", "$CC")
-  if test "$CCDEP" = "gcc"; then
-    DEPFLAGS="-MM"
-  else
-    DEPFLAGS="-M"
-  fi
-  OPTARGS=
-  SOPTARGS=
-fi
-
-CXXDEPFLAGS="$DEPFLAGS --std=c++11"
-
-AC_MSG_CHECKING(whether we want to switch on optimisation)
-AC_ARG_ENABLE(optimize,
-  AS_HELP_STRING([--enable-optimize], [enable optimisation [default=yes]]),
-  enable_optimize=$enableval, enable_optimize=yes)
-if test $enable_optimize = no; then
-  AC_MSG_RESULT(no)
-  OPTARGS=
-  SOPTARGS=
-else
-  AC_MSG_RESULT(yes)
-fi
-
-AC_MSG_CHECKING(whether we want to use a copy of the gauge field)
-AC_ARG_ENABLE(gaugecopy,
-  AS_HELP_STRING([--enable-gaugecopy], [enable use of a copy of the gauge field [default=yes]]),
-  enable_gaugecopy=$enableval, enable_gaugecopy=yes)
-if test $enable_gaugecopy = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(_GAUGE_COPY,1,Construct an extra copy of the gauge fields)
-else
-  AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING(whether we want to use a Dirac Op. with halfspinor exchange)
-AC_ARG_ENABLE(halfspinor,
-  AS_HELP_STRING([--enable-halfspinor], [use a Dirac Op. with halfspinor exchange [default=yes]]),
-  enable_halfspinor=$enableval, enable_halfspinor=yes)
-if test $enable_halfspinor = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(_USE_HALFSPINOR,1,Exchange only a halfspinor in the Dirac Operator)
-  if test $enable_gaugecopy = no; then
-    AC_MSG_WARN([switching on gaugecopy for Dirac operator with halfspinor!])
-    AC_DEFINE(_GAUGE_COPY,1,Construct an extra copy of the gauge fields)
-  fi
-else
-  AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING(whether we want to use shmem API)
-AC_ARG_ENABLE(shmem,
-  AS_HELP_STRING([--enable-shmem],[use shmem API [default=no]]),
-  enable_shmem=$enableval, enable_shmem=no)
-if test $enable_shmem = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(_USE_SHMEM,1,Use shmem API)
-  LIBS="$LIBS -lsma"
-else
-  AC_MSG_RESULT(no)
-fi
-
-
-AC_SUBST(USESUBDIRS)
-
-AC_MSG_CHECKING(whether we want to use CUDA)
-AC_ARG_WITH(cudadir,
-            AS_HELP_STRING([--with-cudadir[=dir]], [use CUDA library (specify 'lib' directory)]),
-             [AC_MSG_RESULT($withval)
-              CUDA_AVAILABLE=1
-              cuda_dir=$withval
-              LDFLAGS="$LDFLAGS -L${cuda_dir} -lcuda"
-              AC_CHECK_LIB([cudart],
-                           [cudaMalloc],
-                           [],
-                           [AC_MSG_ERROR([Can't link a simple program against library cudart.])])],
-             [AC_MSG_RESULT(no)
-              CUDA_AVAILABLE=0])
-
-AC_MSG_CHECKING(whether we want to use HIP)
-AC_ARG_WITH(hipdir,
-            AS_HELP_STRING([--with-hipdir[=dir]], [use HIP library (specify 'lib' directory)]),
-             [AC_MSG_RESULT($withval)
-              HIP_AVAILABLE=1
-              hip_dir=$withval
-              LDFLAGS="$LDFLAGS -L${hip_dir} -lamdhip64"
-              AC_CHECK_LIB([amdhip64],
-                           [hipMalloc],
-                           [],
-                           [AC_MSG_ERROR([Can't link a simple program against library amdhip64.])])],
-             [AC_MSG_RESULT(no)
-              HIP_AVAILABLE=0])
-
-
-# QUDA library for GPUs
-AC_MSG_CHECKING(whether we want to use QUDA)
-AC_ARG_WITH(qudadir,
-            AS_HELP_STRING([--with-qudadir[=dir]], [use QUDA library (specify directory which contains 'include' and 'lib' subdirs)]),
-             [AC_MSG_RESULT($withval)
-              if test $CUDA_AVAILABLE -ne 1 && test $HIP_AVAILABLE -ne 1; then
-                AC_MSG_ERROR([Need either CUDA or HIP to link against QUDA!])
-              fi
-              QUDA_AVAILABLE=1
-              AC_DEFINE(TM_USE_QUDA,1,Using QUDA GPU)
-              quda_dir=$withval
-              LDFLAGS="$LDFLAGS -L${quda_dir}/lib"
-              INCLUDES="$INCLUDES -I${quda_dir}/include/"
-              QUDA_INTERFACE="quda_interface"
-              AC_CHECK_LIB([quda],
-                           [freeGaugeQuda],
-                           [],
-                           [AC_MSG_ERROR([Can't link a simple program against library libquda. (Did you set CXX properly?)])]
-                           )
-              #QUDA needs to be linked with C++ linker
-              CCLD=${CXX}
-             ],
-             [AC_MSG_RESULT(no)
-              QUDA_AVAILABLE=0
-              QUDA_INTERFACE=""
-              ]
-            )
-AC_SUBST([QUDA_AVAILABLE])
-
-AC_MSG_CHECKING(whether the QUDA version is experimental)
-AC_ARG_ENABLE(quda_experimental,
-  AS_HELP_STRING([--enable-quda_experimental], [enable support for experimental QUDA versions [default=no]]),
-  enable_quda_experimental=$enableval, enable_quda_experimental=no)
-if test $enable_quda_experimental = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(TM_QUDA_EXPERIMENTAL,1,Experimental QUDA version in use)
-else
-  AC_MSG_RESULT(no)
-fi
-AC_MSG_CHECKING(whether the QUDA force is enabled)
-AC_ARG_ENABLE(quda_fermionic_forces,
-  AS_HELP_STRING([--enable-quda_fermionic_forces], [enable support for fermionic forces using QUDA [default=yes]]),
-  enable_quda_fermionic_forces=$enableval, enable_quda_fermionic_forces=yes)
-if test $enable_quda_fermionic_forces = no; then
-  AC_MSG_RESULT(no)
-else
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(TM_QUDA_FERMIONIC_FORCES,1, fermionic forces with QUDA are enabled)
-fi
-
-# QPhiX library for Intel Xeon and Xeon Phis
-AC_MSG_CHECKING(whether we want to use QPhiX)
-AC_ARG_WITH(qphixdir,
-            AS_HELP_STRING([--with-qphixdir[=dir]], [use QPhiX, to be found in dir]),
-             [echo yes
-              QPHIX_AVAILABLE=1
-              AC_DEFINE(TM_USE_QPHIX,1,Using QPhiX)
-              qphix_dir=$withval
-              LDFLAGS="$LDFLAGS -L${qphix_dir}/lib -lqphix_solver -lqphix_codegen"
-              INCLUDES="$INCLUDES -I${qphix_dir}/include/" 
-              QPHIX_INTERFACE="qphix_interface"
-              QPHIX_PROGRAMS=""
-              # Due to github issue #404, the qphix test_Dslash code has been disabled by BaKo
-              # for the time being
-              # it should be updated to make use of the QPhiX internal interfaces
-              # for passing full lattice spinors
-              # "qphix_test_Dslash"
-
-              # QMP: TODO AC_CHECK_LIB
-              AC_MSG_CHECKING([where to search for QMP libs])
-              AC_ARG_WITH(qmpdir,
-                          AS_HELP_STRING([--with-qmpdir[=dir]], [if using QPhiX, then set QMP lib dir]),
-                          qmp_dir=$withval
-                          LDFLAGS="$LDFLAGS -L${qmp_dir}/lib -lqmp"
-                          INCLUDES="$INCLUDES -I${qmp_dir}/include/"
-                          )
-              AC_MSG_RESULT($qmp_dir)
-
-              AC_MSG_CHECKING([Setting QPhiX SOALEN])
-              AC_ARG_ENABLE(qphix-soalen,
-                            AS_HELP_STRING([--enable-qphix-soalen], [if using QPhiX, set SOALEN [default=4]]),
-                            enable_qphix_soalen=$enableval, enable_qphix_soalen=4)
-              AC_MSG_RESULT($enable_qphix_soalen)
-              AC_DEFINE_UNQUOTED(QPHIX_SOALEN, ${enable_qphix_soalen}, Structure of Array length to use with QPhiX)
-
-              AC_PROG_CXX
-              #QPhiX needs to be linked with C++ linker
-              CCLD=${CXX}
-             ],
-             [echo no
-              QPHIX_AVAILABLE=0
-              QPHIX_INTERFACE=""])
-AC_SUBST([QPHIX_AVAILABLE])
-
-if test ! -e lib; then
-  mkdir lib
-fi
-
-dnl create the test and tests directory here
-if test ! -e test; then
-  mkdir test
-fi
-
-if test ! -e tests; then
-  mkdir tests
-fi
-
-if test ! -e tests/regressions; then
-  mkdir tests/regressions
-fi
-
-
-LIBS="-lhmc -lmonomial -loperator -lsolver -linit -lmeas -llinalg -lhmc -lxchange -lrational -lio $LIBS"
-AUTOCONF=autoconf
-
-for i in $USESUBDIRS
-do
-  make_files="$make_files $i/Makefile"
-done
-
-AC_CONFIG_FILES([Makefile $make_files])
-
-AC_OUTPUT
diff --git a/src/lib/profiling/hmc/Readme.md b/profiling/hmc/Readme.md
similarity index 100%
rename from src/lib/profiling/hmc/Readme.md
rename to profiling/hmc/Readme.md
diff --git a/src/lib/profiling/hmc/example_profile.pdf b/profiling/hmc/example_profile.pdf
similarity index 100%
rename from src/lib/profiling/hmc/example_profile.pdf
rename to profiling/hmc/example_profile.pdf
diff --git a/src/lib/profiling/hmc/profile.Rmd b/profiling/hmc/profile.Rmd
similarity index 100%
rename from src/lib/profiling/hmc/profile.Rmd
rename to profiling/hmc/profile.Rmd
diff --git a/src/lib/profiling/hmc/timing.R b/profiling/hmc/timing.R
similarity index 100%
rename from src/lib/profiling/hmc/timing.R
rename to profiling/hmc/timing.R
diff --git a/src/lib/profiling/hmc_mk2/.gitignore b/profiling/hmc_mk2/.gitignore
similarity index 100%
rename from src/lib/profiling/hmc_mk2/.gitignore
rename to profiling/hmc_mk2/.gitignore
diff --git a/src/lib/profiling/hmc_mk2/README.md b/profiling/hmc_mk2/README.md
similarity index 100%
rename from src/lib/profiling/hmc_mk2/README.md
rename to profiling/hmc_mk2/README.md
diff --git a/src/lib/profiling/hmc_mk2/logs/example_log.out b/profiling/hmc_mk2/logs/example_log.out
similarity index 99%
rename from src/lib/profiling/hmc_mk2/logs/example_log.out
rename to profiling/hmc_mk2/logs/example_log.out
index faf4874bf..22ec86ec9 100644
--- a/src/lib/profiling/hmc_mk2/logs/example_log.out
+++ b/profiling/hmc_mk2/logs/example_log.out
@@ -270,8 +270,8 @@ operator 0 parsed line 229
 This is the hmc code for twisted mass Wilson QCD
 
 Version 5.2.0, commit 51cf008a89944ecdd9345cdb62aaf0a203a7f306
-# The code is compiled with -D_GAUGE_COPY
-# The code is compiled with -D_USE_HALFSPINOR
+# The code is compiled with -DTM_GAUGE_COPY
+# The code is compiled with -DTM_USE_HALFSPINOR
 # the code is compiled for non-blocking MPI calls (spinor and gauge)
 # the code is compiled with openMP support
 # Non-Schroedinger (anti-periodic, periodic or twisted) boundary conditions are used
diff --git a/src/lib/profiling/hmc_mk2/make_profile.R b/profiling/hmc_mk2/make_profile.R
similarity index 100%
rename from src/lib/profiling/hmc_mk2/make_profile.R
rename to profiling/hmc_mk2/make_profile.R
diff --git a/src/lib/profiling/hmc_mk2/profile.Rmd b/profiling/hmc_mk2/profile.Rmd
similarity index 100%
rename from src/lib/profiling/hmc_mk2/profile.Rmd
rename to profiling/hmc_mk2/profile.Rmd
diff --git a/qphix_base_classes.hpp b/qphix_base_classes.hpp
deleted file mode 100644
index 26015e3a2..000000000
--- a/qphix_base_classes.hpp
+++ /dev/null
@@ -1,771 +0,0 @@
-// Copyright © 2017 Martin Ueding <dev@martin-ueding.de>
-// Licensed unter the [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause).
-
-// Due to github issue #404, the helper functions to apply the full QPhiX operator
-// are currently disabled because they conflict with the new interfaces in QPhiX
-// itself. If required, these should be rewritten to use these interfaces
-// rather than the base classes in qphix_base_classes.hpp
-
-// This file should be deprecated or updated to provide any functionality
-// not covered by QPhiX itself.
-
-/**
-  \file Additions to QPhiX that are only needed for tmLQCD.
-
-  In the original QPhiX, there are only Wilson fermions and Wilson clover
-  fermions. The Dslash operators have a different call signature (the latter
-  requiring a clover term), so there is no common base class. With the addition
-  of Wilson twisted mass (Mario) and Wilson twisted clover (Peter), there are
-  now two instances of the Dslash that have the same signature. In order to
-  write a more general even-odd source preparation and solution reconstruction
-  code, a common base class for non-clover and clover is desired. In order to
-  leave the QPhiX code untouched (for now), this code lives here in tmLQCD.
-  */
-
-#pragma once
-
-#include <qphix/blas_new_c.h>
-#include <qphix/clover_dslash_def.h>
-#include <qphix/dslash_def.h>
-#include <qphix/geometry.h>
-#include <qphix/tm_clov_dslash_def.h>
-#include <qphix/tm_dslash_def.h>
-
-#include <cassert>
-
-namespace tmlqcd {
-
-namespace {
-size_t constexpr re = 0;
-size_t constexpr im = 1;
-int const n_blas_simt = 1;
-
-// The even checkerboard is given by ( (x + y + z + t ) & 1 == 0 ) -> cb0 is even
-int constexpr cb_even = 0;
-int constexpr cb_odd = 1;
-}
-
-/**
-  Complex multiplication accumulate.
-
-  Computes \f$ (r + \mathrm i i) += (a + \mathrm i b) * (c + \mathrm i d) \f$.
-  */
-template <typename FT>
-void cplx_mul_acc(FT &r_out, FT &i_out, FT const &a, FT const &b, FT const &c, FT const &d) {
-  r_out += a * c - b * d;
-  i_out += a * d + b * c;
-}
-
-/**
-  Wrapper for the clover multiplication function.
-
-  The `struct` is needed in order to allow for partial template specialization in the `Clover`
-  parameter.
-
-  \tparam Clover Type of clover block to use, must be a type from Geometry such that there exists a
-  specialization for it.
-  */
-template <typename FT, int veclen, int soalen, bool compress12, typename Clover>
-struct InnerCloverProduct {
-  /**
-  Multiplies the clover term for a single lattice size to a spinor.
-
-  This function is intended to be used in a loop over all lattice sites. It is expected from the
-  caller to have figured out all the correct indices. There are template specializations for the two
-  different types of clover term that are used in QPhiX.
-
-  \param[out] out Output spinor block. It is assumed to be zeroed properly, the function will just
-  accumulate values into that output variable. Use \ref QPhiX::zeroSpinor for that.
-  \param[in] in Input spinor block.
-  \param[in] clover Single clover block that contains the lattice site of the spinor.
-  \param[in] xi SIMD index for the arrays with length `soalen`, as in the spinors.
-  \param[in] veclen_idx SIMD index for the arrays with length `veclen`, as in the clover term.
-  */
-  static void multiply(
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &out,
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &in,
-      Clover const &clover, int const xi, int const veclen_idx);
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-struct InnerCloverProduct<FT, veclen, soalen, compress12,
-                          typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock> {
-  static void multiply(
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &spinor_out,
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &spinor_in,
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock const &clov_block,
-      int const xi, int const veclen_idx) {
-    // The clover term is block-diagonal in spin. Therefore we need
-    // to iterate over the two blocks of spin.
-    for (auto s_block : {0, 1}) {
-      // Extract the diagonal and triangular parts.
-      auto const &diag_in = s_block == 0 ? clov_block.diag1 : clov_block.diag2;
-      auto const &off_diag_in = s_block == 0 ? clov_block.off_diag1 : clov_block.off_diag2;
-      // Input two-spinor component.
-      for (auto two_s_in : {0, 1}) {
-        // Reconstruct four spinor index.
-        auto const four_s_in = 2 * s_block + two_s_in;
-        // Output two-spinor component.
-        for (auto two_s_out : {0, 1}) {
-          // Reconstruct four spinor index.
-          auto const four_s_out = 2 * s_block + two_s_out;
-          // Input color.
-          for (auto c_in : {0, 1, 2}) {
-            // Spin-color index (0, ..., 5).
-            auto const sc_in = 3 * two_s_in + c_in;
-            // Output color.
-            for (auto c_out : {0, 1, 2}) {
-              // Spin-color index (0, ..., 5).
-              auto const sc_out = 3 * two_s_out + c_out;
-
-              // See `qphix-codegen` file `dslash_common.cc`
-              // function
-              // `clover_term` for the index manipulations done
-              // here.
-
-              // Using separate loops over the actual indices is
-              // probably
-              // faster than the branching in the innermost loop.
-
-              if (sc_out == sc_in) {
-                cplx_mul_acc(spinor_out[c_out][four_s_out][re][xi],
-                             spinor_out[c_out][four_s_out][im][xi], diag_in[sc_in][veclen_idx],
-                             QPhiX::rep<FT,double>(0.0), spinor_in[c_in][four_s_in][re][xi],
-                             spinor_in[c_in][four_s_in][im][xi]);
-              } else if (sc_out < sc_in) {
-                auto const idx15 = sc_in * (sc_in - 1) / 2 + sc_out;
-                cplx_mul_acc(
-                    spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
-                    off_diag_in[idx15][re][veclen_idx],
-                    // aww hell, maybe one should just add negation to QPhiX::half ?
-                    QPhiX::rep<FT,double>(-QPhiX::rep<double,FT>(off_diag_in[idx15][im][veclen_idx])),
-                    spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
-              } else {
-                auto const idx15 = sc_out * (sc_out - 1) / 2 + sc_in;
-                cplx_mul_acc(
-                    spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
-                    off_diag_in[idx15][re][veclen_idx], off_diag_in[idx15][im][veclen_idx],
-                    spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-struct InnerCloverProduct<
-    FT, veclen, soalen, compress12,
-    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock> {
-  static void multiply(
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &spinor_out,
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &spinor_in,
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock const &clov_block,
-      int const xi, int const veclen_idx) {
-    // The clover term is block-diagonal in spin. Therefore we need
-    // to iterate over the two blocks of spin.
-    for (auto s_block : {0, 1}) {
-      // handy reference to half-spinor block
-      auto const &block_in = s_block == 0 ? clov_block.block1 : clov_block.block2;
-      // Input two-spinor component.
-      for (auto two_s_in : {0, 1}) {
-        // Reconstruct four spinor index.
-        auto const four_s_in = 2 * s_block + two_s_in;
-        // Output two-spinor component.
-        for (auto two_s_out : {0, 1}) {
-          // Reconstruct four spinor index.
-          auto const four_s_out = 2 * s_block + two_s_out;
-          // Input color.
-          for (auto c_in : {0, 1, 2}) {
-            // Spin-color index (0, ..., 5).
-            auto const sc_in = 3 * two_s_in + c_in;
-            // Output color.
-            for (auto c_out : {0, 1, 2}) {
-              // Spin-color index (0, ..., 5).
-              auto const sc_out = 3 * two_s_out + c_out;
-
-              cplx_mul_acc(
-                  spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
-                  block_in[sc_out][sc_in][re][veclen_idx], block_in[sc_out][sc_in][im][veclen_idx],
-                  spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/**
-  Multiplies a checkerboarded QPhiX Clover term with a checkerboarded QPhiX spinor.
-
-  Padding is taken care of. A test case for (a copy of) this function exists in QPhiX.
-
-  If the preprocessor macro `PRINT_MAPPING` is defined, it will print out the mapping of `(x, y, z,
-  t)` coordinates to block indices. Also it will check that each block is accessed the proper number
-  of times, that is `soalen` for spinors and `veclen` for clover blocks.
-
-  \param[out] out Output spinor
-  \param[in] in Input spinor
-  \param[in] clover Clover block
-  \param[in] geom Geometry object holding the dimension of clover and spinor
-  */
-template <typename FT, int veclen, int soalen, bool compress12, typename Clover>
-void clover_product(
-    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock *const out,
-    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const *const in,
-    Clover *clover, ::QPhiX::Geometry<FT, veclen, soalen, compress12> &geom) {
-  ::QPhiX::zeroSpinor<FT, veclen, soalen, compress12>(out, geom, n_blas_simt);
-
-#ifdef PRINT_MAPPING
-  std::vector<int> spin_touches(geom.getPxyz() * geom.Nt(), 0);
-  std::vector<int> clover_touches(geom.getPxyz() * geom.Nt() * soalen / veclen, 0);
-
-  std::cout << std::setw(3) << "x" << std::setw(3) << "y" << std::setw(3) << "z" << std::setw(3)
-            << "t"
-            << ":" << std::setw(5) << "spin" << std::setw(5) << "clov"
-            << "\n";
-#endif
-
-  // Iterate through all the block.
-  for (int t = 0; t < geom.Nt(); ++t) {
-    for (int z = 0; z < geom.Nz(); ++z) {
-      for (int y = 0; y < geom.Ny(); ++y) {
-        for (int x = 0; x < geom.Nxh(); ++x) {
-          // First element in the current XY plane at desired Z and T.
-          auto const xyBase = t * geom.getPxyz() + z * geom.getPxy();
-          // Index of the SoA along the X direction.
-          auto const xb = x / soalen;
-          // Index within the SoA.
-          auto const xi = x % soalen;
-          // Global spin block index.
-          auto const spin_block_idx = xb + geom.Nxh() / soalen * y + xyBase;
-          // Global clover/gauge block index.
-          auto const clov_block_idx =
-              xb + (y / geom.nGY()) * geom.Nxh() / soalen + xyBase / geom.nGY();
-          // Index of the SoA structure within the current tile.
-          // auto const tile = (geom.Nxh() / soalen * y + xyBase) % geom.nGY();
-          auto const tile = y % geom.nGY();
-          // Vector index for clover/gauge. The SoA index only runs to
-          // `soalen`, this index needs to run to `veclen`, that is across the
-          // various SoA within the tile.
-          auto const veclen_idx = soalen * tile + xi;
-
-#ifdef PRINT_MAPPING
-          ++spin_touches[spin_block_idx];
-          ++clover_touches[clov_block_idx];
-
-          std::cout << std::setw(3) << x << std::setw(3) << y << std::setw(3) << z << std::setw(3)
-                    << t << ":" << std::setw(5) << spin_block_idx << std::setw(5) << clov_block_idx
-                    << "\n";
-#endif
-
-          assert(xi + xb * soalen == x);
-
-          // References to the objects at desired block.
-          auto const &clov_block = clover[clov_block_idx];
-          auto const &spinor_in = in[spin_block_idx];
-          auto &spinor_out = out[spin_block_idx];
-
-          InnerCloverProduct<FT, veclen, soalen, compress12, Clover>::multiply(
-              spinor_out, spinor_in, clov_block, xi, veclen_idx);
-        }
-      }
-    }
-  }
-
-#ifdef PRINT_MAPPING
-  std::cout << std::flush;
-
-  // Make sure that each block got touched the correct number of times.
-  for (int i = 0; i != spin_touches.size(); ++i) {
-    if (spin_touches[i] != soalen) {
-      std::cout << "Spin missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
-                << spin_touches[i] << " times instead of " << soalen << "\n";
-    }
-  }
-
-  for (int i = 0; i != clover_touches.size(); ++i) {
-    if (clover_touches[i] != veclen) {
-      std::cout << "Clover missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
-                << clover_touches[i] << " times instead of " << veclen << "\n";
-    }
-  }
-
-  std::cout << std::flush;
-#endif
-}
-
-/**
-  Abstract base class for all single-flavor Dslash variants.
-
-  There are four Dslash operators which are implemented in QPhiX:
-
-  - Wilson
-  - Wilson clover
-  - Wilson twisted mass
-  - Wilson clover with twisted mass
-
-  Each of these has a the actual Dslash operation and a so-called “achimbdpsi” operation. These act
-  on four-spinors given a gauge field. This base class provides a uniform interface to all four
-  kinds.
-
-  This code should eventually be migrated into the QPhiX repository. Currently these classes are
-  mere delegators. In the QPhiX repository, the actual classes there should be used as concrete
-  classes.
-  */
-template <typename FT, int veclen, int soalen, bool compress12>
-class Dslash {
- public:
-  typedef ::QPhiX::Geometry<FT, veclen, soalen, compress12> Geom;
-  typedef typename Geom::FourSpinorBlock Spinor;
-  typedef typename Geom::SU3MatrixBlock SU3MatrixBlock;
-
-  explicit Dslash(Geom *geom, double const t_boundary_, double const aniso_coeff_S_,
-                  double const aniso_coeff_T_, double const mass_, bool use_tbc_[4] = nullptr,
-                  double tbc_phases_[4][2] = nullptr)
-      : geom(geom),
-        t_boundary(t_boundary_),
-        aniso_coeff_S(aniso_coeff_S_),
-        aniso_coeff_T(aniso_coeff_T_),
-        mass(mass_) {}
-
-  /**
-    Computes \f$ \psi_\mathrm o = A_\mathrm{oo} \chi_\mathrm o \f$.
-
-    The actual definition of the matrix \f$ A_\mathrm{oo} \f$ is
-    implementation dependent and can be the mass factor \f$ \alpha = 4 + m
-    \f$ for plain Wilson or something more complicated for twisted mass.
-
-    \param[out] out Output spinor \f$ \psi \f$.
-    \param[in] in Input spinor \f$ \chi \f$.
-    */
-  virtual void A_chi(Spinor *const out, Spinor const *const in, int const isign, int const cb) = 0;
-
-  /**
-    Computes \f$ \psi_\mathrm e = A_\mathrm{ee}^{-1} \chi_\mathrm e \f$.
-
-    \param[out] out Output spinor \f$ \psi \f$.
-    \param[in] in Input spinor \f$ \chi \f$.
-    */
-  virtual void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
-                         int const cb) = 0;
-
-  /**
-    Forwarder for the `dslash`.
-
-    This will call the `dslash` function of the respective QPhiX dslash class. There is a subtle
-    difference between the Wilson and all other cases. The Wilson dslash is just the hopping matrix,
-    just the operator \f$ D \f$. For every other case (clover, twisted mass, twisted mass clover),
-    the `dslash` member function will compute \f$ A^{-1} D \f$. In the Wilson case, this \f$ A =
-    \alpha = 4 + m = 1/(2 \kappa) \f$. Since that is _not_ included in the Wilson `dslash`, you will
-    obtain different results when using WilsonDslash::dslash and WilsonTMDslash::dslash with \f$
-    \mu = 0 \f$.
-
-    \todo Make this member function `const`. For this the member function in
-    QPhiX that is called internally must be marked `const` as well.
-    */
-  virtual void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-                      int const isign, int const cb) = 0;
-
-  /**
-    Always plain Wilson dslash.
-
-    In contrast to the \ref dslash member function which just forwards the implementation of QPhiX,
-    this will always give you the “naked” plain Wilson dslash without any factors of \f$ A^{-1} \f$
-    applied.
-    */
-  virtual void plain_dslash(Spinor *const res, const Spinor *const psi,
-                            const SU3MatrixBlock *const u, int const isign, int const cb) {
-    // XXX Perhaps rather implement this with an instance of the WilsonDslash instead?
-
-    auto tmp = QPhiX::makeFourSpinorHandle(*geom);
-    dslash(tmp.get(), psi, u, isign, cb);
-    A_chi(res, tmp.get(), isign, cb);
-  };
-
-  /**
-    Always “dressed” dslash.
-
-    This computes \f$ A^{-1} D \f$ for all variants. In the Wilson case, this will give \f$
-    \alpha^{-1} D \f$.
-    */
-  virtual void A_inv_dslash(Spinor *const res, const Spinor *const psi,
-                            const SU3MatrixBlock *const u, int const isign, int const cb) {
-    dslash(res, psi, u, isign, cb);
-  };
-
-  /**
-    Forwarder for the `achimbdpsi`.
-
-    \todo Make this member function `const`. For this the member function in QPhiX that is called
-    internally must be marked `const` as well.
-    */
-  virtual void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
-                          const SU3MatrixBlock *const u, double const alpha, double const beta,
-                          int const isign, int const cb) = 0;
-
-  /**
-    Prepares the sources on the odd checkerboard.
-
-    This computes
-    \f[
-        \tilde b_o = \frac 12 D_{oe} M_{ee}^{-1} b_e + b_o \,.
-    \f]
-
-    \param[out] tilde_b_odd Prepared source
-    \param[in] b_even Source (right hand side) on the even lattice sites
-    \param]in] b_odd Source on the odd lattice sites
-    \param[in] u Gauge field on the odd lattice sites
-    */
-  virtual void prepare_source(Spinor *const tilde_b_odd, Spinor const *const b_even,
-                              Spinor const *const b_odd, SU3MatrixBlock const *const u);
-
-  /**
-    Reconstructs the solution on the even lattices sites.
-
-    This computes
-    \f[
-        x_e = M_{ee}^{-1} \left( b_e - \frac 12 D_{eo} x_o \right) \,.
-    \f]
-
-    \param[out] x_even Solution on the even lattices sites
-    \param[in] b_even Source (right hand side) on the even lattice sites
-    \param[in] x_odd Solution on the odd lattices sites
-    \param[in] u Gauge field on the even lattice sites
-    */
-  virtual void reconstruct_solution(Spinor *const x_even, Spinor const *const b_even,
-                                    Spinor const *const x_odd, SU3MatrixBlock const *const u);
-
-  Geom *getGeometry() const { return geom; }
-
- private:
-  Geom *const geom;
-
-  double const t_boundary;
-  double const aniso_coeff_S;
-  double const aniso_coeff_T;
-  double const mass;
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-class WilsonDslash : public Dslash<FT, veclen, soalen, compress12> {
- public:
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
-
-  WilsonDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_, double const t_boundary_,
-               double const aniso_coeff_S_, double const aniso_coeff_T_, double const mass_,
-               bool use_tbc_[4] = nullptr, double tbc_phases_[4][2] = nullptr)
-      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
-                                               mass_, use_tbc_, tbc_phases_),
-        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
-        mass_factor_alpha(4.0 + mass_),
-        mass_factor_beta(1.0 / (4.0 * mass_factor_alpha)) {}
-
-  void A_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
-             int const cb_ignored) override {
-    int const n_blas_simt = 1;
-    ::QPhiX::axy(mass_factor_alpha, in, out, upstream_dslash.getGeometry(), n_blas_simt);
-  }
-
-  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
-                 int const cb_ignored) override {
-    int const n_blas_simt = 1;
-    ::QPhiX::axy(1.0 / mass_factor_alpha, in, out, upstream_dslash.getGeometry(), n_blas_simt);
-  }
-
-  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-              int const isign, int const cb) override {
-    upstream_dslash.dslash(res, psi, u, isign, cb);
-  }
-
-  void plain_dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-                    int const isign, int const cb) override {
-    dslash(res, psi, u, isign, cb);
-  };
-
-  void A_inv_dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-                    int const isign, int const cb) override {
-    auto tmp = QPhiX::makeFourSpinorHandle(upstream_dslash.getGeometry());
-    dslash(tmp.get(), psi, u, isign, cb);
-    A_inv_chi(res, tmp.get(), isign, cb);
-  };
-
-  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
-                  const SU3MatrixBlock *const u, double const alpha, double const beta,
-                  int const isign, int const cb) override {
-    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, alpha, beta, isign, cb);
-  }
-
- private:
-  ::QPhiX::Dslash<FT, veclen, soalen, compress12> upstream_dslash;
-
-  double const mass_factor_alpha;
-  double const mass_factor_beta;
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-class WilsonTMDslash : public Dslash<FT, veclen, soalen, compress12> {
- public:
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
-
-  WilsonTMDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_, double const t_boundary_,
-                 double const aniso_coeff_S_, double const aniso_coeff_T_, double const mass_,
-                 double const twisted_mass_, bool use_tbc_[4] = nullptr,
-                 double tbc_phases_[4][2] = nullptr)
-      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
-                                               mass_, use_tbc_, tbc_phases_),
-        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, mass_, twisted_mass_,
-                        use_tbc_, tbc_phases_),
-        mass_factor_alpha(4.0 + mass_),
-        mass_factor_beta(0.25),
-        derived_mu(twisted_mass_ / mass_factor_alpha),
-        derived_mu_inv(mass_factor_alpha /
-                       (mass_factor_alpha * mass_factor_alpha + twisted_mass_ * twisted_mass_)) {}
-
-  void A_chi(Spinor *const out, Spinor const *const in, int const isign,
-             int const cb_ignored) override {
-    helper_A_chi(out, in, -derived_mu * isign, mass_factor_alpha);
-  }
-
-  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
-                 int const cb_ignored) override {
-    helper_A_chi(out, in, derived_mu * isign, derived_mu_inv);
-  }
-
-  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-              int const isign, int const cb) override {
-    upstream_dslash.dslash(res, psi, u, isign, cb);
-  }
-
-  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
-                  const SU3MatrixBlock *const u, double const alpha, double const beta,
-                  int const isign, int const cb) override {
-    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, alpha, beta, isign, cb);
-  }
-
- private:
-  void helper_A_chi(Spinor *const out, Spinor const *const in, double const factor_a,
-                    double const factor_b);
-
-  ::QPhiX::TMDslash<FT, veclen, soalen, compress12> upstream_dslash;
-
-  double const mass_factor_alpha;
-  double const mass_factor_beta;
-  double const derived_mu;
-  double const derived_mu_inv;
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-class WilsonClovDslash : public Dslash<FT, veclen, soalen, compress12> {
- public:
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock CloverBlock;
-
-  WilsonClovDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_,
-                   double const t_boundary_, double const aniso_coeff_S_,
-                   double const aniso_coeff_T_, double const mass_,
-                   CloverBlock *const (&clover_)[2], CloverBlock *const (&inv_clover_)[2],
-                   bool use_tbc_[4] = nullptr, double tbc_phases_[4][2] = nullptr)
-      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
-                                               mass_, use_tbc_, tbc_phases_),
-        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
-        mass_factor_alpha(4.0 + mass_),
-        mass_factor_beta(1.0 / (4.0 * mass_factor_alpha)) {
-    for (int cb : {0, 1}) {
-      clover[cb] = clover_[cb];
-      inv_clover[cb] = inv_clover_[cb];
-    }
-  }
-
-  void A_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
-             int const cb) override {
-    clover_product(out, in, clover[cb], upstream_dslash.getGeometry());
-  }
-
-  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
-                 int const cb) override {
-    clover_product(out, in, inv_clover[cb], upstream_dslash.getGeometry());
-  }
-
-  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-              int const isign, int const cb) override {
-    upstream_dslash.dslash(res, psi, u, inv_clover[cb], isign, cb);
-  }
-
-  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
-                  const SU3MatrixBlock *const u, double const alpha, double const beta,
-                  int const isign, int const cb) override {
-    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, clover[cb], mass_factor_beta, isign, cb);
-  }
-
- private:
-  ::QPhiX::ClovDslash<FT, veclen, soalen, compress12> upstream_dslash;
-
-  double const mass_factor_alpha;
-  double const mass_factor_beta;
-
-  /**
-    Reference to the clover term.
-
-    This class has to provide a `dslash` and `achimbdpsi` member function with the prescribed
-    argument list which does not contain the clover term. The user of these classes should not have
-    to differentiate between non-clover and clover variants. In order to provide the function
-    signature, the clover term is a member. This means that the user has to construct a new operator
-    if the pointers to the clover field need to be changed. Seperate pointers are kept for the fields
-    on the even and odd checkerboards, hence the array dimension.
-    */
-  CloverBlock *clover[2];
-
-  /// See \ref clover.
-  CloverBlock *inv_clover[2];
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-class WilsonClovTMDslash : public Dslash<FT, veclen, soalen, compress12> {
- public:
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
-  typedef
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock FullCloverBlock;
-  typedef
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock CloverBlock;
-
-  WilsonClovTMDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_,
-                     double const t_boundary_, double const aniso_coeff_S_,
-                     double const aniso_coeff_T_, double const mass_, double const twisted_mass_,
-                     CloverBlock *const (&clover_)[2],
-                     FullCloverBlock *const (&inv_clover_)[2][2], bool use_tbc_[4] = nullptr,
-                     double tbc_phases_[4][2] = nullptr)
-      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
-                                               mass_, use_tbc_, tbc_phases_),
-        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
-        mass_factor_alpha(4.0 + mass_),
-        mass_factor_beta(0.25),
-        derived_mu(twisted_mass_ / mass_factor_alpha),
-        derived_mu_inv(mass_factor_alpha /
-                       (mass_factor_alpha * mass_factor_alpha + twisted_mass_ * twisted_mass_)) {
-    for (int cb : {0, 1}) {
-      clover[cb] = clover_[cb];
-      for (int fl : {0, 1}) {
-        inv_clover[cb][fl] = inv_clover_[cb][fl];
-      }
-    }
-  }
-
-  void A_chi(Spinor *const out, Spinor const *const in, int const isign, int const cb) override {
-    clover_product(out, in, clover[cb], upstream_dslash.getGeometry());
-    // TODO: add twisted mass here
-  }
-
-  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
-                 int const cb) override {
-    if (isign == -1) {
-      clover_product(out, in, inv_clover[cb][1], upstream_dslash.getGeometry());
-    } else {
-      clover_product(out, in, inv_clover[cb][0], upstream_dslash.getGeometry());
-    }
-  }
-
-  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-              int const isign, int const cb) override {
-    upstream_dslash.dslash(res, psi, u, (const FullCloverBlock **)inv_clover[cb], isign, cb);
-  }
-
-  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
-                  const SU3MatrixBlock *const u, double const alpha, double const beta,
-                  int const isign, int const cb) override {
-    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, clover[cb],
-                                         mass_factor_beta, isign, cb);
-  }
-
- private:
-  ::QPhiX::TMClovDslash<FT, veclen, soalen, compress12> upstream_dslash;
-
-  double const mass_factor_alpha;
-  double const mass_factor_beta;
-  double const derived_mu;
-  double const derived_mu_inv;
-
-  CloverBlock *clover[2];
-  /* For twisted clover, there are two fields on each checkerboard which differ in the sign
-   * of the twisted quark mass. In effect then, the inner index can be thought of as being
-   * in flavour space while the outer index is the checkerboard index. 
-   */
-  FullCloverBlock *inv_clover[2][2];
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-void WilsonTMDslash<FT, veclen, soalen, compress12>::helper_A_chi(Spinor *const out,
-                                                                  Spinor const *const in,
-                                                                  double const factor_a,
-                                                                  double const factor_b) {
-  auto const nVecs = upstream_dslash.getGeometry().nVecs();
-  auto const Pxy = upstream_dslash.getGeometry().getPxy();
-  auto const Pxyz = upstream_dslash.getGeometry().getPxyz();
-
-  for (uint64_t t = 0; t < T; t++)
-    for (uint64_t x = 0; x < LX / 2; x++)
-      for (uint64_t y = 0; y < LY; y++)
-        for (uint64_t z = 0; z < LZ; z++) {
-          uint64_t const SIMD_vector = x / soalen;
-          uint64_t const x_internal = x % soalen;
-          uint64_t const qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
-
-          for (int color = 0; color < 3; ++color) {
-            for (int spin_block = 0; spin_block < 2; ++spin_block) {
-              // Implement the $\gamma_5$ structure.
-              auto const signed_factor_a = factor_a * (spin_block == 0 ? 1.0 : -1.0);
-
-              for (int half_spin = 0; half_spin < 2; ++half_spin) {
-                auto const four_spin = 2 * spin_block + half_spin;
-                for (int v = 0; v < soalen; ++v) {
-                  auto &out_bcs = out[qphix_idx][color][four_spin];
-                  auto const &in_bcs = in[qphix_idx][color][four_spin];
-
-                  out_bcs[re][v] = factor_b * (in_bcs[re][v] + signed_factor_a * in_bcs[im][v]);
-                  out_bcs[im][v] = factor_b * (in_bcs[im][v] - signed_factor_a * in_bcs[re][v]);
-                }
-              }
-            }
-          }
-
-        }  // volume
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-void Dslash<FT, veclen, soalen, compress12>::prepare_source(Spinor *const tilde_b_odd,
-                                                            Spinor const *const b_even,
-                                                            Spinor const *const b_odd,
-                                                            SU3MatrixBlock const *const u) {
-  auto Mee_be = QPhiX::makeFourSpinorHandle(*geom);
-  WilsonDslash<FT, veclen, soalen, compress12> plain_dslash(geom, t_boundary, aniso_coeff_S,
-                                                            aniso_coeff_T, mass);
-
-  A_inv_chi(Mee_be.get(), b_even, 1, cb_even);
-
-  plain_dslash.dslash(tilde_b_odd, Mee_be.get(), u, 1, cb_odd);
-
-  // FIXME Perhaps use a variable number of BLAS threads here (last parameter).
-  QPhiX::aypx(0.5, Mee_be.get(), tilde_b_odd, *geom, 1);
-}
-
-template <typename FT, int veclen, int soalen, bool compress12>
-void Dslash<FT, veclen, soalen, compress12>::reconstruct_solution(Spinor *const x_even,
-                                                                  Spinor const *const b_even,
-                                                                  Spinor const *const x_odd,
-                                                                  SU3MatrixBlock const *const u) {
-  auto tmp = QPhiX::makeFourSpinorHandle(*geom);
-  WilsonDslash<FT, veclen, soalen, compress12> plain_dslash(geom, t_boundary, aniso_coeff_S,
-                                                            aniso_coeff_T, mass);
-
-  plain_dslash.dslash(tmp.get(), x_odd, u, 1, cb_even);
-  QPhiX::aypx(0.5, b_even, tmp.get(), *geom, 1);
-  A_inv_chi(x_even, tmp.get(), 1, cb_even);
-}
-}
diff --git a/qphix_interface.cpp b/qphix_interface.cpp
deleted file mode 100644
index 2c61427dd..000000000
--- a/qphix_interface.cpp
+++ /dev/null
@@ -1,2192 +0,0 @@
-/***********************************************************************
- *
- * Copyright (C) 2015 Mario Schroeck
- *               2016 Peter Labus
- *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- *
- ***********************************************************************/
-
-#include "qphix_interface.h"
-#include "qphix_interface.hpp"
-#include "qphix_interface_utils.hpp"
-#include "qphix_types.h"
-#include "qphix_veclen.h"
-
-#ifdef TM_USE_MPI
-#include <mpi.h>
-#endif
-
-extern "C" {
-#ifdef HAVE_CONFIG_H
-#include "tmlqcd_config.h"
-#endif
-#include "boundary.h"
-#include "geometry_eo.h"
-#include "gettime.h"
-#include "global.h"
-#include "linalg/convert_eo_to_lexic.h"
-#include "linalg/diff.h"
-#include "linalg/square_norm.h"
-#include "misc_types.h"
-#include "operator/Hopping_Matrix.h"
-#include "operator/clover_leaf.h"
-#include "operator/clovertm_operators.h"
-#include "operator_types.h"
-#include "struct_accessors.h"
-
-// for the normalisation of the heavy doublet when running
-// RHMC
-#include "phmc.h"
-
-#include "solver/matrix_mult_typedef.h"
-#include "solver/solver.h"
-#include "solver/solver_field.h"
-#include "solver/solver_params.h"
-#include "solver/solver_types.h"
-#include "start.h"
-#include "xchange/xchange_gauge.h"
-}
-#ifdef TM_USE_OMP
-#include <omp.h>
-#endif
-#include <qphix/blas_new_c.h>
-#include <qphix/clover.h>
-#include <qphix/inv_dummy_hermtest.h>
-#include <qphix/inv_richardson_multiprec.h>
-#include <qphix/invbicgstab.h>
-#include <qphix/invcg.h>
-#include <qphix/minvcg.h>
-#include <qphix/ndtm_reuse_operator.h>
-#include <qphix/ndtm_reuse_operator_clover.h>
-#include <qphix/print_utils.h>
-#include <qphix/qphix_config.h>
-#include <qphix/twisted_mass.h>
-#include <qphix/twisted_mass_clover.h>
-#include <qphix/wilson.h>
-#include <cfloat>
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include <vector>
-
-using namespace tmlqcd;
-
-tm_QPhiXParams_t qphix_input;
-
-int By;
-int Bz;
-int NCores;
-int Sy;
-int Sz;
-int PadXY;
-int PadXYZ;
-int MinCt;
-int N_simt;
-bool compress12;
-QphixPrec_t qphix_precision;
-QphixPrec_t qphix_inner_precision;
-
-int subLattSize[4];
-int lattSize[4];
-int qmp_geom[4];
-int qmp_tm_map[4];
-
-// angles for boundary phases, values come from read_input
-extern double X0, X1, X2, X3;
-
-bool use_tbc[4];
-double tbc_phases[4][2];
-// we always use twisted boundary conditions, which means that we are always
-// periodic in time and any possible anti-periodicity is implemented via
-// the phase
-double constexpr t_boundary = 1.0;
-
-template <typename T>
-struct rsdTarget {
-  static const double value;
-};
-
-template <>
-const double rsdTarget<QPhiX::half>::value = 1.0e-3;
-
-template <>
-const double rsdTarget<float>::value = 1.0e-8;
-
-void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPrec_t precision_,
-                QphixPrec_t inner_precision_) {
-  static bool qmp_topo_initialised = false;
-
-  // Global Lattice Size
-  lattSize[0] = LX * g_nproc_x;
-  lattSize[1] = LY * g_nproc_y;
-  lattSize[2] = LZ * g_nproc_z;
-  lattSize[3] = T * g_nproc_t;
-
-  // Local Lattice Size
-  subLattSize[0] = LX;
-  subLattSize[1] = LY;
-  subLattSize[2] = LZ;
-  subLattSize[3] = T;
-
-  // extract twisted boundary conditions
-  for (int dim = 0; dim < 4; dim++) {
-    bool dim_tbc = false;
-    double dim_phase[2] = {1.0, 0.0};
-    if (dim == 0) {
-      dim_tbc = (fabs(X1) > DBL_EPSILON);
-      dim_phase[0] = -((double *)(&phase_1))[0] / g_kappa;
-      dim_phase[1] = -((double *)(&phase_1))[1] / g_kappa;
-    } else if (dim == 1) {
-      dim_tbc = (fabs(X2) > DBL_EPSILON);
-      dim_phase[0] = -((double *)(&phase_2))[0] / g_kappa;
-      dim_phase[1] = -((double *)(&phase_2))[1] / g_kappa;
-    } else if (dim == 2) {
-      dim_tbc = (fabs(X3) > DBL_EPSILON);
-      dim_phase[0] = -((double *)(&phase_3))[0] / g_kappa;
-      dim_phase[1] = -((double *)(&phase_3))[1] / g_kappa;
-    } else if (dim == 3) {
-      dim_tbc = (fabs(X0) > DBL_EPSILON);
-      dim_phase[0] = -((double *)(&phase_0))[0] / g_kappa;
-      dim_phase[1] = -((double *)(&phase_0))[1] / g_kappa;
-    }
-    use_tbc[dim] = dim_tbc;
-    tbc_phases[dim][0] = dim_phase[0];
-    tbc_phases[dim][1] = dim_phase[1];
-  }
-
-  By = params.By;
-  Bz = params.Bz;
-  NCores = params.NCores;
-  Sy = params.Sy;
-  Sz = params.Sz;
-  PadXY = params.PadXY;
-  PadXYZ = params.PadXYZ;
-  MinCt = params.MinCt;
-  N_simt = Sy * Sz;
-  if (c12 == 8) {
-    QPhiX::masterPrintf(
-        "# INFO QphiX: 8-parameter gauge compression not supported, using two row compression "
-        "instead!\n");
-    c12 = 12;
-  }
-  compress12 = c12 == 12 ? true : false;
-  qphix_precision = precision_;
-  qphix_inner_precision = inner_precision_;
-
-#ifdef QPHIX_QMP_COMMS
-  // Declare the logical topology
-  if (!qmp_topo_initialised) {
-    // the QMP topology is the one implied by the number of processes in each
-    // dimension as required by QPHIX ( x fastest to t slowest running )
-    qmp_geom[0] = g_nproc_x;
-    qmp_geom[1] = g_nproc_y;
-    qmp_geom[2] = g_nproc_z;
-    qmp_geom[3] = g_nproc_t;
-
-    // in order for the topologies to agree between tmLQCD and QPhiX, the dimensions need to be
-    // permuted
-    // since Z is fastest in tmLQCD and X is second-slowest
-    qmp_tm_map[0] = 2;
-    qmp_tm_map[1] = 1;
-    qmp_tm_map[2] = 0;
-    qmp_tm_map[3] = 3;
-    if (QMP_declare_logical_topology_map(qmp_geom, 4, qmp_tm_map, 4) != QMP_SUCCESS) {
-      QMP_error("Failed to declare QMP Logical Topology\n");
-      abort();
-    }
-    // longish test to check if the logical coordinates are correctly mapped
-    if (g_debug_level >= 5) {
-      for (int proc = 0; proc < g_nproc; proc++) {
-        if (proc == g_proc_id) {
-          const int coordinates[4] = {g_proc_coords[1], g_proc_coords[2], g_proc_coords[3],
-                                      g_proc_coords[0]};
-          int id = QMP_get_node_number_from(coordinates);
-          int *qmp_coords = QMP_get_logical_coordinates_from(id);
-          fflush(stdout);
-          printf("QMP id: %3d x:%3d y:%3d z:%3d t:%3d\n", id, qmp_coords[0], qmp_coords[1],
-                 qmp_coords[2], qmp_coords[3]);
-          printf("MPI id: %3d x:%3d y:%3d z:%3d t:%3d\n\n", g_proc_id, g_proc_coords[1],
-                 g_proc_coords[2], g_proc_coords[3], g_proc_coords[0]);
-          free(qmp_coords);
-          fflush(stdout);
-          MPI_Barrier(MPI_COMM_WORLD);
-        } else {
-          MPI_Barrier(MPI_COMM_WORLD);
-        }
-      }
-    }
-    qmp_topo_initialised = true;
-  }
-#endif
-
-#ifdef QPHIX_QPX_SOURCE
-  if (thread_bind) {
-    QPhiX::setThreadAffinity(NCores_user, Sy_user * Sz_user);
-  }
-  QPhiX::reportAffinity();
-#endif
-}
-
-void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPrec_t precision_) {
-  _initQphix(argc, argv, params, c12, precision_, precision_);
-}
-
-// Finalize the QPhiX library
-void _endQphix() {}
-
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_clover_to_QPhiX(
-    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::CloverBlock *qphix_clover, int cb,
-    bool inverse, bool fl_offdiag = false) {
-  const double startTime = gettime();
-
-  /* the spin-colour clover term in sw_term and the corresponding inverse
-   * in sw_inv are stored in the tmLQCD gamma basis.
-   * When we translate spinors to QPhiX, we apply a transformation V to the tmLQCD
-   * spinor and then apply the same transformation to the output spinor
-   * ( we have V^dagger = V and V*V = 1 )
-   * Thus, in order to translate the clover field, we need to copy
-   *   (1+T)' = V*(1+T)*V, where T is the spin-colour clover-term
-   * This way, the clover term will be in the correct gamma basis.
-   *
-   * The tmLQCD clover term is stored in half-spinor blocks of colour matrices
-   * for which we need to work out what (1+T)'=V*(1+T)*V implies.
-   * Below, each sAB represents one 3x3 colour matrix
-   *
-   *                +s33 -s32    0    0
-   *  T' = V*T*V =  -s23 +s22    0    0
-   *                   0    0 +s11 -s10
-   *                   0    0 -s01 +s00
-   *
-   * Such that the half-spinor blocks are inverted and within these, the ordering is
-   * reversed. Note that the off-diagonal 3x3 colour blocks are hermitian conjugate to
-   * each other and this is preserved by the transformation.
-   *
-   * The QPhiX (Wilson) clover term is stored as 12 reals on the diagonal
-   * in two 6-element vectors, one for each half-spinor spin pair
-   * and two sets of off-diagonal complex components.
-   *
-   * In addition, colour matrices are transposed in QPhiX.
-   *
-   * The tmLQCD clover term is stored as:
-   *
-   *      s00 s01
-   *          s11
-   * T =          s22 s23
-   *                  s33
-   *
-   * with indexing
-   *
-   *     sw[0][0] sw[1][0]
-   *              sw[2][0]
-   *                       sw[0][1] sw[1][1]
-   *                                sw[2][1]
-   *
-   * The inverse has four su3 blocks instead and is indexed
-   *     sw_inv[0][0] sw_inv[1][0]
-   *     sw_inv[3][0] sw_inv[2][0]
-   *                               sw_inv[0][1] sw_inv[1][1]
-   *                               sw_inv[3][1] sw_inv[2][1]
-   *
-   * where blocks sw_inv[3][0] and sw_inv[3][1] are relevant only when mu > 0
-   *
-   * There is a special case for the non-degenerate twisted clover operator. The
-   * flavour-off-diagonal components of the inverse clover term do not have an imaginary part on the
-   * spin-colour diagonal. They can thus be stored as CloverBlock, which is done in the QPhiX
-   * implementation of the ND tmclover operator.
-   *
-   * As a hack, this inverse is prepared by sw_invert_epsbar and placed in to the last
-   * VOLUME/2 sites of sw_inv. Reading from there is triggered by the boolean
-   * fl_offdiag.
-   */
-
-  // rescale to get clover term (or its inverse) in the physical normalisation
-  // rather than the kappa normalisation
-  const double scale = inverse ? 2.0 * g_kappa : 1.0 / (2.0 * g_kappa);
-  su3 ***tm_clover = inverse ? sw_inv : sw;
-
-  // Number of elements in spin, color & complex
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  // Geometric parameters for QPhiX data layout
-  const auto ngy = geom.nGY();
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-
-  // packer for Wilson clover (real diagonal + complex upper-triangular)
-  /* for the index in the off_diagN arrays, we map to an index in the su3 struct
-   * keeping in mind complex conjugation
-   * The off-diagonal in QPhiX is stored as follows:
-   *
-   * 0 1 3 6 10
-   *   2 4 7 11
-   *     5 8 12
-   *       9 13
-   *         14
-   *
-   * which we are going to map to su3 in blocks
-   *
-   *     0* 1*
-   *        2*
-   *
-   * 3   4  5
-   * 6   7  8
-   * 10 11 12
-   *
-   *   9* 13*
-   *      14*
-   *
-   * where the asterisk indicates complex conjugation. As a linear array then,
-   * these mappings are:
-   *
-   */
-  const int od_su3_offsets[15] = {Nz,
-                                  2 * Nz,            //     0 1
-                                  Nc * Nz + 2 * Nz,  //       2
-
-                                  0,
-                                  Nz,
-                                  2 * Nz,  // 3  4  5
-                                  Nc * Nz,
-                                  Nc * Nz + Nz,
-                                  Nc * Nz + 2 * Nz,  // 6  7  8
-
-                                  Nz,  //     9
-
-                                  2 * Nc * Nz,
-                                  2 * Nc * Nz + Nz,
-                                  2 * Nc * Nz + 2 * Nz,  // 10 11 12
-
-                                  2 * Nz,
-                                  Nc * Nz + 2 * Nz};  // 13 14
-
-#pragma omp parallel for collapse(4)
-  for (int64_t t = 0; t < T; t++) {
-    for (int64_t z = 0; z < LZ; z++) {
-      for (int64_t y = 0; y < LY; y++) {
-        for (int64_t v = 0; v < nVecs; v++) {
-          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
-
-          for (int64_t x_soa = 0; x_soa < SOALEN; x_soa++) {
-            int64_t xx = (y % ngy) * SOALEN + x_soa;
-            int64_t q_cb_x_coord = x_soa + v * SOALEN;
-            int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
-
-            // the inverse of the clover term is in even-odd ordering
-            // while the clover term itself is lexicographically ordered
-            // for the special case of the nd tmclover operator, the inverse of the flavour
-            // off-diagonal components is stored in the last VOLUME/2 elements of sw_inv
-            int64_t tm_idx =
-                (inverse ? g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]] : g_ipt[t][tm_x_coord][y][z]) +
-                ((inverse && fl_offdiag) ? VOLUME / 2 : 0);
-
-            int b_idx;
-
-            //             we begin with the diagonal elements in CloverBlock
-            for (int d = 0; d < 6; d++) {
-              //               choose the block in sw which corresponds to the block in T'
-              b_idx = d < 3 ? 2 : 0;
-              //               get the right colour components
-              qphix_clover[block].diag1[d][xx] = QPhiX::rep<FT, double>(
-                  *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
-                    (Nc * Nz + Nz) * (d % 3)) *
-                  scale);
-
-              qphix_clover[block].diag2[d][xx] = QPhiX::rep<FT, double>(
-                  *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
-                    (Nc * Nz + Nz) * (d % 3)) *
-                  scale);
-            }
-
-            b_idx = 2;  // s33 and s11
-            for (int od : {0, 1, 2}) {
-              for (int reim : {0, 1}) {
-                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
-                    (reim == 1 ? -1.0 : 1.0) *
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
-                      od_su3_offsets[od] + reim) *
-                    scale);
-
-                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
-                    (reim == 1 ? -1.0 : 1.0) *
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
-                      od_su3_offsets[od] + reim) *
-                    scale);
-              }
-            }
-
-            b_idx = 1;  // s32 and s10
-            for (int od : {3, 4, 5, 6, 7, 8, 10, 11, 12}) {
-              for (int reim : {0, 1}) {
-                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
-                      od_su3_offsets[od] + reim) *
-                    (-scale));
-
-                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
-                      od_su3_offsets[od] + reim) *
-                    (-scale));
-              }
-            }
-
-            b_idx = 0;  // s22 and s00
-            for (int od : {9, 13, 14}) {
-              for (int reim : {0, 1}) {
-                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
-                    (reim == 1 ? -1.0 : 1.0) *
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
-                      od_su3_offsets[od] + reim) *
-                    scale);
-
-                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
-                    (reim == 1 ? -1.0 : 1.0) *
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
-                      od_su3_offsets[od] + reim) *
-                    scale);
-              }
-            }
-
-          }  // x_soa
-        }  // for(v)
-      }  // for(y)
-    }  // for(z)
-  }  // for(t)
-
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf(
-        "# QPHIX-interface: time spent in reorder_clover_to_QPhiX (CloverBlock): %f secs\n",
-        diffTime);
-  }
-}
-
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_clover_to_QPhiX(
-    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FullCloverBlock *qphix_clover[2],
-    int cb, bool inverse) {
-  const double startTime = gettime();
-
-  /* the spin-colour clover term in sw_term and the corresponding inverse
-   * in sw_inv are stored in the tmLQCD gamma basis.
-   * When we translate spinors to QPhiX, we apply a transformation V to the tmLQCD
-   * spinor and then apply the same transformation to the output spinor
-   * ( we have V^dagger = V and V*V = 1 )
-   * Thus, in order to translate the clover field, we need to copy
-   *   (1+T)' = V*(1+T)*V, where T is the spin-colour clover-term
-   * This way, the clover term will be in the correct gamma basis.
-   *
-   * The tmLQCD clover term is stored in half-spinor blocks of colour matrices
-   * for which we need to work out what (1+T)'=V*(1+T)*V implies.
-   * Below, each sAB represents one 3x3 colour matrix
-   *
-   *                +s33 -s32    0    0
-   *  T' = V*T*V =  -s23 +s22    0    0
-   *                   0    0 +s11 -s10
-   *                   0    0 -s01 +s00
-   *
-   * Such that the half-spinor blocks are inverted and within these, the ordering is
-   * reversed. Note that the off-diagonal 3x3 colour blocks are hermitian conjugate to
-   * each other and this is preserved by the transformation.
-   *
-   * The QPhiX (tmclover) clover term and its inverse are stored as a pair of full
-   * 6x6 complex matrices which are multiplied with the spinor in exactly the same way
-   * as in tmLQCD.
-   *
-   * The tmLQCD clover term is stored as:
-   *
-   *      s00 s01
-   *          s11
-   * T =          s22 s23
-   *                  s33
-   *
-   * with indexing
-   *
-   *     sw[0][0] sw[1][0]
-   *              sw[2][0]
-   *                       sw[0][1] sw[1][1]
-   *                                sw[2][1]
-   *
-   * The inverse has four su3 blocks instead and is indexed
-   *     sw_inv[0][0] sw_inv[1][0]
-   *     sw_inv[3][0] sw_inv[2][0]
-   *                               sw_inv[0][1] sw_inv[1][1]
-   *                               sw_inv[3][1] sw_inv[2][1]
-   *
-   * where blocks sw_inv[3][0] and sw_inv[3][1] are relevant only when mu > 0   *
-   */
-
-  // rescale to get clover term (or its inverse) in the physical normalisation
-  // rather than the kappa normalisation
-  const double scale = inverse ? 2.0 * g_kappa : 1.0 / (2.0 * g_kappa);
-  su3 ***tm_clover = inverse ? sw_inv : sw;
-
-  // Number of elements in spin, color & complex
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  const double amu = g_mu / (2.0 * g_kappa);
-
-  // Geometric parameters for QPhiX data layout
-  const auto ngy = geom.nGY();
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-
-#pragma omp parallel for collapse(4)
-  for (int64_t t = 0; t < T; t++) {
-    for (int64_t z = 0; z < LZ; z++) {
-      for (int64_t y = 0; y < LY; y++) {
-        for (int64_t v = 0; v < nVecs; v++) {
-          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
-
-          for (int64_t x_soa = 0; x_soa < SOALEN; x_soa++) {
-            int64_t xx = (y % ngy) * SOALEN + x_soa;
-            int64_t q_cb_x_coord = x_soa + v * SOALEN;
-            int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
-
-            //             the inverse of the clover term is in even-odd ordering
-            //             while the clover term itself is lexicographically ordered
-            int64_t tm_idx =
-                inverse ? g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]] : g_ipt[t][tm_x_coord][y][z];
-
-            for (int fl : {0, 1}) {
-              if (inverse && fl == 1) {
-                // the inverse clover term for the second flavour is stored at an offset
-                tm_idx += VOLUME / 2;
-              }
-              for (int q_hs : {0, 1}) {
-                auto &hs_block =
-                    ((q_hs == 0) ? qphix_clover[fl][block].block1 : qphix_clover[fl][block].block2);
-                for (int q_sc1 = 0; q_sc1 < 6; q_sc1++) {
-                  for (int q_sc2 = 0; q_sc2 < 6; q_sc2++) {
-                    const int q_s1 = q_sc1 / 3;
-                    const int q_s2 = q_sc2 / 3;
-                    const int q_c1 = q_sc1 % 3;
-                    const int q_c2 = q_sc2 % 3;
-
-                    // invert in spin as required by V*T*V
-                    const int t_hs = 1 - q_hs;
-                    // the indices inside the half-spinor are also inverted
-                    // (which transposes them, of course)
-                    const int t_s1 = 1 - q_s1;
-                    const int t_s2 = 1 - q_s2;
-                    // carry out the mapping from T' to T, keeping in mind that for the inverse
-                    // there are four blocks also on the tmLQCD side, otherwise there are just three
-                    const int t_b_idx = t_s1 + t_s2 + ((inverse && t_s1 == 1 && t_s2 == 0) ? 2 : 0);
-                    for (int reim : {0, 1}) {
-                      hs_block[q_sc1][q_sc2][reim][xx] = QPhiX::rep<FT, double>(
-                          scale *
-                              // off-diagonal (odd-numbered) blocks change sign
-                              (t_b_idx & 1 ? (-1.0) : 1.0) *
-                              // if not doing the inverse and in the bottom-left block, need to
-                              // complex conjugate
-                              ((!inverse && (t_s1 == 1 && t_s2 == 0) && reim == 1) ? -1.0 : 1.0) *
-                              *(reinterpret_cast<double const *const>(
-                                    &(tm_clover[tm_idx][t_b_idx][t_hs].c00)) +
-                                // if not doing the inverse and in the bottom-left block, transpose
-                                // in colour
-                                // because we're actually reading out of the top-right block
-                                Nz * ((!inverse && (t_s1 == 1 && t_s2 == 0)) ? Nc * q_c2 + q_c1
-                                                                             : Nc * q_c1 + q_c2) +
-                                reim) +
-                          // in the QPhiX gamma basis, the twisted quark mass enters with the
-                          // opposite
-                          // sign for consistency
-                          ((!inverse && q_sc1 == q_sc2 && q_hs == 0 && reim == 1)
-                               ? -amu * (1 - 2 * fl)
-                               : 0) +
-                          ((!inverse && q_sc1 == q_sc2 && q_hs == 1 && reim == 1)
-                               ? amu * (1 - 2 * fl)
-                               : 0));
-                    }
-                  }  // q_sc2
-                }  // q_sc1
-              }  // q_hs
-            }  // fl
-
-          }  // x_soa
-        }  // for(v)
-      }  // for(y)
-    }  // for(z)
-  }  // for(t)
-
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf(
-        "# QPHIX-interface: time spent in reorder_clover_to_QPhiX (FullCloverBlock): %f secs\n",
-        diffTime);
-  }
-}
-
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_gauge_to_QPhiX(
-    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::SU3MatrixBlock *qphix_gauge_cb0,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::SU3MatrixBlock *qphix_gauge_cb1) {
-  const double startTime = gettime();
-
-  // Number of elements in spin, color & complex
-  // Here c1 is QPhiX's outer color, and c2 the inner one
-  const int Ns = 4;
-  const int Nc1 = compress12 ? 2 : 3;
-  const int Nc2 = 3;
-  const int Nz = 2;
-
-  // Geometric parameters for QPhiX data layout
-  const auto ngy = geom.nGY();
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-
-  // This is needed to translate between the different
-  // orderings of the direction index "\mu" in tmlQCD
-  // and QPhiX, respectively
-  // in qphix, the Dirac operator is applied in the order
-  //   -+x -> -+y -> -+z -> -+t
-  // while tmlqcd does
-  //   -+t -> -+x -> -+y -> -+z
-  // same as the lattice ordering
-  // The mappingn between the application dimensions is thus:
-  //  tmlqcd_dim(t(0) -> x(1) -> y(2) -> z(3)) = qphix_dim( t(3) -> x(0) -> y(1) -> z(2) )
-  const int change_dim[4] = {1, 2, 3, 0};
-
-  // Get the base pointer for the (global) tmlQCD gauge field
-  xchange_gauge(g_gauge_field);
-  const double *in = reinterpret_cast<double *>(&g_gauge_field[0][0].c00);
-
-#pragma omp parallel for collapse(4)
-  for (int64_t t = 0; t < T; t++)
-    for (int64_t z = 0; z < LZ; z++)
-      for (int64_t y = 0; y < LY; y++)
-        for (int64_t v = 0; v < nVecs; v++) {
-          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
-
-          for (int dim = 0; dim < 4; dim++)     // dimension == QPhiX \mu
-            for (int c1 = 0; c1 < Nc1; c1++)    // QPhiX convention color 1 (runs up to 2 or 3)
-              for (int c2 = 0; c2 < Nc2; c2++)  // QPhiX convention color 2 (always runs up to 3)
-                for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
-                  int64_t xx = (y % ngy) * SOALEN + x_soa;
-                  int64_t q_cb_x_coord = x_soa + v * SOALEN;
-                  int64_t tm_x_coord_cb0 = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ 0);
-                  int64_t tm_x_coord_cb1 = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ 1);
-
-                  int64_t tm_idx_cb0;
-                  int64_t tm_idx_cb1;
-
-                  // backward / forward
-                  for (int dir = 0; dir < 2; dir++) {
-                    if (dir == 0) {
-                      tm_idx_cb0 = g_idn[g_ipt[t][tm_x_coord_cb0][y][z]][change_dim[dim]];
-                      tm_idx_cb1 = g_idn[g_ipt[t][tm_x_coord_cb1][y][z]][change_dim[dim]];
-                    } else {
-                      tm_idx_cb0 = g_ipt[t][tm_x_coord_cb0][y][z];
-                      tm_idx_cb1 = g_ipt[t][tm_x_coord_cb1][y][z];
-                    }
-                    for (int reim = 0; reim < Nz; reim++) {
-                      // Note:
-                      // -----
-                      // 1. \mu in QPhiX runs from 0..7 for all eight neighbouring
-                      // links.
-                      //    Here, the ordering of the direction (backward/forward)
-                      //    is the same
-                      //    for tmlQCD and QPhiX, but we have to change the
-                      //    ordering of the dimensions.
-                      int q_mu = 2 * dim + dir;
-
-                      qphix_gauge_cb0[block][q_mu][c1][c2][reim][xx] =
-                          QPhiX::rep<FT, double>(su3_get_elem(
-                              &(g_gauge_field[tm_idx_cb0][change_dim[dim]]), c2, c1, reim));
-                      qphix_gauge_cb1[block][q_mu][c1][c2][reim][xx] =
-                          QPhiX::rep<FT, double>(su3_get_elem(
-                              &(g_gauge_field[tm_idx_cb1][change_dim[dim]]), c2, c1, reim));
-                    }
-                  }
-                }  // for(dim,c1,c2,x_soa)
-        }  // outer loop (t,z,y,v)
-
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_gauge_to_QPhiX: %f secs\n",
-                        diffTime);
-  }
-}
-
-// Reorder tmLQCD eo-spinor to a FourSpinorBlock QPhiX spinor on the given checkerboard
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_eo_spinor_to_QPhiX(
-    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom, spinor const *const tm_eo_spinor,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FourSpinorBlock *qphix_spinor,
-    const int cb) {
-  const double startTime = gettime();
-
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-  const auto Nxh = geom.Nxh();
-
-  // This is needed to translate between the different
-  // gamma bases tmlQCD and QPhiX are using
-  // (note, this is a 4x4 matrix with 4 non-zero elements)
-  const int change_sign[4] = {1, -1, -1, 1};
-  const int change_spin[4] = {3, 2, 1, 0};
-
-#pragma omp parallel for collapse(4)
-  for (int64_t t = 0; t < T; t++) {
-    for (int64_t z = 0; z < LZ; z++) {
-      for (int64_t y = 0; y < LY; y++) {
-        for (int64_t v = 0; v < nVecs; v++) {
-          for (int col = 0; col < Nc; col++) {
-            for (int q_spin = 0; q_spin < Ns; q_spin++) {
-              for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
-                int64_t q_ind = t * Pxyz + z * Pxy + y * nVecs + v;
-                int64_t q_cb_x_coord = v * SOALEN + x_soa;
-                // when t+y+z is odd and we're on an odd (1) checkerboard OR
-                // when t+y+z is even and we're on an even (0) checkerboard
-                // the full x coordinate is 2*x_cb
-                // otherwise, it is 2*x_cb+1
-                int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
-                // exchange x and z dimensions
-                int64_t tm_eo_ind = g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]];
-
-                for (int reim = 0; reim < 2; reim++) {
-                  qphix_spinor[q_ind][col][q_spin][reim][x_soa] = QPhiX::rep<FT, double>(
-                      change_sign[q_spin] *
-                      spinor_get_elem(&(tm_eo_spinor[tm_eo_ind]), change_spin[q_spin], col, reim));
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_eo_spinor_to_QPhiX: %f secs\n",
-                        diffTime);
-  }
-}
-
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_eo_spinor_from_QPhiX(
-    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom, spinor *tm_eo_spinor,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FourSpinorBlock *qphix_spinor,
-    const int cb, double normFac = 1.0) {
-  const double startTime = gettime();
-
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-  const auto Nxh = geom.Nxh();
-
-  // This is needed to translate between the different
-  // gamma bases tmlQCD and QPhiX are using
-  // (note, this is a 4x4 matrix with 4 non-zero elements)
-  const int change_sign[4] = {1, -1, -1, 1};
-  const int change_spin[4] = {3, 2, 1, 0};
-
-#pragma omp parallel for collapse(4)
-  for (int64_t t = 0; t < T; t++) {
-    for (int64_t z = 0; z < LZ; z++) {
-      for (int64_t y = 0; y < LY; y++) {
-        for (int64_t v = 0; v < nVecs; v++) {
-          for (int col = 0; col < Nc; col++) {
-            for (int q_spin = 0; q_spin < Ns; q_spin++) {
-              for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
-                int64_t q_ind = t * Pxyz + z * Pxy + y * nVecs + v;
-                int64_t q_cb_x_coord = v * SOALEN + x_soa;
-                // when t+y+z is odd and we're on an odd checkerboard (1) OR
-                // when t+y+z is even and we're on an even (0) checkerboard
-                // the full x coordinate is 2*x_cb
-                // otherwise, it is 2*x_cb+1
-                int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
-                // exchange x and z dimensions
-                int64_t tm_eo_ind = g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]];
-
-                spinor_set_elem(
-                    &(tm_eo_spinor[tm_eo_ind]), change_spin[q_spin], col,
-                    change_sign[q_spin] * normFac *
-                        QPhiX::rep<double, FT>(qphix_spinor[q_ind][col][q_spin][0][x_soa]),
-                    change_sign[q_spin] * normFac *
-                        QPhiX::rep<double, FT>(qphix_spinor[q_ind][col][q_spin][1][x_soa]));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_eo_spinor_from_QPhiX: %f secs\n",
-                        diffTime);
-  }
-}
-
-// Reorder a full tmLQCD spinor to a cb0 and cb1 QPhiX spinor
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_spinor_to_QPhiX(QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
-                             double const *tm_spinor, FT *qphix_spinor_cb0, FT *qphix_spinor_cb1) {
-  const double startTime = gettime();
-
-  // Number of elements in spin, color & complex
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  // Geometric parameters for QPhiX data layout
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-
-  // This is needed to translate between the different
-  // gamma bases tmlQCD and QPhiX are using
-  const int change_sign[4] = {1, -1, -1, 1};
-  const int change_spin[4] = {3, 2, 1, 0};
-
-// This will loop over the entire lattice and calculate
-// the array and internal indices for both tmlQCD & QPhiX
-#pragma omp parallel for collapse(4)
-  for (uint64_t t = 0; t < T; t++)
-    for (uint64_t x = 0; x < LX; x++)
-      for (uint64_t y = 0; y < LY; y++)
-        for (uint64_t z = 0; z < LZ; z++) {
-          // These are the QPhiX SIMD vector in checkerboarded x direction
-          // (up to LX/2) and the internal position inside the SIMD vector
-          const uint64_t SIMD_vector = (x / 2) / SOALEN;
-          const uint64_t x_internal = (x / 2) % SOALEN;
-
-          // Calculate the array index in tmlQCD & QPhiX,
-          // given a global lattice index (t,x,y,z)
-          const uint64_t qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
-          const uint64_t tm_idx = g_ipt[t][x][y][z];
-
-          // Calculate base point for every spinor field element (tmlQCD) or
-          // for every SIMD vector of spinors, a.k.a FourSpinorBlock (QPhiX),
-          // which will depend on the checkerboard (cb)
-          const double *in = tm_spinor + Ns * Nc * Nz * tm_idx;
-          FT *out;
-          if ((t + x + y + z) & 1)
-            out = qphix_spinor_cb1 + SOALEN * Nz * Nc * Ns * qphix_idx;  // odd -> cb1
-          else
-            out = qphix_spinor_cb0 + SOALEN * Nz * Nc * Ns * qphix_idx;  // even -> cb0
-
-          // Copy the internal elements, performing a gamma basis transformation
-          for (int spin = 0; spin < Ns; spin++)  // QPhiX spin index
-            for (int color = 0; color < Nc; color++)
-              for (int z = 0; z < Nz; z++)  // RE or IM
-              {
-                const uint64_t qId =
-                    x_internal + z * SOALEN + spin * SOALEN * Nz + color * SOALEN * Nz * Ns;
-                const uint64_t tId = z + color * Nz + change_spin[spin] * Nz * Nc;
-
-                out[qId] = QPhiX::rep<FT, double>(change_sign[spin] * in[tId]);
-              }
-
-        }  // volume
-
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_spinor_to_QPhiX: %f secs\n",
-                        diffTime);
-  }
-}
-
-// Reorder a cb0 and cb1 QPhiX spinor to a full tmLQCD spinor
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_spinor_from_QPhiX(QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
-                               double *tm_spinor, FT const *qphix_spinor_cb0,
-                               FT const *qphix_spinor_cb1, double normFac = 1.0) {
-  const double startTime = gettime();
-
-  // Number of elements in spin, color & complex
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  // Geometric parameters for QPhiX data layout
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-
-  // This is needed to translate between the different
-  // gamma bases tmlQCD and QPhiX are using
-  const int change_sign[4] = {1, -1, -1, 1};
-  const int change_spin[4] = {3, 2, 1, 0};
-
-// This will loop over the entire lattice and calculate
-// the array and internal indices for both tmlQCD & QPhiX
-#pragma omp parallel for collapse(4)
-  for (uint64_t t = 0; t < T; t++)
-    for (uint64_t x = 0; x < LX; x++)
-      for (uint64_t y = 0; y < LY; y++)
-        for (uint64_t z = 0; z < LZ; z++) {
-          // These are the QPhiX SIMD vector in checkerboarded x direction
-          // (up to LX/2) and the internal position inside the SIMD vector
-          const uint64_t SIMD_vector = (x / 2) / SOALEN;
-          const uint64_t x_internal = (x / 2) % SOALEN;
-
-          // Calculate the array index in tmlQCD & QPhiX,
-          // given a global lattice index (t,x,y,z)
-          const uint64_t qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
-          const uint64_t tm_idx = g_ipt[t][x][y][z];
-
-          // Calculate base point for every spinor field element (tmlQCD) or
-          // for every SIMD vector of spinors, a.k.a FourSpinorBlock (QPhiX),
-          // which will depend on the checkerboard (cb)
-          const FT *in;
-          if ((t + x + y + z) & 1)
-            in = qphix_spinor_cb1 + SOALEN * Nz * Nc * Ns * qphix_idx;  // cb1
-          else
-            in = qphix_spinor_cb0 + SOALEN * Nz * Nc * Ns * qphix_idx;  // cb0
-          double *out = tm_spinor + Ns * Nc * Nz * tm_idx;
-
-          // Copy the internal elements, performing a gamma basis transformation
-          for (int spin = 0; spin < Ns; spin++)  // tmlQCD spin index
-            for (int color = 0; color < Nc; color++)
-              for (int z = 0; z < Nz; z++)  // RE or IM
-              {
-                const uint64_t qId = x_internal + z * SOALEN + change_spin[spin] * SOALEN * Nz +
-                                     color * SOALEN * Nz * Ns;
-                const uint64_t tId = z + color * Nz + spin * Nz * Nc;
-
-                out[tId] = QPhiX::rep<double, FT>(normFac * change_sign[spin] * in[qId]);
-              }
-
-        }  // volume
-
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_spinor_from_QPhiX: %f secs\n",
-                        diffTime);
-  }
-}
-
-template <typename FT, int V, int S, bool compress12, typename FT_inner, int V_inner, int S_inner,
-          bool compress12_inner>
-void pack_nd_clover(
-    QPhiX::Geometry<FT, V, S, compress12> &geom,
-    QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner> &geom_inner,
-    typename QPhiX::Geometry<FT, V, S, compress12>::FullCloverBlock *full_invclov[2],
-    typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock *invclov_odiag,
-    typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock *clov,
-    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::FullCloverBlock
-        *full_invclov_inner[2],
-    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock
-        *invclov_odiag_inner,
-    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock *clov_inner,
-    const int cb, bool pack_inner) {
-  typedef typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock QClover;
-  typedef typename QPhiX::Geometry<FT, V, S, compress12>::FullCloverBlock QFullClover;
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock
-      QClover_inner;
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::FullCloverBlock
-      QFullClover_inner;
-
-  double start = gettime();
-  reorder_clover_to_QPhiX(geom, clov, cb, false);
-  if (pack_inner) {
-    reorder_clover_to_QPhiX(geom_inner, clov_inner, cb, false);
-  }
-
-  sw_invert_epsbar(g_epsbar);
-  reorder_clover_to_QPhiX(geom, invclov_odiag, 1 - cb, true, true);
-  if (pack_inner) {
-    reorder_clover_to_QPhiX(geom_inner, invclov_odiag_inner, 1 - cb, true, true);
-  }
-
-  // no minus sign here, the difference in the sign of gamma5
-  // is taken care of internally
-  sw_invert_mubar(g_mubar);
-  reorder_clover_to_QPhiX(geom, full_invclov, 1 - cb, true);
-  if (pack_inner) {
-    reorder_clover_to_QPhiX(geom_inner, full_invclov_inner, 1 - cb, true);
-  }
-
-  sw_invert_nd(g_mubar * g_mubar - g_epsbar * g_epsbar);
-
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-inteface: ND TMClover clover-field packing took %.4lf seconds\n",
-                        gettime() - start);
-  }
-}
-
-// Due to github issue #404, the helper functions to apply the full QPhiX operator
-// are currently disabled because they conflict with the new interfaces in QPhiX
-// itself. If required, these should be rewritten to use these interfaces
-// rather than the base classes in qphix_base_classes.hpp
-
-// Apply the full QPhiX fermion matrix to checkerboarded tm spinors
-// template <typename FT, int V, int S, bool compress>
-// void Mfull_helper(spinor *Even_out, spinor *Odd_out, const spinor *Even_in, const spinor *Odd_in,
-//                  const op_type_t op_type) {
-//  // TODO: this should use handles for gauge and spinors because these are definitely temporary
-//  // objects
-//  typedef typename QPhiX::Geometry<FT, V, S, compress>::SU3MatrixBlock QGauge;
-//  typedef typename QPhiX::Geometry<FT, V, S, compress>::FourSpinorBlock QSpinor;
-//  typedef typename QPhiX::Geometry<FT, V, S, compress>::CloverBlock QClover;
-//  typedef typename QPhiX::Geometry<FT, V, S, compress>::FullCloverBlock QFullClover;
-//
-//  if (g_debug_level > 1) tmlqcd::printQphixDiagnostics(V, S, compress, V, S, compress);
-//
-//  double coeff_s = (FT)(1);
-//  double coeff_t = (FT)(1);
-//
-//  QPhiX::Geometry<FT, V, S, compress> geom(subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ,
-//                                           MinCt);
-//
-//  // Wilson mass
-//  double mass = 1 / (2.0 * g_kappa) - 4;
-//
-//  tmlqcd::Dslash<FT, V, S, compress> *polymorphic_dslash;
-//
-//  QGauge *u_packed[2];
-//  QSpinor *qphix_in[2];
-//  QSpinor *qphix_out[2];
-//
-//  QClover *clover[2];
-//  QClover *inv_clover[2];
-//
-//  QFullClover *inv_fullclover[2][2];
-//
-//  QSpinor *tmp_spinor = (QSpinor *)geom.allocCBFourSpinor();
-//  for (int cb : {0, 1}) {
-//    u_packed[cb] = (QGauge *)geom.allocCBGauge();
-//    qphix_in[cb] = (QSpinor *)geom.allocCBFourSpinor();
-//    qphix_out[cb] = (QSpinor *)geom.allocCBFourSpinor();
-//    clover[cb] = nullptr;
-//    inv_clover[cb] = nullptr;
-//    for (int fl : {0, 1}) {
-//      inv_fullclover[cb][fl] = nullptr;
-//    }
-//  }
-//  reorder_gauge_to_QPhiX(geom, u_packed[cb_even], u_packed[cb_odd]);
-//
-//  if (op_type == WILSON) {
-//    polymorphic_dslash = new tmlqcd::WilsonDslash<FT, V, S, compress>(
-//        &geom, t_boundary, coeff_s, coeff_t, mass, use_tbc, tbc_phases);
-//  } else if (op_type == TMWILSON) {
-//    polymorphic_dslash = new tmlqcd::WilsonTMDslash<FT, V, S, compress>(
-//        &geom, t_boundary, coeff_s, coeff_t, mass, -g_mu / (2.0 * g_kappa), use_tbc, tbc_phases);
-//  } else if (op_type == CLOVER && fabs(g_mu) <= DBL_EPSILON) {
-//    for (int cb : {0, 1}) {
-//      clover[cb] = (QClover *)geom.allocCBClov();
-//      inv_clover[cb] = (QClover *)geom.allocCBClov();
-//
-//      reorder_clover_to_QPhiX(geom, clover[cb], cb, false);
-//      sw_invert(cb, 0);
-//      reorder_clover_to_QPhiX(geom, inv_clover[cb], cb, true);
-//    }
-//
-//    polymorphic_dslash = new tmlqcd::WilsonClovDslash<FT, V, S, compress>(
-//        &geom, t_boundary, coeff_s, coeff_t, mass, clover, inv_clover, use_tbc, tbc_phases);
-//
-//  } else if (op_type == CLOVER && fabs(g_mu) > DBL_EPSILON) {
-//    for (int cb : {0, 1}) {
-//      clover[cb] = (QClover *)geom.allocCBClov();
-//      for (int fl : {0, 1}) {
-//        inv_fullclover[cb][fl] = (QFullClover *)geom.allocCBFullClov();
-//      }
-//      reorder_clover_to_QPhiX(geom, clover[cb], cb, false);
-//      sw_invert(cb, g_mu);
-//      reorder_clover_to_QPhiX(geom, inv_fullclover[cb], cb, true);
-//    }
-//
-//    polymorphic_dslash = new tmlqcd::WilsonClovTMDslash<FT, V, S, compress>(
-//        &geom, t_boundary, coeff_s, coeff_t, mass, -g_mu / (2.0 * g_kappa), clover,
-//        inv_fullclover, use_tbc, tbc_phases);
-//
-//  } else {
-//    QPhiX::masterPrintf("tmlqcd::Mfull_helper; No such operator type: %d\n", op_type);
-//    abort();
-//  }
-//
-////   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Even_in),
-////                              qphix_in[cb_even], cb_even);
-////   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Odd_in),
-/// qphix_in[cb_odd], /                              cb_odd);
-//  reorder_eo_spinor_to_QPhiX(geom, Even_in,
-//                             qphix_in[cb_even], cb_even);
-//  reorder_eo_spinor_to_QPhiX(geom, Odd_in, qphix_in[cb_odd],
-//                             cb_odd);
-//  // Apply QPhiX Mfull
-//  polymorphic_dslash->plain_dslash(qphix_out[cb_odd], qphix_in[cb_even], u_packed[cb_odd],
-//                                   /* isign == non-conjugate */ 1, cb_odd);
-//  polymorphic_dslash->plain_dslash(qphix_out[cb_even], qphix_in[cb_odd], u_packed[cb_even],
-//                                   /* isign == non-conjugate */ 1, cb_even);
-//  for (int cb : {0, 1}) {
-//    polymorphic_dslash->A_chi(tmp_spinor, qphix_in[cb], 1, cb);
-//    QPhiX::aypx(-0.5, tmp_spinor, qphix_out[cb], geom, 1);
-//  }
-//
-//  reorder_eo_spinor_from_QPhiX(geom, Even_out, qphix_out[cb_even],
-//                               cb_even, 2.0 * g_kappa);
-//  reorder_eo_spinor_from_QPhiX(geom, Odd_out, qphix_out[cb_odd], cb_odd,
-//                               2.0 * g_kappa);
-//
-//  geom.free(tmp_spinor);
-//  for (int cb : {0, 1}) {
-//    geom.free(u_packed[cb]);
-//    geom.free(qphix_in[cb]);
-//    geom.free(qphix_out[cb]);
-//    geom.free(clover[cb]);
-//    geom.free(inv_clover[cb]);
-//    for (int fl : {0, 1}) {
-//      geom.free(inv_fullclover[cb][fl]);
-//    }
-//  };
-//  delete (polymorphic_dslash);
-//}
-
-// Templated even-odd preconditioned solver using QPhiX Library
-template <typename FT, int V, int S, bool compress, typename FT_inner = FT, int V_inner = V,
-          int S_inner = S, bool compress_inner = compress>
-int invert_eo_qphix_helper(std::vector<std::vector<spinor *> > &tmlqcd_odd_out,
-                           std::vector<std::vector<spinor *> > &tmlqcd_odd_in,
-                           const double target_precision, const int max_iter, const int solver_flag,
-                           solver_params_t solver_params, const int num_flavour) {
-  // TODO: it would perhaps be beneficial to keep the fields resident
-  typedef typename QPhiX::Geometry<FT, V, S, compress>::SU3MatrixBlock QGauge;
-  typedef typename QPhiX::Geometry<FT, V, S, compress>::FourSpinorBlock QSpinor;
-  typedef typename QPhiX::FourSpinorHandle<FT, V, S, compress> QSpinorHandle;
-  typedef typename QPhiX::Geometry<FT, V, S, compress>::CloverBlock QClover;
-  typedef typename QPhiX::Geometry<FT, V, S, compress>::FullCloverBlock QFullClover;
-
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::SU3MatrixBlock
-      QGauge_inner;
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::FourSpinorBlock
-      QSpinor_inner;
-  typedef typename QPhiX::FourSpinorHandle<FT_inner, V_inner, S_inner, compress_inner>
-      QSpinorHandle_inner;
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::CloverBlock
-      QClover_inner;
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::FullCloverBlock
-      QFullClover_inner;
-
-  /************************
-   *                      *
-   *    SETUP GEOMETRY    *
-   *                      *
-   ************************/
-
-  if (g_debug_level > 1) {
-    tmlqcd::printQphixDiagnostics(V, S, compress, V_inner, S_inner, compress_inner);
-  }
-
-  QPhiX::Geometry<FT, V, S, compress> geom(subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ,
-                                           MinCt);
-
-  // we always create the inner geometry, the overhead should be small...
-  QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner> geom_inner(
-      subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ, MinCt);
-
-  // Set number of BLAS threads by hand.
-  // In case some implements the tune routines in QPhiX
-  // this may be updated...
-  QPhiX::masterPrintf("# Setting number of BLAS threads...\n");
-  const int n_blas_simt = N_simt;
-  QPhiX::masterPrintf("# ...done.\n");
-
-  // Anisotropy Coefficents
-  const double coeff_s = 1.0;
-  const double coeff_t = 1.0;
-
-  // The Wilson mass
-  const double mass = 1.0 / (2.0 * g_kappa) - 4.0;
-
-  // Set variables need for solve
-  bool verbose = g_debug_level > 2 ? true : false;
-  int niters = -1;
-  int niters2 = 0;
-  double rsd_final = -1.0;
-  uint64_t site_flops = 0;
-  uint64_t site_flops2 = 0;
-  uint64_t mv_apps = 0;
-  uint64_t mv_apps2 = 0;
-
-  double start_time;
-  double end_time;
-
-  // support for multi-shift solves via the length of the output vector,
-  // which counts the shifts on the outer index and the flavour on the inner index
-  const int num_shifts = tmlqcd_odd_out.size();
-  std::vector<double> shifts;
-  shifts.resize(num_shifts);
-  std::vector<double> RsdTargetArr;
-  RsdTargetArr.resize(num_shifts);
-  std::vector<double> RsdFinalArr;
-  RsdFinalArr.resize(num_shifts);
-
-  double rescale = 0.5 / g_kappa;
-  // the inverse of M M^dag, as required for the HMC, comes with a factor of alpha^2
-  if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
-    rescale *= rescale;
-  }
-
-  std::vector<QSpinorHandle> q_spinor_handles;
-
-  QGauge *u_packed[2] = {nullptr, nullptr};
-  QGauge_inner *u_packed_inner[2] = {nullptr, nullptr};
-  for (int cb : {0, 1}) {
-    u_packed[cb] = (QGauge *)geom.allocCBGauge();
-  }
-  // Reorder (global) input gauge field from tmLQCD to QPhiX
-  reorder_gauge_to_QPhiX(geom, u_packed[cb_even], u_packed[cb_odd]);
-
-  // for mixed solvers, we also need the gauge field in the inner precision
-  if (solver_is_mixed(solver_flag)) {
-    for (int cb : {0, 1}) {
-      u_packed_inner[cb] = (QGauge_inner *)geom_inner.allocCBGauge();
-    }
-    reorder_gauge_to_QPhiX(geom_inner, u_packed_inner[cb_even], u_packed_inner[cb_odd]);
-  }
-
-  if (num_flavour == 1) {
-    constexpr int nf = 1;
-    std::vector<QSpinor *> qphix_in;
-    qphix_in.resize(1);
-    std::vector<QSpinor *> qphix_out;
-    qphix_out.resize(num_shifts);
-    QSpinor *qphix_buffer;
-
-    QClover *qphix_clover = nullptr;
-    QClover *qphix_inv_clover = nullptr;
-
-    QClover_inner *qphix_clover_inner = nullptr;
-    QClover_inner *qphix_inv_clover_inner = nullptr;
-
-    QFullClover *qphix_inv_fullclover[2] = {nullptr, nullptr};
-
-    QFullClover_inner *qphix_inv_fullclover_inner[2] = {nullptr, nullptr};
-
-    q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-    qphix_in[0] = q_spinor_handles.back().get();
-
-    for (int shift = 0; shift < num_shifts; shift++) {
-      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-      qphix_out[shift] = q_spinor_handles.back().get();
-    }
-
-    q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-    qphix_buffer = q_spinor_handles.back().get();
-
-    QPhiX::EvenOddLinearOperator<FT, V, S, compress> *FermionMatrixQPhiX = nullptr;
-    QPhiX::EvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner>
-        *InnerFermionMatrixQPhiX = nullptr;
-    if ((fabs(g_mu) > DBL_EPSILON) && g_c_sw > DBL_EPSILON) {  // TWISTED-MASS-CLOVER
-      qphix_clover = (QClover *)geom.allocCBClov();
-      for (int fl : {0, 1}) {
-        qphix_inv_fullclover[fl] = (QFullClover *)geom.allocCBFullClov();
-      }
-      reorder_clover_to_QPhiX(geom, qphix_clover, cb_odd, false);
-      reorder_clover_to_QPhiX(geom, qphix_inv_fullclover, cb_even, true);
-
-      QPhiX::masterPrintf("# Creating QPhiX Twisted Clover Fermion Matrix...\n");
-      FermionMatrixQPhiX = new QPhiX::EvenOddTMCloverOperator<FT, V, S, compress>(
-          u_packed, qphix_clover, qphix_inv_fullclover, &geom, t_boundary, coeff_s, coeff_t,
-          use_tbc, tbc_phases, -0.5 * (g_mu3 + g_mu) / g_kappa);
-      if (solver_is_mixed(solver_flag)) {
-        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
-        for (int fl : {0, 1}) {
-          qphix_inv_fullclover_inner[fl] = (QFullClover_inner *)geom_inner.allocCBFullClov();
-        }
-        reorder_clover_to_QPhiX(geom_inner, qphix_clover_inner, cb_odd, false);
-        reorder_clover_to_QPhiX(geom_inner, qphix_inv_fullclover_inner, cb_even, true);
-        InnerFermionMatrixQPhiX =
-            new QPhiX::EvenOddTMCloverOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                u_packed_inner, qphix_clover_inner, qphix_inv_fullclover_inner, &geom_inner,
-                t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases, -0.5 * (g_mu3 + g_mu) / g_kappa);
-      }
-      QPhiX::masterPrintf("# ...done.\n");
-    } else if (fabs(g_mu) > DBL_EPSILON) {  // TWISTED-MASS
-      const double TwistedMass = -g_mu / (2.0 * g_kappa);
-      QPhiX::masterPrintf("# Creating QPhiX Twisted Mass Wilson Fermion Matrix...\n");
-      FermionMatrixQPhiX = new QPhiX::EvenOddTMWilsonOperator<FT, V, S, compress>(
-          mass, TwistedMass, u_packed, &geom, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
-      QPhiX::masterPrintf("# ...done.\n");
-      if (solver_is_mixed(solver_flag)) {
-        InnerFermionMatrixQPhiX =
-            new QPhiX::EvenOddTMWilsonOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                mass, TwistedMass, u_packed_inner, &geom_inner, t_boundary, coeff_s, coeff_t,
-                use_tbc, tbc_phases);
-      }
-    } else if (g_c_sw > DBL_EPSILON) {  // WILSON CLOVER
-      qphix_clover = (QClover *)geom.allocCBClov();
-      qphix_inv_clover = (QClover *)geom.allocCBClov();
-
-      reorder_clover_to_QPhiX(geom, qphix_clover, cb_odd, false);
-      reorder_clover_to_QPhiX(geom, qphix_inv_clover, cb_even, true);
-
-      QPhiX::masterPrintf("# Creating QPhiX Wilson Clover Fermion Matrix...\n");
-      FermionMatrixQPhiX = new QPhiX::EvenOddCloverOperator<FT, V, S, compress>(
-          u_packed, qphix_clover, qphix_inv_clover, &geom, t_boundary, coeff_s, coeff_t, use_tbc,
-          tbc_phases, -0.5 * g_mu3 / g_kappa);
-      if (solver_is_mixed(solver_flag)) {
-        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
-        qphix_inv_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
-        reorder_clover_to_QPhiX(geom_inner, qphix_clover_inner, cb_odd, false);
-        reorder_clover_to_QPhiX(geom_inner, qphix_inv_clover_inner, cb_even, true);
-        InnerFermionMatrixQPhiX =
-            new QPhiX::EvenOddCloverOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                u_packed_inner, qphix_clover_inner, qphix_inv_clover_inner, &geom_inner, t_boundary,
-                coeff_s, coeff_t, use_tbc, tbc_phases, -0.5 * g_mu3 / g_kappa);
-      }
-      QPhiX::masterPrintf("# ...done.\n");
-
-    } else {  // WILSON
-      QPhiX::masterPrintf("# Creating QPhiX Wilson Fermion Matrix...\n");
-      FermionMatrixQPhiX = new QPhiX::EvenOddWilsonOperator<FT, V, S, compress>(
-          mass, u_packed, &geom, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
-      if (solver_is_mixed(solver_flag)) {
-        InnerFermionMatrixQPhiX =
-            new QPhiX::EvenOddWilsonOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                mass, u_packed_inner, &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc,
-                tbc_phases);
-      }
-      QPhiX::masterPrintf("# ...done.\n");
-    }
-
-    // Create a Linear Solver Object
-    QPhiX::AbstractSolver<FT, V, S, compress> *SolverQPhiX = nullptr;
-    QPhiX::AbstractSolver<FT_inner, V_inner, S_inner, compress_inner> *InnerSolverQPhiX = nullptr;
-    QPhiX::AbstractMultiSolver<FT, V, S, compress, nf> *MultiSolverQPhiX = nullptr;
-    if (solver_flag == DUMMYHERMTEST) {
-      QPhiX::masterPrintf("# QPHIX: Creating dummy solver for hermiticity test...\n");
-      SolverQPhiX =
-          new QPhiX::InvDummyHermTest<FT, V, S, compress,
-                                      typename QPhiX::EvenOddLinearOperator<FT, V, S, compress> >(
-              *FermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == CG) {
-      QPhiX::masterPrintf("# QPHIX: Creating CG solver...\n");
-      SolverQPhiX = new QPhiX::InvCG<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == BICGSTAB) {
-      QPhiX::masterPrintf("# QPHIX: Creating BiCGStab solver...\n");
-      SolverQPhiX = new QPhiX::InvBiCGStab<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == MIXEDCG) {
-      // TODO: probably need to adjust inner solver iterations here...
-      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision CG solver...\n");
-      InnerSolverQPhiX = new QPhiX::InvCG<FT_inner, V_inner, S_inner, compress_inner>(
-          *InnerFermionMatrixQPhiX, max_iter);
-      const bool MMdag = true;
-      SolverQPhiX = new QPhiX::InvRichardsonMultiPrec<FT, V, S, compress, FT_inner, V_inner,
-                                                      S_inner, compress_inner, MMdag>(
-          *FermionMatrixQPhiX, *InnerSolverQPhiX, solver_params.mcg_delta, max_iter);
-    } else if (solver_flag == MIXEDBICGSTAB) {
-      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision BICGCGSTAB solver...\n");
-      InnerSolverQPhiX = new QPhiX::InvBiCGStab<FT_inner, V_inner, S_inner, compress_inner>(
-          *InnerFermionMatrixQPhiX, max_iter);
-      const bool MMdag = false;
-      SolverQPhiX = new QPhiX::InvRichardsonMultiPrec<FT, V, S, compress, FT_inner, V_inner,
-                                                      S_inner, compress_inner, MMdag>(
-          *FermionMatrixQPhiX, *InnerSolverQPhiX, solver_params.mcg_delta, max_iter);
-    } else if (solver_flag == CGMMS) {
-      QPhiX::masterPrintf("# QPHIX: Creating multi-shift CG solver ...\n");
-      MultiSolverQPhiX =
-          new QPhiX::MInvCG<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter, num_shifts);
-    } else {
-      QPhiX::masterPrintf(" Solver not yet supported by QPhiX!\n");
-      QPhiX::masterPrintf(" Aborting...\n");
-      abort();
-    }
-    QPhiX::masterPrintf("# ...done.\n");
-
-    //     reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const
-    //     *const>(tmlqcd_odd_in[0][0]),
-    //                                qphix_in[0], cb_odd);
-    reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_in[0][0], qphix_in[0], cb_odd);
-    QPhiX::masterPrintf("# Calling the solver...\n");
-
-    // Set the right precision for the QPhiX solver
-    // we get target_precision externally and and is given such, that it's either
-    // already relative or absolute
-    // Most QPhiX solvers allow setting absolute or relative residual
-    // by passing an appropriate flag, but this is not true for the multi-shift solver.
-    // As a result, we follow that solver and call ALL solvers with
-    // QPhiX::RELATIVE, which gives results consistent with tmLQCD in all cases.
-    double rhs_norm2 = 1.0;
-    QPhiX::norm2Spinor(rhs_norm2, qphix_in[0], geom, n_blas_simt);
-    const double RsdTarget = sqrt(target_precision / rhs_norm2);
-
-    // Calling the solver
-    start_time = gettime();
-    if (solver_flag == DUMMYHERMTEST) {
-      random_spinor_field_eo(tmlqcd_odd_out[0][0], 0, RN_GAUSS);
-      reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_out[0][0], qphix_buffer, cb_odd);
-      for (int isign : {-1, 1}) {
-        (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
-                       isign, verbose, cb_odd, QPhiX::RELATIVE);
-      }
-      QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-    } else if (solver_flag == CG || solver_flag == MIXEDCG || solver_flag == RGMIXEDCG) {
-      // USING CG:
-      // We are solving
-      //   M M^dagger qphix_buffer = qphix_in_prepared
-      // here, that is, isign = -1 for the QPhiX CG solver.
-      (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
-                     -1, verbose, cb_odd, QPhiX::RELATIVE);
-      // After that. if required by the solution type, multiply with M^dagger:
-      //   qphix_out[1] = M^dagger ( M^dagger^-1 M^-1 ) qphix_in_prepared
-      if (solver_params.solution_type == TM_SOLUTION_M) {
-        (*FermionMatrixQPhiX)(qphix_out[0], qphix_buffer, /* conjugate */ -1);
-        mv_apps++;
-      } else {
-        QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-      }
-    } else if (solver_flag == CGMMS) {
-      // TODO: handle the residuals properly
-      if (g_debug_level > 2) QPhiX::masterPrintf("# QPHIX CGMMS: shifts: \n");
-      for (int shift = 0; shift < num_shifts; shift++) {
-        RsdTargetArr[shift] = RsdTarget;
-        RsdFinalArr[shift] = -1.0;
-        shifts[shift] =
-            solver_params.shifts[shift] * solver_params.shifts[shift] / (4 * g_kappa * g_kappa);
-        if (g_debug_level > 2)
-          QPhiX::masterPrintf("# QPHIX CGMMS: shift[%d] = %.6e\n", shift, shifts[shift]);
-      }
-      if (g_debug_level > 2) QPhiX::masterPrintf("\n");
-      (*MultiSolverQPhiX)(qphix_out.data(), qphix_in[0], num_shifts, shifts.data(),
-                          RsdTargetArr.data(), niters, RsdFinalArr.data(), site_flops, mv_apps, -1,
-                          verbose);
-      rsd_final = RsdFinalArr[0];
-    } else if (solver_flag == BICGSTAB || solver_flag == MIXEDBICGSTAB) {
-      (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
-                     1, verbose, cb_odd, QPhiX::RELATIVE);
-      // for M^dagger^-1 M^-1 solution type, need to call BiCGstab twice
-      if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
-        (*SolverQPhiX)(qphix_out[0], qphix_buffer, RsdTarget, niters2, rsd_final, site_flops,
-                       mv_apps2, -1, verbose, cb_odd, QPhiX::RELATIVE);
-      } else {
-        QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-      }
-    }
-    end_time = gettime();
-
-    for (int shift = 0; shift < num_shifts; shift++) {
-      reorder_eo_spinor_from_QPhiX(geom, tmlqcd_odd_out[shift][0], qphix_out[shift], cb_odd,
-                                   rescale);
-    }
-
-    QPhiX::masterPrintf("# QPHIX: ...done.\n");
-    QPhiX::masterPrintf("# QPHIX: Cleaning up\n");
-    delete (FermionMatrixQPhiX);
-    delete (InnerFermionMatrixQPhiX);
-    delete (SolverQPhiX);
-    delete (InnerSolverQPhiX);
-    delete (MultiSolverQPhiX);
-    // on KNL, it seems that munmap is problematic, so we check for nullptr
-    if (qphix_clover) geom.free(qphix_clover);
-    if (qphix_inv_clover) geom.free(qphix_inv_clover);
-    if (qphix_clover_inner) geom_inner.free(qphix_clover_inner);
-    if (qphix_inv_clover_inner) geom_inner.free(qphix_inv_clover_inner);
-    for (int fl : {0, 1}) {
-      if (qphix_inv_fullclover[fl]) geom.free(qphix_inv_fullclover[fl]);
-      if (qphix_inv_fullclover_inner[fl]) geom_inner.free(qphix_inv_fullclover_inner[fl]);
-    }
-    QPhiX::masterPrintf("# QPHIX: ...done.\n\n");
-
-  } else if (num_flavour == 2) {
-    // for explicit template arguments
-    constexpr int nf = 2;
-
-    QSpinor *qphix_in[2];
-    std::vector<QSpinor **> qphix_out;
-    qphix_out.resize(num_shifts);
-    for (int shift = 0; shift < num_shifts; shift++) {
-      qphix_out[shift] = new QSpinor *[2];
-      for (int fl : {0, 1}) {
-        q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-        qphix_out[shift][fl] = q_spinor_handles.back().get();
-      }
-    }
-
-    QSpinor *qphix_buffer[2];
-    for (int fl : {0, 1}) {
-      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-      qphix_in[fl] = q_spinor_handles.back().get();
-      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-      qphix_buffer[fl] = q_spinor_handles.back().get();
-    }
-
-    QClover *qphix_clover = nullptr;
-    QClover_inner *qphix_clover_inner = nullptr;
-
-    QClover *qphix_invclov_odiag = nullptr;
-    QClover_inner *qphix_invclov_odiag_inner = nullptr;
-
-    QFullClover *qphix_inv_fullclover[2] = {nullptr, nullptr};
-    QFullClover_inner *qphix_inv_fullclover_inner[2] = {nullptr, nullptr};
-
-    QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> *TwoFlavFermionMatrixQPhiX = nullptr;
-    QPhiX::TwoFlavEvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner>
-        *InnerTwoFlavFermionMatrixQPhiX = nullptr;
-
-    if (g_c_sw > DBL_EPSILON) {  // DBCLOVER
-      qphix_clover = (QClover *)geom.allocCBClov();
-      qphix_invclov_odiag = (QClover *)geom.allocCBClov();
-      if (solver_is_mixed(solver_flag)) {
-        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
-        qphix_invclov_odiag_inner = (QClover_inner *)geom_inner.allocCBClov();
-      }
-
-      for (int fl : {0, 1}) {
-        qphix_inv_fullclover[fl] = (QFullClover *)geom.allocCBFullClov();
-        if (solver_is_mixed(solver_flag)) {
-          qphix_inv_fullclover_inner[fl] = (QFullClover_inner *)geom_inner.allocCBFullClov();
-        }
-      }
-
-      pack_nd_clover(geom, geom_inner, qphix_inv_fullclover, qphix_invclov_odiag, qphix_clover,
-                     qphix_inv_fullclover_inner, qphix_invclov_odiag_inner, qphix_clover_inner,
-                     cb_odd, solver_is_mixed(solver_flag));
-
-      QPhiX::masterPrintf(
-          "# QPHIX: Creating two-flavour QPhiX Wilson Twisted Clover Fermion Matrix...\n");
-      TwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMCloverReuseOperator<FT, V, S, compress>(
-          -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed, qphix_clover,
-          qphix_invclov_odiag, qphix_inv_fullclover, &geom, t_boundary, coeff_s, coeff_t, use_tbc,
-          tbc_phases);
-      if (solver_is_mixed(solver_flag)) {
-        InnerTwoFlavFermionMatrixQPhiX =
-            new QPhiX::EvenOddNDTMCloverReuseOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed_inner,
-                qphix_clover_inner, qphix_invclov_odiag_inner, qphix_inv_fullclover_inner,
-                &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
-      }
-    } else {  // DBTMWILSON
-      QPhiX::masterPrintf(
-          "# QPHIX: Creating two-flavour QPhiX Wilson Twisted Mass Fermion Matrix...\n");
-      TwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMWilsonReuseOperator<FT, V, S, compress>(
-          mass, -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed, &geom, t_boundary,
-          coeff_s, coeff_t, use_tbc, tbc_phases);
-      if (solver_is_mixed(solver_flag)) {
-        InnerTwoFlavFermionMatrixQPhiX =
-            new QPhiX::EvenOddNDTMWilsonReuseOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                mass, -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed_inner,
-                &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
-      }
-    }
-
-    //
-    QPhiX::AbstractSolver<FT, V, S, compress, nf> *TwoFlavSolverQPhiX = nullptr;
-    QPhiX::AbstractSolver<FT_inner, V_inner, S_inner, compress_inner, nf> *InnerTwoFlavSolverQPhiX =
-        nullptr;
-    QPhiX::AbstractMultiSolver<FT, V, S, compress, nf> *TwoFlavMultiSolverQPhiX = nullptr;
-    if (solver_flag == DUMMYHERMTEST) {
-      QPhiX::masterPrintf("# QPHIX: Creating dummy solver for hermiticity test...\n");
-      TwoFlavSolverQPhiX = new QPhiX::InvDummyHermTest<
-          FT, V, S, compress, typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
-          *TwoFlavFermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == CG) {
-      QPhiX::masterPrintf("# QPHIX: Creating CG solver...\n");
-      TwoFlavSolverQPhiX =
-          new QPhiX::InvCG<FT, V, S, compress,
-                           typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
-              *TwoFlavFermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == BICGSTAB) {
-      QPhiX::masterPrintf("# QPHIX: Creating BiCGstab solver...\n");
-      TwoFlavSolverQPhiX =
-          new QPhiX::InvBiCGStab<FT, V, S, compress,
-                                 typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
-              *TwoFlavFermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == MIXEDCG) {
-      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision CG solver...\n");
-      InnerTwoFlavSolverQPhiX =
-          new QPhiX::InvCG<FT_inner, V_inner, S_inner, compress_inner,
-                           typename QPhiX::TwoFlavEvenOddLinearOperator<FT_inner, V_inner, S_inner,
-                                                                        compress_inner> >(
-              *InnerTwoFlavFermionMatrixQPhiX, max_iter);
-      const bool MMdag = true;
-      TwoFlavSolverQPhiX = new QPhiX::InvRichardsonMultiPrec<
-          FT, V, S, compress, FT_inner, V_inner, S_inner, compress_inner, MMdag,
-          typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
-          *TwoFlavFermionMatrixQPhiX, *InnerTwoFlavSolverQPhiX, solver_params.mcg_delta, max_iter);
-    } else if (solver_flag == CGMMSND) {
-      QPhiX::masterPrintf("# QPHIX: Creating multi-shift CG solver...\n");
-      TwoFlavMultiSolverQPhiX =
-          new QPhiX::MInvCG<FT, V, S, compress,
-                            typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
-              *TwoFlavFermionMatrixQPhiX, max_iter, num_shifts);
-    } else {
-      QPhiX::masterPrintf(" Solver not yet supported by QPhiX!\n");
-      QPhiX::masterPrintf(" Aborting...\n");
-      abort();
-    }
-    QPhiX::masterPrintf("# QPHIX: ...done.\n");
-
-    for (int fl : {0, 1}) {
-      //       reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const
-      //       *const>(tmlqcd_odd_in[0][fl]),
-      //                                  qphix_in[fl], cb_odd);
-      reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_in[0][fl], qphix_in[fl], cb_odd);
-    }
-
-    QPhiX::masterPrintf("# QPHIX: Calling the solver...\n");
-
-    // Set the right precision for the QPhiX solver
-    // we get target_precision externally and and is given such, that it's either
-    // already relative or absolute
-    // Most QPhiX solvers allow setting absolute or relative residual
-    // by passing an appropriate flag, but this is not true for the multi-shift solver.
-    // As a result, we follow that solver and call ALL solvers with
-    // QPhiX::RELATIVE, which gives results consistent with tmLQCD in all cases.
-    double rhs_norm2 = 1.0;
-    QPhiX::norm2Spinor<FT, V, S, compress, nf>(rhs_norm2, qphix_in, geom, n_blas_simt);
-    const double RsdTarget = sqrt(target_precision / rhs_norm2);
-
-    // Calling the solver
-    start_time = gettime();
-    if (solver_flag == DUMMYHERMTEST) {
-      for (int fl : {0, 1}) {
-        random_spinor_field_eo(tmlqcd_odd_out[0][fl], 0, RN_GAUSS);
-        reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_out[0][fl], qphix_buffer[fl], cb_odd);
-      }
-      for (int isign : {-1, 1}) {
-        (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
-                              mv_apps, isign, verbose, cb_odd, QPhiX::RELATIVE);
-      }
-      QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-    } else if (solver_flag == CG || solver_flag == MIXEDCG) {
-      // USING CG:
-      // We are solving
-      //   M M^dagger qphix_buffer = qphix_in_prepared
-      // here, that is, isign = -1 for the QPhiX CG solver.
-      (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
-                            mv_apps, -1, verbose, cb_odd, QPhiX::RELATIVE);
-      // After that. if required by the solution type, multiply with M^dagger:
-      //   qphix_out[1] = M^dagger M^dagger^-1 M^-1 qphix_in_prepared
-      if (solver_params.solution_type == TM_SOLUTION_M) {
-        (*TwoFlavFermionMatrixQPhiX)(qphix_out[0], qphix_buffer, /* conjugate */ -1);
-        mv_apps++;
-      } else {
-        QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-      }
-    } else if (solver_flag == BICGSTAB || solver_flag == MIXEDBICGSTAB) {
-      (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
-                            mv_apps, 1, verbose, cb_odd, QPhiX::RELATIVE);
-      // for M^dagger^-1 M^-1 solution type, need to call BiCGstab twice
-      if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
-        (*TwoFlavSolverQPhiX)(qphix_out[0], qphix_buffer, RsdTarget, niters2, rsd_final, site_flops,
-                              mv_apps2, -1, verbose, cb_odd, QPhiX::RELATIVE);
-      } else {
-        QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-      }
-    } else if (solver_flag == CGMMSND) {
-      // TODO: handle the residuals properly
-      if (g_debug_level > 2) QPhiX::masterPrintf("# QPHIX CGMMSND: shifts: \n");
-      // tmLQCD weights the operator with 1/maxev in the RHMC relative to the shifts
-      // we will do this externally on the inverse (in monomial_solve) and thus need to weight
-      // the shifts by maxev^2
-      const double maxev_sq = (1.0 / phmc_invmaxev) * (1.0 / phmc_invmaxev);
-      for (int shift = 0; shift < num_shifts; shift++) {
-        RsdTargetArr[shift] = RsdTarget;
-        RsdFinalArr[shift] = -1.0;
-        shifts[shift] = maxev_sq * solver_params.shifts[shift] * solver_params.shifts[shift] /
-                        (4 * g_kappa * g_kappa);
-        if (g_debug_level > 2) QPhiX::masterPrintf("# [%d] = %lf\n", shift, shifts[shift]);
-      }
-      if (g_debug_level > 2) QPhiX::masterPrintf("\n");
-      (*TwoFlavMultiSolverQPhiX)(qphix_out.data(), qphix_in, num_shifts, shifts.data(),
-                                 RsdTargetArr.data(), niters, RsdFinalArr.data(), site_flops,
-                                 mv_apps, -1, verbose);
-      rsd_final = RsdFinalArr[0];
-    }
-    end_time = gettime();
-
-    for (int shift = 0; shift < num_shifts; shift++) {
-      for (int fl : {0, 1}) {
-        reorder_eo_spinor_from_QPhiX(geom, tmlqcd_odd_out[shift][fl], qphix_out[shift][fl], cb_odd,
-                                     rescale);
-      }
-    }
-
-    delete TwoFlavFermionMatrixQPhiX;
-    delete InnerTwoFlavFermionMatrixQPhiX;
-    delete InnerTwoFlavSolverQPhiX;
-    delete TwoFlavMultiSolverQPhiX;
-    delete TwoFlavSolverQPhiX;
-    for (int shift = 0; shift < num_shifts; shift++) {
-      delete[] qphix_out[shift];
-    }
-
-    if (qphix_clover) geom.free(qphix_clover);
-    if (qphix_invclov_odiag) geom.free(qphix_invclov_odiag);
-    if (qphix_clover_inner) geom_inner.free(qphix_clover_inner);
-    if (qphix_invclov_odiag_inner) geom_inner.free(qphix_invclov_odiag_inner);
-    for (int fl : {0, 1}) {
-      if (qphix_inv_fullclover[fl]) geom.free(qphix_inv_fullclover[fl]);
-      if (qphix_inv_fullclover_inner[fl]) geom_inner.free(qphix_inv_fullclover_inner[fl]);
-    }
-
-  } else {  // if(num_flavour)
-    // complain, this number of flavours is not valid
-  }  // if(num_flavour)
-
-  for (int cb : {0, 1}) {
-    if (u_packed[cb]) geom.free(u_packed[cb]);
-    if (u_packed_inner[cb]) geom_inner.free(u_packed_inner[cb]);
-  }
-
-  // FIXME: This should be called properly somewhere else
-  _endQphix();
-
-  QPhiX::masterPrintf("# ...done.\n\n");
-
-  uint64_t num_cb_sites = lattSize[0] / 2 * lattSize[1] * lattSize[2] * lattSize[3];
-  // FIXME: this needs to be adjusted depending on the operator used
-  uint64_t op_flops_per_site = 1320;
-  uint64_t total_flops =
-      (site_flops + site_flops2 + (2 * num_flavour * op_flops_per_site) * (mv_apps + mv_apps2)) *
-      num_cb_sites;
-  QPhiX::masterPrintf("# QPHIX: Solver Time = %g sec\n", (end_time - start_time));
-  QPhiX::masterPrintf("# QPHIX: Performance in GFLOPS = %g\n\n",
-                      1.0e-9 * total_flops / (end_time - start_time));
-
-  if (solver_is_mixed(solver_flag)) {
-    // the mixed solver reports the outer iterations, we would like to get
-    // some better total
-    niters = mv_apps / 2;
-    if (solver_flag == MIXEDBICGSTAB && solver_params.solution_type == TM_SOLUTION_M_MDAG) {
-      niters2 = mv_apps2 / 2;
-    }
-  }
-  // solver did not converge in maximum number of iterations
-  // FIXME: non-convergence does not work correctly yet
-  if ((niters + niters2) > max_iter) {
-    niters = -1;
-    niters2 = 0;
-  }
-  return (niters + niters2);
-}
-
-// Due to github issue #404, the helper functions to apply the full QPhiX operator
-// are currently disabled because they conflict with the new interfaces in QPhiX
-// itself. If required, these should be rewritten to use these interfaces
-// rather than the base classes in qphix_base_classes.hpp
-
-// Template wrapper for the Dslash operator call-able from C code
-// void Mfull_qphix(spinor *Even_out, spinor *Odd_out, const spinor *Even_in, const spinor *Odd_in,
-//                 const op_type_t op_type) {
-//  tmlqcd::checkQphixInputParameters(qphix_input);
-//  // FIXME: two-row gauge compression and double precision hard-coded
-//  _initQphix(0, nullptr, qphix_input, 12, QPHIX_DOUBLE_PREC);
-//
-//  if (qphix_precision == QPHIX_DOUBLE_PREC) {
-//    if (QPHIX_SOALEN > VECLEN_DP) {
-//      QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
-//                          VECLEN_DP);
-//      abort();
-//    }
-//    QPhiX::masterPrintf("TESTING IN DOUBLE PRECISION \n");
-//    if (compress12) {
-//      Mfull_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
-//                                                          op_type);
-//    } else {
-//      Mfull_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
-//                                                           op_type);
-//    }
-//  } else if (qphix_precision == QPHIX_FLOAT_PREC) {
-//    if (QPHIX_SOALEN > VECLEN_SP) {
-//      QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d\n", QPHIX_SOALEN,
-//                          VECLEN_SP);
-//      abort();
-//    }
-//    QPhiX::masterPrintf("TESTING IN SINGLE PRECISION \n");
-//    if (compress12) {
-//      Mfull_helper<float, VECLEN_SP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
-//                                                         op_type);
-//    } else {
-//      Mfull_helper<float, VECLEN_SP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
-//                                                          op_type);
-//    }
-//  }
-// #if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
-//  else if (qphix_precision == QPHIX_HALF_PREC) {
-//    if (QPHIX_SOALEN > VECLEN_HP) {
-//      QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d\n", QPHIX_SOALEN,
-//                          VECLEN_HP);
-//      abort();
-//    }
-//    QPhiX::masterPrintf("TESTING IN HALF PRECISION \n");
-//    if (compress12) {
-//      Mfull_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
-//                                                               op_type);
-//    } else {
-//      Mfull_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in,
-//      Odd_in,
-//                                                                op_type);
-//    }
-//  }
-// #endif
-//}
-
-// we have a unified interface for n-flavour inversions, but we need to provide wrappers
-// which can be called by the tmLQCD solver drivers for one and two-flavour inversions
-int invert_eo_qphix_oneflavour(spinor *Odd_out_1f, spinor *Odd_in_1f, const int max_iter,
-                               const double precision, const int solver_flag, const int rel_prec,
-                               const solver_params_t solver_params, const SloppyPrecision sloppy,
-                               const CompressionType compression) {
-  const int num_flavour = 1;
-  const int num_shifts = 1;
-  std::vector<std::vector<spinor *> > Odd_out;
-  std::vector<std::vector<spinor *> > Odd_in;
-
-  Odd_out.resize(num_shifts);
-  Odd_out[0].resize(num_flavour);
-  Odd_in.resize(1);
-  Odd_in[0].resize(num_flavour);
-
-  Odd_in[0][0] = Odd_in_1f;
-  Odd_out[0][0] = Odd_out_1f;
-
-  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
-                                         rel_prec, solver_params, sloppy, compression, num_flavour);
-}
-
-int invert_eo_qphix_oneflavour_mshift(spinor **Odd_out_1f, spinor *Odd_in_1f, const int max_iter,
-                                      const double precision, const int solver_flag,
-                                      const int rel_prec, const solver_params_t solver_params,
-                                      const SloppyPrecision sloppy,
-                                      const CompressionType compression) {
-  // even though the default is set to 1, guard against zeroes
-  const int num_shifts = solver_params.no_shifts == 0 ? 1 : solver_params.no_shifts;
-  const int num_flavour = 1;
-  std::vector<std::vector<spinor *> > Odd_out;
-  std::vector<std::vector<spinor *> > Odd_in;
-
-  Odd_out.resize(num_shifts);
-  Odd_in.resize(1);
-  Odd_in[0].resize(num_flavour);
-
-  Odd_in[0][0] = Odd_in_1f;
-  for (int shift = 0; shift < num_shifts; shift++) {
-    Odd_out[shift].resize(num_flavour);
-    Odd_out[shift][0] = Odd_out_1f[shift];
-  }
-
-  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
-                                         rel_prec, solver_params, sloppy, compression, num_flavour);
-}
-
-// Template wrapper for QPhiX solvers callable from C code, return number of iterations
-int invert_eo_qphix_twoflavour(spinor *Odd_out_s, spinor *Odd_out_c, spinor *Odd_in_s,
-                               spinor *Odd_in_c, const int max_iter, const double precision,
-                               const int solver_flag, const int rel_prec,
-                               const solver_params_t solver_params, const SloppyPrecision sloppy,
-                               const CompressionType compression) {
-  const int num_flavour = 2;
-  const int num_shifts = 1;
-  std::vector<std::vector<spinor *> > Odd_out;
-  std::vector<std::vector<spinor *> > Odd_in;
-
-  Odd_out.resize(num_shifts);
-  Odd_out[0].resize(num_flavour);
-  Odd_in.resize(1);
-  Odd_in[0].resize(num_flavour);
-
-  Odd_in[0][0] = Odd_in_s;
-  Odd_in[0][1] = Odd_in_c;
-
-  Odd_out[0][0] = Odd_out_s;
-  Odd_out[0][1] = Odd_out_c;
-
-  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
-                                         rel_prec, solver_params, sloppy, compression, num_flavour);
-}
-
-int invert_eo_qphix_twoflavour_mshift(spinor **Odd_out_s, spinor **Odd_out_c, spinor *Odd_in_s,
-                                      spinor *Odd_in_c, const int max_iter, const double precision,
-                                      const int solver_flag, const int rel_prec,
-                                      const solver_params_t solver_params,
-                                      const SloppyPrecision sloppy,
-                                      const CompressionType compression) {
-  // even though the default is set to 1, guard against zeroes
-  const int num_shifts = solver_params.no_shifts == 0 ? 1 : solver_params.no_shifts;
-  const int num_flavour = 2;
-  std::vector<std::vector<spinor *> > Odd_out;
-  std::vector<std::vector<spinor *> > Odd_in;
-
-  Odd_out.resize(num_shifts);
-  Odd_in.resize(1);
-  Odd_in[0].resize(num_flavour);
-
-  Odd_in[0][0] = Odd_in_s;
-  Odd_in[0][1] = Odd_in_c;
-
-  for (int shift = 0; shift < num_shifts; shift++) {
-    Odd_out[shift].resize(num_flavour);
-    Odd_out[shift][0] = Odd_out_s[shift];
-    Odd_out[shift][1] = Odd_out_c[shift];
-  }
-
-  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
-                                         rel_prec, solver_params, sloppy, compression, num_flavour);
-}
-
-// Template wrapper for QPhiX solvers callable from C code, return number of iterations
-// the interface is prepared for multi-rhs solves, hence the double vector for the input
-int invert_eo_qphix_nflavour_mshift(std::vector<std::vector<spinor *> > &Odd_out,
-                                    std::vector<std::vector<spinor *> > &Odd_in,
-                                    const double precision, const int max_iter,
-                                    const int solver_flag, const int rel_prec,
-                                    solver_params_t solver_params, const SloppyPrecision sloppy,
-                                    const CompressionType compression, const int num_flavour) {
-  tmlqcd::checkQphixInputParameters(qphix_input);
-  double target_precision = precision;
-  double src_norm = 0.0;
-  for (int f = 0; f < num_flavour; ++f) {
-    src_norm += square_norm(Odd_in[0][f], VOLUME / 2, 1);
-  }
-  // we use "precision_lambda" to determine if a system can be solved in half or float
-  // precision (when a fixed-precision solver is used)
-  double precision_lambda = target_precision / src_norm;
-  if (rel_prec == 1) {
-    QPhiX::masterPrintf("# QPHIX: Using relative precision\n");
-    target_precision = precision * src_norm;
-    precision_lambda = precision;
-  }
-  QPhiX::masterPrintf("# QPHIX: precision_lambda: %g, target_precision: %g\n\n", precision_lambda,
-                      target_precision);
-
-  // mixed solvers require inner and outer precisions, which we specify explicitly here
-  if (solver_is_mixed(solver_flag)) {
-#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
-    if (sloppy == SLOPPY_HALF) {
-      if (QPHIX_SOALEN > VECLEN_DP || QPHIX_SOALEN > VECLEN_HP) {
-        QPhiX::masterPrintf(
-            "SOALEN=%d is greater than the half prec VECLEN=%d or the double prec VECLEN=%d\n",
-            QPHIX_SOALEN, VECLEN_HP, VECLEN_DP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
-      QPhiX::masterPrintf("# USING DOUBLE-HALF PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_HALF_PREC);
-      if (compress12) {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true, QPhiX::half, VECLEN_HP,
-                                      QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false, QPhiX::half,
-                                      VECLEN_HP, QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    } else
-#else
-    if (sloppy == SLOPPY_HALF) {
-      QPhiX::masterPrintf("QPHIX interface: half precision not supported on this architecture!\n");
-      abort();
-    } else
-#endif
-        if (sloppy == SLOPPY_SINGLE) {
-      if (QPHIX_SOALEN > VECLEN_DP || QPHIX_SOALEN > VECLEN_SP) {
-        QPhiX::masterPrintf(
-            "SOALEN=%d is greater than the single prec VECLEN=%d or the double prec VECLEN=%d\n",
-            QPHIX_SOALEN, VECLEN_SP, VECLEN_DP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
-      QPhiX::masterPrintf("# USING DOUBLE-SINGLE PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_FLOAT_PREC);
-      if (compress12) {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true, float, VECLEN_SP,
-                                      QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false, float, VECLEN_SP,
-                                      QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    } else {  // if(sloppy)
-      if (QPHIX_SOALEN > VECLEN_DP) {
-        QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
-                            VECLEN_DP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
-      QPhiX::masterPrintf("# USING DOUBLE-DOUBLE PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_DOUBLE_PREC);
-      if (compress12) {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    }  // if( sloppy )
-  } else {  // if( solver_is_mixed )
-#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
-    if (sloppy == SLOPPY_HALF || precision_lambda >= rsdTarget<QPhiX::half>::value) {
-      if (QPHIX_SOALEN > VECLEN_HP) {
-        QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d\n", QPHIX_SOALEN,
-                            VECLEN_HP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
-      QPhiX::masterPrintf("# USING HALF PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_HALF_PREC);
-
-      if (compress12) {
-        return invert_eo_qphix_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    } else
-#else
-    if (sloppy == SLOPPY_HALF) {
-      QPhiX::masterPrintf("QPHIX interface: half precision not supported on this architecture!\n");
-      abort();
-    } else
-#endif
-        if (sloppy == SLOPPY_SINGLE || precision_lambda >= rsdTarget<float>::value) {
-      if (QPHIX_SOALEN > VECLEN_SP) {
-        QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d\n", QPHIX_SOALEN,
-                            VECLEN_SP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
-      QPhiX::masterPrintf("# USING SINGLE PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_FLOAT_PREC);
-
-      if (compress12) {
-        return invert_eo_qphix_helper<float, VECLEN_SP, QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<float, VECLEN_SP, QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    } else {
-      if (QPHIX_SOALEN > VECLEN_DP) {
-        QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
-                            VECLEN_DP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
-      QPhiX::masterPrintf("# USING DOUBLE PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC);
-
-      if (compress12) {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    }  // if( sloppy || target_precision )
-  }  // if ( solver_flag == *MIXEDCG )
-  return -1;
-}
-
-void tmlqcd::checkQphixInputParameters(const tm_QPhiXParams_t &params) {
-  if (params.MinCt == 0) {
-    QPhiX::masterPrintf("QPHIX Error: MinCt cannot be 0! Minimal value: 1. Aborting.\n");
-    abort();
-  }
-  if (params.By == 0 || params.Bz == 0) {
-    QPhiX::masterPrintf("QPHIX Error: By and Bz may not be 0! Minimal value: 1. Aborting.\n");
-    abort();
-  }
-  if (params.NCores * params.Sy * params.Sz != omp_num_threads) {
-    QPhiX::masterPrintf("QPHIX Error: NCores * Sy * Sz != ompnumthreads ! Aborting.\n");
-    abort();
-  }
-}
-
-void tmlqcd::printQphixDiagnostics(int VECLEN, int SOALEN, bool compress, int VECLEN_inner,
-                                   int SOALEN_inner, bool compress_inner) {
-  QPhiX::masterPrintf("# QphiX: VECLEN=%d SOALEN=%d VECLEN_inner=%d, SOALEN_inner=%d\n", VECLEN,
-                      SOALEN, VECLEN_inner, SOALEN_inner);
-
-  QPhiX::masterPrintf("# QphiX: Declared QMP Topology (xyzt):");
-  for (int mu = 0; mu < 4; mu++) QPhiX::masterPrintf(" %d", qmp_geom[mu]);
-  QPhiX::masterPrintf("\n");
-
-  QPhiX::masterPrintf("# QphiX: Mapping of dimensions QMP -> tmLQCD (xyzt):");
-  for (int mu = 0; mu < 4; mu++) QPhiX::masterPrintf(" %d->%d", mu, qmp_tm_map[mu]);
-  QPhiX::masterPrintf("\n");
-
-  QPhiX::masterPrintf("# QphiX: Global Lattice Size (xyzt) = ");
-  for (int mu = 0; mu < 4; mu++) {
-    QPhiX::masterPrintf(" %d", lattSize[mu]);
-  }
-  QPhiX::masterPrintf("\n");
-  QPhiX::masterPrintf("# QphiX: Local Lattice Size (xyzt) = ");
-  for (int mu = 0; mu < 4; mu++) {
-    QPhiX::masterPrintf(" %d", subLattSize[mu]);
-  }
-  QPhiX::masterPrintf("\n");
-  QPhiX::masterPrintf("# QphiX: Block Sizes: By= %d Bz=%d\n", By, Bz);
-  QPhiX::masterPrintf("# QphiX: Cores = %d\n", NCores);
-  QPhiX::masterPrintf("# QphiX: SMT Grid: Sy=%d Sz=%d\n", Sy, Sz);
-  QPhiX::masterPrintf("# QphiX: Pad Factors: PadXY=%d PadXYZ=%d\n", PadXY, PadXYZ);
-  QPhiX::masterPrintf("# QphiX: Threads_per_core = %d\n", N_simt);
-  QPhiX::masterPrintf("# QphiX: MinCt = %d\n", MinCt);
-  if (compress) {
-    QPhiX::masterPrintf("# QphiX: Using two-row gauge compression (compress12)\n");
-  }
-  if (compress_inner) {
-    QPhiX::masterPrintf("# QphiX: Inner solver using two-row gauge compression (compress12)\n");
-  }
-}
-
-void testSpinorPackers(spinor *Even_out, spinor *Odd_out, const spinor *const Even_in,
-                       const spinor *const Odd_in) {
-  tmlqcd::checkQphixInputParameters(qphix_input);
-  // FIXME: two-row gauge compression and double precision hard-coded
-  _initQphix(0, nullptr, qphix_input, 12, QPHIX_DOUBLE_PREC);
-
-  QPhiX::Geometry<double, VECLEN_SP, QPHIX_SOALEN, true> geom(subLattSize, By, Bz, NCores, Sy, Sz,
-                                                              PadXY, PadXYZ, MinCt);
-
-  auto qphix_cb_even = QPhiX::makeFourSpinorHandle(geom);
-  auto qphix_cb_odd = QPhiX::makeFourSpinorHandle(geom);
-
-  spinor **tmp;
-  init_solver_field(&tmp, VOLUME / 2, 2);
-
-  //   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Even_in),
-  //                              qphix_cb_even.get(), cb_even);
-  //   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Odd_in),
-  //                              qphix_cb_odd.get(), cb_odd);
-  reorder_eo_spinor_to_QPhiX(geom, Even_in, qphix_cb_even.get(), cb_even);
-  reorder_eo_spinor_to_QPhiX(geom, Odd_in, qphix_cb_odd.get(), cb_odd);
-
-  reorder_eo_spinor_from_QPhiX(geom, Even_out, qphix_cb_even.get(), cb_even, 1.0);
-  reorder_eo_spinor_from_QPhiX(geom, Odd_out, qphix_cb_odd.get(), cb_odd, 1.0);
-
-  diff(tmp[0], Even_out, Even_in, VOLUME / 2);
-  diff(tmp[1], Odd_out, Odd_in, VOLUME / 2);
-  double l2norm = square_norm(tmp[0], VOLUME / 2, 1) + square_norm(tmp[1], VOLUME / 2, 1);
-  QPhiX::masterPrintf("QPHIX eo spinor packer back and forth difference L2 norm: %lf\n", l2norm);
-  finalize_solver(tmp, 2);
-}
diff --git a/qphix_interface.hpp b/qphix_interface.hpp
deleted file mode 100644
index b487eda66..000000000
--- a/qphix_interface.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/***********************************************************************
- *
- * Copyright (C) 2017 Bartosz Kostrzewa
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- *
- ***********************************************************************/
-
-#pragma once
-
-#include "global.h"
-#include "qphix_types.h"
-
-#ifdef __cplusplus /* If this is a C++ compiler, use C linkage */
-extern "C" {
-#endif
-
-#include "misc_types.h"
-#include "operator_types.h"
-#include "solver/matrix_mult_typedef.h"
-#include "solver/solver_params.h"
-#include "su3.h"
-
-#ifdef __cplusplus
-}
-#endif
-
-#include <vector>
-
-int invert_eo_qphix_nflavour_mshift(std::vector< std::vector< spinor* > > &Odd_out, 
-                                    std::vector< std::vector< spinor* > > &Odd_in, 
-                                    const double precision,
-                                    const int max_iter,
-                                    const int solver_flag, 
-                                    const int rel_prec,
-                                    solver_params_t solver_params,
-                                    const SloppyPrecision sloppy, const CompressionType compression,
-                                    const int num_flavour);
\ No newline at end of file
diff --git a/qphix_interface_utils.hpp b/qphix_interface_utils.hpp
deleted file mode 100644
index 56d8afe56..000000000
--- a/qphix_interface_utils.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/***********************************************************************
- *
- * Copyright (C) 2015 Mario Schroeck
- *               2016 Peter Labus
- *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- *
- ***********************************************************************/
-
-#pragma once
-
-#include "qphix_types.h"
-
-namespace tmlqcd {
-
-void checkQphixInputParameters(const tm_QPhiXParams_t &params);
-void printQphixDiagnostics(int VECLEN, int SOALEN, bool compress, int VECLEN_inner, int SOALEN_inner, bool compress_inner);
-
-}  // namespace tmlqcd
diff --git a/src/bin/LapH_ev.c b/src/bin/LapH_ev.c
index dd96133fb..08e810b36 100644
--- a/src/bin/LapH_ev.c
+++ b/src/bin/LapH_ev.c
@@ -63,20 +63,20 @@ int main(int argc, char *argv[]) {
   tmlqcd_mpi_init(argc, argv);
 
   if (g_proc_id == 0) {
-#ifdef _GAUGE_COPY
-    printf("# The code was compiled with -D_GAUGE_COPY\n");
+#ifdef TM_GAUGE_COPY
+    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
 #endif
-#ifdef _USE_HALFSPINOR
-    printf("# The code was compiled with -D_USE_HALFSPINOR\n");
+#ifdef TM_USE_HALFSPINOR
+    printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
 #endif
-#ifdef _USE_SHMEM
-    printf("# the code was compiled with -D_USE_SHMEM\n");
-#ifdef _PERSISTENT
+#ifdef TM_USE_SHMEM
+    printf("# the code was compiled with -DTM_USE_SHMEM\n");
+#ifdef TM_PERSISTENT
     printf("# the code was compiled for persistent MPI calls (halfspinor only)\n");
 #endif
 #endif
 #ifdef TM_USE_MPI
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
     printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 #endif
 #endif
@@ -98,8 +98,8 @@ int main(int argc, char *argv[]) {
   exit(0);
 #endif
 #endif
-#ifdef FIXEDVOLUME
-  printf(" Error: FIXEDVOLUME not allowed");
+#ifdef TM_FIXEDVOLUME
+  printf(" Error: TM_FIXEDVOLUME not allowed");
   exit(0);
 #endif
 
diff --git a/src/bin/benchmark.c b/src/bin/benchmark.c
index 3dd70a86b..72d8c8f4d 100644
--- a/src/bin/benchmark.c
+++ b/src/bin/benchmark.c
@@ -33,7 +33,7 @@
 #include <time.h>
 #ifdef TM_USE_MPI
 #include <mpi.h>
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 #include <io/gauge.h>
 #include <io/params.h>
 #endif
@@ -61,19 +61,19 @@
 #include "test/check_geometry.h"
 #include "xchange/xchange.h"
 
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
 #define SLICE (LX * LY * LZ / 2)
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2))
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2) + (T * LX * LZ / 2))
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2) + (T * LX * LZ / 2) + (T * LX * LY / 2))
-#elif defined PARALLELX
+#elif defined TM_PARALLELX
 #define SLICE ((LY * LZ * T / 2))
-#elif defined PARALLELXY
+#elif defined TM_PARALLELXY
 #define SLICE ((LY * LZ * T / 2) + (LX * LZ * T / 2))
-#elif defined PARALLELXYZ
+#elif defined TM_PARALLELXYZ
 #define SLICE ((LY * LZ * T / 2) + (LX * LZ * T / 2) + (LX * LY * T / 2))
 #endif
 
@@ -81,7 +81,7 @@ int check_xchange();
 
 int main(int argc, char *argv[]) {
   int j, j_max, k, k_max = 1;
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   paramsXlfInfo *xlfInfo;
 #endif
   int status = 0;
@@ -123,20 +123,20 @@ int main(int argc, char *argv[]) {
   tmlqcd_mpi_init(argc, argv);
 
   if (g_proc_id == 0) {
-#ifdef _GAUGE_COPY
-    printf("# The code was compiled with -D_GAUGE_COPY\n");
+#ifdef TM_GAUGE_COPY
+    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
 #endif
-#ifdef _USE_HALFSPINOR
-    printf("# The code was compiled with -D_USE_HALFSPINOR\n");
+#ifdef TM_USE_HALFSPINOR
+    printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
 #endif
-#ifdef _USE_SHMEM
-    printf("# The code was compiled with -D_USE_SHMEM\n");
-#ifdef _PERSISTENT
+#ifdef TM_USE_SHMEM
+    printf("# The code was compiled with -DTM_USE_SHMEM\n");
+#ifdef TM_PERSISTENT
     printf("# The code was compiled for persistent MPI calls (halfspinor only)\n");
 #endif
 #endif
 #ifdef TM_USE_MPI
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
     printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 #endif
 #endif
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
     fflush(stdout);
   }
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -186,7 +186,7 @@ int main(int argc, char *argv[]) {
   /* define the boundary conditions for the fermion fields */
   boundary(g_kappa);
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
@@ -200,7 +200,7 @@ int main(int argc, char *argv[]) {
       exit(0);
     }
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -210,7 +210,7 @@ int main(int argc, char *argv[]) {
     fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
     exit(1);
   }
-#if (defined TM_USE_MPI && !(defined _USE_SHMEM))
+#if (defined TM_USE_MPI && !(defined TM_USE_SHMEM))
   check_xchange();
 #endif
 
@@ -344,7 +344,7 @@ int main(int argc, char *argv[]) {
     sdt = sdt / ((double)(2 * SLICE));
     if (g_proc_id == 0) {
       printf("# The size of the package is %d bytes.\n", (SLICE) * 192);
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
       printf("# The bandwidth is %5.2f + %5.2f MB/sec\n", 192. / sdt / 1024 / 1024,
              192. / sdt / 1024. / 1024);
 #else
@@ -431,7 +431,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   if (g_proc_id == 0) {
     printf("# Performing parallel IO test ...\n");
   }
diff --git a/src/bin/deriv_mg_tune.c b/src/bin/deriv_mg_tune.c
index d3abb66ee..75595bc60 100644
--- a/src/bin/deriv_mg_tune.c
+++ b/src/bin/deriv_mg_tune.c
@@ -64,7 +64,7 @@
 #include "solver/solver.h"
 #include "test/check_geometry.h"
 #include "update_tm.h"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QUDA
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_DERIV_MG_TUNE);
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
@@ -136,7 +136,7 @@ int main(int argc, char *argv[]) {
 
   g_mu = g_mu1;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
   status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
@@ -204,7 +204,7 @@ int main(int argc, char *argv[]) {
     exit(1);
   }
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -217,7 +217,7 @@ int main(int argc, char *argv[]) {
     exit(-1);
   }
 
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -367,7 +367,7 @@ int main(int argc, char *argv[]) {
 #endif
 
   return (0);
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/bin/hmc_tm.c b/src/bin/hmc_tm.c
index 2db6f8c1b..0d95a3b3c 100644
--- a/src/bin/hmc_tm.c
+++ b/src/bin/hmc_tm.c
@@ -67,7 +67,7 @@
 #include "solver/solver.h"
 #include "test/check_geometry.h"
 #include "update_tm.h"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QUDA
@@ -113,7 +113,7 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_HMC_TM);
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
@@ -168,7 +168,7 @@ int main(int argc, char *argv[]) {
 
   g_mu = g_mu1;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
   status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
@@ -257,7 +257,7 @@ int main(int argc, char *argv[]) {
     exit(1);
   }
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -270,7 +270,7 @@ int main(int argc, char *argv[]) {
     exit(-1);
   }
 
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -504,7 +504,7 @@ int main(int argc, char *argv[]) {
     }
 
     /* online measurements */
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     // When the configuration is rejected, we have to update it in the MG and redo the setup.
     int mg_update = accept ? 0 : 1;
 #endif
@@ -514,7 +514,7 @@ int main(int argc, char *argv[]) {
         if (g_proc_id == 0) {
           fprintf(stdout, "#\n# Beginning online measurement.\n");
         }
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
         if (mg_update) {
           mg_update = 0;
           MG_reset();
@@ -591,7 +591,7 @@ int main(int argc, char *argv[]) {
 #endif
 
   return (0);
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/bin/invert.c b/src/bin/invert.c
index 007e0ea41..c3111decb 100644
--- a/src/bin/invert.c
+++ b/src/bin/invert.c
@@ -84,7 +84,7 @@
 #ifdef TM_USE_QPHIX
 #include "qphix_interface.h"
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #include "expo.h"
@@ -114,7 +114,7 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_INVERT);
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
@@ -165,7 +165,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
   j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
@@ -246,7 +246,7 @@ int main(int argc, char *argv[]) {
   init_measurements();
 
   /* this could be maybe moved to init_operators */
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -258,7 +258,7 @@ int main(int argc, char *argv[]) {
     fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
     exit(-1);
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   if (even_odd_flag) init_xchange_halffield();
 #endif
 #endif
@@ -362,7 +362,7 @@ int main(int argc, char *argv[]) {
     g_precWS = NULL;
     if (use_preconditioning == 1) {
       /* todo load fftw wisdom */
-#if (defined HAVE_FFTW) && !(defined TM_USE_MPI)
+#if (defined TM_USE_FFTW) && !(defined TM_USE_MPI)
       loadFFTWWisdom(g_spinor_field[0], g_spinor_field[1], T, LX);
 #else
       use_preconditioning = 0;
@@ -457,7 +457,7 @@ int main(int argc, char *argv[]) {
   MPI_Finalize();
 #endif
   return (0);
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/bin/offline_measurement.c b/src/bin/offline_measurement.c
index c2ae72f9c..72a828fb7 100644
--- a/src/bin/offline_measurement.c
+++ b/src/bin/offline_measurement.c
@@ -83,7 +83,7 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_OFFLINE_MEASUREMENT);
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -210,7 +210,7 @@ int main(int argc, char *argv[]) {
   init_measurements();
 
   /* this could be maybe moved to init_operators */
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -223,7 +223,7 @@ int main(int argc, char *argv[]) {
       exit(-1);
     }
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   if (even_odd_flag) init_xchange_halffield();
 #endif
 #endif
@@ -307,7 +307,7 @@ int main(int argc, char *argv[]) {
 #endif
   return (0);
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/bin/check_locallity.c b/src/bin/tests/check_locallity.c
similarity index 98%
rename from src/bin/check_locallity.c
rename to src/bin/tests/check_locallity.c
index 52ea21209..f03806f21 100644
--- a/src/bin/check_locallity.c
+++ b/src/bin/tests/check_locallity.c
@@ -77,13 +77,13 @@ int main(int argc, char *argv[]) {
   double *norm;
   struct stout_parameters params_smear;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   int kb = 0;
 #endif
 #ifdef TM_USE_MPI
   double atime = 0., etime = 0.;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
@@ -186,7 +186,7 @@ int main(int argc, char *argv[]) {
   /* define the boundary conditions for the fermion fields */
   boundary();
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -199,7 +199,7 @@ int main(int argc, char *argv[]) {
       exit(-1);
     }
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -312,7 +312,7 @@ int main(int argc, char *argv[]) {
   free_spinor_field();
   free_moment_field();
   return (0);
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/bin/hopping_test.c b/src/bin/tests/hopping_test.c
similarity index 94%
rename from src/bin/hopping_test.c
rename to src/bin/tests/hopping_test.c
index 04df878e5..da60c83ba 100644
--- a/src/bin/hopping_test.c
+++ b/src/bin/tests/hopping_test.c
@@ -34,7 +34,7 @@
 #include <time.h>
 #ifdef TM_USE_MPI
 #include <mpi.h>
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 #include <io/gauge.h>
 #include <io/params.h>
 #endif
@@ -59,19 +59,19 @@
 #include "test/check_geometry.h"
 #include "xchange/xchange.h"
 
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
 #define SLICE (LX * LY * LZ / 2)
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2))
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2) + (T * LX * LZ / 2))
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2) + (T * LX * LZ / 2) + (T * LX * LY / 2))
-#elif defined PARALLELX
+#elif defined TM_PARALLELX
 #define SLICE ((LY * LZ * T / 2))
-#elif defined PARALLELXY
+#elif defined TM_PARALLELXY
 #define SLICE ((LY * LZ * T / 2) + (LX * LZ * T / 2))
-#elif defined PARALLELXYZ
+#elif defined TM_PARALLELXYZ
 #define SLICE ((LY * LZ * T / 2) + (LX * LZ * T / 2) + (LX * LY * T / 2))
 #endif
 
@@ -102,20 +102,20 @@ int main(int argc, char *argv[]) {
   tmlqcd_mpi_init(argc, argv);
 
   if (g_proc_id == 0) {
-#ifdef _GAUGE_COPY
-    printf("# The code was compiled with -D_GAUGE_COPY\n");
+#ifdef TM_GAUGE_COPY
+    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
 #endif
-#ifdef _USE_HALFSPINOR
-    printf("# The code was compiled with -D_USE_HALFSPINOR\n");
+#ifdef TM_USE_HALFSPINOR
+    printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
 #endif
-#ifdef _USE_SHMEM
-    printf("# the code was compiled with -D_USE_SHMEM\n");
-#ifdef _PERSISTENT
+#ifdef TM_USE_SHMEM
+    printf("# the code was compiled with -DTM_USE_SHMEM\n");
+#ifdef TM_PERSISTENT
     printf("# the code was compiled for persistent MPI calls (halfspinor only)\n");
 #endif
 #endif
 #ifdef TM_USE_MPI
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
     printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 #endif
 #endif
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
     fflush(stdout);
   }
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -165,7 +165,7 @@ int main(int argc, char *argv[]) {
   /* define the boundary conditions for the fermion fields */
   boundary(g_kappa);
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
@@ -179,7 +179,7 @@ int main(int argc, char *argv[]) {
       exit(0);
     }
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -190,7 +190,7 @@ int main(int argc, char *argv[]) {
     exit(1);
   }
 
-#if (defined TM_USE_MPI && !(defined _USE_SHMEM))
+#if (defined TM_USE_MPI && !(defined TM_USE_SHMEM))
   check_xchange();
 #endif
 
diff --git a/src/bin/qphix_test_Dslash.c b/src/bin/tests/qphix_test_Dslash.c
similarity index 99%
rename from src/bin/qphix_test_Dslash.c
rename to src/bin/tests/qphix_test_Dslash.c
index 56250bc5a..b4218d3e6 100644
--- a/src/bin/qphix_test_Dslash.c
+++ b/src/bin/tests/qphix_test_Dslash.c
@@ -35,7 +35,7 @@
 #include <time.h>
 #ifdef TM_USE_MPI
 #include <mpi.h>
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 #include <io/gauge.h>
 #include <io/params.h>
 #endif
@@ -86,7 +86,7 @@ double compare_spinors(spinor* s1, spinor* s2);
 
 int main(int argc, char* argv[]) {
   int j;
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   paramsXlfInfo* xlfInfo;
 #endif
   int status = 0;
@@ -105,7 +105,7 @@ int main(int argc, char* argv[]) {
   tmlqcd_mpi_init(argc, argv);
   g_dbw2rand = 0;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND, 0);
@@ -135,7 +135,7 @@ int main(int argc, char* argv[]) {
   /* define the geometry */
   geometry();
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
@@ -146,7 +146,7 @@ int main(int argc, char* argv[]) {
     fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n");
     exit(0);
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -180,7 +180,7 @@ int main(int argc, char* argv[]) {
 #endif
 
   g_update_gauge_copy = 1;
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   update_backward_gauge(g_gauge_field);
 #endif
 
diff --git a/src/bin/scalar_prod_r_test.c b/src/bin/tests/scalar_prod_r_test.c
similarity index 100%
rename from src/bin/scalar_prod_r_test.c
rename to src/bin/tests/scalar_prod_r_test.c
diff --git a/src/bin/test_eigenvalues.c b/src/bin/tests/test_eigenvalues.c
similarity index 98%
rename from src/bin/test_eigenvalues.c
rename to src/bin/tests/test_eigenvalues.c
index 053944698..759d8dd2f 100644
--- a/src/bin/test_eigenvalues.c
+++ b/src/bin/tests/test_eigenvalues.c
@@ -227,7 +227,7 @@ int main(int argc, char *argv[]) {
   g_eps_sq_acc = g_eps_sq_acc1;
   g_eps_sq_force = g_eps_sq_force1;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -274,11 +274,11 @@ int main(int argc, char *argv[]) {
 
     parameterfile = fopen(parameterfilename, "w");
     printf("# This is the hmc code for twisted Mass Wilson QCD\n\nVersion %s\n", Version);
-#ifdef _NEW_GEOMETRY
-    printf("# The code was compiled with -D_NEW_GEOMETRY\n");
+#ifdef TM_NEW_GEOMETRY
+    printf("# The code was compiled with -DTM_NEW_GEOMETRY\n");
 #endif
-#ifdef _GAUGE_COPY
-    printf("# The code was compiled with -D_GAUGE_COPY\n");
+#ifdef TM_GAUGE_COPY
+    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
 #endif
     printf("# The lattice size is %d x %d x %d x %d\n", (int)(T * g_nproc_t), (int)(LX * g_nproc_x),
            (int)(LY), (int)(LZ));
@@ -430,7 +430,7 @@ int main(int argc, char *argv[]) {
 #ifdef TM_USE_MPI
   xchange_gauge(g_gauge_field);
 #endif
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   update_backward_gauge();
 #endif
 
diff --git a/src/bin/test_lemon.c b/src/bin/tests/test_lemon.c
similarity index 99%
rename from src/bin/test_lemon.c
rename to src/bin/tests/test_lemon.c
index f2147ad3f..3cef7689c 100644
--- a/src/bin/test_lemon.c
+++ b/src/bin/tests/test_lemon.c
@@ -66,7 +66,7 @@ int main(int argc, char *argv[]) {
 
   tmlqcd_mpi_init(argc, argv);
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -108,7 +108,7 @@ int main(int argc, char *argv[]) {
   xlfInfo = construct_paramsXlfInfo(plaquette_energy, 0);
   write_lime_gauge_field("conf.lime", 64, xlfInfo);
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   if (g_proc_id == 0) {
     printf("Now we do write with lemon to conf.lemon...\n");
   }
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 4ace6c997..746b40c0d 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -369,11 +369,11 @@ list(
 
 list(APPEND TEST_SRC_C test/check_xchange.c test/check_geometry.c
      test/overlaptests.c)
-if(TMLQCD_USE_QPHIX)
+if(TM_USE_QPHIX)
   list(APPEND MAIN_SRC_C QphiX/qphix_interface.cpp)
 endif()
 
-if(TMLQCD_USE_QUDA)
+if(TM_USE_QUDA)
   list(APPEND MAIN_SRC_C quda_interface.c)
 endif()
 
@@ -392,7 +392,8 @@ list(
   ${INIT_SRC_C}
   ${SOLVER_SRC_C}
   ${TEST_SRC_C}
-  ${MEAS_SRC_C})
+  ${MEAS_SRC_C}
+  ${PROJECT_BINARY_DIR}/git_hash.c)
 
 include_directories(
   $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
@@ -424,13 +425,10 @@ set_target_properties(hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
 # define a library and add the dependencies
 target_link_libraries(
   hmc
-  PUBLIC $<$<BOOL:${HAVE_CLOCK_GETTIME_IN_RT}>:rt>
-         $<$<BOOL:${TM_USE_LEMON}>:tmlqcd::clime>
-         $<$<BOOL:${TM_USE_LEMON}>:clemon::lemon>
+  PUBLIC $<$<BOOL:${TM_CLOCK_GETTIME_IN_RT}>:rt>
+         $<$<BOOL:${TM_DDalphaAMG}>:tmlqcd::DDalphaAMG>
          $<$<BOOL:${TM_USE_QPHIX}>:tmlqcd::qphix>
          $<$<BOOL:${TM_USE_FFTW}>:tmlqcd::fftw3>
-         $<$<BOOL:${TM_USE_MPI}>:MPI::MPI_C
-         MPI::MPI_CXX>
          $<$<BOOL:${TM_USE_QUDA}>:QUDA::quda>
          $<$<BOOL:${TM_USE_CUDA}>:CUDA::cufft
          CUDA::cufftw
@@ -440,9 +438,13 @@ target_link_libraries(
          $<$<BOOL:${TM_USE_HIP}>:hip::hipfft
          roc::hipblas
          hip::host>
+         tmlqcd::clime
+         $<$<BOOL:${TM_USE_LEMON}>:clemon::lemon>
          ${LAPACK_LIBRARIES}
          ${BLAS_LIBRARIES}
-         $<$<BOOL:${TM_USE_OPENMP}>:OpenMP::OpenMP_C
+         $<$<BOOL:${TM_USE_MPI}>:MPI::MPI_C
+         MPI::MPI_CXX>
+         $<$<BOOL:${TM_USE_OMP}>:OpenMP::OpenMP_C
          OpenMP::OpenMP_CXX>
          m)
 
diff --git a/src/lib/DDalphaAMG_interface.c b/src/lib/DDalphaAMG_interface.c
index 029d2f76f..80bff4fcc 100644
--- a/src/lib/DDalphaAMG_interface.c
+++ b/src/lib/DDalphaAMG_interface.c
@@ -17,13 +17,13 @@
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  *
- * Interface for DDalphaAMG
+ * Interface for TM_USE_DDalphaAMG
  *
  *******************************************************************************/
 
 #include "DDalphaAMG_interface.h"
 
-#ifndef DDalphaAMG
+#ifndef TM_USE_DDalphaAMG
 
 int mg_setup_iter;
 int mg_coarse_setup_iter;
@@ -43,47 +43,47 @@ double mg_dtau_update;
 double mg_rho_update;
 
 void MG_init(void) {
-  printf("ERROR: MG_init called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_init called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_update_gauge(double step) {
-  printf("ERROR: MG_update_gauge called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_update_gauge called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_update_mu(double mu_tmLQCD, double odd_tmLQCD) {
-  printf("ERROR: MG_update_mu called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_update_mu called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_reset(void) {
-  printf("ERROR: MG_reset called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_reset called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_finalize(void) {
-  printf("ERROR: MG_finalize called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_finalize called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 int MG_solver(spinor *const phi_new, spinor *const phi_old, const double precision,
               const int max_iter, const int rel_prec, const int N, su3 **gf, matrix_mult f) {
-  printf("ERROR: MG_solver called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_solver called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 int MG_solver_eo(spinor *const Even_new, spinor *const Odd_new, spinor *const Even,
                  spinor *const Odd, const double precision, const int max_iter, const int rel_prec,
                  const int N, su3 **gf, matrix_mult_full f_full) {
-  printf("ERROR: MG_solver_eo called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_solver_eo called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 int MG_solver_nd(spinor *const up_new, spinor *const dn_new, spinor *const up_old,
                  spinor *const dn_old, const double precision, const int max_iter,
                  const int rel_prec, const int N, su3 **gf, matrix_mult_nd f) {
-  printf("ERROR: MG_solver_nd called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_solver_nd called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
@@ -207,7 +207,7 @@ static inline int MG_check(spinor *const phi_new, spinor *const phi_old, const i
           "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
           "restart... \n");
       printf(
-          "ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
+          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
           "\n",
           differ[0], differ[1], differ[0] / differ[1], precision);
     }
@@ -215,7 +215,7 @@ static inline int MG_check(spinor *const phi_new, spinor *const phi_old, const i
   }
 
   if (g_debug_level > 0 && g_proc_id == 0)
-    printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+    printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
            differ[0], differ[1], differ[0] / differ[1]);
 
   return 1;
@@ -257,7 +257,7 @@ static inline int MG_check_nd(spinor *const up_new, spinor *const dn_new, spinor
           "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
           "restart... \n");
       printf(
-          "ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
+          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
           "\n",
           differ[0], differ[1], differ[0] / differ[1], precision);
     }
@@ -265,7 +265,7 @@ static inline int MG_check_nd(spinor *const up_new, spinor *const dn_new, spinor
   }
 
   if (g_debug_level > 0 && g_proc_id == 0)
-    printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+    printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
            differ[0], differ[1], differ[0] / differ[1]);
 
   return 1;
@@ -304,7 +304,7 @@ static inline int MG_mms_check_nd(spinor **const up_new, spinor **const dn_new,
             "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
             "restart... \n");
         printf(
-            "ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > "
+            "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > "
             "%e \n",
             differ[0], differ[1], differ[0] / differ[1], precision[i]);
       }
@@ -313,7 +313,7 @@ static inline int MG_mms_check_nd(spinor **const up_new, spinor **const dn_new,
     }
 
     if (g_debug_level > 0 && g_proc_id == 0)
-      printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+      printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
              differ[0], differ[1], differ[0] / differ[1]);
   }
 
@@ -343,7 +343,7 @@ static int MG_pre_solve(su3 **gf) {
   if (mg_initialized == 0) {
     MG_init();
     mg_initialized = 1;
-    if (g_proc_id == 0) printf("DDalphaAMG initialized\n");
+    if (g_proc_id == 0) printf("TM_USE_DDalphaAMG initialized\n");
     MPI_Barrier(MPI_COMM_WORLD);
   }
 
@@ -351,23 +351,23 @@ static int MG_pre_solve(su3 **gf) {
     DDalphaAMG_set_configuration((double *)&(gf[0][0]), &mg_status);
     mg_update_gauge = 0;
     if (mg_status.success && g_proc_id == 0)
-      printf("DDalphaAMG cnfg set, plaquette %e\n", mg_status.info);
+      printf("TM_USE_DDalphaAMG cnfg set, plaquette %e\n", mg_status.info);
     else if (g_proc_id == 0)
       printf("ERROR: configuration updating did not run correctly");
   }
 
   if (mg_do_setup == 1) {
     if (mg_setup_mu_set) {
-      if (g_proc_id == 0) printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
+      if (g_proc_id == 0) printf("TM_USE_DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
       MG_update_mu(mg_setup_mu, 0);
     } else
       MG_update_mu(g_mu, 0);
-    if (g_proc_id == 0) printf("DDalphaAMG running setup\n");
+    if (g_proc_id == 0) printf("TM_USE_DDalphaAMG running setup\n");
     DDalphaAMG_setup(&mg_status);
     mg_do_setup = 0;
     mg_tau = gauge_tau;
     if (mg_status.success && g_proc_id == 0)
-      printf("DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
+      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
              100. * (mg_status.coarse_time / mg_status.time));
     else if (g_proc_id == 0)
       printf("ERROR: setup procedure did not run correctly");
@@ -375,16 +375,16 @@ static int MG_pre_solve(su3 **gf) {
 
   if (mg_update_setup > 0) {
     if (mg_setup_mu_set) {
-      if (g_proc_id == 0) printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
+      if (g_proc_id == 0) printf("TM_USE_DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
       MG_update_mu(mg_setup_mu, 0);
     } else
       MG_update_mu(g_mu, 0);
-    if (g_proc_id == 0) printf("DDalphaAMG updating setup\n");
+    if (g_proc_id == 0) printf("TM_USE_DDalphaAMG updating setup\n");
     DDalphaAMG_update_setup(mg_update_setup, &mg_status);
     mg_update_setup = 0;
     mg_tau = gauge_tau;
     if (mg_status.success && g_proc_id == 0)
-      printf("DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
+      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
              100. * (mg_status.coarse_time / mg_status.time));
     else if (g_proc_id == 0)
       printf("ERROR: setup updating did not run correctly");
@@ -395,7 +395,7 @@ static int MG_pre_solve(su3 **gf) {
 
 static int MG_solve(spinor *const phi_new, spinor *const phi_old, const double precision,
                     const int N, matrix_mult f) {
-  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
+  // for rescaling  convention in TM_USE_DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
   // rescale by 1/4+m
   double mg_scale = 0.5 / g_kappa;
   double *old = (double *)phi_old;
@@ -529,7 +529,7 @@ static int MG_solve(spinor *const phi_new, spinor *const phi_old, const double p
 
 static int MG_solve_nd(spinor *up_new, spinor *dn_new, spinor *const up_old, spinor *const dn_old,
                        const double precision, const int N, matrix_mult_nd f) {
-  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
+  // for rescaling  convention in TM_USE_DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
   // rescale by 1/4+m moreover in the nd case, the tmLQCD is multiplied by phmc_invmaxev
   double mg_scale = 0.5 / g_kappa / phmc_invmaxev;
   double sqnorm;
@@ -803,7 +803,7 @@ static int MG_solve_nd(spinor *up_new, spinor *dn_new, spinor *const up_old, spi
                                          // 0 and shift
              f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
              f == Qsw_pm_ndpsi_shift) {  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
-    // DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
+    // TM_USE_DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
     // tmLQCD:          gamma5 Dh tau1 gamma5 Dh tau1
     if (init_guess) {
       mul_gamma5(old1, VOLUME);
@@ -900,7 +900,7 @@ static int MG_solve_nd(spinor *up_new, spinor *dn_new, spinor *const up_old, spi
 static int MG_mms_solve_nd(spinor **const up_new, spinor **const dn_new, spinor *const up_old,
                            spinor *const dn_old, const double *shifts, const int no_shifts,
                            double *precision, const int N, matrix_mult_nd f) {
-  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
+  // for rescaling  convention in TM_USE_DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
   // rescale by 1/4+m moreover in the nd case, the tmLQCD is multiplied by phmc_invmaxev
   double mg_scale = 0.5 / g_kappa / phmc_invmaxev;
   double *old1 = (double *)up_old;
@@ -1001,7 +1001,7 @@ static int MG_mms_solve_nd(spinor **const up_new, spinor **const dn_new, spinor
                                          // 0 and shift
              f == Qsw_pm_ndpsi_shift) {  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
     mg_scale *= mg_scale;
-    // DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
+    // TM_USE_DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
     // tmLQCD:          gamma5 Dh tau1 gamma5 Dh tau1
     DDalphaAMG_solve_ms_doublet_squared_odd(new2, old2, new1, old1, mg_even_shifts, mg_odd_shifts,
                                             no_shifts, precision, &mg_status);
@@ -1110,7 +1110,7 @@ void MG_init() {
   mg_params.conf_index_fct = conf_index_fct;
   mg_params.vector_index_fct = vector_index_fct;
 
-  /* in DDalphaAMG
+  /* in TM_USE_DDalphaAMG
    * Printing level:
    *  -1: silent (errors or warnings)
    *   0: minimal //default
diff --git a/src/lib/DDalphaAMG_interface.h b/src/lib/DDalphaAMG_interface.h
index 96f59c31e..cc7ae1678 100644
--- a/src/lib/DDalphaAMG_interface.h
+++ b/src/lib/DDalphaAMG_interface.h
@@ -17,7 +17,7 @@
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  *
- * Interface for DDalphaAMG
+ * Interface for TM_USE_DDalphaAMG
  *
  *******************************************************************************/
 
diff --git a/src/lib/buffers/utils_generic_exchange.blocking.inc b/src/lib/buffers/utils_generic_exchange.blocking.inc
index e6e5f975c..71b44900c 100644
--- a/src/lib/buffers/utils_generic_exchange.blocking.inc
+++ b/src/lib/buffers/utils_generic_exchange.blocking.inc
@@ -26,7 +26,7 @@
 		 g_cart_grid, &status);
   }
   
-#  if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#  if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv(buffer[0],              1, slice_X_gath_type, g_nb_x_dn, 93,
@@ -108,10 +108,10 @@
 		 1, edge_XT_cont_type, g_nb_t_dn, 98,
 		 g_cart_grid, &status);
   }
-  /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/
+  /* end of if defined TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT*/
 #  endif
 
-#  if (defined PARALLELXYT || defined PARALLELXYZT)
+#  if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv(buffer[0],                            1, slice_Y_gath_type, g_nb_y_dn, 103,
@@ -247,9 +247,9 @@
 		 g_cart_grid, &status);
   }
 
-  /* end of if defined PARALLELXYT || PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYT || TM_PARALLELXYZT */
 #  endif
-#  if defined PARALLELXYZT
+#  if defined TM_PARALLELXYZT
   /* z-Rand */
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -454,4 +454,4 @@
 
   }
 
-#endif /* PARALLELXYZT */
+#endif /* TM_PARALLELXYZT */
diff --git a/src/lib/buffers/utils_generic_exchange.c b/src/lib/buffers/utils_generic_exchange.c
index c1c3c844a..474c738ad 100644
--- a/src/lib/buffers/utils_generic_exchange.c
+++ b/src/lib/buffers/utils_generic_exchange.c
@@ -4,13 +4,13 @@
 void generic_exchange(void *field_in, int bytes_per_site) {}
 #else /* MPI */
 void generic_exchange(void *field_in, int bytes_per_site) {
-#if defined _NON_BLOCKING
+#if defined TM_NON_BLOCKING
   int cntr = 0;
   MPI_Request request[108];
   MPI_Status status[108];
-#else  /* _NON_BLOCKING */
+#else  /* TM_NON_BLOCKING */
   MPI_Status status;
-#endif /* _NON_BLOCKING */
+#endif /* TM_NON_BLOCKING */
   static int initialized = 0;
 
   /* We start by defining all the MPI datatypes required */
@@ -125,11 +125,11 @@ void generic_exchange(void *field_in, int bytes_per_site) {
   }
 
   /* Following are implementations using different compile time flags */
-#if defined _NON_BLOCKING
+#if defined TM_NON_BLOCKING
 #include "utils_generic_exchange.nonblocking.inc"
-#else  /* _NON_BLOCKING */
+#else  /* TM_NON_BLOCKING */
 #include "utils_generic_exchange.blocking.inc"
-#endif /* _NON_BLOCKING */
+#endif /* TM_NON_BLOCKING */
 }
 
 #endif /* MPI */
diff --git a/src/lib/buffers/utils_generic_exchange.nonblocking.inc b/src/lib/buffers/utils_generic_exchange.nonblocking.inc
index 0789a490f..71409008f 100644
--- a/src/lib/buffers/utils_generic_exchange.nonblocking.inc
+++ b/src/lib/buffers/utils_generic_exchange.nonblocking.inc
@@ -32,7 +32,7 @@
     cntr=cntr+2;
   }
   
-#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#    if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Isend(buffer[0],              1, slice_X_gath_type, g_nb_x_dn, 87,
@@ -71,7 +71,7 @@
 #    endif
   MPI_Waitall(cntr, request, status);
   cntr=0;
-#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#    if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* The edges */
 
   /* send the data to the neighbour on the left in t direction */
@@ -137,10 +137,10 @@
 	      g_cart_grid, &request[cntr+1]);
     cntr=cntr+2;
   }
-  /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/
+  /* end of if defined TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT*/
 #    endif
 
-#    if (defined PARALLELXYT || defined PARALLELXYZT)
+#    if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Isend(buffer[0],                            1, slice_Y_gath_type, g_nb_y_dn, 106,
@@ -177,7 +177,7 @@
 #    endif
   MPI_Waitall(cntr, request, status);
   cntr=0;
-#    if (defined PARALLELXYT || defined PARALLELXYZT)
+#    if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* jetzt wirds richtig eklig ... */
 
@@ -312,9 +312,9 @@
     cntr=cntr+2;
   }
 
-  /* end of if defined PARALLELXYT || PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYT || TM_PARALLELXYZT */
 #    endif
-#    if defined PARALLELXYZT
+#    if defined TM_PARALLELXYZT
   /* z-Rand */
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -359,7 +359,7 @@
   }
 #    endif
   MPI_Waitall(cntr, request, status);
-#    if defined PARALLELXYZT
+#    if defined TM_PARALLELXYZT
   cntr=0;
   /* edges */
 
diff --git a/src/lib/deriv_Sb.c b/src/lib/deriv_Sb.c
index 4303c80d5..7b55eb170 100644
--- a/src/lib/deriv_Sb.c
+++ b/src/lib/deriv_Sb.c
@@ -56,7 +56,7 @@
 void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field_t* const hf,
               const double factor) {
   tm_stopwatch_push(&g_timers, __func__, "");
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(hf->gaugefield);
   }
@@ -85,7 +85,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
 #undef static
 #endif
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(derivSb)
 #endif
 
@@ -114,7 +114,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = &g_gauge_field_copy[icx][0];
 #else
     up = &hf->gaugefield[ix][0];
@@ -136,7 +136,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][0];
@@ -159,7 +159,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][1];
@@ -181,7 +181,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][1];
@@ -203,7 +203,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][2];
@@ -225,7 +225,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][2];
@@ -247,7 +247,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][3];
@@ -269,7 +269,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][3];
@@ -292,7 +292,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
   } /* OpenMP closing brace */
 #endif
   tm_stopwatch_pop(&g_timers, 0, 1, "");
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(derivSb)
 #endif
 }
diff --git a/src/lib/deriv_Sb_D_psi.c b/src/lib/deriv_Sb_D_psi.c
index 6ba15d490..61da4b9d2 100644
--- a/src/lib/deriv_Sb_D_psi.c
+++ b/src/lib/deriv_Sb_D_psi.c
@@ -63,7 +63,7 @@ void deriv_Sb_D_psi(spinor* const l, spinor* const k, hamiltonian_field_t* const
 #undef static
 #endif
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(derivSb)
 #endif
 
@@ -225,7 +225,7 @@ void deriv_Sb_D_psi(spinor* const l, spinor* const k, hamiltonian_field_t* const
 
       /****************** end of loop ************************/
     }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(derivSb)
 #endif
 
diff --git a/fixed_volume.h.in b/src/lib/fixed_volume.h.in
similarity index 100%
rename from fixed_volume.h.in
rename to src/lib/fixed_volume.h.in
diff --git a/src/lib/geometry_eo.c b/src/lib/geometry_eo.c
index 8622131e9..ceb348e1a 100644
--- a/src/lib/geometry_eo.c
+++ b/src/lib/geometry_eo.c
@@ -45,7 +45,7 @@
 
 void Hopping_Matrix_Indices(void);
 
-#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ))
+#if ((defined TM_PARALLELX) || (defined TM_PARALLELXY) || (defined TM_PARALLELXYZ))
 
 /* This is the version of the function Index  introduced for Aurora-like parallelizations (mainly
  * xyz)  */
@@ -72,7 +72,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     ix = VOLUME + T * LY * LZ + y0 * LY * LZ + y2 * LZ + y3;
   }
 
-#if (defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   /* y-Rand */
   if (x2 == LY) {
     ix = VOLUME + 2 * T * LY * LZ + y0 * LX * LZ + y1 * LZ + y3;
@@ -97,9 +97,9 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUME + RAND + 3 * T * LZ + y0 * LZ + y3;
     }
   }
-#endif /* endif of PARALLELXY  || PARALLELXYZ */
+#endif /* endif of TM_PARALLELXY  || TM_PARALLELXYZ */
 
-#if defined PARALLELXYZ
+#if defined TM_PARALLELXYZ
   /* z-Rand */
   if (x3 == LZ) {
     ix = VOLUME + 2 * T * LY * LZ + 2 * T * LX * LZ + y0 * LX * LY + y1 * LY + y2;
@@ -142,7 +142,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     }
   }
 
-#endif /* endif of PARALLELXYZ */
+#endif /* endif of TM_PARALLELXYZ */
 
   /* The DBW2 stuff --> second boundary slice */
   /* This we put a the very end.              */
@@ -150,44 +150,44 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
   /* x2-rand+ */
   if (x1 == LX + 1) {
     ix = VOLUMEPLUSRAND + y0 * LY * LZ + y2 * LZ + y3;
-#if (defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXY || defined TM_PARALLELXYZ)
     /* x2y */
     if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + y0 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 1 * T * LZ + y0 * LZ + y3;
     }
-#endif /* endif of PARALLELXY || PARALLELXYZ  */
-#if defined PARALLELXYZ
+#endif /* endif of TM_PARALLELXY || TM_PARALLELXYZ  */
+#if defined TM_PARALLELXYZ
     /* x2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 4 * T * LY + y0 * LY + y2;
     } else if (x3 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 5 * T * LY + y0 * LY + y2;
     }
-#endif /* endif of PARALLELXYZ  */
+#endif /* endif of TM_PARALLELXYZ  */
   }
   /* x2-rand- */
   if (x1 == -2) {
     ix = VOLUMEPLUSRAND + T * LY * LZ + y0 * LY * LZ + y2 * LZ + y3;
-#if (defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXY || defined TM_PARALLELXYZ)
     /* x2y */
     if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + 2 * T * LZ + y0 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 3 * T * LZ + y0 * LZ + y3;
     }
-#endif /* endif of PARALLELXY || PARALLELXYZ  */
-#if defined PARALLELXYZ
+#endif /* endif of TM_PARALLELXY || TM_PARALLELXYZ  */
+#if defined TM_PARALLELXYZ
     /* x2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 6 * T * LY + y0 * LY + y2;
     } else if (x3 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 7 * T * LY + y0 * LY + y2;
     }
-#endif /* endif of  PARALLELXYZ  */
+#endif /* endif of  TM_PARALLELXYZ  */
   }
-#if (defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   /* y2-rand+ */
   if (x2 == LY + 1) {
     ix = VOLUMEPLUSRAND + 2 * T * LY * LZ + y0 * LX * LZ + y1 * LZ + y3;
@@ -197,14 +197,14 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x1 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 6 * T * LZ + y0 * LZ + y3;
     }
-#if defined PARALLELXYZ
+#if defined TM_PARALLELXYZ
     /* y2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 8 * T * LY + 4 * T * LX + y0 * LX + y1;
     } else if (x3 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 8 * T * LY + 5 * T * LX + y0 * LX + y1;
     }
-#endif /* endif of PARALLELXYZ  */
+#endif /* endif of TM_PARALLELXYZ  */
   }
   /* y2-rand- */
   if (x2 == -2) {
@@ -215,17 +215,17 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x1 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 7 * T * LZ + y0 * LZ + y3;
     }
-#if defined PARALLELXYZ
+#if defined TM_PARALLELXYZ
     /* y2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 8 * T * LY + 6 * T * LX + y0 * LX + y1;
     } else if (x3 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 8 * T * LY + 7 * T * LX + y0 * LX + y1;
     }
-#endif /* endif of PARALLELXYZ  */
+#endif /* endif of TM_PARALLELXYZ  */
   }
-#endif /* endif of PARALLELXY || PARALLELXYZ  */
-#if defined PARALLELXYZ
+#endif /* endif of TM_PARALLELXY || TM_PARALLELXYZ  */
+#if defined TM_PARALLELXYZ
   /* z2-rand+ */
   if (x3 == LZ + 1) {
     ix = VOLUMEPLUSRAND + 2 * T * LY * LZ + 2 * T * LX * LZ + y0 * LX * LY + y1 * LY + y2;
@@ -259,12 +259,12 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 8 * T * LY + 3 * T * LX + y0 * LX + y1;
     }
   }
-#endif /* endif of PARALLELXYZ  */
+#endif /* endif of TM_PARALLELXYZ  */
 
   return (ix);
 }
 
-#else /* original version of Index(): used for no parallelization  or PARALLEL*T */
+#else /* original version of Index(): used for no parallelization  or TM_PARALLEL*T */
 
 int Index(const int x0, const int x1, const int x2, const int x3) {
   int y0, y1, y2, y3, ix;
@@ -274,7 +274,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
   y3 = (x3 + LZ) % LZ;
   ix = ((y0 * LX + y1) * LY + y2) * LZ + y3;
 
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
   if (x0 == T) {
     ix = VOLUME + y3 + LZ * y2 + LZ * LY * y1;
   }
@@ -283,7 +283,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     ix = VOLUME + LX * LY * LZ + y3 + LZ * y2 + LZ * LY * y1;
   }
 #endif
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
   if (x1 == LX) {
     ix = VOLUME + 2 * LX * LY * LZ + y0 * LY * LZ + y2 * LZ + y3;
   }
@@ -309,9 +309,9 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     }
   }
 
-#endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT */
+#endif /* endif of TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT */
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* y-Rand */
   if (x2 == LY) {
     ix = VOLUME + 2 * LX * LY * LZ + 2 * T * LY * LZ + y0 * LX * LZ + y1 * LZ + y3;
@@ -358,8 +358,8 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     }
   }
 
-#endif /* endif of PARALLELXYT  || PARALLELXYZT */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT  || TM_PARALLELXYZT */
+#if defined TM_PARALLELXYZT
   /* z-Rand */
   if (x3 == LZ) {
     ix =
@@ -429,30 +429,30 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     }
   }
 
-#endif /* endif of PARALLELXYZT */
+#endif /* endif of TM_PARALLELXYZT */
 
   /* The DBW2 stuff --> second boundary slice */
   /* This we put a the very end.              */
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
   if (x0 == T + 1) {
     ix = VOLUMEPLUSRAND + y3 + LZ * y2 + LZ * LY * y1;
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
     /* t2x */
     if (x1 == LX) {
       ix = VOLUMEPLUSRAND + RAND + y2 * LZ + y3;
     } else if (x1 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 1 * LY * LZ + y2 * LZ + y3;
     }
-#endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT  */
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#endif /* endif of TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* t2y */
     else if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + y1 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 2 * LX * LZ + y1 * LZ + y3;
     }
-#endif /* endif of PARALLELXYT || PARALLELXYZT  */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if defined TM_PARALLELXYZT
     /* t2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + y1 * LY + y2;
@@ -460,28 +460,28 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 2 * LX * LY + y1 * LY +
            y2;
     }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   }
   /* the slice at time -2 is put behind the one at time T+1 */
   else if (x0 == -2) {
     ix = VOLUMEPLUSRAND + LX * LY * LZ + y3 + LZ * y2 + LZ * LY * y1;
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
     /* t2x */
     if (x1 == LX) {
       ix = VOLUMEPLUSRAND + RAND + 2 * LY * LZ + y2 * LZ + y3;
     } else if (x1 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 3 * LY * LZ + y2 * LZ + y3;
     }
-#endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT  */
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#endif /* endif of TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* t2y */
     else if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + LX * LZ + y1 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 3 * LX * LZ + y1 * LZ + y3;
     }
-#endif /* endif of PARALLELXYT || PARALLELXYZT  */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if defined TM_PARALLELXYZT
     /* t2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + LX * LY + y1 * LY + y2;
@@ -489,10 +489,10 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 3 * LX * LY + y1 * LY +
            y2;
     }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   }
-#endif /* endif of PARALLELT || PARALLELXT || PARALLELXYT || PARALLELXYZT  */
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || defined PARALLELXYZT)
+#endif /* endif of TM_PARALLELT || TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || defined TM_PARALLELXYZT)
   if (x1 == LX + 1) {
     ix = VOLUMEPLUSRAND + 2 * LX * LY * LZ + y0 * LY * LZ + y2 * LZ + y3;
     /* x2t */
@@ -501,15 +501,15 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x0 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 6 * LY * LZ + y2 * LZ + y3;
     }
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* x2y */
     else if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + y0 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 1 * T * LZ + y0 * LZ + y3;
     }
-#endif /* endif of PARALLELXYT || PARALLELXYZT  */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if defined TM_PARALLELXYZT
     /* x2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
@@ -518,7 +518,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
            5 * T * LY + y0 * LY + y2;
     }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   }
   if (x1 == -2) {
     ix = VOLUMEPLUSRAND + 2 * LX * LY * LZ + T * LY * LZ + y0 * LY * LZ + y2 * LZ + y3;
@@ -528,15 +528,15 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x0 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 7 * LY * LZ + y2 * LZ + y3;
     }
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* x2y */
     else if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 2 * T * LZ + y0 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 3 * T * LZ + y0 * LZ + y3;
     }
-#endif /* endif of PARALLELXYT || PARALLELXYZT  */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if defined TM_PARALLELXYZT
     /* x2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
@@ -545,10 +545,10 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
            7 * T * LY + y0 * LY + y2;
     }
-#endif /* endif of  PARALLELXYZT  */
+#endif /* endif of  TM_PARALLELXYZT  */
   }
-#endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT  */
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#endif /* endif of TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   if (x2 == LY + 1) {
     ix = VOLUMEPLUSRAND + 2 * LX * LY * LZ + 2 * T * LY * LZ + y0 * LX * LZ + y1 * LZ + y3;
     /* y2x */
@@ -563,7 +563,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x0 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 5 * LX * LZ + y1 * LZ + y3;
     }
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     /* y2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
@@ -572,7 +572,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
            8 * T * LY + 5 * T * LX + y0 * LX + y1;
     }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   }
   if (x2 == -2) {
     ix = VOLUMEPLUSRAND + 2 * LX * LY * LZ + 2 * T * LY * LZ + T * LX * LZ + y0 * LX * LZ +
@@ -589,7 +589,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x0 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 7 * LX * LZ + y1 * LZ + y3;
     }
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     /* y2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
@@ -598,10 +598,10 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
            8 * T * LY + 7 * T * LX + y0 * LX + y1;
     }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   }
-#endif /* endif of PARALLELXYT || PARALLELXYZT  */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if defined TM_PARALLELXYZT
   /* z2-Rand */
   if (x3 == LZ + 1) {
     if ((x0 < T) && (x0 > -1) && (x1 < LX) && (x1 > -1) && (x2 > -1) && (x2 < LY)) {
@@ -663,14 +663,14 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
            8 * T * LY + 3 * T * LX + y0 * LX + y1;
     }
   }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   /*   if(ix == 372) { */
   /*     printf("## %d %d %d %d ix = %d, %d %d %d %d\n", x0, x1, x2, x3, ix, T, LX, LY, LZ); */
   /*   } */
   return (ix);
 }
 
-#endif /* PARALLEL???  */
+#endif /* TM_PARALLEL???  */
 
 void geometry() {
   int x0, x1, x2, x3, ix;
@@ -685,17 +685,17 @@ void geometry() {
 
   xeven = malloc(VOLUMEPLUSRAND * sizeof(int));
 
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   startvaluet = 1;
 #endif
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || \
-     defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
+     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   startvaluex = 1;
 #endif
-#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   startvaluey = 1;
 #endif
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   startvaluez = 1;
 #endif
 
@@ -795,7 +795,7 @@ void geometry() {
     }
   }
 
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   ix = 0;
   for (x0 = 0; x0 < T; x0++) {
     for (x1 = 0; x1 < LX; x1++) {
@@ -852,7 +852,7 @@ void geometry() {
   }
 
 
-#endif /* PARALLELXYZ || PARALLELXYZT*/
+#endif /* TM_PARALLELXYZ || TM_PARALLELXYZT*/
 
   /* The rectangular gauge action part */
   /* Everything is stored behind VOLUMEPLUSRAND-1 !*/
@@ -861,7 +861,7 @@ void geometry() {
       printf("# Initialising rectangular gauge action stuff\n");
       fflush(stdout);
     }
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     for (x1 = -startvaluex; x1 < (LX + startvaluex); x1++) {
       for (x2 = -startvaluey; x2 < (LY + startvaluey); x2++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
@@ -910,8 +910,8 @@ void geometry() {
       }
     }
 #endif
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || \
-     defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
+     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
     for (x0 = -startvaluet; x0 < (T + startvaluet); x0++) {
       for (x2 = -startvaluey; x2 < (LY + startvaluey); x2++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
@@ -959,7 +959,7 @@ void geometry() {
       }
     }
 #endif
-#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
     for (x0 = -startvaluet; x0 < (T + startvaluet); x0++) {
       for (x1 = -startvaluex; x1 < (LX + startvaluex); x1++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
@@ -1007,7 +1007,7 @@ void geometry() {
       }
     }
 #endif
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
     for (x0 = -startvaluet; x0 < (T + startvaluet); x0++) {
       for (x1 = -startvaluex; x1 < (LX + startvaluex); x1++) {
         for (x2 = -startvaluey; x2 < (LY + startvaluey); x2++) {
diff --git a/src/lib/get_rectangle_staples.c b/src/lib/get_rectangle_staples.c
index eb2a7db9f..eab6b9d9e 100644
--- a/src/lib/get_rectangle_staples.c
+++ b/src/lib/get_rectangle_staples.c
@@ -34,7 +34,7 @@ void get_rectangle_staples_general(su3 *const v, const int x, const int mu,
                                    const su3 *const *const gf) {
   su3 ALIGN tmp1, tmp2;
   const su3 *a, *b, *c, *d, *e;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(rectstaples)
 #endif
   _su3_zero((*v));
@@ -178,7 +178,7 @@ void get_rectangle_staples_general(su3 *const v, const int x, const int mu,
       _su3_times_su3_acc((*v), tmp2, tmp1);
     }
   }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(rectstaples)
 #endif
 }
diff --git a/src/lib/get_staples.c b/src/lib/get_staples.c
index e80648382..b33010f2c 100644
--- a/src/lib/get_staples.c
+++ b/src/lib/get_staples.c
@@ -35,7 +35,7 @@ void get_staples(su3* const staple, const int x, const int mu, const su3** in_ga
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(staples)
 #endif
 
@@ -61,7 +61,7 @@ void get_staples(su3* const staple, const int x, const int mu, const su3** in_ga
       _su3d_times_su3_acc(*staple, *w1, st);
     }
   }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(staples)
 #endif
 }
@@ -72,7 +72,7 @@ void get_spacelike_staples(su3* const staple, const int x, const int mu,
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(staples)
 #endif
 
@@ -98,7 +98,7 @@ void get_spacelike_staples(su3* const staple, const int x, const int mu,
       _su3d_times_su3_acc(*staple, *w1, st);
     }
   }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(staples)
 #endif
 }
@@ -109,7 +109,7 @@ void get_timelike_staples(su3* const staple, const int x, const int mu,
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(staples)
 #endif
 
@@ -134,7 +134,7 @@ void get_timelike_staples(su3* const staple, const int x, const int mu,
     /* v = v + w1^d * st */
     _su3d_times_su3_acc(*staple, *w1, st);
   }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(staples)
 #endif
 }
diff --git a/src/lib/gettime.c b/src/lib/gettime.c
index 68c123ae4..adae6dcb3 100644
--- a/src/lib/gettime.c
+++ b/src/lib/gettime.c
@@ -21,7 +21,7 @@
 #ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
 #endif
-#ifdef HAVE_CLOCK_GETTIME
+#ifdef TM_CLOCK_GETTIME
 #ifndef _POSIX_C_SOURCE
 #define _POSIX_C_SOURCE 199309L
 #endif
@@ -45,7 +45,7 @@ double gettime(void) {
 
   t = MPI_Wtime();
 
-#elif (defined HAVE_CLOCK_GETTIME)
+#elif (defined TM_CLOCK_GETTIME)
 
   struct timespec ts;
 
diff --git a/src/lib/git_hash.h b/src/lib/git_hash.h
new file mode 100644
index 000000000..a3a22b48d
--- /dev/null
+++ b/src/lib/git_hash.h
@@ -0,0 +1,6 @@
+#ifndef TM_GIT_HASH_H
+#define TM_GIT_HASH_H
+
+extern const char git_hash[];
+
+#endif
diff --git a/src/lib/global.h b/src/lib/global.h
index 1fc644d3e..b0d3b1ac2 100644
--- a/src/lib/global.h
+++ b/src/lib/global.h
@@ -38,7 +38,7 @@
 #ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef FIXEDVOLUME
+#ifdef TM_FIXEDVOLUME
 #include "fixed_volume.h"
 #endif
 #include "su3.h"
@@ -79,7 +79,7 @@ EXTERN tm_mpi_thread_level_t g_mpi_thread_level;
 EXTERN tm_timers_t g_timers;
 
 EXTERN int T_global;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
 EXTERN int T, L, LX, LY, LZ, VOLUME;
 EXTERN int N_PROC_T, N_PROC_X, N_PROC_Y, N_PROC_Z;
 EXTERN int RAND, EDGES, VOLUMEPLUSRAND;
@@ -130,7 +130,7 @@ EXTERN int g_running_phmc;
 
 EXTERN su3 **g_gauge_field;
 EXTERN su3_32 **g_gauge_field_32;
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
 EXTERN su3 ***g_gauge_field_copy;
 EXTERN su3_32 ***g_gauge_field_copy_32;
 #else
diff --git a/src/lib/init/init_dirac_halfspinor.c b/src/lib/init/init_dirac_halfspinor.c
index f5939d9cc..891a703e2 100644
--- a/src/lib/init/init_dirac_halfspinor.c
+++ b/src/lib/init/init_dirac_halfspinor.c
@@ -94,7 +94,7 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 2 * mu + 1] =
             &HalfSpinor[8 * g_lexic2eosub[g_iup[j][mu]] + 2 * mu + 1];
       }
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (t == 0) {
         k = (g_lexic2eosub[g_idn[j][0]] - VOLUME / 2);
         NBPointer[ieo][8 * i] = &sendBuffer[k];
@@ -104,8 +104,8 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 1] = &sendBuffer[k];
       }
 #endif
-#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || \
-     (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELX) || (defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || \
+     (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (x == 0) {
         k = (g_lexic2eosub[g_idn[j][1]] - VOLUME / 2);
         NBPointer[ieo][8 * i + 2] = &sendBuffer[k];
@@ -115,8 +115,8 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 3] = &sendBuffer[k];
       }
 #endif
-#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || \
-     (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (y == 0) {
         k = (g_lexic2eosub[g_idn[j][2]] - VOLUME / 2);
         NBPointer[ieo][8 * i + 4] = &sendBuffer[k];
@@ -126,7 +126,7 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 5] = &sendBuffer[k];
       }
 #endif
-#if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXYZ) || (defined TM_PARALLELXYZT))
       if (z == 0) {
         k = (g_lexic2eosub[g_idn[j][3]] - VOLUME / 2);
         NBPointer[ieo][8 * i + 6] = &sendBuffer[k];
@@ -154,7 +154,7 @@ int init_dirac_halfspinor() {
       for (int mu = 0; mu < 8; mu++) {
         NBPointer[ieo][8 * i + mu] = &HalfSpinor[8 * i + mu];
       }
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (t == T - 1) {
         NBPointer[ieo][8 * i] = &recvBuffer[(g_lexic2eosub[g_iup[j][0]] - VOLUME / 2)];
       }
@@ -162,8 +162,8 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 1] = &recvBuffer[(g_lexic2eosub[g_idn[j][0]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || \
-     (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELX) || (defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || \
+     (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (x == LX - 1) {
         NBPointer[ieo][8 * i + 2] = &recvBuffer[(g_lexic2eosub[g_iup[j][1]] - VOLUME / 2)];
       }
@@ -171,8 +171,8 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 3] = &recvBuffer[(g_lexic2eosub[g_idn[j][1]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || \
-     (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (y == LY - 1) {
         NBPointer[ieo][8 * i + 4] = &recvBuffer[(g_lexic2eosub[g_iup[j][2]] - VOLUME / 2)];
       }
@@ -180,7 +180,7 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 5] = &recvBuffer[(g_lexic2eosub[g_idn[j][2]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXYZ) || (defined TM_PARALLELXYZT))
       if (z == LZ - 1) {
         NBPointer[ieo][8 * i + 6] = &recvBuffer[(g_lexic2eosub[g_iup[j][3]] - VOLUME / 2)];
       }
@@ -240,7 +240,7 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 2 * mu + 1] =
             &HalfSpinor32[8 * g_lexic2eosub[g_iup[j][mu]] + 2 * mu + 1];
       }
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (t == 0) {
         k = (g_lexic2eosub[g_idn[j][0]] - VOLUME / 2);
         NBPointer32[ieo][8 * i] = &sendBuffer32[k];
@@ -250,8 +250,8 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 1] = &sendBuffer32[k];
       }
 #endif
-#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || \
-     (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELX) || (defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || \
+     (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (x == 0) {
         k = (g_lexic2eosub[g_idn[j][1]] - VOLUME / 2);
         NBPointer32[ieo][8 * i + 2] = &sendBuffer32[k];
@@ -261,8 +261,8 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 3] = &sendBuffer32[k];
       }
 #endif
-#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || \
-     (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (y == 0) {
         k = (g_lexic2eosub[g_idn[j][2]] - VOLUME / 2);
         NBPointer32[ieo][8 * i + 4] = &sendBuffer32[k];
@@ -272,7 +272,7 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 5] = &sendBuffer32[k];
       }
 #endif
-#if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXYZ) || (defined TM_PARALLELXYZT))
       if (z == 0) {
         k = (g_lexic2eosub[g_idn[j][3]] - VOLUME / 2);
         NBPointer32[ieo][8 * i + 6] = &sendBuffer32[k];
@@ -300,7 +300,7 @@ int init_dirac_halfspinor32() {
       for (mu = 0; mu < 8; mu++) {
         NBPointer32[ieo][8 * i + mu] = &HalfSpinor32[8 * i + mu];
       }
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (t == T - 1) {
         NBPointer32[ieo][8 * i] = &recvBuffer32[(g_lexic2eosub[g_iup[j][0]] - VOLUME / 2)];
       }
@@ -308,8 +308,8 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 1] = &recvBuffer32[(g_lexic2eosub[g_idn[j][0]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || \
-     (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELX) || (defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || \
+     (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (x == LX - 1) {
         NBPointer32[ieo][8 * i + 2] = &recvBuffer32[(g_lexic2eosub[g_iup[j][1]] - VOLUME / 2)];
       }
@@ -317,8 +317,8 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 3] = &recvBuffer32[(g_lexic2eosub[g_idn[j][1]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || \
-     (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (y == LY - 1) {
         NBPointer32[ieo][8 * i + 4] = &recvBuffer32[(g_lexic2eosub[g_iup[j][2]] - VOLUME / 2)];
       }
@@ -326,7 +326,7 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 5] = &recvBuffer32[(g_lexic2eosub[g_idn[j][2]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXYZ) || (defined TM_PARALLELXYZT))
       if (z == LZ - 1) {
         NBPointer32[ieo][8 * i + 6] = &recvBuffer32[(g_lexic2eosub[g_iup[j][3]] - VOLUME / 2)];
       }
diff --git a/src/lib/init/init_gauge_field.c b/src/lib/init/init_gauge_field.c
index e30e040bf..1ad4463a8 100644
--- a/src/lib/init/init_gauge_field.c
+++ b/src/lib/init/init_gauge_field.c
@@ -54,7 +54,7 @@ int init_gauge_field(const int V, const int back) {
     g_gauge_field[i] = g_gauge_field[i - 1] + 4;
   }
 
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
   if (back == 1) {
     /*
       g_gauge_field_copy[ieo][PM][sites/2][mu]
@@ -134,7 +134,7 @@ int init_gauge_field_32(const int V, const int back) {
     g_gauge_field_32[i] = g_gauge_field_32[i - 1] + 4;
   }
 
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
   if (back == 1) {
     /*
       g_gauge_field_copy[ieo][PM][sites/2][mu]
@@ -167,7 +167,7 @@ int init_gauge_field_32(const int V, const int back) {
       g_gauge_field_copy_32[1][i] = g_gauge_field_copy_32[1][i - 1] + 4;
     }
   }
-#else /* than _USE_HALFSPINOR  */
+#else /* than TM_USE_HALFSPINOR  */
   if (back == 1) {
     if ((void*)(g_gauge_field_copy_32 = (su3_32**)calloc((VOLUME + RAND), sizeof(su3_32*))) ==
         NULL) {
@@ -217,7 +217,7 @@ void convert_32_gauge_field(su3_32** gf32, su3** gf, int V) {
       gf32[i][mu].c22 = (_Complex float)gf[i][mu].c22;
     }
   }
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
 
 #endif
 }
diff --git a/src/lib/init/init_geometry_indices.c b/src/lib/init/init_geometry_indices.c
index ef54c45de..6b75fc83a 100644
--- a/src/lib/init/init_geometry_indices.c
+++ b/src/lib/init/init_geometry_indices.c
@@ -58,7 +58,7 @@ int init_geometry_indices(const int V) {
   g_eo2lexic = (int *)calloc(V, sizeof(int));
   if ((void *)g_eo2lexic == NULL) return (11);
 
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   g_field_z_ipt_even = (int *)calloc(T * LX * LY, sizeof(int));
   if ((void *)g_field_z_ipt_even == NULL) return (12);
   g_field_z_ipt_odd = (int *)calloc(T * LX * LY, sizeof(int));
@@ -136,7 +136,7 @@ void free_geometry_indices() {
   free(g_eo2lexic);
   free(g_lexic2eosub);
   free(g_lexic2eo);
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   free(g_field_z_ipt_odd);
   free(g_field_z_ipt_even);
 #endif
diff --git a/src/lib/init/init_parallel.h b/src/lib/init/init_parallel.h
index f88ebe1b4..553da6765 100644
--- a/src/lib/init/init_parallel.h
+++ b/src/lib/init/init_parallel.h
@@ -19,8 +19,8 @@
  *
  *******************************************************************************/
 
-#ifndef _INIT_PARALLEL_H
-#define _INIT_PARALLEL_H
+#ifndef _INIT_TM_PARALLEL_H
+#define _INIT_TM_PARALLEL_H
 
 void init_parallel_and_read_input(int argc, char *argv[], const char input_filename[]);
 
diff --git a/src/lib/init/init_spinor_field.c b/src/lib/init/init_spinor_field.c
index c70945634..6fea95cd8 100644
--- a/src/lib/init/init_spinor_field.c
+++ b/src/lib/init/init_spinor_field.c
@@ -23,7 +23,7 @@
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
-#ifdef _USE_SHMEM
+#ifdef TM_USE_SHMEM
 #include <mpp/shmem.h>
 #endif
 #include "global.h"
@@ -37,7 +37,7 @@ spinor *sp_tbuff = NULL;
 int init_spinor_field(const int V, const int nr) {
   int i = 0;
 
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void *)(sp = (spinor *)shmalloc((nr * V + 1) * sizeof(spinor))) == NULL) {
     printf("malloc errno : %d\n", errno);
     errno = 0;
@@ -65,7 +65,7 @@ int init_spinor_field(const int V, const int nr) {
 }
 
 void free_spinor_field() {
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   shfree(sp);
   shfree(sp_csg);
 #else
@@ -78,7 +78,7 @@ spinor32 *sp32 = NULL;
 int init_spinor_field_32(const int V, const int nr) {
   int i = 0;
 
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void *)(sp32 = (spinor32 *)shmalloc((nr * V + 1) * sizeof(spinor32))) == NULL) {
     printf("malloc errno : %d\n", errno);
     errno = 0;
@@ -106,7 +106,7 @@ int init_spinor_field_32(const int V, const int nr) {
 }
 
 void free_spinor_field_32() {
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   shfree(sp32);
 #else
   free(sp32);
@@ -119,7 +119,7 @@ void free_spinor_field_32() {
 int allocate_spinor_field_array(spinor ***spinors, spinor **sp, const int V, const int nr) {
   int i = 0;
 
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void *)((*sp) = (spinor *)shmalloc((nr * V + 1) * sizeof(spinor))) == NULL) {
     printf("malloc errno : %d\n", errno);
     errno = 0;
@@ -147,7 +147,7 @@ int allocate_spinor_field_array(spinor ***spinors, spinor **sp, const int V, con
 }
 
 void free_spinor_field_array(spinor **sp) {
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   shfree(*sp);
 #else
   free(*sp);
@@ -165,7 +165,7 @@ int init_csg_field(const int V) {
 
   /* if all histories are zero, we do not need initialisation */
   if (sum != 0) {
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
     sp_csg = (spinor *)shmalloc((sum * V + 1) * sizeof(spinor));
 #else
     sp_csg = (spinor *)calloc(sum * V + 1, sizeof(spinor));
diff --git a/src/lib/invert_clover_eo.c b/src/lib/invert_clover_eo.c
index e3b6cad31..63e512819 100644
--- a/src/lib/invert_clover_eo.c
+++ b/src/lib/invert_clover_eo.c
@@ -53,7 +53,7 @@
 #ifdef TM_USE_QUDA
 #include "quda_interface.h"
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QPHIX
@@ -81,7 +81,7 @@ int invert_clover_eo(spinor* const Even_new, spinor* const Odd_new, spinor* cons
     }
 #endif
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     if (solver_flag == MG) {
       return MG_solver_eo(Even_new, Odd_new, Even, Odd, precision, max_iter, rel_prec, VOLUME / 2,
                           gf[0], &Msw_full);
@@ -197,7 +197,7 @@ int invert_clover_eo(spinor* const Even_new, spinor* const Odd_new, spinor* cons
                     rel_prec, VOLUME, Qsq);
       Qm(g_spinor_field[DUM_DERI + 1], g_spinor_field[DUM_DERI]);
     }
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     else if (solver_flag == MG) {
       return MG_solver_eo(Even_new, Odd_new, Even, Odd, precision, max_iter, rel_prec, VOLUME / 2,
                           gf[0], &Msw_full);
diff --git a/src/lib/invert_doublet_eo.c b/src/lib/invert_doublet_eo.c
index 5be48415e..8d5a7dd82 100644
--- a/src/lib/invert_doublet_eo.c
+++ b/src/lib/invert_doublet_eo.c
@@ -50,7 +50,7 @@
 #ifdef TM_USE_QUDA
 #include "quda_interface.h"
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QPHIX
@@ -75,7 +75,7 @@ int invert_doublet_eo(spinor* const Even_new_s, spinor* const Odd_new_s, spinor*
   }
 #endif
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   if (solver_flag == MG) {
     return MG_solver_nd_eo(Even_new_s, Odd_new_s, Even_new_c, Odd_new_c, Even_s, Odd_s, Even_c,
                            Odd_c, precision, max_iter, rel_prec, VOLUME / 2, g_gauge_field,
@@ -162,7 +162,7 @@ int invert_cloverdoublet_eo(spinor* const Even_new_s, spinor* const Odd_new_s,
   }
 #endif
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   if (solver_flag == MG) {
     return MG_solver_nd_eo(Even_new_s, Odd_new_s, Even_new_c, Odd_new_c, Even_s, Odd_s, Even_c,
                            Odd_c, precision, max_iter, rel_prec, VOLUME / 2, g_gauge_field,
diff --git a/src/lib/invert_eo.c b/src/lib/invert_eo.c
index 997cab021..3b7625d48 100644
--- a/src/lib/invert_eo.c
+++ b/src/lib/invert_eo.c
@@ -61,7 +61,7 @@
 #ifdef TM_USE_QPHIX
 #include "qphix_interface.h"
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 
@@ -84,7 +84,7 @@ int invert_eo(spinor *const Even_new, spinor *const Odd_new, spinor *const Even,
   }
 #endif
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   if (solver_flag == MG)
     return MG_solver_eo(Even_new, Odd_new, Even, Odd, precision, max_iter, rel_prec, VOLUME / 2,
                         g_gauge_field, &M_full);
diff --git a/src/lib/io/gauge_read.c b/src/lib/io/gauge_read.c
index b7be10928..de53d9c28 100644
--- a/src/lib/io/gauge_read.c
+++ b/src/lib/io/gauge_read.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #include "gauge.ih"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 
@@ -209,7 +209,7 @@ int read_gauge_field(char *filename, su3 **const gf) {
     // reading a new gauge configuration moves the gauge_id a long way
     // to guarantee that the change is propagated
     update_tm_gauge_id(&g_gauge_state, TM_GAUGE_PROPAGATE_THRESHOLD);
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     MG_reset();
 #endif
   }
diff --git a/src/lib/io/gauge_read_binary.c b/src/lib/io/gauge_read_binary.c
index b61284cab..473e4d9c7 100644
--- a/src/lib/io/gauge_read_binary.c
+++ b/src/lib/io/gauge_read_binary.c
@@ -22,7 +22,7 @@
 /* FIXME I will first fix this function by using referral.
          Probably should be done better in the future. AD. */
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int read_binary_gauge_data(LemonReader* lemonreader, DML_Checksum* checksum,
                            paramsIldgFormat* input, su3** const gf) {
   int t, x, y, z, status = 0;
@@ -144,7 +144,7 @@ int read_binary_gauge_data(LemonReader* lemonreader, DML_Checksum* checksum,
   free(filebuffer);
   return (0);
 }
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 int read_binary_gauge_data(LimeReader *limereader, DML_Checksum *checksum, paramsIldgFormat *input,
                            su3 **const gf) {
   int t, x, y, z, status = 0;
@@ -273,4 +273,4 @@ int read_binary_gauge_data(LimeReader *limereader, DML_Checksum *checksum, param
 #endif
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
diff --git a/src/lib/io/gauge_write_binary.c b/src/lib/io/gauge_write_binary.c
index 668b53a17..ad3c7882e 100644
--- a/src/lib/io/gauge_write_binary.c
+++ b/src/lib/io/gauge_write_binary.c
@@ -22,7 +22,7 @@
 /* FIXME I will first fix this function by using referral.
          Probably should be done better in the future. AD. */
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int write_binary_gauge_data(LemonWriter* lemonwriter, const int prec, DML_Checksum* checksum) {
   int x, xG, y, yG, z, zG, t, tG, status = 0;
   su3 tmp3[4];
@@ -133,7 +133,7 @@ int write_binary_gauge_data(LemonWriter* lemonwriter, const int prec, DML_Checks
   return 0;
 }
 
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 
 int write_binary_gauge_data(LimeWriter* limewriter, const int prec, DML_Checksum* checksum) {
   int x, X, y, Y, z, Z, tt, t0, tag = 0, id = 0, status = 0;
@@ -281,4 +281,4 @@ int write_binary_gauge_data(LimeWriter* limewriter, const int prec, DML_Checksum
 
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
diff --git a/src/lib/io/selector.h b/src/lib/io/selector.h
index 806178bff..236be8d32 100644
--- a/src/lib/io/selector.h
+++ b/src/lib/io/selector.h
@@ -21,11 +21,11 @@
 #define _IO_SELECTOR_H
 
 #include <lime.h>
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 #include <lemon.h>
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 #define LIME_FILE MPI_File
 #define WRITER LemonWriter
 #define READER LemonReader
@@ -42,7 +42,7 @@
 #define WriterCloseRecord lemonWriterCloseRecord
 #define DestroyReader lemonDestroyReader
 #define DestroyHeader lemonDestroyHeader
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 #define LIME_FILE FILE
 #define WRITER LimeWriter
 #define READER LimeReader
diff --git a/src/lib/io/spinor_read_binary.c b/src/lib/io/spinor_read_binary.c
index 6d459fd2c..81607a700 100644
--- a/src/lib/io/spinor_read_binary.c
+++ b/src/lib/io/spinor_read_binary.c
@@ -19,7 +19,7 @@
 
 #include "spinor.ih"
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int read_binary_spinor_data(spinor *const s, spinor *const r, LemonReader *lemonreader,
                             DML_Checksum *checksum) {
   int t, x, y, z, i = 0, status = 0;
@@ -126,7 +126,7 @@ int read_binary_spinor_data(spinor *const s, spinor *const r, LemonReader *lemon
   free(filebuffer);
   return 0;
 }
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 int read_binary_spinor_data(spinor *const s, spinor *const r, LimeReader *limereader,
                             DML_Checksum *checksum) {
   int t, x, y, z, i = 0, status = 0;
@@ -212,9 +212,9 @@ int read_binary_spinor_data(spinor *const s, spinor *const r, LimeReader *limere
 #endif
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int read_binary_spinor_data_l(spinor *const s, LemonReader *lemonreader, DML_Checksum *checksum) {
   int t, x, y, z, i = 0, status = 0;
   int latticeSize[] = {T_global, g_nproc_x * LX, g_nproc_y * LY, g_nproc_z * LZ};
@@ -314,7 +314,7 @@ int read_binary_spinor_data_l(spinor *const s, LemonReader *lemonreader, DML_Che
   free(filebuffer);
   return 0;
 }
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 int read_binary_spinor_data_l(spinor *const s, LimeReader *limereader, DML_Checksum *checksum) {
   int t, x, y, z, i = 0, status = 0;
   n_uint64_t bytes;
@@ -390,4 +390,4 @@ int read_binary_spinor_data_l(spinor *const s, LimeReader *limereader, DML_Check
 #endif
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
diff --git a/src/lib/io/spinor_write_binary.c b/src/lib/io/spinor_write_binary.c
index a2bc0cd68..560b5ce65 100644
--- a/src/lib/io/spinor_write_binary.c
+++ b/src/lib/io/spinor_write_binary.c
@@ -19,7 +19,7 @@
 
 #include "spinor.ih"
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int write_binary_spinor_data(spinor *const s, spinor *const r, LemonWriter *lemonwriter,
                              DML_Checksum *checksum, int const prec) {
   int x, y, z, t, i = 0, xG, yG, zG, tG, status = 0;
@@ -124,7 +124,7 @@ int write_binary_spinor_data(spinor *const s, spinor *const r, LemonWriter *lemo
   return 0;
 }
 
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 int write_binary_spinor_data(spinor *const s, spinor *const r, LimeWriter *limewriter,
                              DML_Checksum *checksum, const int prec) {
   int x, X, y, Y, z, Z, t, t0, tag = 0, id = 0, i = 0, status = 0;
@@ -272,9 +272,9 @@ int write_binary_spinor_data(spinor *const s, spinor *const r, LimeWriter *limew
   }
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int write_binary_spinor_data_l(spinor *const s, LemonWriter *lemonwriter, DML_Checksum *checksum,
                                int const prec) {
   int x, y, z, t, i = 0, xG, yG, zG, tG, status = 0;
@@ -374,7 +374,7 @@ int write_binary_spinor_data_l(spinor *const s, LemonWriter *lemonwriter, DML_Ch
   return 0;
 }
 
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 int write_binary_spinor_data_l(spinor *const s, LimeWriter *limewriter, DML_Checksum *checksum,
                                const int prec) {
   int x, X, y, Y, z, Z, t, t0, tag = 0, id = 0, i = 0, status = 0;
@@ -514,4 +514,4 @@ int write_binary_spinor_data_l(spinor *const s, LimeWriter *limewriter, DML_Chec
   }
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
diff --git a/src/lib/io/spinor_write_propagator_type.c b/src/lib/io/spinor_write_propagator_type.c
index 67356b8f1..77eb17728 100644
--- a/src/lib/io/spinor_write_propagator_type.c
+++ b/src/lib/io/spinor_write_propagator_type.c
@@ -4,9 +4,9 @@ void write_propagator_type(WRITER *writer, const int type) {
   uint64_t bytes;
   char *message;
 
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   if (g_cart_id == 0) {
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
 
     message = (char *)malloc(128);
 
@@ -34,7 +34,7 @@ void write_propagator_type(WRITER *writer, const int type) {
 
     close_writer_record(writer);
     free(message);
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   }
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
 }
diff --git a/src/lib/io/spinor_write_source_format.c b/src/lib/io/spinor_write_source_format.c
index a501ae5d3..e6cf0e782 100644
--- a/src/lib/io/spinor_write_source_format.c
+++ b/src/lib/io/spinor_write_source_format.c
@@ -22,9 +22,9 @@
 void write_source_format(WRITER *writer, paramsSourceFormat const *format) {
   uint64_t bytes;
   char *buf = NULL;
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   if (g_cart_id == 0) {
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
     buf = (char *)malloc(512);
     sprintf(buf,
             "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
@@ -49,7 +49,7 @@ void write_source_format(WRITER *writer, paramsSourceFormat const *format) {
     close_writer_record(writer);
 
     free(buf);
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   }
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
 }
diff --git a/src/lib/io/utils_construct_reader.c b/src/lib/io/utils_construct_reader.c
index 2714455b2..832ede73d 100644
--- a/src/lib/io/utils_construct_reader.c
+++ b/src/lib/io/utils_construct_reader.c
@@ -7,22 +7,22 @@ void construct_reader(READER **reader, char *filename) {
   int status = 0;
 
   if (g_debug_level > 0 && g_cart_id == 0) {
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
     printf("# Constructing LEMON reader for file %s ...\n", filename);
 #else
     printf("# Constructing LIME reader for file %s ...\n", filename);
 #endif
   }
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   fh = (MPI_File *)malloc(sizeof(MPI_File));
   status = MPI_File_open(g_cart_grid, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, fh);
   status = (status == MPI_SUCCESS) ? 0 : 1;
-#else  /* HAVE_LIBLEMON */
+#else  /* TM_USE_LEMON */
   fh = fopen(filename, "r");
   status = (fh == NULL) ? 1 : 0;
   fflush(stderr);
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
   if (status) {
     kill_with_error(fh, g_cart_id,
@@ -30,11 +30,11 @@ void construct_reader(READER **reader, char *filename) {
                     "rights.\nUnable to continue.\n");
   }
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   *reader = lemonCreateReader(fh, g_cart_grid);
-#else  /* HAVE_LIBLEMON */
+#else  /* TM_USE_LEMON */
   *reader = limeCreateReader(fh);
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
   if (*reader == (READER *)NULL) {
     kill_with_error(fh, g_cart_id, "\nCould not create reader, unable to continue.\n");
diff --git a/src/lib/io/utils_construct_writer.c b/src/lib/io/utils_construct_writer.c
index 4f13900fe..f2fe58bb7 100644
--- a/src/lib/io/utils_construct_writer.c
+++ b/src/lib/io/utils_construct_writer.c
@@ -4,14 +4,14 @@ void construct_writer(WRITER **writer, char *filename, const int append) {
   LIME_FILE *fh = NULL;
   int status = 0;
   if (g_debug_level > 0 && g_cart_id == 0) {
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
     printf("# Constructing LEMON writer for file %s for append = %d\n", filename, append);
 #else
     printf("# Constructing LIME writer for file %s for append = %d\n", filename, append);
 #endif
   }
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   fh = (MPI_File *)malloc(sizeof(MPI_File));
   if (append) {
     status = MPI_File_open(g_cart_grid, filename,
@@ -24,7 +24,7 @@ void construct_writer(WRITER **writer, char *filename, const int append) {
   status = (status == MPI_SUCCESS) ? 0 : 1;
   *writer = lemonCreateWriter(fh, g_cart_grid);
   status = status || (writer == NULL);
-#else  /* HAVE_LIBLEMON */
+#else  /* TM_USE_LEMON */
   if (g_cart_id == 0) {
     if (append) {
       fh = fopen(filename, "a");
@@ -35,7 +35,7 @@ void construct_writer(WRITER **writer, char *filename, const int append) {
     *writer = limeCreateWriter(fh);
     status = status || (writer == NULL);
   }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
   if (status) kill_with_error(fh, g_cart_id, "Failed to create writer. Aborting...\n");
 }
diff --git a/src/lib/io/utils_destruct_reader.c b/src/lib/io/utils_destruct_reader.c
index 4ee23d595..2ed391c47 100644
--- a/src/lib/io/utils_destruct_reader.c
+++ b/src/lib/io/utils_destruct_reader.c
@@ -5,10 +5,10 @@ void destruct_reader(READER *reader) {
 
   fh = reader->fp;
   DestroyReader(reader);
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   MPI_File_close(fh);
   free(fh); /* NB This assumes construct_writer was used to malloc memory! */
-#else       /* HAVE_LIBLEMON */
+#else       /* TM_USE_LEMON */
   fclose(fh);
-#endif      /* HAVE_LIBLEMON */
+#endif      /* TM_USE_LEMON */
 }
diff --git a/src/lib/io/utils_destruct_writer.c b/src/lib/io/utils_destruct_writer.c
index 840c06b4e..1f6216167 100644
--- a/src/lib/io/utils_destruct_writer.c
+++ b/src/lib/io/utils_destruct_writer.c
@@ -3,16 +3,16 @@
 void destruct_writer(WRITER *writer) {
   LIME_FILE *fh = NULL;
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   fh = writer->fp;
   lemonDestroyWriter(writer);
   MPI_File_close(fh);
   free(fh); /* NB This assumes construct_writer was used to malloc memory! */
-#else       /* HAVE_LIBLEMON */
+#else       /* TM_USE_LEMON */
   if (g_cart_id == 0) {
     fh = writer->fp;
     limeDestroyWriter(writer);
     fclose(fh);
   }
-#endif      /* HAVE_LIBLEMON */
+#endif      /* TM_USE_LEMON */
 }
diff --git a/src/lib/io/utils_kill_with_error.c b/src/lib/io/utils_kill_with_error.c
index bd697220d..322536bd7 100644
--- a/src/lib/io/utils_kill_with_error.c
+++ b/src/lib/io/utils_kill_with_error.c
@@ -7,11 +7,11 @@ void kill_with_error(LIME_FILE *fh, int const rank, char const *error) {
   }
 
   if (fh != NULL)
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
     MPI_File_close(fh);
 #else
     fclose(fh);
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
 #ifdef TM_USE_MPI
   MPI_Abort(MPI_COMM_WORLD, 1);
diff --git a/src/lib/io/utils_write_first_message.c b/src/lib/io/utils_write_first_message.c
index 983b92b0a..287d67c37 100644
--- a/src/lib/io/utils_write_first_message.c
+++ b/src/lib/io/utils_write_first_message.c
@@ -30,28 +30,28 @@ int write_first_messages(FILE* parameterfile, char const* const executable,
            TMLQCD_PACKAGE_VERSION, git_hash);
   printf("%s", message);
   fprintf(parameterfile, "%s", message);
-#ifdef _GAUGE_COPY
-  printf("# The code is compiled with -D_GAUGE_COPY\n");
-  fprintf(parameterfile, "# The code is compiled with -D_GAUGE_COPY\n");
+#ifdef TM_GAUGE_COPY
+  printf("# The code is compiled with -DTM_GAUGE_COPY\n");
+  fprintf(parameterfile, "# The code is compiled with -DTM_GAUGE_COPY\n");
 #endif
-#ifdef _USE_HALFSPINOR
-  printf("# The code is compiled with -D_USE_HALFSPINOR\n");
-  fprintf(parameterfile, "# The code is compiled with -D_USE_HALFSPINOR\n");
+#ifdef TM_USE_HALFSPINOR
+  printf("# The code is compiled with -DTM_USE_HALFSPINOR\n");
+  fprintf(parameterfile, "# The code is compiled with -DTM_USE_HALFSPINOR\n");
 #endif
-#ifdef _USE_SHMEM
-  printf("# the code is compiled with -D_USE_SHMEM\n");
-  fprintf(parameterfile, "# the code is compiled with -D_USE_SHMEM\n");
-#ifdef _PERSISTENT
+#ifdef TM_USE_SHMEM
+  printf("# the code is compiled with -DTM_USE_SHMEM\n");
+  fprintf(parameterfile, "# the code is compiled with -DTM_USE_SHMEM\n");
+#ifdef TM_PERSISTENT
   printf("# the code is compiled for persistent MPI calls (halfspinor only)\n");
   fprintf(parameterfile, "# the code is compiled for persistent MPI calls (halfspinor only)\n");
 #endif
 #endif
 #ifdef TM_USE_MPI
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
   printf("# the code is compiled for non-blocking MPI calls (spinor and gauge)\n");
   fprintf(parameterfile, "# the code is compiled for non-blocking MPI calls (spinor and gauge)\n");
 #endif
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   printf("# the code is compiled with MPI IO / Lemon\n");
   fprintf(parameterfile, "# the code is compiled with MPI IO / Lemon\n");
 #endif
diff --git a/src/lib/io/utils_write_header.c b/src/lib/io/utils_write_header.c
index 7f5f85c83..be8ae4ade 100644
--- a/src/lib/io/utils_write_header.c
+++ b/src/lib/io/utils_write_header.c
@@ -23,9 +23,9 @@ void write_header(WRITER *writer, int MB, int ME, char const *type, uint64_t byt
   int status;
   RECORD_HEADER *header;
 
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   if (g_cart_id == 0) {
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
     /* Nasty (but probably harmless) hack to get rid of const qualifier - the original c-lime was
      * sloppy here. */
     header = CreateHeader(MB, ME, (char *)type, bytes);
@@ -35,8 +35,8 @@ void write_header(WRITER *writer, int MB, int ME, char const *type, uint64_t byt
     if (status != LIME_SUCCESS) {
       kill_with_error(writer->fp, g_cart_id, "Header writing error. Aborting\n");
     }
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   }
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
   return;
 }
diff --git a/src/lib/io/utils_write_message.c b/src/lib/io/utils_write_message.c
index b71cdbbce..d346c9a9c 100644
--- a/src/lib/io/utils_write_message.c
+++ b/src/lib/io/utils_write_message.c
@@ -23,9 +23,9 @@ int write_message(WRITER *writer, char const *buffer, uint64_t bytes) {
   int status;
   n_uint64_t bytesWritten = bytes;
 
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   if (g_cart_id == 0) {
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
     if (buffer == (char *)NULL) return (0);
 
 #ifdef TM_USE_MPI
@@ -35,8 +35,8 @@ int write_message(WRITER *writer, char const *buffer, uint64_t bytes) {
 #endif
     if (status != LIME_SUCCESS || bytes != bytesWritten)
       kill_with_error(writer->fp, g_cart_id, "I/O error on writing message. Aborting...\n");
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   }
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
   return (0);
 }
diff --git a/src/lib/linalg/blas.h b/src/lib/linalg/blas.h
index a972e5029..110afb01f 100644
--- a/src/lib/linalg/blas.h
+++ b/src/lib/linalg/blas.h
@@ -23,8 +23,8 @@
 #include <complex.h>
 #include "linalg/fortran.h"
 
-#if defined CRAY || defined HITACHI
-/* On the CRAY is all different, of course... */
+#if defined TM_CRAY || defined HITACHI
+/* On the TM_CRAY is all different, of course... */
 #include "fortran.h"
 #define zgemm ZGEMM
 #define zgemv ZGEMV
diff --git a/src/lib/linalg/lapack.h b/src/lib/linalg/lapack.h
index 1c7f4ce7a..a651b07ae 100644
--- a/src/lib/linalg/lapack.h
+++ b/src/lib/linalg/lapack.h
@@ -23,7 +23,7 @@
 #include <complex.h>
 #include "linalg/fortran.h"
 
-#if defined CRAY || defined HITACHI
+#if defined TM_CRAY || defined HITACHI
 #define zgels CGELS
 #define zgesv CGESV
 #define zgeevx CGEEVX
diff --git a/src/lib/little_D.c b/src/lib/little_D.c
index 370e7583a..2bee49824 100644
--- a/src/lib/little_D.c
+++ b/src/lib/little_D.c
@@ -276,11 +276,11 @@ extern int waitcount;
 void init_little_field_exchange(_Complex double *w) {
 #ifdef TM_USE_MPI
   int i = 0;
-#if (defined PARALLELT || defined PARALLELX)
+#if (defined TM_PARALLELT || defined TM_PARALLELX)
   int no_dirs = 2;
-#elif (defined PARALLELXT || defined PARALLELXY || defined PARALLELXYZ)
+#elif (defined TM_PARALLELXT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   int no_dirs = 4;
-#elif (defined PARALLELXYT || defined PARALLELXYZT)
+#elif (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   int no_dirs = 6;
 #endif
   if (waitcount != 0) {
@@ -304,7 +304,7 @@ void init_little_field_exchange(_Complex double *w) {
               g_nb_list[i], i + 1, g_cart_grid, &lrequests[2 * i + 3]);
     waitcount += 4;
   }
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
   /* send to the right, receive from the left */
   i = 6;
   MPI_Isend((void *)(w + g_N_s), g_N_s, MPI_DOUBLE_COMPLEX, g_nb_list[i], i, g_cart_grid,
diff --git a/src/lib/meas/polyakov_loop.c b/src/lib/meas/polyakov_loop.c
index 9108bcb99..25deea402 100644
--- a/src/lib/meas/polyakov_loop.c
+++ b/src/lib/meas/polyakov_loop.c
@@ -446,7 +446,7 @@ int polyakov_loop_dir(const int nstore /* in  */, const int dir /* in  */) {
 
   /* (1) collect contributions from different time/z slices to nodes with rank=0
      in spatial volume/space-time slices */
-#ifndef PARALLELXYZT
+#ifndef TM_PARALLELXYZT
   if (dir == 0) {
 #endif
     tmp_ray = (su3 *)calloc(VOL3, sizeof(su3)); /* */
@@ -456,7 +456,7 @@ int polyakov_loop_dir(const int nstore /* in  */, const int dir /* in  */) {
     }
 
     MPI_Reduce(tmp_loc, tmp_ray, VOL3, mpi_su3, mpi_reduce_su3_ray, 0, ray);
-#ifndef PARALLELXYZT
+#ifndef TM_PARALLELXYZT
   }
 #endif
 
@@ -475,7 +475,7 @@ int polyakov_loop_dir(const int nstore /* in  */, const int dir /* in  */) {
     ks = 0.0;
 
 #ifdef TM_USE_MPI
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
     u = tmp_ray;
 #else
     if (dir == 0) {
@@ -502,11 +502,11 @@ int polyakov_loop_dir(const int nstore /* in  */, const int dir /* in  */) {
 #ifdef TM_USE_MPI
     MPI_Reduce(&pl_tmp, &pl, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, slice);
   }
-#ifndef PARALLELXYZT
+#ifndef TM_PARALLELXYZT
   if (dir == 0) {
 #endif
     free(tmp_ray);
-#ifndef PARALLELXYZT
+#ifndef TM_PARALLELXYZT
   }
 #endif
 
diff --git a/src/lib/measure_gauge_action.c b/src/lib/measure_gauge_action.c
index 6a558a51b..1f7cb6ad5 100644
--- a/src/lib/measure_gauge_action.c
+++ b/src/lib/measure_gauge_action.c
@@ -26,9 +26,7 @@
  *     Returns the value of the action
  ************************************************************************/
 
-#ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
-#endif
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/src/lib/mpi_init.c b/src/lib/mpi_init.c
index 2bbbde315..cc09fd4cd 100644
--- a/src/lib/mpi_init.c
+++ b/src/lib/mpi_init.c
@@ -25,7 +25,7 @@
 #ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef _USE_SHMEM
+#ifdef TM_USE_SHMEM
 #include <mpp/shmem.h>
 #endif
 #include "global.h"
@@ -134,7 +134,7 @@ MPI_Datatype halffield_y_slice_gath;
 
 MPI_Datatype halffield_z_slice_cont;
 
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
 MPI_Datatype field_z_slice_even_dn;
 MPI_Datatype field_z_slice_even_up;
 MPI_Datatype field_z_slice_odd_dn;
@@ -188,60 +188,60 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   }
 
 #ifdef TM_USE_MPI
-#ifdef _USE_SHMEM
+#ifdef TM_USE_SHMEM
   /* we need that the PE number in MPI_COMM_WORL  */
   /* exactly correspond to the one in g_cart_grid */
   reorder = 0;
 #endif
 
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_T = 0; /* the other N_PROC_? are read from input, if not constraint below */
                 /* N_PROC_T will be set by MPI_Dims_create, if not constraint below */
 #endif
 
-#if defined PARALLELT
+#if defined TM_PARALLELT
   ndims = 1;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_X = 1;
   N_PROC_Y = 1;
   N_PROC_Z = 1;
 #endif
 #endif
-#if defined PARALLELX
+#if defined TM_PARALLELX
   ndims = 1;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_T = 1;
   N_PROC_Y = 1;
   N_PROC_Z = 1;
 #endif
 #endif
-#if defined PARALLELXT
+#if defined TM_PARALLELXT
   ndims = 2;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_Y = 1;
   N_PROC_Z = 1;
 #endif
 #endif
-#if defined PARALLELXY
+#if defined TM_PARALLELXY
   ndims = 2;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_T = 1;
   N_PROC_Z = 1;
 #endif
 #endif
-#if defined PARALLELXYT
+#if defined TM_PARALLELXYT
   ndims = 3;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_Z = 1;
 #endif
 #endif
-#if defined PARALLELXYZ
+#if defined TM_PARALLELXYZ
   ndims = 3;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_T = 1;
 #endif
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
   ndims = 4;
 #endif
   dims[0] = N_PROC_T;
@@ -278,7 +278,7 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
     exit(-1);
   }
 
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_T = g_nproc_t;
   N_PROC_X = g_nproc_x;
   N_PROC_Y = g_nproc_y;
@@ -289,42 +289,42 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   LZ = LZ / g_nproc_z;
   VOLUME = (T * LX * LY * LZ);
   SPACEVOLUME = VOLUME / T;
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   RAND = (2 * LX * LY * LZ);
   EDGES = 0;
-#elif defined PARALLELX
+#elif defined TM_PARALLELX
   RAND = (2 * T * LY * LZ);
   EDGES = 0;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   RAND = 2 * LZ * (LY * LX + T * LY);
   EDGES = 4 * LZ * LY;
-#elif defined PARALLELXY
+#elif defined TM_PARALLELXY
   RAND = 2 * LZ * T * (LX + LY);
   EDGES = 4 * LZ * T;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   RAND = 2 * LZ * (LY * LX + T * LY + T * LX);
   EDGES = 4 * LZ * (LY + T + LX);
-#elif defined PARALLELXYZ
+#elif defined TM_PARALLELXYZ
   RAND = 2 * T * (LY * LZ + LX * LZ + LX * LY);
   EDGES = 4 * T * (LX + LY + LZ);
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   RAND = 2 * LZ * LY * LX + 2 * LZ * T * LY + 2 * LZ * T * LX + 2 * T * LX * LY;
   EDGES = 4 * LZ * LY + 4 * LZ * T + 4 * LZ * LX + 4 * LY * T + 4 * LY * LX + 4 * T * LX;
-#else  /* ifdef PARALLELT */
+#else  /* ifdef TM_PARALLELT */
   RAND = 0;
   EDGES = 0;
-#endif /* ifdef PARALLELT */
+#endif /* ifdef TM_PARALLELT */
   /* Note that VOLUMEPLUSRAND is not always equal to VOLUME+RAND */
   /* VOLUMEPLUSRAND rather includes the edges */
   VOLUMEPLUSRAND = VOLUME + RAND + EDGES;
   SPACERAND = RAND / T;
-#endif /* ifndef FIXEDVOLUME */
+#endif /* ifndef TM_FIXEDVOLUME */
   g_dbw2rand = (RAND + 2 * EDGES);
 
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   field_buffer_z = (spinor *)malloc(T * LX * LY / 2 * sizeof(spinor));
   field_buffer_z2 = (spinor *)malloc(T * LX * LY / 2 * sizeof(spinor));
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
   field_buffer_z3 = (spinor *)malloc(T * LX * LY / 2 * sizeof(spinor));
   field_buffer_z4 = (spinor *)malloc(T * LX * LY / 2 * sizeof(spinor));
 #endif
@@ -347,23 +347,23 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   for (i = 0; i < 8; i++) {
     g_nb_list[i] = g_cart_id;
   }
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   MPI_Cart_shift(g_cart_grid, 0, 1, &g_nb_t_dn, &g_nb_t_up);
   g_nb_list[0] = g_nb_t_up;
   g_nb_list[1] = g_nb_t_dn;
 #endif
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || \
-     defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
+     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   MPI_Cart_shift(g_cart_grid, 1, 1, &g_nb_x_dn, &g_nb_x_up);
   g_nb_list[2] = g_nb_x_up;
   g_nb_list[3] = g_nb_x_dn;
 #endif
-#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   MPI_Cart_shift(g_cart_grid, 2, 1, &g_nb_y_dn, &g_nb_y_up);
   g_nb_list[4] = g_nb_y_up;
   g_nb_list[5] = g_nb_y_dn;
 #endif
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   MPI_Cart_shift(g_cart_grid, 3, 1, &g_nb_z_dn, &g_nb_z_up);
   g_nb_list[6] = g_nb_z_up;
   g_nb_list[7] = g_nb_z_dn;
@@ -669,7 +669,7 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   g_mpi_ST_rank = 0;
   g_stdio_proc = 0;
 
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   T = T_global;
   VOLUME = (T * LX * LY * LZ);
   SPACEVOLUME = VOLUME / T;
@@ -687,7 +687,7 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
 
   /* Here we perform some checks in order not to */
   /* run into trouble later                      */
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   if ((T * LX * LY) % 2 != 0 && even_odd_flag == 1) {
     fprintf(stderr, "T*LX*LY must be even!\nAborting prgram...\n");
 #ifdef TM_USE_MPI
diff --git a/src/lib/mpi_init.h b/src/lib/mpi_init.h
index dce6dfad7..d9476e662 100644
--- a/src/lib/mpi_init.h
+++ b/src/lib/mpi_init.h
@@ -108,8 +108,8 @@ extern MPI_Datatype halffield_y_slice_gath;
 extern MPI_Datatype halffield_z_slice_cont;
 
 
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || \
-     defined PARALLELXYZ)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || \
+     defined TM_PARALLELXYZ)
 extern MPI_Datatype field_z_slice_even_dn;
 extern MPI_Datatype field_z_slice_even_up;
 extern MPI_Datatype field_z_slice_odd_dn;
diff --git a/src/lib/operator.c b/src/lib/operator.c
index 6b6a94df2..e15a97701 100644
--- a/src/lib/operator.c
+++ b/src/lib/operator.c
@@ -63,7 +63,7 @@
 #ifdef TM_USE_QUDA
 #include "quda_interface.h"
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 
diff --git a/src/lib/operator/D_psi_body.c b/src/lib/operator/D_psi_body.c
index 06bde0fc5..b5acd1158 100644
--- a/src/lib/operator/D_psi_body.c
+++ b/src/lib/operator/D_psi_body.c
@@ -283,7 +283,7 @@ void _PSWITCH(D_psi)(_PTSWITCH(spinor) *const P, _PTSWITCH(spinor) *const Q) {
   _C_TYPE ALIGN32 phase_2l = (_C_TYPE)phase_2;
   _C_TYPE ALIGN32 phase_3l = (_C_TYPE)phase_3;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (_PSWITCH(g_update_gauge_copy)) {
     _PSWITCH(update_backward_gauge)(_PSWITCH(g_gauge_field));
   }
diff --git a/src/lib/operator/Hopping_Matrix.c b/src/lib/operator/Hopping_Matrix.c
index a8da9e810..8b106e10a 100644
--- a/src/lib/operator/Hopping_Matrix.c
+++ b/src/lib/operator/Hopping_Matrix.c
@@ -38,11 +38,11 @@
  *
  *  Structure of top level precompiler directives
  *
- * - defining _USE_HALFSPINOR implies that we also use
+ * - defining TM_USE_HALFSPINOR implies that we also use
  *   a "gauge copy"
  *
  * - such that we are checking for the _USE_GAUGECOPY feature seperatly in the
- *   ELSE branch of the "if defined _USE_HALFSPINOR" statement
+ *   ELSE branch of the "if defined TM_USE_HALFSPINOR" statement
  *
  ****************************************************************/
 
@@ -64,11 +64,11 @@
 #include "operator/Hopping_Matrix.h"
 #include "update_backward_gauge.h"
 
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
 #include "operator/halfspinor_hopping.h"
 
 void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -88,10 +88,10 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
   return;
 }
 
-#else /* thats _USE_HALFSPINOR */
+#else /* thats TM_USE_HALFSPINOR */
 void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -114,4 +114,4 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
   return;
 }
 
-#endif /* thats _USE_HALFSPINOR */
+#endif /* thats TM_USE_HALFSPINOR */
diff --git a/src/lib/operator/Hopping_Matrix_32.c b/src/lib/operator/Hopping_Matrix_32.c
index d1fbe78c7..1198d52bb 100644
--- a/src/lib/operator/Hopping_Matrix_32.c
+++ b/src/lib/operator/Hopping_Matrix_32.c
@@ -39,11 +39,11 @@
  *
  *  Structure of top level precompiler directives
  *
- * - defining _USE_HALFSPINOR implies that we also use
+ * - defining TM_USE_HALFSPINOR implies that we also use
  *   a "gauge copy"
  *
  * - such that we are checking for the _USE_GAUGECOPY feature seperatly in the
- *   ELSE branch of the "if defined _USE_HALFSPINOR" statement
+ *   ELSE branch of the "if defined TM_USE_HALFSPINOR" statement
  *
  ****************************************************************/
 
@@ -66,13 +66,13 @@
 #include "update_backward_gauge.h"
 #include "operator/Hopping_Matrix_32.h"
 
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
 #include "operator/halfspinor_hopping_32.h"
 #endif
 
 void Hopping_Matrix_32_orphaned(const int ieo, spinor32* const l, spinor32* const k) {
-#if defined _USE_HALFSPINOR
-#ifdef _GAUGE_COPY
+#if defined TM_USE_HALFSPINOR
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy_32) {
     update_backward_gauge_32_orphaned(g_gauge_field_32);
   }
diff --git a/src/lib/operator/Hopping_Matrix_nocom.c b/src/lib/operator/Hopping_Matrix_nocom.c
index dce8ad591..c7814bbb0 100644
--- a/src/lib/operator/Hopping_Matrix_nocom.c
+++ b/src/lib/operator/Hopping_Matrix_nocom.c
@@ -48,8 +48,8 @@
 
 #define Hopping_Matrix Hopping_Matrix_nocom
 #define _NO_COMM 1
-#ifdef _KOJAK_INST
-#undef _KOJAK_INST
+#ifdef TM_KOJAK_INST
+#undef TM_KOJAK_INST
 #endif
 
 #include "Hopping_Matrix.c"
diff --git a/src/lib/operator/halfspinor_body.c b/src/lib/operator/halfspinor_body.c
index 542292b1d..a2c54c7e4 100644
--- a/src/lib/operator/halfspinor_body.c
+++ b/src/lib/operator/halfspinor_body.c
@@ -30,7 +30,7 @@ halfspinor* restrict* phi ALIGN;
 halfspinor32* restrict* phi32 ALIGN;
 _declare_hregs();
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(hoppingmatrix)
 #endif
 
@@ -320,6 +320,6 @@ if (g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
 #endif
   }
 }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(hoppingmatrix)
 #endif
diff --git a/src/lib/operator/hopping_body_dbl.c b/src/lib/operator/hopping_body_dbl.c
index 02edbaec7..5ae88044e 100644
--- a/src/lib/operator/hopping_body_dbl.c
+++ b/src/lib/operator/hopping_body_dbl.c
@@ -45,7 +45,7 @@ if (ieo == 0) {
 #ifndef TM_USE_OMP
 hi = &g_hi[16 * ioff];
 
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
 up = &g_gauge_field_copy[ioff][0];
 #else
 up = &g_gauge_field[(*hi)][0];
@@ -62,7 +62,7 @@ hi++;
 for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
 #ifdef TM_USE_OMP
   hi = &g_hi[16 * icx];
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = &g_gauge_field_copy[icx][0];
 #else
   up = &g_gauge_field[(*hi)][0];
@@ -76,7 +76,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   pn = p + (icx - ioff);
 #endif
   /*********************** direction +t ************************/
-#if (!defined _GAUGE_COPY)
+#if (!defined TM_GAUGE_COPY)
   um = &g_gauge_field[(*hi)][0];
 #else
   um = up + 1;
@@ -88,7 +88,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_t_p();
 
   /*********************** direction -t ************************/
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -99,7 +99,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_t_m();
 
   /*********************** direction +1 ************************/
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
   um = &g_gauge_field[(*hi)][1];
 #else
   um = up + 1;
@@ -111,7 +111,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_x_p();
 
   /*********************** direction -1 ************************/
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -122,7 +122,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_x_m();
 
   /*********************** direction +2 ************************/
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
   um = &g_gauge_field[(*hi)][2];
 #else
   um = up + 1;
@@ -134,7 +134,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_y_p();
 
   /*********************** direction -2 ************************/
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -145,7 +145,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_y_m();
 
   /*********************** direction +3 ************************/
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
   um = &g_gauge_field[(*hi)][3];
 #else
   um = up + 1;
@@ -158,7 +158,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
 
   /*********************** direction -3 ************************/
 #ifndef TM_USE_OMP
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = um + 1;
 #else
   up = &g_gauge_field[(*hi)][0];
diff --git a/src/lib/operator/hopping_sgl.c b/src/lib/operator/hopping_sgl.c
index 5067ab13d..062507158 100644
--- a/src/lib/operator/hopping_sgl.c
+++ b/src/lib/operator/hopping_sgl.c
@@ -37,7 +37,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
   spinor32* restrict r, * restrict sp, * restrict sm;
   spinor32 temp;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge();
   }
@@ -72,7 +72,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = &g_gauge_field_copy[icx][0];
 #else
     up = &g_gauge_field[ix][0];
@@ -100,7 +100,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     um = up + 1;
 #else
     um = &g_gauge_field[iy][0];
@@ -129,7 +129,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
 
     sp = k + icy;
 
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -157,7 +157,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
     um = &g_gauge_field[iy][1];
 #else
     um = up + 1;
@@ -185,7 +185,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -212,7 +212,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
     um = &g_gauge_field[iy][2];
 #else
     um = up + 1;
@@ -240,7 +240,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -267,7 +267,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
     um = &g_gauge_field[iy][3];
 #else
     um = up + 1;
diff --git a/src/lib/operator/tm_sub_Hopping_Matrix.c b/src/lib/operator/tm_sub_Hopping_Matrix.c
index fd2aef9db..857404088 100644
--- a/src/lib/operator/tm_sub_Hopping_Matrix.c
+++ b/src/lib/operator/tm_sub_Hopping_Matrix.c
@@ -51,12 +51,12 @@
 // where cfactor = a + i b
 //
 
-#if (defined _USE_HALFSPINOR)
+#if (defined TM_USE_HALFSPINOR)
 #include "operator/halfspinor_hopping.h"
 
 void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* const p, spinor* const k,
                            complex double const cfactor) {
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -78,10 +78,10 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* const p, spin
   return;
 }
 
-#elif (!defined _NO_COMM && !defined _USE_HALFSPINOR)
+#elif (!defined _NO_COMM && !defined TM_USE_HALFSPINOR)
 void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* p, spinor* const k,
                            complex double const cfactor) {
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
diff --git a/src/lib/operator/tm_times_Hopping_Matrix.c b/src/lib/operator/tm_times_Hopping_Matrix.c
index 3b336d2a9..6d1abddba 100644
--- a/src/lib/operator/tm_times_Hopping_Matrix.c
+++ b/src/lib/operator/tm_times_Hopping_Matrix.c
@@ -51,12 +51,12 @@
 // where cfactor = a + i b
 //
 
-#if (defined _USE_HALFSPINOR && !defined _NO_COMM)
+#if (defined TM_USE_HALFSPINOR && !defined _NO_COMM)
 #include "operator/halfspinor_hopping.h"
 
 void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
                              complex double const cfactor) {
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -78,10 +78,10 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
   return;
 }
 
-#elif (!defined _NO_COMM && !defined _USE_HALFSPINOR)
+#elif (!defined _NO_COMM && !defined TM_USE_HALFSPINOR)
 void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
                              double complex const cfactor) {
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -103,4 +103,4 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
 #endif
   return;
 }
-#endif  //_USE_HALFSPINOR && !defined _NO_COMM
+#endif  //TM_USE_HALFSPINOR && !defined _NO_COMM
diff --git a/src/lib/overrelaxation.c b/src/lib/overrelaxation.c
index 91d95fa30..2a1329bba 100644
--- a/src/lib/overrelaxation.c
+++ b/src/lib/overrelaxation.c
@@ -153,7 +153,7 @@ void flip_subgroup(int ix, int mu, su3 vv, int i) {
   *z = w;
 }
 
-#if defined PARALLEL1
+#if defined TM_PARALLEL1
 void overrel_sweep() {
   int x0, x1, x2, x3;
   int mu, ix;
diff --git a/src/lib/parallel_io.h b/src/lib/parallel_io.h
index 50e03fd59..98df3fb8c 100644
--- a/src/lib/parallel_io.h
+++ b/src/lib/parallel_io.h
@@ -17,8 +17,8 @@
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
 
-#ifndef _PARALLEL_IO_H
-#define _PARALLEL_IO_H
+#ifndef _TM_PARALLEL_IO_H
+#define _TM_PARALLEL_IO_H
 
 #include <lemon.h>
 #include "dml.h"
diff --git a/src/lib/read_input.l b/src/lib/read_input.l
index 6af756c7e..59f002748 100644
--- a/src/lib/read_input.l
+++ b/src/lib/read_input.l
@@ -849,8 +849,8 @@ static inline double fltlist_next_token(int * const list_end){
 }
 
 <INITMULTIGRID>AMG{SPC}* {
-#ifdef DDalphaAMG
- if(myverbose) printf("Initialising DDalphaAMG line %d\n", line_of_file); 
+#ifdef TM_USE_DDalphaAMG
+ if(myverbose) printf("Initialising DDalphaAMG line %d\n", line_of_file);
  BEGIN(MULTIGRID);
 #else
  printf("ERROR line %d: DDalphaAMG library not included\n", line_of_file);
@@ -951,7 +951,7 @@ static inline double fltlist_next_token(int * const list_end){
     mg_no_shifts=0;
     if(myverbose) printf("  MG_MMS_Mass set to %.16f line %d operator %d\n", mg_mms_mass, line_of_file, current_operator);
   }
-  EndDDalphaAMG{SPC}* {
+  End_DDalphaAMG{SPC}* {
   if(myverbose) printf("DDalphaAMG parsed in line %d\n\n", line_of_file);
   BEGIN(0);
   }
@@ -1385,7 +1385,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(name_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     optr->solver = MG;
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
@@ -1490,7 +1490,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(name_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     optr->solver = MG;
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
@@ -1543,7 +1543,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(name_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     optr->solver = MG;
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
@@ -2834,7 +2834,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(name_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     mnl->solver = MG;
     BEGIN(solver_caller);
@@ -2877,7 +2877,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(solver_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     if(myverbose) printf("  HB Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     mnl->HB_solver = MG;
     BEGIN(solver_caller);
@@ -2902,7 +2902,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(solver_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     mnl->solver = MG;
     BEGIN(solver_caller);
@@ -2935,7 +2935,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(solver_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     mnl->solver = MG;
     BEGIN(solver_caller);
@@ -3229,49 +3229,49 @@ static inline double fltlist_next_token(int * const list_end){
 }
 
 <TT>{DIGIT}+                  {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   T_global = atoi(yytext);
   if(myverbose!=0) printf("T =%s\n", yytext);
 #endif
 }
 <LL>{DIGIT}+                  {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   L = atoi(yytext);
   if(myverbose!=0) printf("L =%s\n", yytext);
 #endif
 }
 <LLX>{DIGIT}+                  {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   LX = atoi(yytext);
   if(myverbose!=0) printf("LX =%s\n", yytext);
 #endif
 }
 <LLY>{DIGIT}+                  {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   LY = atoi(yytext);
   if(myverbose!=0) printf("LY =%s\n", yytext);
 #endif
 }
 <LLZ>{DIGIT}+                  {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   LZ = atoi(yytext);
   if(myverbose!=0) printf("LZ =%s\n", yytext);
 #endif
 }
 <NPROCX>{DIGIT}+              {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_X = atoi(yytext);
   if(myverbose!=0) printf("Nr of processors in x direction = %s\n", yytext);
 #endif
 }
 <NPROCY>{DIGIT}+              {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_Y = atoi(yytext);
   if(myverbose!=0) printf("Nr of processors in y direction = %s\n", yytext);
 #endif
 }
 <NPROCZ>{DIGIT}+              {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_Z = atoi(yytext);
   if(myverbose!=0) printf("Nr of processors in z direction = %s\n", yytext);
 #endif
@@ -3776,7 +3776,7 @@ int read_input(const char * conf_file){
    * Setting default values!
    ********************************************/
   reread = 0;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   T_global = _default_T_global;
   L = _default_L;
   LX = _default_LX;
@@ -3994,7 +3994,7 @@ int read_input(const char * conf_file){
   yyout = fopen("/dev/null", "w");
 
   parse_config();  
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   if(LX == 0) {
     LX = L;
   }
@@ -4029,7 +4029,7 @@ int read_input(const char * conf_file){
  */
 
 int reread_input(const char * conf_file){
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   int tt=T, ll=L, lx = LX, ly = LY, lz = LZ, 
       np=N_PROC_X, npy = N_PROC_Y;
 #endif
@@ -4054,7 +4054,7 @@ int reread_input(const char * conf_file){
 
   parse_config();  
 
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   T = tt;
   L = ll;
   LX = lx;
diff --git a/src/lib/solver/cg_her.c b/src/lib/solver/cg_her.c
index bf6981c4b..b556acb25 100644
--- a/src/lib/solver/cg_her.c
+++ b/src/lib/solver/cg_her.c
@@ -102,7 +102,7 @@ int cg_her(spinor* const P, spinor* const Q, const int max_iter, double eps_sq,
     if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq * squarenorm) && (rel_prec == 1))) {
       break;
     }
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
     if (((err * err <= eps_sq) && (rel_prec == 0)) ||
         ((err * err <= eps_sq * squarenorm) && (rel_prec == 1))) {
       g_sloppy_precision = 1;
diff --git a/src/lib/solver/cg_her_nd.c b/src/lib/solver/cg_her_nd.c
index 03a85a713..746c21718 100644
--- a/src/lib/solver/cg_her_nd.c
+++ b/src/lib/solver/cg_her_nd.c
@@ -133,7 +133,7 @@ int cg_her_nd(spinor* const P_up, spinor* P_dn, spinor* const Q_up, spinor* cons
     if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq * squarenorm) && (rel_prec == 1))) {
       break;
     }
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
     if (((err * err <= eps_sq) && (rel_prec == 0)) ||
         ((err * err <= eps_sq * squarenorm) && (rel_prec == 1))) {
       g_sloppy_precision = 1;
diff --git a/src/lib/solver/cr.c b/src/lib/solver/cr.c
index 58022ac28..f6a1bd348 100644
--- a/src/lib/solver/cr.c
+++ b/src/lib/solver/cr.c
@@ -106,7 +106,7 @@ int cr(spinor* const P, spinor* const Q, const int m, const int max_restarts, co
       break;
     }
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
     if (((err * err <= eps_sq) && (rel_prec == 0)) ||
         ((err * err <= eps_sq * norm_sq) && (rel_prec == 1))) {
       if (g_sloppy_precision_flag == 1) {
diff --git a/src/lib/solver/diagonalise_general_matrix.c b/src/lib/solver/diagonalise_general_matrix.c
index 0667da9aa..9fb989da6 100644
--- a/src/lib/solver/diagonalise_general_matrix.c
+++ b/src/lib/solver/diagonalise_general_matrix.c
@@ -70,7 +70,7 @@ void diagonalise_general_matrix(int n, _Complex double *A, int lda, _Complex dou
 
   /* Query call to get the optimal lwork */
   lwork = -1;
-#ifdef HAVE_LAPACK
+#ifdef TM_LAPACK
   _FT(zgeevx)("N", "N", "V", "N", &n, A, &lda, evalues, vl, &n, vr, &n, &ilo, &ihi, scale, &abnrm,
               rcone, rconv, &dummy, &lwork, rwork, &info, 1, 1, 1, 1);
   lwork = (int)(creal(dummy));
diff --git a/src/lib/solver/dirac_operator_eigenvectors.c b/src/lib/solver/dirac_operator_eigenvectors.c
index 42e85d198..845d5aedc 100644
--- a/src/lib/solver/dirac_operator_eigenvectors.c
+++ b/src/lib/solver/dirac_operator_eigenvectors.c
@@ -28,7 +28,7 @@
 #ifdef FFTW
 #include <fftw3.h>
 #endif
-#ifdef _USE_SHMEM
+#ifdef TM_USE_SHMEM
 #include <mpp/shmem.h>
 #endif
 #include <stdlib.h>
@@ -330,7 +330,7 @@ _Complex double calcDDaggerDovEvalue(const int *praw, double kappa, double rho,
 }
 
 void spinor_fft(spinor *spinor_in, spinor *spinor_out, int tt, int ll, unsigned int forward) {
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
   fftw_plan plan = spinor_fftw_plan(spinor_in, spinor_out, tt, ll, forward, FFTW_WISDOM_ONLY);
   fftw_execute(plan);
 #else
@@ -555,7 +555,7 @@ void spinorPrecWS_Free(spinorPrecWS *ws) {
  */
 
 void eigenvector_Dtm(spinor *spin, double mu, int epsilon, int k, int color, int rawp[4]) {
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
   fftw_plan p1bw;
 #endif
   int i = 0;
@@ -630,7 +630,7 @@ void eigenvector_Dtm(spinor *spin, double mu, int epsilon, int k, int color, int
 
   _spinor_muleq_real(*phi, 1.0 / sqrt((double)(VOLUME)));
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
   p1bw = spinor_fftw_plan(spin, spin, T, L, 0, FFTW_WISDOM_ONLY);
   fftw_execute(p1bw);
 #endif
@@ -638,7 +638,7 @@ void eigenvector_Dtm(spinor *spin, double mu, int epsilon, int k, int color, int
   /* spinor mulp half phase */
 }
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 fftw_plan spinor_fftw_plan(const spinor *spinor_in, spinor *spinor_out, int T, int ll,
                            unsigned int forward, int fftw_flags) {
   /*    int index_s = gsi(get_index(it, ix, iy, iz, tt, ll)); */
@@ -760,13 +760,13 @@ void spinorPrecondition(spinor *spinor_out, const spinor *spinor_in, spinorPrecW
   spinor phi_plus;
   double OOVOL = 1. / (double)(VOLUME);
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
   fftw_plan plan_fw;
   fftw_plan plan_bw;
 #endif
 
   if (autofft == 1) {
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
     /*     spinor_mulp_half_phase(spinor_out,spinor_in,ws->c_table, ws->s_table,1,1.); */
     plan_fw = spinor_fftw_plan(spinor_in, spinor_out, tt, ll, 1 /* = true */, FFTW_WISDOM_ONLY);
     fftw_execute(plan_fw);
@@ -889,7 +889,7 @@ void spinorPrecondition(spinor *spinor_out, const spinor *spinor_in, spinorPrecW
   }
 
   if (autofft == 1) {
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
     plan_bw = spinor_fftw_plan(spinor_out, spinor_out, tt, LX, 0, FFTW_WISDOM_ONLY);
     fftw_execute(plan_bw);
 #endif
@@ -1292,7 +1292,7 @@ void spinor_mulp_half_phase(spinor *spinor_out, const spinor *spinor_in, double
  * loading and storing of fftw wisdoms
  */
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 void loadFFTWWisdom(spinor *spinor_in, spinor *spinor_out, int tt, int ll) {
   /*   ostringstream filename_fftw_wisdom; */
   /*   filename_fftw_wisdom << "fftw_wisdom_" << setw(2) << setfill('0') << T << "x"<< setw(2) <<
@@ -2050,7 +2050,7 @@ void calculateDiagFalloffElements(const int op_id) {
   if (g_precWS == NULL) {
     /* we are going to need fft*/
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
     loadFFTWWisdom(g_spinor_field[0], g_spinor_field[1], T, LX);
 #endif
   }
diff --git a/src/lib/solver/dirac_operator_eigenvectors.h b/src/lib/solver/dirac_operator_eigenvectors.h
index 1ebe2ce71..b10a86312 100644
--- a/src/lib/solver/dirac_operator_eigenvectors.h
+++ b/src/lib/solver/dirac_operator_eigenvectors.h
@@ -24,7 +24,7 @@
 #ifdef HAVE_CONFIG_H
 #include "tmlqcd_config.h"
 #endif
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 #include <fftw3.h>
 #endif
 
@@ -68,7 +68,7 @@ extern tm_operator PRECWSOPERATORSELECT[14];
 /* */
 extern double g_prec_sequence_d_dagger_d[3];
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 fftw_plan spinor_fftw_plan(const spinor *spinor_in, spinor *spinor_out, int tt, int ll,
                            unsigned int forward, int fftw_flags);
 #endif
@@ -170,7 +170,7 @@ void spinor_mulp_half_phase(spinor *spinor_out, const spinor *spinor_in, double
  * read and write fftw wisdoms
  * this is supposed to speed up things
  */
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 void writeFFTWWisdom(int tt, int ll);
 void loadFFTWWisdom(spinor *spinor_in, spinor *spinor_out, int tt, int ll);
 #endif
diff --git a/src/lib/solver/eigenvalues.c b/src/lib/solver/eigenvalues.c
index 1725387d0..4d8d08887 100644
--- a/src/lib/solver/eigenvalues.c
+++ b/src/lib/solver/eigenvalues.c
@@ -68,7 +68,7 @@ double eigenvalues(int *nr_of_eigenvalues, const int max_iterations, const doubl
                    const int even_odd_flag) {
   double returnvalue;
   _Complex double norm2;
-#ifdef HAVE_LAPACK
+#ifdef TM_LAPACK
   static int allocated = 0;
   char filename[200];
   FILE *ofs;
diff --git a/src/lib/solver/fgmres.c b/src/lib/solver/fgmres.c
index 60d10fa72..154428124 100644
--- a/src/lib/solver/fgmres.c
+++ b/src/lib/solver/fgmres.c
@@ -85,7 +85,7 @@ int fgmres(spinor *const P, spinor *const Q, const int m, const int max_restarts
   atime = gettime();
   cumiter_lgcr = 0;
   if (N == VOLUME) {
-    init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); /* #ifdef HAVE_LAPACK */
+    init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); /* #ifdef TM_LAPACK */
   } else {
     init_solver_field(&solver_field, VOLUMEPLUSRAND / 2, nr_sf);
   }
diff --git a/src/lib/solver/fgmres4complex_body.c b/src/lib/solver/fgmres4complex_body.c
index b11528c58..1f6fa9c89 100644
--- a/src/lib/solver/fgmres4complex_body.c
+++ b/src/lib/solver/fgmres4complex_body.c
@@ -57,7 +57,7 @@ int _PSWITCH(fgmres4complex)(_Complex _F_TYPE *const P, _Complex _F_TYPE *const
   int fltcntr = 0;
   double alphasave = 0;
 
-  _PSWITCH(init_lsolver_field)(&solver_field, /*why not N?*/ lda, nr_sf); /* #ifdef HAVE_LAPACK */
+  _PSWITCH(init_lsolver_field)(&solver_field, /*why not N?*/ lda, nr_sf); /* #ifdef TM_LAPACK */
 
   eps = sqrt(eps_sq);
   _PSWITCH(init_lgmres)(m, lda);
diff --git a/src/lib/solver/gmres_dr.c b/src/lib/solver/gmres_dr.c
index 4b9f429e0..781b32d86 100644
--- a/src/lib/solver/gmres_dr.c
+++ b/src/lib/solver/gmres_dr.c
@@ -54,7 +54,7 @@
 #include "solver/solver_field.h"
 #include "su3.h"
 
-#ifndef HAVE_LAPACK
+#ifndef TM_LAPACK
 /* In case there is no lapack use normal gmres */
 int gmres_dr(spinor* const P, spinor* const Q, const int m, const int nr_ev, const int max_restarts,
              const double eps_sq, const int rel_prec, const int N, matrix_mult f) {
diff --git a/src/lib/solver/gram-schmidt.c b/src/lib/solver/gram-schmidt.c
index 1e8da1d24..ffd5d6b29 100644
--- a/src/lib/solver/gram-schmidt.c
+++ b/src/lib/solver/gram-schmidt.c
@@ -26,7 +26,7 @@
 #include "linalg/blas.h"
 #include "linalg_eo.h"
 #include "su3spinor.h"
-#ifdef CRAY
+#ifdef TM_CRAY
 #include <fortran.h>
 #endif
 #include "gram-schmidt.h"
@@ -62,7 +62,7 @@ void IteratedClassicalGS(_Complex double v[], double *vnrm, int n, int m, _Compl
       work1[j] = scalar_prod((spinor *)(A + j * lda), (spinor *)v,
                              n * sizeof(_Complex double) / sizeof(spinor), 1);
     }
-#ifdef HAVE_LAPACK
+#ifdef TM_LAPACK
     _FT(zgemv)(fupl_n, &n, &m, &CMONE, A, &lda, work1, &ONE, &CONE, v, &ONE, 1);
 #endif
     (*vnrm) = sqrt(square_norm((spinor *)v, n * sizeof(_Complex double) / sizeof(spinor), 1));
@@ -90,7 +90,7 @@ void ModifiedGS(_Complex double v[], int n, int m, _Complex double A[], int lda)
     s = scalar_prod((spinor *)(A + i * lda), (spinor *)v,
                     n * sizeof(_Complex double) / sizeof(spinor), 1);
     s = -s;
-#ifdef HAVE_LAPACK
+#ifdef TM_LAPACK
     _FT(zaxpy)(&n, &s, A + i * lda, &ONE, v, &ONE);
 #endif
   }
diff --git a/src/lib/solver/mcr.c b/src/lib/solver/mcr.c
index 707181cc2..184fa567f 100644
--- a/src/lib/solver/mcr.c
+++ b/src/lib/solver/mcr.c
@@ -127,7 +127,7 @@ int mcr(spinor* const P, spinor* const Q, const int m, const int max_restarts, c
         break;
       }
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
       if (((err * err <= eps_sq) && (rel_prec == 0)) ||
           ((err * err <= eps_sq * norm_sq) && (rel_prec == 1))) {
         if (g_sloppy_precision_flag == 1) {
diff --git a/src/lib/solver/monomial_solve.c b/src/lib/solver/monomial_solve.c
index 94873079f..0e73e9b0d 100644
--- a/src/lib/solver/monomial_solve.c
+++ b/src/lib/solver/monomial_solve.c
@@ -77,7 +77,7 @@
 #include "solver/solver_params.h"
 #include "solver/solver_types.h"
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QPHIX
@@ -184,7 +184,7 @@ int solve_degenerate(spinor* const P, spinor* const Q, solver_params_t solver_pa
   } else if (solver_type == BICGSTAB) {
     iteration_count = bicgstab_complex(P, Q, max_iter, eps_sq, rel_prec, N, f);
   }
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   else if (solver_type == MG)
     iteration_count = MG_solver(P, Q, eps_sq, max_iter, rel_prec, N, g_gauge_field, f);
 #endif
@@ -283,7 +283,7 @@ int solve_mms_tm(spinor** const P, spinor* const Q, solver_params_t* solver_para
       if (solver_params->type == CGMMS) {
     iteration_count = cg_mms_tm(P, Q, solver_params);
   }
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   else if (solver_params->type == MG) {
     // if the mg_mms_mass is larger than the smallest shift we use MG
     if (mg_no_shifts > 0 || mg_mms_mass >= solver_params->shifts[0]) {
@@ -507,7 +507,7 @@ int solve_mms_nd(spinor** const Pup, spinor** const Pdn, spinor* const Qup, spin
     } else if (solver_params->type == CGMMSND) {
       iteration_count = cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);
     }
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     else if (solver_params->type == MG) {
       // if the mg_mms_mass is larger than the smallest shift we use MG
       if (mg_no_shifts > 0 || mg_mms_mass >= solver_params->shifts[0]) {
@@ -691,7 +691,7 @@ int solve_mms_nd_plus(spinor** const Pup, spinor** const Pdn, spinor* const Qup,
 
   int iteration_count = 0;
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   // With MG we can solve directly the unsquared operator
   if (solver_params->type == MG) {
     matrix_mult_nd f = Qtm_tau1_ndpsi_add_Ishift;
diff --git a/src/lib/solver/solver_field.c b/src/lib/solver/solver_field.c
index 1cfd06515..5644a4cae 100644
--- a/src/lib/solver/solver_field.c
+++ b/src/lib/solver/solver_field.c
@@ -37,7 +37,7 @@ int init_solver_field(spinor*** const solver_field, const int V, const int nr) {
   }
 
   /* allocate the full chunk of memory to solver_field[nr] */
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void*)((*solver_field)[nr] = (spinor*)shmalloc((nr * V + 1) * sizeof(spinor))) == NULL) {
     fprintf(stderr, "malloc errno in init_solver_field: %d\n", errno);
     errno = 0;
@@ -74,7 +74,7 @@ int init_solver_field_32(spinor32*** const solver_field, const int V, const int
   }
 
   /* allocate the full chunk of memory to solver_field[nr] */
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void*)((*solver_field)[nr] = (spinor32*)shmalloc((nr * V + 1) * sizeof(spinor32))) == NULL) {
     fprintf(stderr, "malloc errno in init_solver_field: %d\n", errno);
     errno = 0;
@@ -143,7 +143,7 @@ int init_lsolver_field(_Complex double*** const solver_field, const int V, const
   }
 
   /* allocate the full chunk of memory to solver_field[nr] */
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void*)((*solver_field)[nr] =
                   (_Complex double*)shmalloc((nr * V + 1) * sizeof(_Complex double))) == NULL) {
     fprintf(stderr, "malloc errno in init_solver_field: %d\n", errno);
@@ -184,7 +184,7 @@ int init_lsolver_field_32(_Complex float*** const solver_field, const int V, con
   }
 
   /* allocate the full chunk of memory to solver_field[nr] */
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void*)((*solver_field)[nr] =
                   (_Complex float*)shmalloc((nr * V + 1) * sizeof(_Complex float))) == NULL) {
     fprintf(stderr, "malloc errno in init_solver_field: %d\n", errno);
diff --git a/src/lib/spinor_fft.c b/src/lib/spinor_fft.c
index fb101d269..54ece4bda 100644
--- a/src/lib/spinor_fft.c
+++ b/src/lib/spinor_fft.c
@@ -22,7 +22,7 @@
 #include "mpi_init.h"
 #include "spinor_fft.h"
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 #include <fftw3.h>
 #endif
 
@@ -35,7 +35,7 @@ void check_mpi_comm_membership(MPI_Comm commself, MPI_Comm commcheck, const char
                                const char *name_b, FILE *logFile);
 #endif
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 fftw_plan spinor_fftw_plan2d(spinor *spinor_in, spinor *spinor_out, int dim0, int dim1, int howmany,
                              unsigned int forward, int fftw_flags);
 #endif
@@ -50,7 +50,7 @@ void spinor_fft_transpose_xp_t(spinor *fieldout, spinor *fieldin, int dim0, int
 void spinor_fft_reduce_2d(spinor *localSpinorField, int *collectionRank, spinor ***field_collection,
                           spinor **membuff) {
   /* this implementation is intended for four dimensional parallelisation */
-#if (defined PARALLELXYZT && defined TM_USE_MPI && defined HAVE_FFTW)
+#if (defined TM_PARALLELXYZT && defined TM_USE_MPI && defined TM_USE_FFTW)
 
   int sendRecvCoord[4];
   int i;
@@ -195,7 +195,7 @@ void spinor_fft_reduce_2d(spinor *localSpinorField, int *collectionRank, spinor
 void spinor_fft_redist_2d(spinor *localSpinorField, int collectionRank, spinor **field_collection,
                           spinor *membuff) {
   /* this implementation is intended for four dimensional parallelisation */
-#if (defined PARALLELXYZT && defined TM_USE_MPI && defined HAVE_FFTW)
+#if (defined TM_PARALLELXYZT && defined TM_USE_MPI && defined TM_USE_FFTW)
 
   int sendRecvCoord[4];
   int dims[] = {g_nproc_t, g_nproc_x, g_nproc_y, g_nproc_z};
@@ -326,7 +326,7 @@ void spinor_fft_redist_2d(spinor *localSpinorField, int collectionRank, spinor *
 #endif
 }
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 fftw_plan spinor_fftw_plan2d(spinor *spinor_in, spinor *spinor_out, int dim0, int dim1,
                              int howmany_wospin, unsigned int forward, int fftw_flags) {
   /*    int index_s = gsi(get_index(it, ix, iy, iz, T, L)); */
diff --git a/src/lib/test/Makefile b/src/lib/test/Makefile
deleted file mode 100644
index 8efc8b569..000000000
--- a/src/lib/test/Makefile
+++ /dev/null
@@ -1,88 +0,0 @@
-TARGETS = scalar_prod_r_test
-
-USESF = yes
-
-OS = -os3
-
-# gcc shouldn't see this options, that's why we don't use CGLAGS here
-NLCCFLAGS = -D_STD_C99_COMPLEX_CHECKED -D_STD_C99_COMPLEX -Dapenext
-INCLUDES = -I../
-# workaround to let nlcc not see the non-standard complex.h
-NLCCINCLUDES = -I${NROOT}/include/nlibc/ ${INCLUDES}
-
-NLCCOPTS = -gp ${NLCCFLAGS} ${NLCCINCLUDES}
-ifdef USESF
-  MPPOPTS = -sf -v
-  SHAKEROPTS = -n -z 
-else 
-  MPPOPTS = -v
-  SHAKEROPTS = +a -z 
-endif
-SOFANOPTS = --rr
-
-# needed due to a bug in nlcc
-NLCCOS = -OS3
-
-NLCC = nlcc-0.5.2
-MPP = mpp
-SOFAN = sofan
-SHAKER = shaker
-M4 = m4
-CCDEP = gcc
-DEPFLAGS = -MM -MQ $*.sasm ${CFLAGS} ${INCLUDES}
-
-DEPFILES = $(addsuffix .d, ${TARGETS})
-MEMFILES = $(addsuffix .mem, ${TARGETS}) $(addsuffix -sofan.mem, ${TARGETS}) \
-	   $(addsuffix .no, ${TARGETS}) $(addsuffix -sofan.no, ${TARGETS})
-ASMFILES = $(addsuffix .sasm, ${TARGETS}) $(addsuffix .masm, ${TARGETS}) $(addsuffix -sofan.masm, ${TARGETS})
-NCDFILES = $(addsuffix .ncd, ${TARGETS}) $(addsuffix -sofan.ncd, ${TARGETS})
-SFOUTFILES = $(addsuffix .svn-out, ${TARGETS}) $(addsuffix .svn-out%, ${TARGETS}) \
-             $(addsuffix .sf_log, ${TARGETS}) $(addsuffix .sf_log%, ${TARGETS}) \
-             $(addsuffix .sf_log0, ${TARGETS}) $(addsuffix .sf_log0%, ${TARGETS}) \
-             $(addsuffix .err-sf, ${TARGETS}) $(addsuffix .svn-out, ${TARGETS}) \
-             $(addsuffix .dmo, ${TARGETS}) \
-	     $(addsuffix -sofan.svn-out, ${TARGETS}) $(addsuffix -sofan.svn-out%, ${TARGETS}) \
-             $(addsuffix -sofan.sf_log, ${TARGETS}) $(addsuffix -sofan.sf_log%, ${TARGETS}) \
-             $(addsuffix -sofan.sf_log0, ${TARGETS}) $(addsuffix -sofan.sf_log0%, ${TARGETS}) \
-             $(addsuffix -sofan.err-sf, ${TARGETS}) $(addsuffix -sofan.svn-out, ${TARGETS}) \
-             $(addsuffix -sofan.dmo, ${TARGETS})
-GCCBINARIES = $(addsuffix .gccbin, ${TARGETS})
-
-all: $(addsuffix -sofan.mem, ${TARGETS})
-allgcc:  $(addsuffix .gccbin, ${TARGETS})
-
--include $(DEPFILES)
-
-%.mem: %.masm
-	${SHAKER} ${SHAKEROPTS} $<
-
-%.masm: %.sasm
-	${MPP} ${OS} ${MPPOPTS} $<
-
-%-sofan.masm: %.masm
-	${SOFAN} ${SOFANOPTS} $< $@
-
-%.sasm: %.c Makefile
-	${NLCC} ${NLCCOPTS} ${NLCCOS} -S $<
-
-%.ncd: %.mem
-	dispminit $< > $@
-
-%-sofan.perf: %-sofan.ncd
-	nperf -asm=$*.sasm -c -l -a $< > $@ || (rm -f $@; exit 1)
-
-# beware, this is not very general
-%.gccbin: %.c
-	gcc -I../ $< -o $@
-
-$(DEPFILES): %.d: %.c Makefile
-	$(CCDEP) ${DEPFLAGS} ${INCLUDES} $< > $@
-
-clean:
-	rm -f ${ASMFILES} ${MEMFILES} ${NCDFILES} ${GCCBINARIES}
-
-distclean: clean
-	rm -f ${DEPFILES} ${SFOUTFILES}
-
-.SECONDARY:
-.DELETE_ON_ERROR:
diff --git a/src/lib/test/check_geometry.c b/src/lib/test/check_geometry.c
index 74589a739..b9f14eb4d 100644
--- a/src/lib/test/check_geometry.c
+++ b/src/lib/test/check_geometry.c
@@ -90,7 +90,7 @@ int check_geometry() {
           ix = g_ipt[x0][x1][x2][x3];
 
           iy0 = g_iup[ix][0];
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x0 != T - 1) {
             iz0 = g_ipt[(x0 + 1) % T][x1][x2][x3];
           } else {
@@ -107,7 +107,7 @@ int check_geometry() {
 #endif
 
           iy1 = g_iup[ix][1];
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x1 != LX - 1) {
             iz1 = g_ipt[x0][(x1 + 1) % LX][x2][x3];
           } else {
@@ -125,7 +125,7 @@ int check_geometry() {
 #endif
 
           iy2 = g_iup[ix][2];
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x2 != LY - 1) {
             iz2 = g_ipt[x0][x1][(x2 + 1) % LY][x3];
           } else {
@@ -145,7 +145,7 @@ int check_geometry() {
 #endif
 
           iy3 = g_iup[ix][3];
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
           if (x3 != LZ - 1) {
             iz3 = g_ipt[x0][x1][x2][(x3 + 1) % LZ];
           } else {
@@ -176,7 +176,7 @@ int check_geometry() {
           }
 
           iy0 = g_idn[ix][0];
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x0 != 0) {
             iz0 = g_ipt[(x0 + T - 1) % T][x1][x2][x3];
           } else {
@@ -194,7 +194,7 @@ int check_geometry() {
 #endif
 
           iy1 = g_idn[ix][1];
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x1 != 0) {
             iz1 = g_ipt[x0][(x1 + LX - 1) % LX][x2][x3];
           } else {
@@ -212,7 +212,7 @@ int check_geometry() {
           iz1 = g_ipt[x0][(x1 + LX - 1) % LX][x2][x3];
 #endif
           iy2 = g_idn[ix][2];
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x2 != 0) {
             iz2 = g_ipt[x0][x1][(x2 + LY - 1) % LY][x3];
           } else {
@@ -231,7 +231,7 @@ int check_geometry() {
 #endif
 
           iy3 = g_idn[ix][3];
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
           if (x3 != 0) {
             iz3 = g_ipt[x0][x1][x2][(x3 + LZ - 1) % LZ];
           } else {
@@ -262,8 +262,8 @@ int check_geometry() {
           }
 
           /* The edges */
-          /* In case of PARALLELT there is actually no edge to take care of */
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+          /* In case of TM_PARALLELT there is actually no edge to take care of */
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
           if (x0 == 0) {
             iy0 = g_idn[g_idn[ix][1]][0];
             if (x1 != 0) {
@@ -318,7 +318,7 @@ int check_geometry() {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x0 == 0) {
             iy0 = g_idn[g_idn[ix][2]][0];
             if (x2 != 0) {
@@ -421,7 +421,7 @@ int check_geometry() {
             }
           }
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
           if (x0 == 0) {
             iy0 = g_idn[g_idn[ix][3]][0];
             if (x3 != 0) {
@@ -700,7 +700,7 @@ int check_geometry() {
       }
     }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     for (x0 = 0; x0 < T + 2; x0++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -827,7 +827,7 @@ int check_geometry() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
     for (x0 = 0; x0 < T + 2; x0++) {
       for (x1 = 0; x1 < LX + 2; x1++) {
@@ -1027,7 +1027,7 @@ int check_geometry() {
       }
     }
 #endif
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
     for (x0 = 0; x0 < T + 2; x0++) {
       for (x1 = 0; x1 < LX + 2; x1++) {
         for (x2 = 0; x2 < LY + 2; x2++) {
diff --git a/src/lib/test/check_overlap.c b/src/lib/test/check_overlap.c
index 43742a21b..56763cff4 100644
--- a/src/lib/test/check_overlap.c
+++ b/src/lib/test/check_overlap.c
@@ -105,12 +105,12 @@ int main(int argc, char *argv[]) {
   char *gaugecksum = NULL;
   double plaquette_energy;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   MPI_File fh;
   LemonWriter *lemonWriter;
   paramsXlfInfo *xlfInfo;
@@ -188,7 +188,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
@@ -273,7 +273,7 @@ int main(int argc, char *argv[]) {
 
   phmc_invmaxev = 1.;
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -286,7 +286,7 @@ int main(int argc, char *argv[]) {
       exit(-1);
     }
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   if (even_odd_flag) {
     init_xchange_halffield();
   }
@@ -299,9 +299,9 @@ int main(int argc, char *argv[]) {
       printf("Reading Gauge field from file %s\n", conf_filename);
       fflush(stdout);
     }
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
     read_lemon_gauge_field_parallel(conf_filename, &gaugecksum, &xlfmessage, &gaugelfn);
-#else  /* HAVE_LIBLEMON */
+#else  /* TM_USE_LEMON */
     if (xlfmessage != (char *)NULL) free(xlfmessage);
     if (gaugelfn != (char *)NULL) free(gaugelfn);
     if (gaugecksum != (char *)NULL) free(gaugecksum);
@@ -310,7 +310,7 @@ int main(int argc, char *argv[]) {
     gaugelfn = read_message(conf_filename, "ildg-data-lfn");
     gaugecksum = read_message(conf_filename, "scidac-checksum");
     printf("%s \n", gaugecksum);
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
     if (g_proc_id == 0) {
       printf("done!\n");
       fflush(stdout);
@@ -389,7 +389,7 @@ int main(int argc, char *argv[]) {
     free_chi_dn_spinor_field();
   }
   return (0);
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/lib/test/check_xchange.c b/src/lib/test/check_xchange.c
index db5d97cb3..a20f86df4 100644
--- a/src/lib/test/check_xchange.c
+++ b/src/lib/test/check_xchange.c
@@ -63,7 +63,7 @@ int check_xchange() {
       }
     }
 
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
     for (x0 = 0; x0 < T; x0++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -74,7 +74,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     for (x0 = 0; x0 < T; x0++) {
       for (x1 = 0; x1 < LX; x1++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -113,7 +113,7 @@ int check_xchange() {
       }
     }
 
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
     x = (double*)&g_spinor_field[0][(VOLUME + 2 * LX * LY * LZ) / 2];
     for (i = 0; i < T * LY * LZ / 2 * 24; i++, x++) {
       if ((int)(*x) != g_nb_x_up) {
@@ -139,7 +139,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     x = (double*)&g_spinor_field[0][(VOLUME + 2 * LX * LY * LZ) / 2 + 2 * T * LY * LZ / 2];
     for (i = 0; i < T * LX * LZ / 2 * 24; i++, x++) {
       if ((int)(*x) != g_nb_y_up) {
@@ -166,7 +166,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
     set_spinor_field(0, -1.);
 
     for (x0 = 0; x0 < T; x0++) {
@@ -270,7 +270,7 @@ int check_xchange() {
       }
     }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* Set the x boundary */
     for (x0 = 0; x0 < T; x0++) {
       for (x2 = 0; x2 < LY; x2++) {
@@ -284,7 +284,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* Set the y boundary */
     for (x0 = 0; x0 < T; x0++) {
       for (x1 = 0; x1 < LX; x1++) {
@@ -298,7 +298,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
     /* Set the z boundary */
     for (x0 = 0; x0 < T; x0++) {
       for (x1 = 0; x1 < LX; x1++) {
@@ -340,7 +340,7 @@ int check_xchange() {
       }
     }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     x = (double*)&g_gauge_field[(T + 2) * LX * LY * LZ][0];
     for (i = 0; i < T * LY * LZ * 72; i++, x++) {
       if ((int)(*x) != g_nb_x_up) {
@@ -368,7 +368,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     x = (double*)&g_gauge_field[(T + 2) * LX * LY * LZ + 2 * T * LZ * LY][0];
     for (i = 0; i < T * LX * LZ * 72; i++, x++) {
       if ((int)(*x) != g_nb_y_up) {
@@ -396,7 +396,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
     x = (double*)g_gauge_field[VOLUME + 2 * LX * LY * LZ + 2 * T * LZ * LY + 2 * T * LX * LZ];
     for (i = 0; i < T * LX * LY * 72; i++, x++) {
       if ((int)(*x) != g_nb_z_up) {
@@ -504,7 +504,7 @@ int check_xchange() {
     MPI_Barrier(MPI_COMM_WORLD);
 
     /* The edges */
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     fprintf(stdout, "# Rank: %d, (c0, c1, c2, c3) = (%d, %d, %d, %d)\n", g_proc_id,
             g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3]);
     fflush(stdout);
@@ -577,7 +577,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
     di[2] = (g_proc_coords[2] - 1) % g_nproc_y;
     di[0] = g_proc_coords[0];
@@ -712,7 +712,7 @@ int check_xchange() {
       }
     }
 #endif
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
     di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
     di[3] = (g_proc_coords[3] - 1) % g_nproc_z;
     di[0] = g_proc_coords[0];
@@ -1001,7 +1001,7 @@ int check_xchange() {
         }
       }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
       x = (double*)&g_gauge_field[VOLUMEPLUSRAND + 2 * LX * LY * LZ][0];
       for (i = 0; i < T * LY * LZ * 72; i++, x++) {
         if ((int)(*x) != g_nb_x_up) {
@@ -1029,7 +1029,7 @@ int check_xchange() {
       }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
       x = (double*)&g_gauge_field[VOLUMEPLUSRAND + 2 * LX * LY * LZ + 2 * T * LZ * LY][0];
       for (i = 0; i < T * LX * LZ * 72; i++, x++) {
         if ((int)(*x) != g_nb_y_up) {
@@ -1058,7 +1058,7 @@ int check_xchange() {
       }
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
       x = (double*)&g_gauge_field[VOLUMEPLUSRAND + 2 * LX * LY * LZ + 2 * T * LZ * LY +
                                   2 * T * LX * LZ][0];
       for (i = 0; i < T * LX * LY * 72; i++, x++) {
@@ -1088,7 +1088,7 @@ int check_xchange() {
       }
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
 
       set_gauge_field(-1.);
 
@@ -1279,7 +1279,7 @@ int check_xchange() {
           }
         }
       }
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
       /* Set the tz boundary */
       for (x1 = 0; x1 < LX; x1++) {
         for (x2 = 0; x2 < LY; x2++) {
@@ -1332,7 +1332,7 @@ int check_xchange() {
       xchange_gauge(g_gauge_field);
       MPI_Barrier(MPI_COMM_WORLD);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
       di[0] = (g_proc_coords[0] - 1) % g_nproc_t;
       di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
       di[2] = g_proc_coords[2];
@@ -1453,7 +1453,7 @@ int check_xchange() {
       }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
       di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
       di[2] = (g_proc_coords[2] - 1) % g_nproc_y;
@@ -1693,7 +1693,7 @@ int check_xchange() {
         }
       }
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
 
       di[0] = (g_proc_coords[0] - 1) % g_nproc_t;
       di[3] = (g_proc_coords[3] - 1) % g_nproc_z;
@@ -2123,7 +2123,7 @@ int check_xchange() {
         }
       }
     }
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     for (x0 = 0; x0 < T; x0++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2145,7 +2145,7 @@ int check_xchange() {
       }
     }
 #endif
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     for (x0 = 0; x0 < T; x0++) {
       for (x1 = 0; x1 < LX; x1++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2167,7 +2167,7 @@ int check_xchange() {
       }
     }
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     for (x0 = 0; x0 < T; x0++) {
       for (x1 = 0; x1 < LX; x1++) {
         for (x2 = 0; x2 < LY; x2++) {
@@ -2194,7 +2194,7 @@ int check_xchange() {
     xchange_deri(df0);
     MPI_Barrier(MPI_COMM_WORLD);
 
-#if defined PARALLELT
+#if defined TM_PARALLELT
     for (x1 = 0; x1 < LX; x1++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2228,7 +2228,7 @@ int check_xchange() {
       }
     }
 #endif
-#if defined PARALLELXT
+#if defined TM_PARALLELXT
     for (x1 = 1; x1 < LX - 1; x1++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2351,7 +2351,7 @@ int check_xchange() {
       }
     }
 #endif
-#if defined PARALLELXYT
+#if defined TM_PARALLELXYT
     for (x1 = 1; x1 < LX - 1; x1++) {
       for (x2 = 1; x2 < LY - 1; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2748,7 +2748,7 @@ int check_xchange() {
 
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     for (x1 = 1; x1 < LX - 1; x1++) {
       for (x2 = 1; x2 < LY - 1; x2++) {
         for (x3 = 1; x3 < LZ - 1; x3++) {
@@ -3026,7 +3026,7 @@ int check_xchange() {
       }
     }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
     // xt edge
     for (x2 = 0; x2 < LY; x2++) {
@@ -3063,7 +3063,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
     // ty edge
     for (x1 = 0; x1 < LX; x1++) {
@@ -3139,7 +3139,7 @@ int check_xchange() {
     xchange_deri(df0);
     MPI_Barrier(MPI_COMM_WORLD);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
     di[0] = (g_proc_coords[0] - 1) % g_nproc_t;
     di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
@@ -3156,7 +3156,7 @@ int check_xchange() {
     di[1] = (g_proc_coords[1] + 1) % g_nproc_x;
     MPI_Cart_rank(g_cart_grid, di, &pp);
 
-#ifdef PARALLELXT
+#ifdef TM_PARALLELXT
     for (x2 = 0; x2 < LY; x2++) {
       for (x3 = 0; x3 < LZ; x3++) {
 #else
@@ -3224,7 +3224,7 @@ int check_xchange() {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
     // xy-edge
     di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
diff --git a/src/lib/test/measure_rectangles.debug.c b/src/lib/test/measure_rectangles.debug.c
index 75a71d2b2..422f681b2 100644
--- a/src/lib/test/measure_rectangles.debug.c
+++ b/src/lib/test/measure_rectangles.debug.c
@@ -61,10 +61,10 @@ double measure_rectangles() {
   char filename[100];
 
   sprintf(filename, "debug_mr.s");
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   sprintf(filename, "debug_mr.pt.%d", g_proc_id);
 #endif
-#ifdef PARALLELXT
+#ifdef TM_PARALLELXT
   sprintf(filename, "debug_mr.pxt.%d", g_proc_id);
 #endif
   debugfile = fopen(filename, "w");
diff --git a/src/lib/update_backward_gauge.c b/src/lib/update_backward_gauge.c
index a041e577c..b28ab6acf 100644
--- a/src/lib/update_backward_gauge.c
+++ b/src/lib/update_backward_gauge.c
@@ -25,7 +25,7 @@
 #include "su3.h"
 #include "update_backward_gauge.h"
 
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
 void update_backward_gauge(su3** const gf) {
 #ifdef TM_USE_OMP
 #pragma omp parallel
diff --git a/src/lib/update_gauge.c b/src/lib/update_gauge.c
index dde4cbf31..af4730e01 100644
--- a/src/lib/update_gauge.c
+++ b/src/lib/update_gauge.c
@@ -39,7 +39,7 @@
 #include "su3spinor.h"
 #include "update_gauge.h"
 #include "xchange/xchange.h"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 /*******************************************************
@@ -51,7 +51,7 @@
 void update_gauge(const double step, hamiltonian_field_t *const hf) {
   tm_stopwatch_push(&g_timers, __func__, "");
   update_tm_gauge_id(&g_gauge_state, step);
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   MG_update_gauge(step);
 #endif
 
@@ -65,7 +65,7 @@ void update_gauge(const double step, hamiltonian_field_t *const hf) {
     su3 *z;
     static su3adj deriv;
     su3adj *xm;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(updategauge)
 #endif
 
@@ -115,7 +115,7 @@ void update_gauge(const double step, hamiltonian_field_t *const hf) {
 
   tm_stopwatch_pop(&g_timers, 0, 1, "");
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(updategauge)
 #endif
 }
diff --git a/src/lib/update_momenta_fg.c b/src/lib/update_momenta_fg.c
index 0aab582cd..cf1e9e4fb 100644
--- a/src/lib/update_momenta_fg.c
+++ b/src/lib/update_momenta_fg.c
@@ -44,7 +44,7 @@
 #include "su3adj.h"
 #include "su3spinor.h"
 #include "xchange/xchange.h"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 
@@ -123,7 +123,7 @@ void fg_update_momenta_reset_gaugefield(const double step, hamiltonian_field_t *
  *******************************************************/
 void update_momenta_fg(int *mnllist, double step, const int no, hamiltonian_field_t *const hf,
                        double step0) {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   MG_update_gauge(0.0);
 #endif
   if (g_exposu3_no_c == 0) init_exposu3();
@@ -156,7 +156,7 @@ void update_momenta_fg(int *mnllist, double step, const int no, hamiltonian_fiel
   /* for parallelization */
   xchange_gauge(hf->gaugefield);
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   MG_update_gauge(0.0);
 #endif
 
@@ -201,7 +201,7 @@ void update_momenta_fg(int *mnllist, double step, const int no, hamiltonian_fiel
   /* for parallelization */
   xchange_gauge(hf->gaugefield);
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   MG_update_gauge(0.0);
 #endif
 
diff --git a/src/lib/update_tm.c b/src/lib/update_tm.c
index 72a6194e7..3f1cdc5d5 100644
--- a/src/lib/update_tm.c
+++ b/src/lib/update_tm.c
@@ -64,7 +64,7 @@
 #include "su3.h"
 #include "update_tm.h"
 #include "xchange/xchange.h"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 
@@ -120,7 +120,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy, char *filename
     }
   }
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   MG_reset();
 #endif
 
@@ -211,7 +211,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy, char *filename
       free(xlfInfo);
     }
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     MG_reset();
 #endif
 
@@ -354,7 +354,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy, char *filename
     // will result in the updated gauge field to be propagated
     update_tm_gauge_id(&g_gauge_state, TM_GAUGE_PROPAGATE_THRESHOLD);
     update_tm_gauge_id(&g_gauge_state_32, TM_GAUGE_PROPAGATE_THRESHOLD);
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     MG_reset();
 #endif
   }
diff --git a/src/lib/util/io.c b/src/lib/util/io.c
index 6df42d288..4f6267c78 100644
--- a/src/lib/util/io.c
+++ b/src/lib/util/io.c
@@ -36,7 +36,7 @@
  *
  */
 
-#define _FILE_OFFSET_BITS 64
+#define TM_FILE_OFFSET_BITS 64
 
 #include "io.h"
 #include <stdio.h>
diff --git a/src/lib/util/laguer/Makefile b/src/lib/util/laguer/Makefile
deleted file mode 100644
index f9bce70e3..000000000
--- a/src/lib/util/laguer/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-CXX=g++
-CXXFLAGS=-g -O2
-CLNDIR=${HOME}/daten/workdir/cln/
-
-chebyRoot: chebyRoot.C Makefile chebyRoot.H
-	${CXX} $< -g -o $@ -I${CLNDIR}/include/ -L${CLNDIR}/lib -lcln -lm
-
-clean:
-	rm -f *.o chebyRoot *.dat *.log *~
diff --git a/src/lib/util/oox/Makefile b/src/lib/util/oox/Makefile
deleted file mode 100644
index 88de5bdd5..000000000
--- a/src/lib/util/oox/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-CC=gcc
-CXX=g++
-CFLAGS=-O2 -fexpensive-optimizations -fomit-frame-pointer # -mfpmath=sse -msse2 
-LIBS=-lm
-OBJECTS_OOX=oox.o
-INCLUDE=-I./
-
-
-# variables for oox_ga executable
-# if you want to compile with ga lib support
-# please adjust the GALIBPATH variable
-# to the toplevel dir of galib
-# it is assumed that you compiled the library
-# such that a libga.a file is present in the 
-# ./ga subdir of galib
-GALIBPATH=/usr1/scratch/annube/galib247
-LIBS_GA=${LIBS} -L${GALIBPATH}/ga -lga
-CFLAGS_GA=${CFLAGS} -DWITHGALIB
-INCLUDE_GA=${INCLUDE} -I${GALIBPATH}
-OBJECTS_OOX_GA=oox_ga.o oox_gawrapper.o
-
-
-all: oox oox_ga
-
-oox: ${OBJECTS_OOX} Makefile
-	${CXX} ${OBJECTS_OOX} -o $@ ${CFLAGS} ${LIBS}
-
-oox_ga: ${OBJECTS_OOX_GA} Makefile
-	${CXX} ${OBJECTS_OOX_GA} -o $@ ${CFLAGS_GA} ${LIBS_GA}
-
-oox_gawrapper.o: oox_gawrapper.cxx
-	${CXX} ${CFLAGS_GA} -o $@ -c $< ${INCLUDE_GA}
-
-oox_ga.o: oox.c
-	${CC} ${CFLAGS_GA} -o $@ -c $< ${INCLUDE_GA}
-
-clean:
-	rm oox oox_ga *.o
-
-.SUFFIXES:
-
-%.o: %.c
-	${CC} ${CFLAGS}	-o $@ -c $< ${INCLUDE}
-
-%.o: %.cxx
-	${CXX} ${CFLAGS} -o $@ -c $< ${INCLUDE}
diff --git a/src/lib/wrapper/lib_wrapper.c b/src/lib/wrapper/lib_wrapper.c
index 6c95a27d5..9f083adc5 100644
--- a/src/lib/wrapper/lib_wrapper.c
+++ b/src/lib/wrapper/lib_wrapper.c
@@ -121,7 +121,7 @@ int tmLQCD_invert_init(int argc, char* argv[], const int _verbose, const int ext
   for (int j = 0; j < no_operators; j++)
     if (!operator_list[j].even_odd_flag) even_odd_flag = 0;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   int j = init_gauge_field(VOLUMEPLUSRAND, 1);
   j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
@@ -161,7 +161,7 @@ int tmLQCD_invert_init(int argc, char* argv[], const int _verbose, const int ext
   // initialise the operators
   init_operators();
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "tmLQCD_init_invert: Not enough memory for halffield! Aborting...\n");
@@ -172,7 +172,7 @@ int tmLQCD_invert_init(int argc, char* argv[], const int _verbose, const int ext
     fprintf(stderr, "tmLQCD_init_invert: Not enough memory for 32-bit halffield! Aborting...\n");
     return (-1);
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   if (even_odd_flag) init_xchange_halffield();
 #endif
 #endif
diff --git a/src/lib/xchange/xchange_2fields.c b/src/lib/xchange/xchange_2fields.c
index c5dfa86a8..c311bf908 100644
--- a/src/lib/xchange/xchange_2fields.c
+++ b/src/lib/xchange/xchange_2fields.c
@@ -41,18 +41,18 @@
 #include "su3.h"
 #include "xchange_2fields.h"
 
-#if (defined _NON_BLOCKING)
+#if (defined TM_NON_BLOCKING)
 
 /* this version uses non-blocking MPI calls */
 void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
   MPI_Request requests[32];
   MPI_Status status[32];
   int reqcount = 0;
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
   int ix = 0;
 #endif
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchange2fields)
 #endif
 
@@ -88,7 +88,7 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
             g_cart_grid, &requests[reqcount + 1]);
   reqcount = reqcount + 2;
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[reqcount]);
@@ -120,7 +120,7 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
   reqcount = reqcount + 2;
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[reqcount]);
@@ -153,7 +153,7 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
 
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* fill buffer ! */
   /* This is now depending on whether the field is */
   /* even or odd */
@@ -237,8 +237,8 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
   MPI_Waitall(reqcount, requests, status);
 #endif
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchange2fields)
 #endif
 }
-#endif /*  _NON_BLOCKING */
+#endif /*  TM_NON_BLOCKING */
diff --git a/src/lib/xchange/xchange_2fields.h b/src/lib/xchange/xchange_2fields.h
index 35dc7f6c5..6a83085f0 100644
--- a/src/lib/xchange/xchange_2fields.h
+++ b/src/lib/xchange/xchange_2fields.h
@@ -31,7 +31,7 @@
 #define EVEN 1
 #define ODD 0
 
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
 void xchange_2fields(spinor* const k, spinor* const l, const int ieo);
 #else
 #define xchange_2fields(k, l, ieo) \
diff --git a/src/lib/xchange/xchange_deri.c b/src/lib/xchange/xchange_deri.c
index a260ed8b6..7defa1e7c 100644
--- a/src/lib/xchange/xchange_deri.c
+++ b/src/lib/xchange/xchange_deri.c
@@ -55,7 +55,7 @@ void xchange_deri(su3adj** const df) {
 #ifdef TM_USE_MPI
   int ix, iy, t, y, z, x;
   MPI_Status status;
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* The edges need to come first */
 
   /* send the data to the neighbour on the left in t direction */
@@ -96,9 +96,9 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) */
+#endif /* (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT) */
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* edges */
 
   /* send the data to the neighbour on the left in x direction */
@@ -178,9 +178,9 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* (defined PARALLELXYT || defined PARALLELXYZT) */
+#endif /* (defined TM_PARALLELXYT || defined TM_PARALLELXYZT) */
 
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
 
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
@@ -305,7 +305,7 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* PARALLELXYZT */
+#endif /* TM_PARALLELXYZT */
 
   // now the normal boundaries
 
@@ -341,7 +341,7 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv((void*)df[(T + 2) * LX * LY * LZ + T * LY * LZ], 1, deri_x_slice_cont, g_nb_x_dn, 42,
@@ -372,9 +372,9 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) */
+#endif /* (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT) */
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
@@ -406,9 +406,9 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* (defined PARALLELXYT || defined PARALLELXYZT) */
+#endif /* (defined TM_PARALLELXYT || defined TM_PARALLELXYZT) */
 
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv(
@@ -441,7 +441,7 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* PARALLELXYZT */
+#endif /* TM_PARALLELXYZT */
 #endif /* MPI */
   return;
 }
diff --git a/src/lib/xchange/xchange_field.c b/src/lib/xchange/xchange_field.c
index 576574789..417aa8981 100644
--- a/src/lib/xchange/xchange_field.c
+++ b/src/lib/xchange/xchange_field.c
@@ -35,7 +35,7 @@
 #ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef _USE_SHMEM
+#ifdef TM_USE_SHMEM
 #include <mpp/shmem.h>
 #endif
 
@@ -44,30 +44,30 @@
 #include "su3.h"
 #include "xchange_field.h"
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
 #pragma disjoint(*field_buffer_z2, *field_buffer_z)
 #endif
 
 /* this version uses non-blocking MPI calls */
-#if (defined _NON_BLOCKING)
+#if (defined TM_NON_BLOCKING)
 
 void xchange_field(spinor* const l, const int ieo) {
 #ifdef TM_USE_MPI
   MPI_Request requests[16];
   MPI_Status status[16];
 #endif
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int ix = 0;
   int reqcount = 16;
 #endif
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchangefield)
 #endif
 
@@ -84,7 +84,7 @@ void xchange_field(spinor* const l, const int ieo) {
     MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[0]);
     MPI_Irecv((void*)(l + T * LX * LY * LZ / 2), 1, field_time_slice_cont, g_nb_t_up, 81,
               g_cart_grid, &requests[1]);
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the left in x direction */
     /* recieve the data from the neighbour on the right in x direction */
     MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[4]);
@@ -92,7 +92,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_cart_grid, &requests[5]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the left in y direction */
     /* recieve the data from the neighbour on the right in y direction */
     MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[8]);
@@ -100,7 +100,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_nb_y_up, 101, g_cart_grid, &requests[9]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
     /* fill buffer ! */
     /* This is now depending on whether the field is */
     /* even or odd */
@@ -129,7 +129,7 @@ void xchange_field(spinor* const l, const int ieo) {
     MPI_Irecv((void*)(l + (T + 1) * LX * LY * LZ / 2), 1, field_time_slice_cont, g_nb_t_dn, 82,
               g_cart_grid, &requests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the right in x direction */
     /* recieve the data from the neighbour on the left in x direction */
     MPI_Isend((void*)(l + (LX - 1) * LY * LZ / 2), 1, field_x_slice_gath, g_nb_x_up, 92,
@@ -138,7 +138,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_nb_x_dn, 92, g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the right in y direction */
     /* recieve the data from the neighbour on the left in y direction */
     MPI_Isend((void*)(l + (LY - 1) * LZ / 2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid,
@@ -147,7 +147,7 @@ void xchange_field(spinor* const l, const int ieo) {
               field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[11]);
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     if (ieo == 1) {
       for (ix = T * LX * LY / 2; ix < T * LX * LY; ix++) {
         field_buffer_z2[ix - T * LX * LY / 2] = l[g_field_z_ipt_even[ix]];
@@ -174,7 +174,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_cart_grid, &requests[0]);
     MPI_Irecv((void*)(l + (T + 1) * LX * LY * LZ / 2), 1, field_time_slice_cont, g_nb_t_dn, 82,
               g_cart_grid, &requests[1]);
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the right in x direction */
     /* recieve the data from the neighbour on the left in x direction */
     MPI_Isend((void*)(l + (LX - 1) * LY * LZ / 2), 1, field_x_slice_gath, g_nb_x_up, 92,
@@ -183,7 +183,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_nb_x_dn, 92, g_cart_grid, &requests[5]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the right in y direction */
     /* recieve the data from the neighbour on the left in y direction */
     MPI_Isend((void*)(l + (LY - 1) * LZ / 2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid,
@@ -192,7 +192,7 @@ void xchange_field(spinor* const l, const int ieo) {
               field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[9]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
     /* fill buffer ! */
     /* This is now depending on whether the field is */
     /* even or odd */
@@ -218,7 +218,7 @@ void xchange_field(spinor* const l, const int ieo) {
     MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[2]);
     MPI_Irecv((void*)(l + T * LX * LY * LZ / 2), 1, field_time_slice_cont, g_nb_t_up, 81,
               g_cart_grid, &requests[3]);
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the left in x direction */
     /* recieve the data from the neighbour on the right in x direction */
     MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[6]);
@@ -226,7 +226,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the left in y direction */
     /* recieve the data from the neighbour on the right in y direction */
     MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[10]);
@@ -234,7 +234,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_nb_y_up, 101, g_cart_grid, &requests[11]);
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     if (ieo == 1) {
       for (ix = T * LX * LY / 2; ix < T * LX * LY; ix++) {
         field_buffer_z2[ix - T * LX * LY / 2] = l[g_field_z_ipt_even[ix]];
@@ -259,12 +259,12 @@ void xchange_field(spinor* const l, const int ieo) {
 #endif
 
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchangefield)
 #endif
 }
 
-#elif (defined _USE_SHMEM) /* _NON_BLOCKING */
+#elif (defined TM_USE_SHMEM) /* TM_NON_BLOCKING */
 
 /* Here comes the version with shared memory */
 /* exchanges the field  l */
@@ -273,7 +273,7 @@ void xchange_field(spinor* const l, const int ieo) {
 #ifdef TM_USE_MPI
   int i, ix, mu, x0, x1, x2, x3, k;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchangefield)
 #endif
 
@@ -283,7 +283,7 @@ void xchange_field(spinor* const l, const int ieo) {
   shmem_double_put((double*)(l + (T + 1) * LX * LY * LZ / 2),
                    (double*)(l + (T - 1) * LX * LY * LZ / 2), (LX * LY * LZ * 12), g_nb_t_up);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   k = (T + 2) * LX * LY * LZ / 2;
   for (x0 = 0; x0 < T; x0++) {
     shmem_double_put((double*)(l + k), (double*)(l + g_lexic2eo[g_ipt[x0][0][0][0]]), 12 * LZ * LY,
@@ -298,7 +298,7 @@ void xchange_field(spinor* const l, const int ieo) {
   }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   k = ((T + 2) * LX * LY * LZ + 2 * T * LY * LZ) / 2;
   for (x0 = 0; x0 < T; x0++) {
     for (x1 = 0; x1 < LX; x1++) {
@@ -317,7 +317,7 @@ void xchange_field(spinor* const l, const int ieo) {
   }
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   x0 = (VOLUME / 2 + LX * LY * LZ + T * LY * LZ + T * LX * LZ);
   if (ieo == 1) {
     for (k = 0; k < T * LX * LY / 2; k++) {
@@ -347,21 +347,21 @@ void xchange_field(spinor* const l, const int ieo) {
   shmem_barrier_all();
 #endif  // MPI
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchangefield)
 #endif
 }
 
 /* Here comes the naive version */
 /* Using MPI_Sendrecv */
-#else /* _NON_BLOCKING _USE_SHMEM */
+#else /* TM_NON_BLOCKING TM_USE_SHMEM */
 /* exchanges the field  l */
 void xchange_field(spinor* const l, const int ieo) {
 
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchangefield)
 #endif
 
@@ -379,7 +379,7 @@ void xchange_field(spinor* const l, const int ieo) {
                (void*)(l + (T + 1) * LX * LY * LZ / 2), 1, field_time_slice_cont, g_nb_t_dn, 82,
                g_cart_grid, &status);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91,
@@ -394,7 +394,7 @@ void xchange_field(spinor* const l, const int ieo) {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101,
@@ -409,7 +409,7 @@ void xchange_field(spinor* const l, const int ieo) {
 
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* fill buffer ! */
   /* This is now depending on whether the field is */
   /* even or odd */
@@ -448,9 +448,9 @@ void xchange_field(spinor* const l, const int ieo) {
 #endif
 #endif  // MPI
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchangefield)
 #endif
 }
 
-#endif /* _NON_BLOCKING */
+#endif /* TM_NON_BLOCKING */
diff --git a/src/lib/xchange/xchange_gauge.c b/src/lib/xchange/xchange_gauge.c
index 3465d970f..6177a3dbb 100644
--- a/src/lib/xchange/xchange_gauge.c
+++ b/src/lib/xchange/xchange_gauge.c
@@ -38,7 +38,7 @@
 #include "su3adj.h"
 #include "xchange_gauge.h"
 
-#if defined _NON_BLOCKING
+#if defined TM_NON_BLOCKING
 void xchange_gauge(su3** const gf) {
   int cntr = 0;
 #ifdef TM_USE_MPI
@@ -80,7 +80,7 @@ void xchange_gauge(su3** const gf) {
     cntr = cntr + 2;
   }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Isend(gf[0], 1, gauge_x_slice_gath, g_nb_x_dn, 87, g_cart_grid, &request[cntr]);
@@ -117,7 +117,7 @@ void xchange_gauge(su3** const gf) {
 #endif
   MPI_Waitall(cntr, request, status);
   cntr = 0;
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* The edges */
 
   /* send the data to the neighbour on the left in t direction */
@@ -175,10 +175,10 @@ void xchange_gauge(su3** const gf) {
               g_cart_grid, &request[cntr + 1]);
     cntr = cntr + 2;
   }
-  /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/
+  /* end of if defined TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT*/
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Isend(gf[0], 1, gauge_y_slice_gath, g_nb_y_dn, 106, g_cart_grid, &request[cntr]);
@@ -212,7 +212,7 @@ void xchange_gauge(su3** const gf) {
 #endif
   MPI_Waitall(cntr, request, status);
   cntr = 0;
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* jetzt wirds richtig eklig ... */
 
@@ -326,9 +326,9 @@ void xchange_gauge(su3** const gf) {
     cntr = cntr + 2;
   }
 
-  /* end of if defined PARALLELXYT || PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYT || TM_PARALLELXYZT */
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
   /* z-Rand */
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -361,7 +361,7 @@ void xchange_gauge(su3** const gf) {
   }
 #endif
   MPI_Waitall(cntr, request, status);
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
   cntr = 0;
   /* edges */
 
@@ -538,13 +538,13 @@ void xchange_gauge(su3** const gf) {
   }
   MPI_Waitall(cntr, request, status);
 
-  /* end of if defined PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYZT */
 #endif
 #endif
   return;
 }
 
-#else /* _NON_BLOCKING */
+#else /* TM_NON_BLOCKING */
 void xchange_gauge(su3** const gf) {
 
 #ifdef TM_USE_MPI
@@ -576,7 +576,7 @@ void xchange_gauge(su3** const gf) {
                  g_cart_grid, &status);
   }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv(gf[0], 1, gauge_x_slice_gath, g_nb_x_dn, 93, gf[(T + 2) * LX * LY * LZ], 1,
@@ -648,10 +648,10 @@ void xchange_gauge(su3** const gf) {
                  g_nb_t_up, 98, gf[VOLUMEPLUSRAND + RAND + 6 * LY * LZ], 1, gauge_xt_edge_cont,
                  g_nb_t_dn, 98, g_cart_grid, &status);
   }
-  /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/
+  /* end of if defined TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT*/
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv(gf[0], 1, gauge_y_slice_gath, g_nb_y_dn, 103,
@@ -770,9 +770,9 @@ void xchange_gauge(su3** const gf) {
                  gauge_ty_edge_cont, g_nb_y_dn, 298, g_cart_grid, &status);
   }
 
-  /* end of if defined PARALLELXYT || PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYT || TM_PARALLELXYZT */
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
   /* z-Rand */
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -954,11 +954,11 @@ void xchange_gauge(su3** const gf) {
                  1, gauge_zy_edge_cont, g_nb_y_dn, 510, g_cart_grid, &status);
   }
 
-  /* end of if defined PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYZT */
 #endif
 #endif
   return;
 }
 
 
-#endif /* _NON_BLOCKING */
+#endif /* TM_NON_BLOCKING */
diff --git a/src/lib/xchange/xchange_halffield.c b/src/lib/xchange/xchange_halffield.c
index d1eae8a04..3948aa1ca 100644
--- a/src/lib/xchange/xchange_halffield.c
+++ b/src/lib/xchange/xchange_halffield.c
@@ -41,9 +41,9 @@
 #include "su3.h"
 #include "xchange_halffield.h"
 
-#if (defined _USE_HALFSPINOR)
+#if (defined TM_USE_HALFSPINOR)
 
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
 
 MPI_Request prequests[16];
 
@@ -51,13 +51,13 @@ MPI_Request prequests[16];
 void init_xchange_halffield() {
 #ifdef TM_USE_MPI
 
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
   int reqcount = 16;
 #endif
@@ -78,7 +78,7 @@ void init_xchange_halffield() {
   MPI_Recv_init((void*)(recvBuffer), LX * LY * LZ * 12 / 2, MPI_DOUBLE, g_nb_t_up, 82, g_cart_grid,
                 &prequests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the right in x direction */
   /* recieve the data from the neighbour on the left in x direction */
@@ -97,7 +97,7 @@ void init_xchange_halffield() {
                 g_cart_grid, &prequests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
   MPI_Send_init((void*)(sendBuffer + LX * LY * LZ + T * LY * LZ), T * LX * LZ * 12 / 2, MPI_DOUBLE,
@@ -115,7 +115,7 @@ void init_xchange_halffield() {
                 g_nb_y_up, 102, g_cart_grid, &prequests[11]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in z direction */
   /* recieve the data from the neighbour on the left in z direction */
   MPI_Send_init((void*)(sendBuffer + LX * LY * LZ + T * LY * LZ + T * LX * LZ),
@@ -141,13 +141,13 @@ void xchange_halffield() {
 #ifdef TM_USE_MPI
 
   MPI_Status status[16];
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
   int reqcount = 16;
 #endif
@@ -158,7 +158,7 @@ void xchange_halffield() {
   return;
 }
 
-#else /* def (_USE_SHMEM || _PERSISTENT) */
+#else /* def (TM_USE_SHMEM || TM_PERSISTENT) */
 /* 4. */
 void xchange_halffield() {
 
@@ -166,17 +166,17 @@ void xchange_halffield() {
 
   MPI_Request requests[16];
   MPI_Status status[16];
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchangehalf)
 #endif
   /* send the data to the neighbour on the right in t direction */
@@ -193,7 +193,7 @@ void xchange_halffield() {
   MPI_Irecv((void*)(recvBuffer), LX * LY * LZ * 12 / 2, MPI_DOUBLE, g_nb_t_up, 82, g_cart_grid,
             &requests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the right in x direction */
   /* recieve the data from the neighbour on the left in x direction */
@@ -210,7 +210,7 @@ void xchange_halffield() {
             g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
   MPI_Isend((void*)(sendBuffer + LX * LY * LZ + T * LY * LZ), T * LX * LZ * 12 / 2, MPI_DOUBLE,
@@ -226,7 +226,7 @@ void xchange_halffield() {
             g_nb_y_up, 102, g_cart_grid, &requests[11]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in z direction */
   /* recieve the data from the neighbour on the left in z direction */
   MPI_Isend((void*)(sendBuffer + LX * LY * LZ + T * LY * LZ + T * LX * LZ), T * LX * LY * 12 / 2,
@@ -246,27 +246,27 @@ void xchange_halffield() {
 #endif /* MPI */
   return;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchangehalf)
 #endif
 }
 
-#endif /* def (_USE_SHMEM || _PERSISTENT) */
+#endif /* def (TM_USE_SHMEM || TM_PERSISTENT) */
 void xchange_halffield32() {
 #ifdef TM_USE_MPI
 
   MPI_Request requests[16];
   MPI_Status status[16];
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchangehalf32)
 #endif
 
@@ -284,7 +284,7 @@ void xchange_halffield32() {
   MPI_Irecv((void*)(recvBuffer32), LX * LY * LZ * 12 / 2, MPI_FLOAT, g_nb_t_up, 82, g_cart_grid,
             &requests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the right in x direction */
   /* recieve the data from the neighbour on the left in x direction */
@@ -301,7 +301,7 @@ void xchange_halffield32() {
             g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
   MPI_Isend((void*)(sendBuffer32 + LX * LY * LZ + T * LY * LZ), T * LX * LZ * 12 / 2, MPI_FLOAT,
@@ -317,7 +317,7 @@ void xchange_halffield32() {
             g_nb_y_up, 102, g_cart_grid, &requests[11]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in z direction */
   /* recieve the data from the neighbour on the left in z direction */
   MPI_Isend((void*)(sendBuffer32 + LX * LY * LZ + T * LY * LZ + T * LX * LZ), T * LX * LY * 12 / 2,
@@ -336,8 +336,8 @@ void xchange_halffield32() {
   MPI_Waitall(reqcount, requests, status);
 #endif /* MPI */
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchangehalf32)
 #endif
 }
-#endif /* defined _USE_HALFSPINOR */
+#endif /* defined TM_USE_HALFSPINOR */
diff --git a/src/lib/xchange/xchange_lexicfield.c b/src/lib/xchange/xchange_lexicfield.c
index 9def17fc6..56cc4315c 100644
--- a/src/lib/xchange/xchange_lexicfield.c
+++ b/src/lib/xchange/xchange_lexicfield.c
@@ -43,7 +43,7 @@
 #include "xchange_lexicfield.h"
 
 /* this version uses non-blocking MPI calls */
-#if (defined _NON_BLOCKING)
+#if (defined TM_NON_BLOCKING)
 
 /* this is the version independent of the content of the function Index (only available with
  * non-blocking)) */
@@ -51,16 +51,16 @@
 void xchange_lexicfield(spinor* const l) {
   MPI_Request requests[16];
   MPI_Status status[16];
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchange_lexicfield)
 #endif
 
@@ -71,7 +71,7 @@ void xchange_lexicfield(spinor* const l) {
   MPI_Isend((void*)l, 1, lfield_time_slice_cont, g_nb_t_dn, 5081, g_cart_grid, &requests[0]);
   MPI_Irecv((void*)(l + VOLUME), 1, lfield_time_slice_cont, g_nb_t_up, 5081, g_cart_grid,
             &requests[1]);
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Isend((void*)l, 1, lfield_x_slice_gath, g_nb_x_dn, 5091, g_cart_grid, &requests[4]);
@@ -80,7 +80,7 @@ void xchange_lexicfield(spinor* const l) {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Isend((void*)l, 1, lfield_y_slice_gath, g_nb_y_dn, 5101, g_cart_grid, &requests[8]);
@@ -88,7 +88,7 @@ void xchange_lexicfield(spinor* const l) {
             5101, g_cart_grid, &requests[9]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -103,7 +103,7 @@ void xchange_lexicfield(spinor* const l) {
   MPI_Irecv((void*)(l + (T + 1) * LX * LY * LZ), 1, lfield_time_slice_cont, g_nb_t_dn, 5082,
             g_cart_grid, &requests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in x direction */
   /* recieve the data from the neighbour on the left in x direction */
   MPI_Isend((void*)(l + (LX - 1) * LY * LZ), 1, lfield_x_slice_gath, g_nb_x_up, 5092, g_cart_grid,
@@ -112,7 +112,7 @@ void xchange_lexicfield(spinor* const l) {
             5092, g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
   MPI_Isend((void*)(l + (LY - 1) * LZ), 1, lfield_y_slice_gath, g_nb_y_up, 5102, g_cart_grid,
@@ -121,7 +121,7 @@ void xchange_lexicfield(spinor* const l) {
             g_nb_y_dn, 5102, g_cart_grid, &requests[11]);
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
 
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
@@ -135,21 +135,21 @@ void xchange_lexicfield(spinor* const l) {
 
 #endif
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchange_lexicfield)
 #endif
 }
 
 /* Here comes the naive version */
 /* Using MPI_Sendrecv */
-#else /* _NON_BLOCKING */
+#else /* TM_NON_BLOCKING */
 /* exchanges the field  l */
 void xchange_lexicfield(spinor* const l) {
 
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchange_lexicfield)
 #endif
 
@@ -167,7 +167,7 @@ void xchange_lexicfield(spinor* const l) {
                (void*)(l + (T + 1) * LX * LY * LZ), 1, lfield_time_slice_cont, g_nb_t_dn, 5082,
                g_cart_grid, &status);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv((void*)l, 1, lfield_x_slice_gath, g_nb_x_dn, 5091,
@@ -182,7 +182,7 @@ void xchange_lexicfield(spinor* const l) {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv((void*)l, 1, lfield_y_slice_gath, g_nb_y_dn, 5101,
@@ -197,7 +197,7 @@ void xchange_lexicfield(spinor* const l) {
 
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
   MPI_Sendrecv((void*)l, 1, lfield_z_slice_gath, g_nb_z_dn, 5503,
@@ -214,7 +214,7 @@ void xchange_lexicfield(spinor* const l) {
 #endif
 #endif
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchange_lexicfield)
 #endif
 }
@@ -226,20 +226,20 @@ void xchange_lexicfield(spinor* const l) {
  ***********************************************************************/
 
 /* this version uses non-blocking MPI calls */
-#if (defined _NON_BLOCKING)
+#if (defined TM_NON_BLOCKING)
 void xchange_lexicfield32(spinor32* const l) {
   MPI_Request requests[16];
   MPI_Status status[16];
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchange_lexicfield32)
 #endif
 
@@ -250,7 +250,7 @@ void xchange_lexicfield32(spinor32* const l) {
   MPI_Isend((void*)l, 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, g_cart_grid, &requests[0]);
   MPI_Irecv((void*)(l + VOLUME), 1, lfield_time_slice_cont32, g_nb_t_up, 5081, g_cart_grid,
             &requests[1]);
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Isend((void*)l, 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, g_cart_grid, &requests[4]);
@@ -259,7 +259,7 @@ void xchange_lexicfield32(spinor32* const l) {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Isend((void*)l, 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, g_cart_grid, &requests[8]);
@@ -267,7 +267,7 @@ void xchange_lexicfield32(spinor32* const l) {
             5101, g_cart_grid, &requests[9]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -282,7 +282,7 @@ void xchange_lexicfield32(spinor32* const l) {
   MPI_Irecv((void*)(l + (T + 1) * LX * LY * LZ), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082,
             g_cart_grid, &requests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in x direction */
   /* recieve the data from the neighbour on the left in x direction */
   MPI_Isend((void*)(l + (LX - 1) * LY * LZ), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, g_cart_grid,
@@ -291,7 +291,7 @@ void xchange_lexicfield32(spinor32* const l) {
             g_nb_x_dn, 5092, g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
   MPI_Isend((void*)(l + (LY - 1) * LZ), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, g_cart_grid,
@@ -300,7 +300,7 @@ void xchange_lexicfield32(spinor32* const l) {
             lfield_y_slice_cont32, g_nb_y_dn, 5102, g_cart_grid, &requests[11]);
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
 
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
@@ -314,21 +314,21 @@ void xchange_lexicfield32(spinor32* const l) {
 
 #endif
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchange_lexicfield32)
 #endif
 }
 
 /* Here comes the naive version */
 /* Using MPI_Sendrecv */
-#else /* _NON_BLOCKING */
+#else /* TM_NON_BLOCKING */
 /* exchanges the field  l */
 void xchange_lexicfield32(spinor32* const l) {
 
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchange_lexicfield32)
 #endif
 
@@ -347,7 +347,7 @@ void xchange_lexicfield32(spinor32* const l) {
                (void*)(l + (T + 1) * LX * LY * LZ), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082,
                g_cart_grid, &status);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv((void*)l, 1, lfield_x_slice_gath32, g_nb_x_dn, 5091,
@@ -362,7 +362,7 @@ void xchange_lexicfield32(spinor32* const l) {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv((void*)l, 1, lfield_y_slice_gath32, g_nb_y_dn, 5101,
@@ -377,7 +377,7 @@ void xchange_lexicfield32(spinor32* const l) {
 
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
   MPI_Sendrecv((void*)l, 1, lfield_z_slice_gath32, g_nb_z_dn, 5503,
@@ -394,7 +394,7 @@ void xchange_lexicfield32(spinor32* const l) {
 #endif
 #endif
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchange_lexicfield32)
 #endif
 }

From 6d1e3d7cc22d292dc91dd46c55e9f6f78dec4191 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 10 Feb 2026 17:12:26 +0100
Subject: [PATCH 10/80] [cmake] More work

- Add fftw
- Add option to compile the tests (OFF by default)
---
 CMakeLists.txt                    |  3 ++-
 cmake/tmlqcd_config_internal.h.in |  3 ---
 src/bin/CMakeLists.txt            | 25 +++++++++++++++++++++++++
 src/lib/CMakeLists.txt            |  7 +++----
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39adba1c5..2cacfcc39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.24)
+cmake_minimum_required(VERSION 3.30)
 
 project(
   tmlqcd
@@ -103,6 +103,7 @@ option(TM_USE_SHMEM "Use shmem API" OFF)
 option(TM_USE_QUDA "Enable QUDA support" OFF)
 option(TM_USE_GPROF "Enable gprof profiler" OFF)
 option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
+option(TM_ENABLE_TESTS "Enable tests" OFF)
 
 # MPI dependent options
 cmake_dependent_option(
diff --git a/cmake/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
index 2765a2b7c..89bc753df 100644
--- a/cmake/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -93,9 +93,6 @@
 /* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
 #cmakedefine TM_LARGEFILE_SOURCE
 
-/* Define for large files, on AIX-style hosts. */
-#cmakedefine TM_LARGE_FILES 
-
 /* Use even/odd geometry in the gauge fields */
 #cmakedefine TM_NEW_GEOMETRY
 
diff --git a/src/bin/CMakeLists.txt b/src/bin/CMakeLists.txt
index 29c9c1d8a..2f135ddae 100644
--- a/src/bin/CMakeLists.txt
+++ b/src/bin/CMakeLists.txt
@@ -17,3 +17,28 @@ foreach(_prog ${tmlqcd_prog})
                POSITION_INDEPENDENT_CODE ON
                LINKER_LANGUAGE "CXX")
 endforeach()
+
+if(TM_ENABLE_TESTS)
+  list(
+    APPEND
+    tmlqcd_test_prog
+    "check_locallity.c;hopping_test.cscalar_prod_r_test.c;test_eigenvalues.c;test_lemon.c"
+  )
+  if(TM_USE_LEMON)
+    list(APPEND tmlqcd_test_prog test_lemon.c)
+  endif()
+  if(TN_USE_QPHIX)
+    list(APPEND tmlqcd_test_prog qphix_test_Dslash.c)
+  endif()
+
+  foreach(_prog ${tmlqcd_test_prog})
+    add_executable(${_prog} "${CMAKE_SOURCE_DIR}/src/bin/tests/${_prog}.c")
+
+    target_link_libraries(${_prog} PUBLIC hmc)
+    set_target_properties(
+      ${_prog}
+      PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+                 POSITION_INDEPENDENT_CODE ON
+                 LINKER_LANGUAGE "CXX")
+  endforeach()
+endif()
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 746b40c0d..ea2f7e41d 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -332,7 +332,7 @@ list(
   get_rectangle_staples.c
   rnd_gauge_trafo.c
   measure_rectangles.c
-  #invert.c
+  # invert.c
   deriv_Sb_D_psi.c
   mpi_init.c
   update_momenta_fg.c
@@ -414,7 +414,7 @@ endif()
 # create a target library with namespacing because cmake does not know name
 # space at all
 
-if (BUILD_SHARED_LIBS)
+if(BUILD_SHARED_LIBS)
   add_library(hmc SHARED "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
 else()
   add_library(hmc STATIC "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
@@ -449,8 +449,7 @@ target_link_libraries(
          m)
 
 target_compile_definitions(
-  hmc PUBLIC HAVE_CONFIG_H
-             $<$<BOOL:${TM_USE_HIP}>:${TM_GPU_PLATFORM_DFLAGS}>)
+  hmc PUBLIC HAVE_CONFIG_H $<$<BOOL:${TM_USE_HIP}>:${TM_GPU_PLATFORM_DFLAGS}>)
 
 target_include_directories(
   hmc

From 978fb302007696905f308cbae811608e0ea05abe Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Thu, 12 Feb 2026 11:48:58 +0100
Subject: [PATCH 11/80] Removed FindLemon.cmake

---
 cmake/FindLemon.cmake | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 cmake/FindLemon.cmake

diff --git a/cmake/FindLemon.cmake b/cmake/FindLemon.cmake
deleted file mode 100644
index cdeca5e42..000000000
--- a/cmake/FindLemon.cmake
+++ /dev/null
@@ -1,25 +0,0 @@
-include(FindPackageHandleStandardArgs)
-
-find_library(
-  TMLQCD_LEMON_LIBRARIES
-  NAMES lemon
-  PATH_SUFFIXES "lib" "lib64")
-
-find_path(
-  TMLQCD_LEMON_INCLUDE_DIRS
-  NAMES lemon.h
-  PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
-
-find_package_handle_standard_args(Lemon DEFAULT_MSG TMLQCD_LEMON_LIBRARIES
-                                  TMLQCD_LEMON_INCLUDE_DIRS)
-
-if(NOT TARGET tmlqcd::lemon)
-  add_library(tmlqcd::lemon INTERFACE IMPORTED)
-  set_target_properties(tmlqcd::lemon PROPERTIES INTERFACE_LINK_LIBRARIES
-                                                 "${TMLQCD_LEMON_LIBRARIES}")
-  set_target_properties(tmlqcd::lemon PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                                 "${TMLQCD_LEMON_INCLUDE_DIRS}")
-endif()
-
-set(TMLQCD_LEMON_FOUND ON)
-mark_as_advanced(TMLQCD_LEMON_LIBRARIES TMLQCD_LEMON_INCLUDE_DIRS)

From f94841132c99d3e02506b8c7826ea08e18abf612 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Mon, 16 Feb 2026 10:05:47 +0100
Subject: [PATCH 12/80] [cmake] More work

---
 .../repo/packages/lemonio/package.py          |  22 ++-
 .github/workflows/basic-build.yaml            |  38 ++--
 .github/workflows/ddalphaamg-build.yaml       |  47 ++---
 .github/workflows/qphix-build.yaml            |  54 ++----
 CMakeLists.txt                                |  81 ++++----
 cmake/FindDDAlphaAMG.cmake                    |  29 ---
 cmake/FindDDalphaAMG.cmake                    |  28 +++
 cmake/tmlqcd_config_internal.h.in             |  12 +-
 profiling/hmc_mk2/logs/example_log.out        |   2 +-
 src/bin/LapH_ev.c                             | 180 ------------------
 src/bin/benchmark.c                           |   6 +-
 src/bin/deriv_mg_tune.c                       |   2 +-
 src/bin/hmc_tm.c                              |   2 +-
 src/bin/invert.c                              |   2 +-
 src/bin/offline_measurement.c                 |   2 +-
 src/bin/tests/check_locallity.c               |   6 +-
 src/bin/tests/hopping_test.c                  |   6 +-
 src/bin/tests/qphix_test_Dslash.c             |   4 +-
 src/bin/tests/test_eigenvalues.c              |   8 +-
 src/bin/tests/test_lemon.c                    |   2 +-
 src/lib/CMakeLists.txt                        |  15 +-
 src/lib/DDalphaAMG_interface.c                |  32 ++--
 src/lib/buffers/utils_generic_exchange.c      |   2 +-
 src/lib/deriv_Sb.c                            |  18 +-
 src/lib/geometry_eo.c                         |  27 +--
 src/lib/global.h                              |   1 -
 src/lib/init/init.h                           |   2 +-
 src/lib/init/init_dirac_halfspinor.c          |  18 +-
 src/lib/init/init_geometry_indices.c          |   1 -
 src/lib/io/utils_write_first_message.c        |   6 +-
 src/lib/linalg/assign.c                       |   1 -
 src/lib/linalg/assign_add_mul_r_32.c          |   2 +-
 src/lib/linalg/scalar_prod_r.c                |   1 -
 src/lib/matrix_utils.c                        |   5 +-
 src/lib/measure_gauge_action.c                |   2 +-
 src/lib/misc_types.h                          |   2 +-
 src/lib/mpi_init.c                            |  11 +-
 src/lib/mpi_init.h                            |   5 +-
 src/lib/operator/D_psi_body.c                 |   2 +-
 src/lib/operator/Hopping_Matrix.c             |   4 +-
 src/lib/operator/Hopping_Matrix_32.c          |   4 +-
 src/lib/operator/halfspinor_body.c            |   4 +-
 src/lib/operator/hopping_body_dbl.c           |  20 +-
 src/lib/operator/hopping_sgl.c                |  18 +-
 src/lib/operator/tm_sub_Hopping_Matrix.c      |   4 +-
 src/lib/operator/tm_times_Hopping_Matrix.c    |   6 +-
 src/lib/read_input.l                          |   2 +-
 src/lib/smearing/utils_reunitarize_MILC.c     |   4 +-
 src/lib/solver/gram-schmidt.c                 |   1 -
 src/lib/test/check_geometry.c                 |   7 +-
 src/lib/test/check_overlap.c                  |   2 +-
 src/lib/wrapper/lib_wrapper.c                 |   6 +-
 src/lib/xchange/xchange_gauge.c               |   1 -
 53 files changed, 271 insertions(+), 498 deletions(-)
 delete mode 100644 cmake/FindDDAlphaAMG.cmake
 create mode 100644 cmake/FindDDalphaAMG.cmake
 delete mode 100644 src/bin/LapH_ev.c

diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
index d70cac492..7508b4b79 100755
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
@@ -2,12 +2,13 @@
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
-from spack_repo.builtin.build_systems.autotools import AutotoolsPackage
+from spack_repo.builtin.build_systems import cmake
+from spack_repo.builtin.build_systems.cmake import CMakePackage, generator
 
 
 from spack.package import *
 
-class Lemonio(AutotoolsPackage):
+class Lemonio(AutotoolsPackage, CMakePackage):
     """LEMON: Lightweight Parallel I/O library for Lattice QCD."""
 
     homepage = "https://github.com/etmc/lemon"
@@ -16,13 +17,18 @@ class Lemonio(AutotoolsPackage):
 
     version('master', branch='master')
 
-    depends_on("autoconf", type="build", when="@master build_system=autotools")
-    depends_on("automake", type="build", when="@master build_system=autotools")
-    depends_on("libtool", type="build", when="@master build_system=autotools")
+    depends_on("libtool", type="build", when="@master build_system=cmake")
+    depends_on("cmake", type="build", when="master build_system=cmake")
 
     depends_on('mpi')
 
-    def configure_args(self):
-        args = []
-        args.append('CC={0}'.format(self.spec['mpi'].mpicc))
+    generator("ninja")
+
+class CMakeBuilder(cmake.CMakeBuilder):
+    def cmake_args(self):
+        spec = self.spec
+        args = [
+            self.define_from_variant("DBUILD_SHARED_LIBS" "shared"),
+        ]
         return args
+
diff --git a/.github/workflows/basic-build.yaml b/.github/workflows/basic-build.yaml
index afe18e145..d46b67830 100644
--- a/.github/workflows/basic-build.yaml
+++ b/.github/workflows/basic-build.yaml
@@ -35,16 +35,16 @@ jobs:
           repository: usqcd-software/c-lime
           path: lime
 
-      - name: autogen_lime
+      - name: create_builddir_lime
         working-directory: ${{github.workspace}}/lime
-        run: ./autogen.sh && mkdir build
+        run: mkdir build
 
       - name: build_lime
         working-directory: ${{github.workspace}}/lime/build
         run: |
           CC=gcc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir .. >> config.log
           make -j
           make install
 
@@ -61,10 +61,9 @@ jobs:
           repository: etmc/lemon
           path: lemon
 
-      - name: autogen_lemon
+      - name: create_builddir_lemon
         working-directory: ${{github.workspace}}/lemon
         run: |
-          autoreconf -i -f
           mkdir build
 
       - name: build_lemon
@@ -72,9 +71,9 @@ jobs:
         run: |
           CC=mpicc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir ..
           make -j
-          make install
+          make install > config.log
       
       - name: Archive lemon config.log
         if: ${{ always() }}
@@ -92,28 +91,19 @@ jobs:
         shell: bash
         run: mkdir ${{github.workspace}}/main/build
 
-      - name: autogen_tmlqcd
-        working-directory: ${{github.workspace}}/main
-        run: autoconf
-
       - name: configure_and_build
         shell: bash
         working-directory: ${{github.workspace}}/main/build
         run: |
-          CC=mpicc CXX=mpicxx \
-            LDFLAGS="-fopenmp" \
             CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
             CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-            ../configure \
-            --enable-mpi \
-            --with-mpidimension=4 \
-            --enable-omp \
-            --disable-sse2 \
-            --disable-sse3 \
-            --with-limedir=${{github.workspace}}/lime/build/install_dir \
-            --with-lemondir=${{github.workspace}}/lemon/build/install_dir \
-            --with-lapack="-lblas -llapack" || cat config.log
-          make -j
+            cmake -DCMAKE_PREFIX_PATH="${{github.workspace}}/lime/build/install_dir;${{github.workspace}}/lemon/build/install_dir" \
+            -DTM_USE_MPI=ON \
+            -DTM_USE_OMP=ON \
+            -DTM_USE_LEMON=ON \
+            .. > config.log
+            cat config.log
+            make -j
 
       - name: Archive tmLQCD config.log
         if: ${{ always() }}
@@ -125,7 +115,7 @@ jobs:
       - name: nf2_rgmixedcg_hmc_tmcloverdetratio
         working-directory: ${{github.workspace}}/main/build
         run: |
-          mpirun -np 2 ./hmc_tm \
+          mpirun -np 2 src/bin/hmc_tm \
             -f ../doc/sample-input/sample-hmc-rgmixedcg-tmcloverdetratio.input
       
       - name: Archive nf2_rgmixedcg_hmc_tmcloverdetratio output
diff --git a/.github/workflows/ddalphaamg-build.yaml b/.github/workflows/ddalphaamg-build.yaml
index f50ffcae9..509fb28b6 100644
--- a/.github/workflows/ddalphaamg-build.yaml
+++ b/.github/workflows/ddalphaamg-build.yaml
@@ -40,19 +40,16 @@ jobs:
           repository: usqcd-software/c-lime
           path: lime
 
-      - name: autogen_lime
+      - name: create_builddir_lime
         working-directory: ${{github.workspace}}/lime
-        run: ./autogen.sh
-      
-      - name: create_lime_builddir
-        run: mkdir ${{github.workspace}}/lime/build
+        run: mkdir build
 
       - name: build_lime
         working-directory: ${{github.workspace}}/lime/build
         run: |
           CC=gcc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir .. >> config.log
           make -j
           make install
 
@@ -69,23 +66,20 @@ jobs:
           repository: etmc/lemon
           path: lemon
 
-      - name: create_lemon_builddir
-        run: mkdir ${{github.workspace}}/lemon/build
-
-      - name: autogen_lemon
+      - name: create_builddir_lemon
         working-directory: ${{github.workspace}}/lemon
-        run: autoreconf -i -f
+        run: |
+          mkdir build
 
       - name: build_lemon
         working-directory: ${{github.workspace}}/lemon/build
         run: |
           CC=mpicc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure \
-            --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir ..
           make -j
-          make install
-
+          make install > config.log
+      
       - name: Archive lemon config.log
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
@@ -111,10 +105,6 @@ jobs:
         shell: bash
         run: mkdir ${{github.workspace}}/main/build
 
-      - name: autogen_tmlqcd
-        working-directory: ${{github.workspace}}/main
-        run: autoconf
-
       - name: configure_and_build
         shell: bash
         working-directory: ${{github.workspace}}/main/build
@@ -123,22 +113,19 @@ jobs:
             LDFLAGS="-fopenmp" \
             CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
             CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-            ../configure \
-            --enable-mpi \
-            --with-mpidimension=4 \
-            --enable-omp \
-            --disable-sse2 \
-            --disable-sse3 \
-            --with-limedir=${{github.workspace}}/lime/build/install_dir \
-            --with-lemondir=${{github.workspace}}/lemon/build/install_dir \
-            --with-DDalphaAMG=${{github.workspace}}/ddalphaamg \
-            --with-lapack="-lblas -llapack" || cat config.log
+            cmake -DCMAKE_PREFIX_PATH="${{github.workspace}}/lime/build/install_dir;${{github.workspace}}/lemon/build/install_dir;${{github.workspace}}/ddalphaamg" \
+            -DTM_USE_MPI=ON \
+            -DTM_USE_OMP=ON \
+            -DTM_USE_LEMON=ON \
+            -DTM_USE_DDalphaAMG=ON \
+            .. > config.log
+            cat config.log
           make -j
 
       - name: nf2_ddalphaamg_hmc_tmcloverdetratio
         working-directory: ${{github.workspace}}/main/build
         run: |
-          mpirun -np 2 ./hmc_tm \
+          mpirun -np 2 src/bin/hmc_tm \
             -f ../doc/sample-input/sample-hmc-ddalphaamg-tmcloverdetratio.input
 
       - name: Archive nf2_ddalphaamg_hmc_tmcloverdetratio output
diff --git a/.github/workflows/qphix-build.yaml b/.github/workflows/qphix-build.yaml
index 1b39cdf34..eef1b5055 100644
--- a/.github/workflows/qphix-build.yaml
+++ b/.github/workflows/qphix-build.yaml
@@ -35,16 +35,16 @@ jobs:
           repository: usqcd-software/c-lime
           path: lime
 
-      - name: autogen_lime
+      - name: create_builddir_lime
         working-directory: ${{github.workspace}}/lime
-        run: ./autogen.sh && mkdir build
+        run: mkdir build
 
       - name: build_lime
         working-directory: ${{github.workspace}}/lime/build
         run: |
           CC=gcc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir .. >> config.log
           make -j
           make install
 
@@ -61,10 +61,9 @@ jobs:
           repository: etmc/lemon
           path: lemon
 
-      - name: autogen_lemon
+      - name: create_builddir_lemon
         working-directory: ${{github.workspace}}/lemon
         run: |
-          autoreconf -i -f
           mkdir build
 
       - name: build_lemon
@@ -72,11 +71,10 @@ jobs:
         run: |
           CC=mpicc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure \
-            --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir ..
           make -j
-          make install
-
+          make install > config.log
+      
       - name: Archive lemon config.log
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
@@ -84,7 +82,6 @@ jobs:
           name: lemon_config_output
           path: ${{github.workspace}}/lemon/build/config.log 
 
-
       - name: get_qmp
         uses: actions/checkout@v4
         with:
@@ -151,9 +148,10 @@ jobs:
             -DCMAKE_C_COMPILER=mpicc \
             -DCMAKE_C_FLAGS="-std=c99 -O2 -mavx2 -mfma -mtune=haswell -march=haswell -fopenmp" \
             -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir \
-            ..
+            .. >> config.log
           VERBOSE=1 make -j $(( ${nb_cores} + 3 ))
-          make install
+          make install > config.log
+          cat config.log
 
       - name: get_tmlqcd
         uses: actions/checkout@v4
@@ -164,31 +162,21 @@ jobs:
         shell: bash
         run: mkdir ${{github.workspace}}/main/build
 
-      - name: autogen_tmlqcd
-        working-directory: ${{github.workspace}}/main
-        run: autoconf
-
       - name: configure_and_build
         shell: bash
         working-directory: ${{github.workspace}}/main/build
         run: |
           CC=mpicc CXX=mpicxx \
-            LDFLAGS="-fopenmp" \
-            CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-            CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-            ../configure \
-            --enable-mpi \
-            --with-mpidimension=4 \
-            --enable-omp \
-            --disable-sse2 \
-            --disable-sse3 \
-            --with-limedir=${{github.workspace}}/lime/build/install_dir \
-            --with-lemondir=${{github.workspace}}/lemon/build/install_dir \
-            --with-lapack="-lblas -llapack" \
-            --with-qmpdir=${{github.workspace}}/qmp/build/install_dir \
-            --with-qphixdir=${{github.workspace}}/qphix/build/install_dir \
-            --enable-qphix-soalen=4 || cat config.log
-          make -j
+          cmake -DCMAKE_PREFIX_PATH="${{github.workspace}}/lime/build/install_dir;${{github.workspace}}/lemon/build/install_dir;${{github.workspace}}/qmp/build/install_dir;${{github.workspace}}/qphix/build/install_dir" \
+           -DTM_USE_MPI=ON \
+           -DTM_USE_OMP=ON \
+           -DTM_USE_LEMON=ON \
+           -DTM_USE_QPHIX=ON \
+           -DCMAKE_CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
+           -DCMAKE_CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
+           -DQPHIX_DIR="${{github.workspace}}/qphix/build/install_dir" \
+            ..
+          make -j > config.log
 
       - name: Archive tmLQCD config.log
         if: ${{ always() }}
@@ -200,7 +188,7 @@ jobs:
       - name: nf2_qphix_hmc_tmcloverdetratio
         working-directory: ${{github.workspace}}/main/build
         run: |
-          mpirun -np 2 ./hmc_tm \
+          mpirun -np 2 src/bin/hmc_tm \
             -f ../doc/sample-input/sample-hmc-qphix-tmcloverdetratio.input
 
       - name: Archive nf2_qphix_hmc_tmcloverdetratio output
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2cacfcc39..a375ad14b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,9 +23,6 @@ endif()
 # =================================================================================================
 # PROJECT AND VERSION
 include(CMakeDependentOption)
-include(CheckSymbolExists)
-include(CheckLibraryExists)
-include(CheckFunctionExists)
 include(GNUInstallDirs)
 
 cmake_policy(SET CMP0048 NEW)
@@ -56,16 +53,18 @@ endif()
 
 find_package(PkgConfig)
 
-# ##############################################################################
-# Define the paths for static libraries and executables
-# ##############################################################################
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY
-    ${cp2k_BINARY_DIR}/lib
-    CACHE PATH "Single output directory for building all libraries.")
-
 # Search for rocm in common locations
-foreach(__var ROCM_ROOT CRAY_ROCM_ROOT ORNL_ROCM_ROOT CRAY_ROCM_PREFIX
-              ROCM_PREFIX CRAY_ROCM_DIR)
+foreach(
+  __var
+  ROCM_ROOT
+  HIP_ROOT
+  HIP_PATH
+  CRAY_ROCM_ROOT
+  ORNL_ROCM_ROOT
+  CRAY_ROCM_PREFIX
+  ROCM_PREFIX
+  CRAY_ROCM_DIR
+  ROCM_PATH)
   if($ENV{${__var}})
     list(APPEND CMAKE_PREFIX_PATH $ENV{__var})
     set(ROCM_PATH
@@ -75,7 +74,7 @@ foreach(__var ROCM_ROOT CRAY_ROCM_ROOT ORNL_ROCM_ROOT CRAY_ROCM_PREFIX
 endforeach()
 
 option(CMAKE_POSITION_INDEPENDENT_CODE "Enable position independent code" ON)
-option(BUILD_SHARED_LIBS "Enable shared library" ON)
+option(BUILD_SHARED_LIBS "Enable shared library" OFF)
 option(TM_USE_FFTW "Enable fftw support" OFF)
 option(TM_USE_MPI "Enable MPI support" OFF)
 option(TM_USE_CUDA "Enable QUDA support" OFF)
@@ -93,15 +92,12 @@ set(TM_ENABLE_ALIGNMENT
 set_property(CACHE TM_ENABLE_ALIGNMENT PROPERTY STRINGS "auto" "none" "16" "32"
                                                 "64")
 
-option(TM_BGL_DRAM "use BGL dram window (BGL only!)" ON)
 option(TM_USE_OPTIMIZATION "enable optimisation" ON)
-option(TM_USE_GAUGECOPY "Enable use of a copy of the gauge field" ON)
+option(TM_USE_GAUGE_COPY "Enable use of a copy of the gauge field" ON)
 option(TM_USE_HALFSPINOR "Use a Dirac Op. with halfspinor exchange" ON)
-option(TM_USE_TSPLITPAR "Enable timeslice-splitted communications" ON)
 option(TM_USE_QPHIX "enable QPhiX" OFF)
 option(TM_USE_SHMEM "Use shmem API" OFF)
 option(TM_USE_QUDA "Enable QUDA support" OFF)
-option(TM_USE_GPROF "Enable gprof profiler" OFF)
 option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
 option(TM_ENABLE_TESTS "Enable tests" OFF)
 
@@ -203,7 +199,7 @@ if(TM_USE_HDF5)
 endif()
 
 if(TM_USE_LEMON)
-  find_package(Clemon REQUIRED)
+  find_package(lemon REQUIRED)
 endif()
 
 find_package(CLime REQUIRED)
@@ -231,6 +227,8 @@ endif()
 
 if(TM_USE_CUDA OR QUDA_TARGET_CUDA)
   enable_language(CUDA)
+
+  # placeholder for nvhpc for future use
   if(TM_USE_NVHPC)
     find_package(NVHPC REQUIRED COMPONENTS CUDA MATH HOSTUTILS NCCL)
   else()
@@ -238,11 +236,11 @@ if(TM_USE_CUDA OR QUDA_TARGET_CUDA)
   endif()
 endif()
 
+# We may want to use hip-cuda for development or debugging purposes especially
+# if AMD GPU access is not possible. So allow it
+
 if(TM_USE_HIP OR QUDA_TARGET_HIP)
   enable_language(hip)
-
-  # we may want to use hip-cuda for development or debugging purposes especially
-  # if AMD GPU access is not possible. So allow it
   if(TM_USE_CUDA_HIP)
     find_package(CUDA)
   endif()
@@ -254,14 +252,15 @@ if(TM_USE_HIP OR QUDA_TARGET_HIP)
   endif()
 endif()
 
-if(TM_USE_QPIHX)
-  find_package(QPhiX REQUIRED)
+if(TM_USE_QPHIX)
+  find_package(QPhiX REQUIRED CONFIG)
+  message("${QPhiX_LIBRARIES}")
   if(NOT TARGET tmlqcd::qphix)
     add_library(tmlqcd::qphix INTERFACE IMPORTED)
     set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_LINK_LIBRARIES
-                                                   "${QPHIX_LIBRARIES}")
+                                                   "${QPhiX_LIBRARIES}")
     set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                                   "${QPHIX_INCLUDE_DIRS}")
+                                                   "${QPhiX_INCLUDE_DIRS}")
   endif()
 endif()
 
@@ -274,17 +273,7 @@ if(TM_USE_FFTW)
 endif()
 
 if(TM_USE_DDalphaAMG)
-  find_package(DDAlphaAMG REQUIRED)
-endif()
-
-# gprofiler
-
-if(TM_USE_GPROF)
-  set(PROFILE_FLAGS "-pg;-g")
-  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "powerpc|powerpc64")
-    list(APPEND PROFILE_FLAGS "-qfullpath")
-  endif()
-  add_compile_options($<BOOL:$<COMPILE_LANGUAGE:C>:$PROFILE_FLAGS>)
+  find_package(DDalphaAMG REQUIRED)
 endif()
 
 if(TM_ENABLE_WARNINGS)
@@ -292,31 +281,26 @@ if(TM_ENABLE_WARNINGS)
                       $<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wall>)
 endif()
 
-# check for the presence of clock_gettime in libc or librt
-check_symbol_exists(clock_gettime "time.h" TM_CLOCK_GETTIME)
-check_library_exists(rt clock_gettime "" TM_CLOCK_GETTIME_IN_RT)
-check_function_exists(fseeko TM_FSEEKO)
-
 # set the parallelization
 
 if(TM_USE_MPI)
-  if(TM_MPI_DIMENSION EQUAL "1")
+  if(TM_MPI_DIMENSION STREQUAL "1")
     # T parallelisation
     set(TM_PARALLELT ON)
-  elseif(TM_MPI_DIMENSION EQUAL "2")
+  elseif(TM_MPI_DIMENSION STREQUAL "2")
     # XT parallelisation
     set(TM_PARALLELXT ON)
-  elseif(TM_MPI_DIMENSION EQUAL "3")
+  elseif(TM_MPI_DIMENSION STREQUAL "3")
     set(TM_PARALLELXYT ON)
     # XYZ parallelisation
-  elseif(TM_MPI_DIMENSION EQUAL "4")
+  elseif(TM_MPI_DIMENSION STREQUAL "4")
     # timeslice-splitted communications
     set(TM_PARALLELXYZT ON)
-  elseif(TM_MPI_DIMENSION EQUAL "X")
+  elseif(TM_MPI_DIMENSION STREQUAL "X")
     set(TM_PARALLELX ON)
-  elseif(TM_MPI_DIMENSION EQUAL "XY")
+  elseif(TM_MPI_DIMENSION STREQUAL "XY")
     set(TM_PARALLELXY ON)
-  elseif(TM_MPI_DIMENSION EQUAL "XYZ")
+  elseif(TM_MPI_DIMENSION STREQUAL "XYZ")
     set(TM_PARALLELXYZ ON)
   else()
     set(TM_PARALLELXYZT ON)
@@ -346,7 +330,6 @@ if(DEFINED GIT_EXE AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
   message(STATUS "git hash ${TM_SHA}")
 else()
-  # set(TM_GIT_BRANCH "release v${SIRIUS_VERSION}")
   set(TM_SHA
       "https://github.com/etmc/tmLQCD/releases/tag/rel-${TMLQCD_VERSION_MAJOR}-${TMLQCD_VERSION_MINOR}"
   )
diff --git a/cmake/FindDDAlphaAMG.cmake b/cmake/FindDDAlphaAMG.cmake
deleted file mode 100644
index f42c943cc..000000000
--- a/cmake/FindDDAlphaAMG.cmake
+++ /dev/null
@@ -1,29 +0,0 @@
-include(FindPackageHandleStandardArgs)
-
-find_library(
-  TM_DDALPHAAMG_LIBRARIES
-  NAMES DDalphaAMG DDalphaAMG_devel
-  PATH_SUFFIXES "lib" "lib64")
-
-find_path(
-  TM_DDALPHAAMG_INCLUDE_DIRS
-  NAMES DDalphaAMG.h
-  PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
-
-find_package_handle_standard_args(
-  DDAlphaAMG DEFAULT_MSG TMLQCD_DDALPHAAMG_LIBRARIES
-  TMLQCD_DDALPHAAMG_INCLUDE_DIRS)
-
-if(NOT TARGET tmlqcd::DDalphaAMG)
-  add_library(tmlqcd::DDalphaAMG INTERFACE IMPORTED)
-  set_target_properties(
-    tmlqcd::DDalphaAMG PROPERTIES INTERFACE_LINK_LIBRARIES
-                                  "${TMLQCD_DDALPHAAMG_LIBRARIES}")
-  set_target_properties(
-    tmlqcd::DDalphaAMG PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                  "${TMLQCD_DDALPHAAMG_INCLUDE_DIRS}")
-endif()
-
-set(TMLQCD_DDALPHAAMG_FOUND ON)
-mark_as_advanced(TMLQCD_DDALPHAAMG_FOUND TMLQCD_DDALPHAAMG_LIBRARIES
-                 TMLQCD_DDALPHAAMG_INCLUDE_DIRS)
diff --git a/cmake/FindDDalphaAMG.cmake b/cmake/FindDDalphaAMG.cmake
new file mode 100644
index 000000000..5f0d2450f
--- /dev/null
+++ b/cmake/FindDDalphaAMG.cmake
@@ -0,0 +1,28 @@
+include(FindPackageHandleStandardArgs)
+
+find_library(
+  TM_DDALPHAAMG_LIBRARIES
+  NAMES DDalphaAMG DDalphaAMG_devel
+  PATH_SUFFIXES "lib" "lib64")
+
+find_path(
+  TM_DDALPHAAMG_INCLUDE_DIRS
+  NAMES DDalphaAMG.h
+  PATH_SUFFIXES "include")
+
+find_package_handle_standard_args(
+  DDalphaAMG DEFAULT_MSG TM_DDALPHAAMG_LIBRARIES TM_DDALPHAAMG_INCLUDE_DIRS)
+
+if(TM_DDALPHAAMG_LIBRARIES
+   AND TM_DDALPHAAMG_INCLUDE_DIRS
+   AND NOT TARGET tmlqcd::DDalphaAMG)
+  message("INCLUDE: ${TM_DDALPHAAMG_INCLUDE_DIRS}")
+  add_library(tmlqcd::DDalphaAMG INTERFACE IMPORTED)
+  set_property(TARGET tmlqcd::DDalphaAMG PROPERTY INTERFACE_LINK_LIBRARIES
+                                                  "${TM_DDALPHAAMG_LIBRARIES}")
+  set_property(
+    TARGET tmlqcd::DDalphaAMG PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+                                       "${TM_DDALPHAAMG_INCLUDE_DIRS}")
+endif()
+
+mark_as_advanced(TM_DDALPHAAMG_LIBRARIES TM_DDALPHAAMG_INCLUDE_DIRS)
diff --git a/cmake/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
index 89bc753df..fb8d7d818 100644
--- a/cmake/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -12,9 +12,6 @@
 /* Define to 1 if you have the `lemon' library (-llemon). */
 #cmakedefine TM_USE_LEMON 
 
-/* 1 if clock_gettime is available for use in benchmark */
-#cmakedefine TM_CLOCK_GETTIME 
-
 /* Compile with MPI support */
 #cmakedefine TM_USE_MPI
 
@@ -31,9 +28,9 @@
 #define PACKAGE_BUGREPORT "@PACKAGE_BUGREPORT@"
 
 /* Define to the full name of this package. */
-#define PACKAGE_NAME "@PROJECT_DESCRIPTION@"
+#define PACKAGE_NAME "@tmlqcd_DESCRIPTION@"
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "@PROJECT_VERSION@"
+#define PACKAGE_STRING "@tmlqcd_VERSION@"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "@PACKAGE_TARNAME@"
@@ -65,9 +62,6 @@
 /* Fixed volume at compiletime */
 #cmakedefine TM_FIXEDVOLUME
 
-/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
-#cmakedefine TM_FSEEKO
-
 /* Alignment for arrays -- necessary for SSE and automated vectorization */
 #define ALIGN_BASE @ALIGN_BASE@
 
@@ -88,7 +82,7 @@
 #cmakedefine TM_FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
 
 /* Construct an extra copy of the gauge fields */
-#cmakedefine TM_USE_GAUGECOPY
+#cmakedefine TM_USE_GAUGE_COPY
 
 /* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
 #cmakedefine TM_LARGEFILE_SOURCE
diff --git a/profiling/hmc_mk2/logs/example_log.out b/profiling/hmc_mk2/logs/example_log.out
index 22ec86ec9..642963b16 100644
--- a/profiling/hmc_mk2/logs/example_log.out
+++ b/profiling/hmc_mk2/logs/example_log.out
@@ -270,7 +270,7 @@ operator 0 parsed line 229
 This is the hmc code for twisted mass Wilson QCD
 
 Version 5.2.0, commit 51cf008a89944ecdd9345cdb62aaf0a203a7f306
-# The code is compiled with -DTM_GAUGE_COPY
+# The code is compiled with -DTM_USE_GAUGE_COPY
 # The code is compiled with -DTM_USE_HALFSPINOR
 # the code is compiled for non-blocking MPI calls (spinor and gauge)
 # the code is compiled with openMP support
diff --git a/src/bin/LapH_ev.c b/src/bin/LapH_ev.c
deleted file mode 100644
index 08e810b36..000000000
--- a/src/bin/LapH_ev.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/***********************************************************************
- * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- ***********************************************************************/
-/*
- *  Program for computing the eigensystem of the Laplacian operator
- * Authors Luigi Scorzato, Marco Cristoforetti
- *
- *
- *******************************************************************************/
-
-#ifdef HAVE_CONFIG_H
-#include "tmlqcd_config.h"
-#else
-#error "no tmlqcd_config.h"
-#endif
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#ifdef TM_USE_MPI
-#include <mpi.h>
-#endif
-#include <io/gauge.h>
-#include <io/params.h>
-#include "geometry_eo.h"
-#include "global.h"
-#include "init/init.h"
-#include "mpi_init.h"
-#include "ranlxd.h"
-#include "read_input.h"
-#include "solver/eigenvalues_Jacobi.h"
-#include "start.h"
-#include "su3.h"
-#include "xchange/xchange.h"
-
-int main(int argc, char *argv[]) {
-  int tslice, j, k;
-  char conf_filename[50];
-
-#ifdef TM_USE_MPI
-  MPI_Init(&argc, &argv);
-#endif
-
-  /* Read the input file */
-  read_input("LapH.input");
-
-  tmlqcd_mpi_init(argc, argv);
-
-  if (g_proc_id == 0) {
-#ifdef TM_GAUGE_COPY
-    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
-#endif
-#ifdef TM_USE_HALFSPINOR
-    printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
-#endif
-#ifdef TM_USE_SHMEM
-    printf("# the code was compiled with -DTM_USE_SHMEM\n");
-#ifdef TM_PERSISTENT
-    printf("# the code was compiled for persistent MPI calls (halfspinor only)\n");
-#endif
-#endif
-#ifdef TM_USE_MPI
-#ifdef TM_NON_BLOCKING
-    printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n");
-#endif
-#endif
-    printf("\n");
-    fflush(stdout);
-  }
-
-#ifndef WITHLAPH
-  printf(" Error: WITHLAPH not defined");
-  exit(0);
-#endif
-#ifdef TM_USE_MPI
-#ifndef _INDEX_INDEP_GEOM
-  printf(" Error: _INDEX_INDEP_GEOM not defined");
-  exit(0);
-#endif
-#ifndef _USE_TSPLITPAR
-  printf(" Error: _USE_TSPLITPAR not defined");
-  exit(0);
-#endif
-#endif
-#ifdef TM_FIXEDVOLUME
-  printf(" Error: TM_FIXEDVOLUME not allowed");
-  exit(0);
-#endif
-
-  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
-  init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);
-
-  if (g_proc_id == 0) {
-    fprintf(stdout, "The number of processes is %d \n", g_nproc);
-    printf("# The lattice size is %d x %d x %d x %d\n", (int)(T * g_nproc_t), (int)(LX * g_nproc_x),
-           (int)(LY * g_nproc_y), (int)(g_nproc_z * LZ));
-    printf("# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY),
-           (int)LZ);
-    printf("# Computing LapH eigensystem \n");
-
-    fflush(stdout);
-  }
-
-  /* define the geometry */
-  geometry();
-
-  start_ranlux(1, 123456);
-
-  /* Read Gauge field */
-  sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore);
-  if (g_cart_id == 0) {
-    printf("#\n# Trying to read gauge field from file %s in %s precision.\n", conf_filename,
-           (gauge_precision_read_flag == 32 ? "single" : "double"));
-    fflush(stdout);
-  }
-  if ((j = read_gauge_field(conf_filename, g_gauge_field)) != 0) {
-    fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", j, conf_filename);
-    exit(-2);
-  }
-
-  if (g_cart_id == 0) {
-    printf("# Finished reading gauge field.\n");
-    fflush(stdout);
-  }
-
-#ifdef TM_USE_MPI
-  /*For parallelization: exchange the gaugefield */
-  xchange_gauge(g_gauge_field);
-#endif
-
-  /* Init Jacobi field */
-  init_jacobi_field(SPACEVOLUME + SPACERAND, 3);
-
-#ifdef TM_USE_MPI
-  {
-    /* for debugging in parallel set i_gdb = 0 */
-    volatile int i_gdb = 8;
-    char hostname[256];
-    gethostname(hostname, sizeof(hostname));
-    printf("PID %d on %s ready for attach\n", getpid(), hostname);
-    fflush(stdout);
-    if (g_cart_id == 0) {
-      while (0 == i_gdb) {
-        sleep(5);
-      }
-    }
-  }
-
-  MPI_Barrier(MPI_COMM_WORLD);
-#endif
-
-  for (k = 0; k < 3; k++) random_jacobi_field(g_jacobi_field[k], SPACEVOLUME);
-
-  /* Compute LapH Eigensystem */
-
-  for (tslice = 0; tslice < T; tslice++) {
-    eigenvalues_Jacobi(&no_eigenvalues, 5000, eigenvalue_precision, 0, tslice, nstore);
-  }
-
-#ifdef TM_USE_MPI
-  MPI_Finalize();
-#endif
-  return (0);
-}
diff --git a/src/bin/benchmark.c b/src/bin/benchmark.c
index 72d8c8f4d..b2f4ee68c 100644
--- a/src/bin/benchmark.c
+++ b/src/bin/benchmark.c
@@ -123,8 +123,8 @@ int main(int argc, char *argv[]) {
   tmlqcd_mpi_init(argc, argv);
 
   if (g_proc_id == 0) {
-#ifdef TM_GAUGE_COPY
-    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
+#ifdef TM_USE_GAUGE_COPY
+    printf("# The code was compiled with -DTM_USE_GAUGE_COPY\n");
 #endif
 #ifdef TM_USE_HALFSPINOR
     printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
     fflush(stdout);
   }
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
diff --git a/src/bin/deriv_mg_tune.c b/src/bin/deriv_mg_tune.c
index 75595bc60..f65b22c48 100644
--- a/src/bin/deriv_mg_tune.c
+++ b/src/bin/deriv_mg_tune.c
@@ -136,7 +136,7 @@ int main(int argc, char *argv[]) {
 
   g_mu = g_mu1;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
   status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
diff --git a/src/bin/hmc_tm.c b/src/bin/hmc_tm.c
index 0d95a3b3c..399362d0b 100644
--- a/src/bin/hmc_tm.c
+++ b/src/bin/hmc_tm.c
@@ -168,7 +168,7 @@ int main(int argc, char *argv[]) {
 
   g_mu = g_mu1;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
   status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
diff --git a/src/bin/invert.c b/src/bin/invert.c
index c3111decb..bb6f15c10 100644
--- a/src/bin/invert.c
+++ b/src/bin/invert.c
@@ -165,7 +165,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
   j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
diff --git a/src/bin/offline_measurement.c b/src/bin/offline_measurement.c
index 72a828fb7..c1422858f 100644
--- a/src/bin/offline_measurement.c
+++ b/src/bin/offline_measurement.c
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
diff --git a/src/bin/tests/check_locallity.c b/src/bin/tests/check_locallity.c
index f03806f21..01d12826b 100644
--- a/src/bin/tests/check_locallity.c
+++ b/src/bin/tests/check_locallity.c
@@ -18,13 +18,13 @@
  ***********************************************************************/
 
 #include <lime.h>
-#include <tmlqcd_config.h>
 #include <math.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
+#include <tmlqcd_config.h>
 #ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]) {
   double *norm;
   struct stout_parameters params_smear;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   int kb = 0;
 #endif
 #ifdef TM_USE_MPI
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
diff --git a/src/bin/tests/hopping_test.c b/src/bin/tests/hopping_test.c
index da60c83ba..0e5ff03e7 100644
--- a/src/bin/tests/hopping_test.c
+++ b/src/bin/tests/hopping_test.c
@@ -102,8 +102,8 @@ int main(int argc, char *argv[]) {
   tmlqcd_mpi_init(argc, argv);
 
   if (g_proc_id == 0) {
-#ifdef TM_GAUGE_COPY
-    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
+#ifdef TM_USE_GAUGE_COPY
+    printf("# The code was compiled with -DTM_USE_GAUGE_COPY\n");
 #endif
 #ifdef TM_USE_HALFSPINOR
     printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
     fflush(stdout);
   }
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
diff --git a/src/bin/tests/qphix_test_Dslash.c b/src/bin/tests/qphix_test_Dslash.c
index b4218d3e6..41e2602a4 100644
--- a/src/bin/tests/qphix_test_Dslash.c
+++ b/src/bin/tests/qphix_test_Dslash.c
@@ -105,7 +105,7 @@ int main(int argc, char* argv[]) {
   tmlqcd_mpi_init(argc, argv);
   g_dbw2rand = 0;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND, 0);
@@ -180,7 +180,7 @@ int main(int argc, char* argv[]) {
 #endif
 
   g_update_gauge_copy = 1;
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   update_backward_gauge(g_gauge_field);
 #endif
 
diff --git a/src/bin/tests/test_eigenvalues.c b/src/bin/tests/test_eigenvalues.c
index 759d8dd2f..c52d29cf8 100644
--- a/src/bin/tests/test_eigenvalues.c
+++ b/src/bin/tests/test_eigenvalues.c
@@ -227,7 +227,7 @@ int main(int argc, char *argv[]) {
   g_eps_sq_acc = g_eps_sq_acc1;
   g_eps_sq_force = g_eps_sq_force1;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -277,8 +277,8 @@ int main(int argc, char *argv[]) {
 #ifdef TM_NEW_GEOMETRY
     printf("# The code was compiled with -DTM_NEW_GEOMETRY\n");
 #endif
-#ifdef TM_GAUGE_COPY
-    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
+#ifdef TM_USE_GAUGE_COPY
+    printf("# The code was compiled with -DTM_USE_GAUGE_COPY\n");
 #endif
     printf("# The lattice size is %d x %d x %d x %d\n", (int)(T * g_nproc_t), (int)(LX * g_nproc_x),
            (int)(LY), (int)(LZ));
@@ -430,7 +430,7 @@ int main(int argc, char *argv[]) {
 #ifdef TM_USE_MPI
   xchange_gauge(g_gauge_field);
 #endif
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   update_backward_gauge();
 #endif
 
diff --git a/src/bin/tests/test_lemon.c b/src/bin/tests/test_lemon.c
index 3cef7689c..9ef46be7b 100644
--- a/src/bin/tests/test_lemon.c
+++ b/src/bin/tests/test_lemon.c
@@ -66,7 +66,7 @@ int main(int argc, char *argv[]) {
 
   tmlqcd_mpi_init(argc, argv);
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index ea2f7e41d..ebed35308 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -370,7 +370,7 @@ list(
 list(APPEND TEST_SRC_C test/check_xchange.c test/check_geometry.c
      test/overlaptests.c)
 if(TM_USE_QPHIX)
-  list(APPEND MAIN_SRC_C QphiX/qphix_interface.cpp)
+  list(APPEND MAIN_SRC_C qphix/qphix_interface.cpp)
 endif()
 
 if(TM_USE_QUDA)
@@ -404,11 +404,11 @@ include_directories(
 
 # cmake 4.0 uses a different syntax for the option
 if(CMAKE_MAJOR_VERSION LESS 4)
-  flex_target(tmlqcd_input_read read_input.l read_input.c
-              COMPILE_FLAGS "-Ca -Ptmlqcd")
+  flex_target(tmlqcd_input_read read_input.l ${CMAKE_BINARY_DIR}/read_input.c
+              COMPILE_FLAGS "-Ca -Ptmlqcd -i")
 else()
-  flex_target(tmlqcd_input_read read_input.l read_input.c OPTIONS
-              "-Ca -Ptmlqcd")
+  flex_target(tmlqcd_input_read read_input.l ${CMAKE_BINARY_DIR}/read_input.c OPTIONS
+              "-Ca -Ptmlqcd -i")
 endif()
 
 # create a target library with namespacing because cmake does not know name
@@ -425,8 +425,7 @@ set_target_properties(hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
 # define a library and add the dependencies
 target_link_libraries(
   hmc
-  PUBLIC $<$<BOOL:${TM_CLOCK_GETTIME_IN_RT}>:rt>
-         $<$<BOOL:${TM_DDalphaAMG}>:tmlqcd::DDalphaAMG>
+  PUBLIC $<$<BOOL:${TM_USE_DDalphaAMG}>:tmlqcd::DDalphaAMG>
          $<$<BOOL:${TM_USE_QPHIX}>:tmlqcd::qphix>
          $<$<BOOL:${TM_USE_FFTW}>:tmlqcd::fftw3>
          $<$<BOOL:${TM_USE_QUDA}>:QUDA::quda>
@@ -439,7 +438,7 @@ target_link_libraries(
          roc::hipblas
          hip::host>
          tmlqcd::clime
-         $<$<BOOL:${TM_USE_LEMON}>:clemon::lemon>
+         $<$<BOOL:${TM_USE_LEMON}>:lemon::lemon>
          ${LAPACK_LIBRARIES}
          ${BLAS_LIBRARIES}
          $<$<BOOL:${TM_USE_MPI}>:MPI::MPI_C
diff --git a/src/lib/DDalphaAMG_interface.c b/src/lib/DDalphaAMG_interface.c
index 80bff4fcc..bf2da4bef 100644
--- a/src/lib/DDalphaAMG_interface.c
+++ b/src/lib/DDalphaAMG_interface.c
@@ -207,7 +207,8 @@ static inline int MG_check(spinor *const phi_new, spinor *const phi_old, const i
           "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
           "restart... \n");
       printf(
-          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
+          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
+          "%e > %e "
           "\n",
           differ[0], differ[1], differ[0] / differ[1], precision);
     }
@@ -215,8 +216,9 @@ static inline int MG_check(spinor *const phi_new, spinor *const phi_old, const i
   }
 
   if (g_debug_level > 0 && g_proc_id == 0)
-    printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
-           differ[0], differ[1], differ[0] / differ[1]);
+    printf(
+        "MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+        differ[0], differ[1], differ[0] / differ[1]);
 
   return 1;
 }
@@ -257,7 +259,8 @@ static inline int MG_check_nd(spinor *const up_new, spinor *const dn_new, spinor
           "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
           "restart... \n");
       printf(
-          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
+          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
+          "%e > %e "
           "\n",
           differ[0], differ[1], differ[0] / differ[1], precision);
     }
@@ -265,8 +268,9 @@ static inline int MG_check_nd(spinor *const up_new, spinor *const dn_new, spinor
   }
 
   if (g_debug_level > 0 && g_proc_id == 0)
-    printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
-           differ[0], differ[1], differ[0] / differ[1]);
+    printf(
+        "MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+        differ[0], differ[1], differ[0] / differ[1]);
 
   return 1;
 }
@@ -304,7 +308,8 @@ static inline int MG_mms_check_nd(spinor **const up_new, spinor **const dn_new,
             "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
             "restart... \n");
         printf(
-            "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > "
+            "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
+            "%e > "
             "%e \n",
             differ[0], differ[1], differ[0] / differ[1], precision[i]);
       }
@@ -313,8 +318,9 @@ static inline int MG_mms_check_nd(spinor **const up_new, spinor **const dn_new,
     }
 
     if (g_debug_level > 0 && g_proc_id == 0)
-      printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
-             differ[0], differ[1], differ[0] / differ[1]);
+      printf(
+          "MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+          differ[0], differ[1], differ[0] / differ[1]);
   }
 
   finalize_solver(check_vect, 2);
@@ -367,8 +373,8 @@ static int MG_pre_solve(su3 **gf) {
     mg_do_setup = 0;
     mg_tau = gauge_tau;
     if (mg_status.success && g_proc_id == 0)
-      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
-             100. * (mg_status.coarse_time / mg_status.time));
+      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n",
+             mg_status.time, 100. * (mg_status.coarse_time / mg_status.time));
     else if (g_proc_id == 0)
       printf("ERROR: setup procedure did not run correctly");
   }
@@ -384,8 +390,8 @@ static int MG_pre_solve(su3 **gf) {
     mg_update_setup = 0;
     mg_tau = gauge_tau;
     if (mg_status.success && g_proc_id == 0)
-      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
-             100. * (mg_status.coarse_time / mg_status.time));
+      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n",
+             mg_status.time, 100. * (mg_status.coarse_time / mg_status.time));
     else if (g_proc_id == 0)
       printf("ERROR: setup updating did not run correctly");
   }
diff --git a/src/lib/buffers/utils_generic_exchange.c b/src/lib/buffers/utils_generic_exchange.c
index 474c738ad..d1a68a351 100644
--- a/src/lib/buffers/utils_generic_exchange.c
+++ b/src/lib/buffers/utils_generic_exchange.c
@@ -127,7 +127,7 @@ void generic_exchange(void *field_in, int bytes_per_site) {
   /* Following are implementations using different compile time flags */
 #if defined TM_NON_BLOCKING
 #include "utils_generic_exchange.nonblocking.inc"
-#else  /* TM_NON_BLOCKING */
+#else /* TM_NON_BLOCKING */
 #include "utils_generic_exchange.blocking.inc"
 #endif /* TM_NON_BLOCKING */
 }
diff --git a/src/lib/deriv_Sb.c b/src/lib/deriv_Sb.c
index 7b55eb170..1427c4af0 100644
--- a/src/lib/deriv_Sb.c
+++ b/src/lib/deriv_Sb.c
@@ -56,7 +56,7 @@
 void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field_t* const hf,
               const double factor) {
   tm_stopwatch_push(&g_timers, __func__, "");
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(hf->gaugefield);
   }
@@ -114,7 +114,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = &g_gauge_field_copy[icx][0];
 #else
     up = &hf->gaugefield[ix][0];
@@ -136,7 +136,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][0];
@@ -159,7 +159,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][1];
@@ -181,7 +181,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][1];
@@ -203,7 +203,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][2];
@@ -225,7 +225,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][2];
@@ -247,7 +247,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][3];
@@ -269,7 +269,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][3];
diff --git a/src/lib/geometry_eo.c b/src/lib/geometry_eo.c
index ceb348e1a..f89189357 100644
--- a/src/lib/geometry_eo.c
+++ b/src/lib/geometry_eo.c
@@ -274,7 +274,8 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
   y3 = (x3 + LZ) % LZ;
   ix = ((y0 * LX + y1) * LY + y2) * LZ + y3;
 
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
   if (x0 == T) {
     ix = VOLUME + y3 + LZ * y2 + LZ * LY * y1;
   }
@@ -433,7 +434,8 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
 
   /* The DBW2 stuff --> second boundary slice */
   /* This we put a the very end.              */
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
   if (x0 == T + 1) {
     ix = VOLUMEPLUSRAND + y3 + LZ * y2 + LZ * LY * y1;
 #if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
@@ -685,14 +687,16 @@ void geometry() {
 
   xeven = malloc(VOLUMEPLUSRAND * sizeof(int));
 
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT)
   startvaluet = 1;
 #endif
-#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
-     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || \
+     defined TM_PARALLELX || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   startvaluex = 1;
 #endif
-#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || \
+     defined TM_PARALLELXYZ)
   startvaluey = 1;
 #endif
 #if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
@@ -851,7 +855,6 @@ void geometry() {
     }
   }
 
-
 #endif /* TM_PARALLELXYZ || TM_PARALLELXYZT*/
 
   /* The rectangular gauge action part */
@@ -861,7 +864,8 @@ void geometry() {
       printf("# Initialising rectangular gauge action stuff\n");
       fflush(stdout);
     }
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT)
     for (x1 = -startvaluex; x1 < (LX + startvaluex); x1++) {
       for (x2 = -startvaluey; x2 < (LY + startvaluey); x2++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
@@ -910,8 +914,8 @@ void geometry() {
       }
     }
 #endif
-#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
-     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || \
+     defined TM_PARALLELX || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
     for (x0 = -startvaluet; x0 < (T + startvaluet); x0++) {
       for (x2 = -startvaluey; x2 < (LY + startvaluey); x2++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
@@ -959,7 +963,8 @@ void geometry() {
       }
     }
 #endif
-#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || \
+     defined TM_PARALLELXYZ)
     for (x0 = -startvaluet; x0 < (T + startvaluet); x0++) {
       for (x1 = -startvaluex; x1 < (LX + startvaluex); x1++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
diff --git a/src/lib/global.h b/src/lib/global.h
index b0d3b1ac2..31d6dc0d4 100644
--- a/src/lib/global.h
+++ b/src/lib/global.h
@@ -121,7 +121,6 @@ EXTERN int *g_field_z_disp_even_up;
 EXTERN int *g_field_z_disp_odd_dn;
 EXTERN int *g_field_z_disp_odd_up;
 
-
 /* IF PHMC  */
 EXTERN spinor **g_chi_up_spinor_field;
 EXTERN spinor **g_chi_dn_spinor_field;
diff --git a/src/lib/init/init.h b/src/lib/init/init.h
index 0fe9ae51b..127622a8b 100644
--- a/src/lib/init/init.h
+++ b/src/lib/init/init.h
@@ -33,8 +33,8 @@
 #include "init/init_gauge_tmp.h"
 #include "init/init_geometry_indices.h"
 #include "init/init_global_states.h"
-#include "init/init_parallel.h"
 #include "init/init_moment_field.h"
+#include "init/init_parallel.h"
 #include "init/init_spinor_field.h"
 #include "init/init_stout_smear_vars.h"
 #ifdef TM_USE_OMP
diff --git a/src/lib/init/init_dirac_halfspinor.c b/src/lib/init/init_dirac_halfspinor.c
index 891a703e2..6b4fba174 100644
--- a/src/lib/init/init_dirac_halfspinor.c
+++ b/src/lib/init/init_dirac_halfspinor.c
@@ -69,15 +69,13 @@ int init_dirac_halfspinor() {
     errno = 0;
     return (1);
   }
-  sendBuffer =
-      (halfspinor *)(((unsigned long int)(sendBuffer_) + ALIGN_BASE + 1) & ~ALIGN_BASE);
+  sendBuffer = (halfspinor *)(((unsigned long int)(sendBuffer_) + ALIGN_BASE + 1) & ~ALIGN_BASE);
   if ((void *)(recvBuffer_ = (halfspinor *)calloc(RAND / 2 + 8, sizeof(halfspinor))) == NULL) {
     printf("malloc errno : %d\n", errno);
     errno = 0;
     return (1);
   }
-  recvBuffer =
-      (halfspinor *)(((unsigned long int)(recvBuffer_) + ALIGN_BASE + 1) & ~ALIGN_BASE);
+  recvBuffer = (halfspinor *)(((unsigned long int)(recvBuffer_) + ALIGN_BASE + 1) & ~ALIGN_BASE);
 #endif
 
   for (int ieo = 0; ieo < 2; ieo++) {
@@ -94,7 +92,8 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 2 * mu + 1] =
             &HalfSpinor[8 * g_lexic2eosub[g_iup[j][mu]] + 2 * mu + 1];
       }
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (t == 0) {
         k = (g_lexic2eosub[g_idn[j][0]] - VOLUME / 2);
         NBPointer[ieo][8 * i] = &sendBuffer[k];
@@ -154,7 +153,8 @@ int init_dirac_halfspinor() {
       for (int mu = 0; mu < 8; mu++) {
         NBPointer[ieo][8 * i + mu] = &HalfSpinor[8 * i + mu];
       }
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (t == T - 1) {
         NBPointer[ieo][8 * i] = &recvBuffer[(g_lexic2eosub[g_iup[j][0]] - VOLUME / 2)];
       }
@@ -240,7 +240,8 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 2 * mu + 1] =
             &HalfSpinor32[8 * g_lexic2eosub[g_iup[j][mu]] + 2 * mu + 1];
       }
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (t == 0) {
         k = (g_lexic2eosub[g_idn[j][0]] - VOLUME / 2);
         NBPointer32[ieo][8 * i] = &sendBuffer32[k];
@@ -300,7 +301,8 @@ int init_dirac_halfspinor32() {
       for (mu = 0; mu < 8; mu++) {
         NBPointer32[ieo][8 * i + mu] = &HalfSpinor32[8 * i + mu];
       }
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (t == T - 1) {
         NBPointer32[ieo][8 * i] = &recvBuffer32[(g_lexic2eosub[g_iup[j][0]] - VOLUME / 2)];
       }
diff --git a/src/lib/init/init_geometry_indices.c b/src/lib/init/init_geometry_indices.c
index 6b75fc83a..edd568d93 100644
--- a/src/lib/init/init_geometry_indices.c
+++ b/src/lib/init/init_geometry_indices.c
@@ -74,7 +74,6 @@ int init_geometry_indices(const int V) {
   if ((void *)g_field_z_disp_odd_up == NULL) return (17);
 #endif
 
-
   g_coord = (int **)calloc(VOLUME, sizeof(int *));
   if ((void *)g_coord == NULL) return (19);
   for (i = 0; i < VOLUME; i++) {
diff --git a/src/lib/io/utils_write_first_message.c b/src/lib/io/utils_write_first_message.c
index 287d67c37..4233789cc 100644
--- a/src/lib/io/utils_write_first_message.c
+++ b/src/lib/io/utils_write_first_message.c
@@ -30,9 +30,9 @@ int write_first_messages(FILE* parameterfile, char const* const executable,
            TMLQCD_PACKAGE_VERSION, git_hash);
   printf("%s", message);
   fprintf(parameterfile, "%s", message);
-#ifdef TM_GAUGE_COPY
-  printf("# The code is compiled with -DTM_GAUGE_COPY\n");
-  fprintf(parameterfile, "# The code is compiled with -DTM_GAUGE_COPY\n");
+#ifdef TM_USE_GAUGE_COPY
+  printf("# The code is compiled with -DTM_USE_GAUGE_COPY\n");
+  fprintf(parameterfile, "# The code is compiled with -DTM_USE_GAUGE_COPY\n");
 #endif
 #ifdef TM_USE_HALFSPINOR
   printf("# The code is compiled with -DTM_USE_HALFSPINOR\n");
diff --git a/src/lib/linalg/assign.c b/src/lib/linalg/assign.c
index fd04de1e4..19fcda44b 100644
--- a/src/lib/linalg/assign.c
+++ b/src/lib/linalg/assign.c
@@ -47,4 +47,3 @@ void assign_32(spinor32 *const R, spinor32 *const S, const int N) {
   memcpy(R, S, N * sizeof(spinor32));
   return;
 }
-
diff --git a/src/lib/linalg/assign_add_mul_r_32.c b/src/lib/linalg/assign_add_mul_r_32.c
index 9f6b1a72f..5ab9366ac 100644
--- a/src/lib/linalg/assign_add_mul_r_32.c
+++ b/src/lib/linalg/assign_add_mul_r_32.c
@@ -35,7 +35,7 @@
 #include "su3.h"
 
 void assign_add_mul_r_32_orphaned(spinor32 *const R, spinor32 *const S, const float c,
-                                         const int N) {
+                                  const int N) {
 #ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
diff --git a/src/lib/linalg/scalar_prod_r.c b/src/lib/linalg/scalar_prod_r.c
index f4fd9293b..c5288aa34 100644
--- a/src/lib/linalg/scalar_prod_r.c
+++ b/src/lib/linalg/scalar_prod_r.c
@@ -97,4 +97,3 @@ double scalar_prod_r(const spinor *const S, const spinor *const R, const int N,
 #endif
   return res;
 }
-
diff --git a/src/lib/matrix_utils.c b/src/lib/matrix_utils.c
index d5c4198ea..63c98657b 100644
--- a/src/lib/matrix_utils.c
+++ b/src/lib/matrix_utils.c
@@ -30,9 +30,8 @@
 #ifndef TM_USE_OMP
 static
 #endif
-    void
-    exponent_from_coefficients(su3 *out, _Complex double f0, _Complex double f1, _Complex double f2,
-                               su3 const *in) {
+    void exponent_from_coefficients(su3 *out, _Complex double f0, _Complex double f1,
+                                    _Complex double f2, su3 const *in) {
   su3 ALIGN tmp;
   _complex_times_su3(tmp, f2, *in);
   _su3_add_equals_complex_identity(tmp, f1);
diff --git a/src/lib/measure_gauge_action.c b/src/lib/measure_gauge_action.c
index 1f7cb6ad5..ecbe7a888 100644
--- a/src/lib/measure_gauge_action.c
+++ b/src/lib/measure_gauge_action.c
@@ -26,10 +26,10 @@
  *     Returns the value of the action
  ************************************************************************/
 
-#include <tmlqcd_config.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <tmlqcd_config.h>
 #ifdef TM_USE_OMP
 #include <omp.h>
 #endif
diff --git a/src/lib/misc_types.h b/src/lib/misc_types.h
index fee62159f..412719dce 100644
--- a/src/lib/misc_types.h
+++ b/src/lib/misc_types.h
@@ -101,7 +101,7 @@ typedef enum tm_mpi_thread_level_t {
   TM_MPI_THREAD_SINGLE = QMP_THREAD_SINGLE,
   TM_MPI_THREAD_MULTIPLE = QMP_THREAD_MULTIPLE
 } tm_mpi_thread_level_t;
-#elif defined(TM_USE_MPI) 
+#elif defined(TM_USE_MPI)
 typedef enum tm_mpi_thread_level_t {
   TM_MPI_THREAD_SINGLE = MPI_THREAD_SERIALIZED,
   TM_MPI_THREAD_MULTIPLE = MPI_THREAD_MULTIPLE
diff --git a/src/lib/mpi_init.c b/src/lib/mpi_init.c
index cc09fd4cd..f245f0556 100644
--- a/src/lib/mpi_init.c
+++ b/src/lib/mpi_init.c
@@ -347,18 +347,20 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   for (i = 0; i < 8; i++) {
     g_nb_list[i] = g_cart_id;
   }
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT)
   MPI_Cart_shift(g_cart_grid, 0, 1, &g_nb_t_dn, &g_nb_t_up);
   g_nb_list[0] = g_nb_t_up;
   g_nb_list[1] = g_nb_t_dn;
 #endif
-#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
-     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || \
+     defined TM_PARALLELX || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   MPI_Cart_shift(g_cart_grid, 1, 1, &g_nb_x_dn, &g_nb_x_up);
   g_nb_list[2] = g_nb_x_up;
   g_nb_list[3] = g_nb_x_dn;
 #endif
-#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || \
+     defined TM_PARALLELXYZ)
   MPI_Cart_shift(g_cart_grid, 2, 1, &g_nb_y_dn, &g_nb_y_up);
   g_nb_list[4] = g_nb_y_up;
   g_nb_list[5] = g_nb_y_dn;
@@ -552,7 +554,6 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   MPI_Type_commit(&lfield_z_slice_cont32);
   MPI_Type_commit(&lfield_z_slice_gath32);
 
-
   /* The internal z_ and zt_ slices are constructed in geometry() with MPI_Type_indexed() */
 
   /* Now the derivative fields */
diff --git a/src/lib/mpi_init.h b/src/lib/mpi_init.h
index d9476e662..c6e816946 100644
--- a/src/lib/mpi_init.h
+++ b/src/lib/mpi_init.h
@@ -107,9 +107,8 @@ extern MPI_Datatype halffield_y_slice_cont;
 extern MPI_Datatype halffield_y_slice_gath;
 extern MPI_Datatype halffield_z_slice_cont;
 
-
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || \
-     defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
 extern MPI_Datatype field_z_slice_even_dn;
 extern MPI_Datatype field_z_slice_even_up;
 extern MPI_Datatype field_z_slice_odd_dn;
diff --git a/src/lib/operator/D_psi_body.c b/src/lib/operator/D_psi_body.c
index b5acd1158..f73822776 100644
--- a/src/lib/operator/D_psi_body.c
+++ b/src/lib/operator/D_psi_body.c
@@ -283,7 +283,7 @@ void _PSWITCH(D_psi)(_PTSWITCH(spinor) *const P, _PTSWITCH(spinor) *const Q) {
   _C_TYPE ALIGN32 phase_2l = (_C_TYPE)phase_2;
   _C_TYPE ALIGN32 phase_3l = (_C_TYPE)phase_3;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (_PSWITCH(g_update_gauge_copy)) {
     _PSWITCH(update_backward_gauge)(_PSWITCH(g_gauge_field));
   }
diff --git a/src/lib/operator/Hopping_Matrix.c b/src/lib/operator/Hopping_Matrix.c
index 8b106e10a..759809a8e 100644
--- a/src/lib/operator/Hopping_Matrix.c
+++ b/src/lib/operator/Hopping_Matrix.c
@@ -68,7 +68,7 @@
 #include "operator/halfspinor_hopping.h"
 
 void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -91,7 +91,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
 #else /* thats TM_USE_HALFSPINOR */
 void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
diff --git a/src/lib/operator/Hopping_Matrix_32.c b/src/lib/operator/Hopping_Matrix_32.c
index 1198d52bb..0991811b7 100644
--- a/src/lib/operator/Hopping_Matrix_32.c
+++ b/src/lib/operator/Hopping_Matrix_32.c
@@ -63,8 +63,8 @@
 #endif
 #include "boundary.h"
 #include "init/init_dirac_halfspinor.h"
-#include "update_backward_gauge.h"
 #include "operator/Hopping_Matrix_32.h"
+#include "update_backward_gauge.h"
 
 #if defined TM_USE_HALFSPINOR
 #include "operator/halfspinor_hopping_32.h"
@@ -72,7 +72,7 @@
 
 void Hopping_Matrix_32_orphaned(const int ieo, spinor32* const l, spinor32* const k) {
 #if defined TM_USE_HALFSPINOR
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy_32) {
     update_backward_gauge_32_orphaned(g_gauge_field_32);
   }
diff --git a/src/lib/operator/halfspinor_body.c b/src/lib/operator/halfspinor_body.c
index a2c54c7e4..3be906764 100644
--- a/src/lib/operator/halfspinor_body.c
+++ b/src/lib/operator/halfspinor_body.c
@@ -103,7 +103,7 @@ if (g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
 #endif
 
 #if (defined TM_USE_MPI && !defined _NO_COMM)
-  xchange_halffield32();
+    xchange_halffield32();
 #endif
 
 #ifdef TM_USE_OMP
@@ -237,7 +237,7 @@ if (g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
 #endif
 
 #if (defined TM_USE_MPI && !defined _NO_COMM)
-  xchange_halffield();
+    xchange_halffield();
 #endif
 
 #ifdef TM_USE_OMP
diff --git a/src/lib/operator/hopping_body_dbl.c b/src/lib/operator/hopping_body_dbl.c
index 5ae88044e..e34469f4e 100644
--- a/src/lib/operator/hopping_body_dbl.c
+++ b/src/lib/operator/hopping_body_dbl.c
@@ -45,7 +45,7 @@ if (ieo == 0) {
 #ifndef TM_USE_OMP
 hi = &g_hi[16 * ioff];
 
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
 up = &g_gauge_field_copy[ioff][0];
 #else
 up = &g_gauge_field[(*hi)][0];
@@ -62,7 +62,7 @@ hi++;
 for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
 #ifdef TM_USE_OMP
   hi = &g_hi[16 * icx];
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = &g_gauge_field_copy[icx][0];
 #else
   up = &g_gauge_field[(*hi)][0];
@@ -76,7 +76,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   pn = p + (icx - ioff);
 #endif
   /*********************** direction +t ************************/
-#if (!defined TM_GAUGE_COPY)
+#if (!defined TM_USE_GAUGE_COPY)
   um = &g_gauge_field[(*hi)][0];
 #else
   um = up + 1;
@@ -88,7 +88,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_t_p();
 
   /*********************** direction -t ************************/
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -99,7 +99,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_t_m();
 
   /*********************** direction +1 ************************/
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
   um = &g_gauge_field[(*hi)][1];
 #else
   um = up + 1;
@@ -111,7 +111,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_x_p();
 
   /*********************** direction -1 ************************/
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -122,7 +122,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_x_m();
 
   /*********************** direction +2 ************************/
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
   um = &g_gauge_field[(*hi)][2];
 #else
   um = up + 1;
@@ -134,7 +134,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_y_p();
 
   /*********************** direction -2 ************************/
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -145,7 +145,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_y_m();
 
   /*********************** direction +3 ************************/
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
   um = &g_gauge_field[(*hi)][3];
 #else
   um = up + 1;
@@ -158,7 +158,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
 
   /*********************** direction -3 ************************/
 #ifndef TM_USE_OMP
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = um + 1;
 #else
   up = &g_gauge_field[(*hi)][0];
diff --git a/src/lib/operator/hopping_sgl.c b/src/lib/operator/hopping_sgl.c
index 062507158..487bfc47f 100644
--- a/src/lib/operator/hopping_sgl.c
+++ b/src/lib/operator/hopping_sgl.c
@@ -37,7 +37,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
   spinor32* restrict r, * restrict sp, * restrict sm;
   spinor32 temp;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge();
   }
@@ -72,7 +72,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = &g_gauge_field_copy[icx][0];
 #else
     up = &g_gauge_field[ix][0];
@@ -100,7 +100,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     um = up + 1;
 #else
     um = &g_gauge_field[iy][0];
@@ -129,7 +129,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
 
     sp = k + icy;
 
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -157,7 +157,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
     um = &g_gauge_field[iy][1];
 #else
     um = up + 1;
@@ -185,7 +185,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -212,7 +212,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
     um = &g_gauge_field[iy][2];
 #else
     um = up + 1;
@@ -240,7 +240,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -267,7 +267,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
     um = &g_gauge_field[iy][3];
 #else
     um = up + 1;
diff --git a/src/lib/operator/tm_sub_Hopping_Matrix.c b/src/lib/operator/tm_sub_Hopping_Matrix.c
index 857404088..7edf2c954 100644
--- a/src/lib/operator/tm_sub_Hopping_Matrix.c
+++ b/src/lib/operator/tm_sub_Hopping_Matrix.c
@@ -56,7 +56,7 @@
 
 void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* const p, spinor* const k,
                            complex double const cfactor) {
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -81,7 +81,7 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* const p, spin
 #elif (!defined _NO_COMM && !defined TM_USE_HALFSPINOR)
 void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* p, spinor* const k,
                            complex double const cfactor) {
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
diff --git a/src/lib/operator/tm_times_Hopping_Matrix.c b/src/lib/operator/tm_times_Hopping_Matrix.c
index 6d1abddba..9b09c090f 100644
--- a/src/lib/operator/tm_times_Hopping_Matrix.c
+++ b/src/lib/operator/tm_times_Hopping_Matrix.c
@@ -56,7 +56,7 @@
 
 void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
                              complex double const cfactor) {
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -81,7 +81,7 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
 #elif (!defined _NO_COMM && !defined TM_USE_HALFSPINOR)
 void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
                              double complex const cfactor) {
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -103,4 +103,4 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
 #endif
   return;
 }
-#endif  //TM_USE_HALFSPINOR && !defined _NO_COMM
+#endif  // TM_USE_HALFSPINOR && !defined _NO_COMM
diff --git a/src/lib/read_input.l b/src/lib/read_input.l
index 59f002748..5eb542f87 100644
--- a/src/lib/read_input.l
+++ b/src/lib/read_input.l
@@ -951,7 +951,7 @@ static inline double fltlist_next_token(int * const list_end){
     mg_no_shifts=0;
     if(myverbose) printf("  MG_MMS_Mass set to %.16f line %d operator %d\n", mg_mms_mass, line_of_file, current_operator);
   }
-  End_DDalphaAMG{SPC}* {
+  EndDDalphaAMG{SPC}* {
   if(myverbose) printf("DDalphaAMG parsed in line %d\n\n", line_of_file);
   BEGIN(0);
   }
diff --git a/src/lib/smearing/utils_reunitarize_MILC.c b/src/lib/smearing/utils_reunitarize_MILC.c
index b5efa2936..fec177a42 100644
--- a/src/lib/smearing/utils_reunitarize_MILC.c
+++ b/src/lib/smearing/utils_reunitarize_MILC.c
@@ -1,5 +1,5 @@
-#include "utils.ih"
 #include <complex.h>
+#include "utils.ih"
 
 /* No reunitarization code seems to be available, so I've adapted (stolen) this routine from the
  * MILC code (who stole it elsewhere, I think ;]) -- AD. */
@@ -36,7 +36,7 @@ void reunitarize(su3 *omega) {
   bj2 = omega->c02;
 
   omega->c20 = bj1 * omega->c12;
-  omega->c20 -= bj2 *omega->c11;
+  omega->c20 -= bj2 * omega->c11;
 
   omega->c21 = bj2 * omega->c10;
   omega->c21 -= bj0 * omega->c12;
diff --git a/src/lib/solver/gram-schmidt.c b/src/lib/solver/gram-schmidt.c
index ffd5d6b29..4c2ee4310 100644
--- a/src/lib/solver/gram-schmidt.c
+++ b/src/lib/solver/gram-schmidt.c
@@ -75,7 +75,6 @@ void IteratedClassicalGS(_Complex double v[], double *vnrm, int n, int m, _Compl
   }
 }
 
-
 /*
  *  ModifiedGramSchmidt
  *
diff --git a/src/lib/test/check_geometry.c b/src/lib/test/check_geometry.c
index b9f14eb4d..20f7acc96 100644
--- a/src/lib/test/check_geometry.c
+++ b/src/lib/test/check_geometry.c
@@ -90,7 +90,8 @@ int check_geometry() {
           ix = g_ipt[x0][x1][x2][x3];
 
           iy0 = g_iup[ix][0];
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT)
           if (x0 != T - 1) {
             iz0 = g_ipt[(x0 + 1) % T][x1][x2][x3];
           } else {
@@ -176,7 +177,8 @@ int check_geometry() {
           }
 
           iy0 = g_idn[ix][0];
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT)
           if (x0 != 0) {
             iz0 = g_ipt[(x0 + T - 1) % T][x1][x2][x3];
           } else {
@@ -1554,4 +1556,3 @@ int check_geometry() {
 
   return (0);
 }
-
diff --git a/src/lib/test/check_overlap.c b/src/lib/test/check_overlap.c
index 56763cff4..b032f8cdd 100644
--- a/src/lib/test/check_overlap.c
+++ b/src/lib/test/check_overlap.c
@@ -188,7 +188,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
diff --git a/src/lib/wrapper/lib_wrapper.c b/src/lib/wrapper/lib_wrapper.c
index 9f083adc5..19d36ddc6 100644
--- a/src/lib/wrapper/lib_wrapper.c
+++ b/src/lib/wrapper/lib_wrapper.c
@@ -60,11 +60,11 @@
 #include "misc_types.h"
 #include "mpi_init.h"
 #include "operator.h"
+#include "operator/clover_leaf.h"
+#include "qphix_interface.h"
 #include "read_input.h"
 #include "sighandler.h"
 #include "start.h"
-#include "operator/clover_leaf.h"
-#include "qphix_interface.h"
 
 #define CONF_FILENAME_LENGTH 500
 
@@ -121,7 +121,7 @@ int tmLQCD_invert_init(int argc, char* argv[], const int _verbose, const int ext
   for (int j = 0; j < no_operators; j++)
     if (!operator_list[j].even_odd_flag) even_odd_flag = 0;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   int j = init_gauge_field(VOLUMEPLUSRAND, 1);
   j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
diff --git a/src/lib/xchange/xchange_gauge.c b/src/lib/xchange/xchange_gauge.c
index 6177a3dbb..254702822 100644
--- a/src/lib/xchange/xchange_gauge.c
+++ b/src/lib/xchange/xchange_gauge.c
@@ -960,5 +960,4 @@ void xchange_gauge(su3** const gf) {
   return;
 }
 
-
 #endif /* TM_NON_BLOCKING */

From d945c8086132ec73b7ae808b991664ae9668bcd8 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 17 Feb 2026 18:26:48 +0100
Subject: [PATCH 13/80] Improvements

- QUDA_FERMIONIC_FORCES and QUDA_EXPERIMENTAL are always on
- Removed KOJAK instrumentation
- Added a custom Qphix find package file because the original one is broken
---
 .github/workflows/qphix-build.yaml      |  5 ++-
 CMakeLists.txt                          | 18 +--------
 cmake/FindQphix.cmake                   | 39 ++++++++++++++++++
 cmake/tmlqcd_config_internal.h.in       | 17 ++++----
 src/bin/deriv_mg_tune.c                 |  8 ----
 src/bin/hmc_tm.c                        |  8 ----
 src/bin/invert.c                        |  8 ----
 src/bin/offline_measurement.c           |  9 -----
 src/lib/deriv_Sb.c                      |  7 ----
 src/lib/deriv_Sb_D_psi.c                |  7 ----
 src/lib/get_rectangle_staples.c         |  6 ---
 src/lib/get_staples.c                   | 21 ----------
 src/lib/operator/Hopping_Matrix_nocom.c |  3 --
 src/lib/operator/halfspinor_body.c      |  7 ----
 src/lib/quda_interface.c                | 20 ----------
 src/lib/test/check_overlap.c            |  8 ----
 src/lib/update_gauge.c                  | 53 +++++++------------------
 src/lib/xchange/xchange_2fields.c       |  7 ----
 src/lib/xchange/xchange_halffield.c     | 13 ------
 src/lib/xchange/xchange_lexicfield.c    | 28 -------------
 20 files changed, 66 insertions(+), 226 deletions(-)
 create mode 100644 cmake/FindQphix.cmake

diff --git a/.github/workflows/qphix-build.yaml b/.github/workflows/qphix-build.yaml
index eef1b5055..ec4ec5394 100644
--- a/.github/workflows/qphix-build.yaml
+++ b/.github/workflows/qphix-build.yaml
@@ -172,9 +172,10 @@ jobs:
            -DTM_USE_OMP=ON \
            -DTM_USE_LEMON=ON \
            -DTM_USE_QPHIX=ON \
-           -DCMAKE_CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-           -DCMAKE_CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
+           -DCMAKE_CXX_FLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
+           -DCMAKE_C_FLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
            -DQPHIX_DIR="${{github.workspace}}/qphix/build/install_dir" \
+           -DQMP_DIR="${{github.workspace}}/qmp/build/install_dir" \
             ..
           make -j > config.log
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a375ad14b..803feeef7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,7 +100,7 @@ option(TM_USE_SHMEM "Use shmem API" OFF)
 option(TM_USE_QUDA "Enable QUDA support" OFF)
 option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
 option(TM_ENABLE_TESTS "Enable tests" OFF)
-
+set(TM_QPHIX_SOALEN "4" CACHE STRING "QPhiX specific parameter")
 # MPI dependent options
 cmake_dependent_option(
   TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]" OFF
@@ -123,12 +123,6 @@ cmake_dependent_option(TM_USE_LEMON "Use the lemon io library" OFF "TM_USE_MPI"
                        ON)
 
 # GPU dependent options
-cmake_dependent_option(TM_USE_QUDA_EXPERIMENTAL "Enable QUDA support" ON
-                       "TM_USE_QUDA" OFF)
-cmake_dependent_option(
-  TM_QUDA_FERMIONIC_FORCES "Enable support for fermionic forces using QUDA" ON
-  "TM_USE_QUDA" OFF)
-
 cmake_dependent_option(TM_USE_NVHPC "Enable Nvidia HPC toolkit" OFF
                        "TM_USE_CUDA" OFF)
 
@@ -253,15 +247,7 @@ if(TM_USE_HIP OR QUDA_TARGET_HIP)
 endif()
 
 if(TM_USE_QPHIX)
-  find_package(QPhiX REQUIRED CONFIG)
-  message("${QPhiX_LIBRARIES}")
-  if(NOT TARGET tmlqcd::qphix)
-    add_library(tmlqcd::qphix INTERFACE IMPORTED)
-    set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_LINK_LIBRARIES
-                                                   "${QPhiX_LIBRARIES}")
-    set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                                   "${QPhiX_INCLUDE_DIRS}")
-  endif()
+  find_package(Qphix REQUIRED)
 endif()
 
 # check for fftw3 (rely on pkgconfig).
diff --git a/cmake/FindQphix.cmake b/cmake/FindQphix.cmake
new file mode 100644
index 000000000..15ab2d47a
--- /dev/null
+++ b/cmake/FindQphix.cmake
@@ -0,0 +1,39 @@
+find_library(TM_QMP_LIBS NAMES qmp PATH_SUFFIXES "lib" "lib64")
+find_library(TM_QPHIX_LIBS_CODEGEN NAMES "qphix_codegen" PATH_SUFFIXES "lib" "lib64")
+find_library(TM_QPHIX_LIBS_SOLVER NAMES "qphix_solver" PATH_SUFFIXES "lib" "lib64")
+
+message("${QMP_DIR}")
+
+find_path(
+  TM_QMP_INCLUDE_DIRS
+  NAMES qmp.h
+  PATH_SUFFIXES "include"
+  PATHS "${QMP_DIR}")
+
+find_path(
+  TM_QPHIX_INCLUDE_DIRS
+  NAMES qphix_config.h
+  PATH_SUFFIXES "qphix"
+  PATHS "${QPHIX_DIR}")
+find_path(
+  TM_QPHIX_CODEGEN_INCLUDE_DIRS
+  NAMES qpx_utils.h
+  PATH_SUFFIXES "qphix_codegen"
+  PATHS "${QPHIX_DIR}")
+
+message("${TM_QMP_INCLUDE_DIRS} ${TM_QPHIX_INCLUDE_DIRS} ${TM_QMP_LIBS} ${TM_QPHIX_LIBS_CODEGEN} ${TM_QPHIX_LIBS_SOLVER}")
+
+find_package_handle_standard_args(
+  Qphix DEFAULT_MSG TM_QPHIX_LIBS_CODEGEN TM_QPHIX_LIBS_SOLVER TM_QPHIX_INCLUDE_DIRS TM_QMP_LIBS TM_QMP_INCLUDE_DIRS TM_QPHIX_CODEGEN_INCLUDE_DIRS)
+
+if(TM_QMP_LIBS
+    AND TM_QPHIX_INCLUDE_DIRS
+    AND NOT TARGET tmlqcd::qphix)
+  add_library(tmlqcd::qphix INTERFACE IMPORTED)
+  set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_LINK_LIBRARIES
+    "${TM_QPHIX_LIBS_CODEGEN};${TM_QPHIX_LIBS_SOLVER};${TM_QMP_LIBS}")
+  set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+    "${TM_QMP_INCLUDE_DIRS};${TM_QPHIX_INCLUDE_DIRS}/..;${TM_QPHIX_INCLUDE_DIRS};${TM_QPHIX_CODEGEN_INCLUDE_DIRS}")
+endif()
+
+mark_as_advanced(TM_QPHIX_LIBRARIES TM_QPHIX_INCLUDE_DIRS TM_QMP_LIBS TM_QMP_INCLUDE_DIRS)
diff --git a/cmake/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
index fb8d7d818..145df156a 100644
--- a/cmake/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -66,7 +66,7 @@
 #define ALIGN_BASE @ALIGN_BASE@
 
 /* Alignment compiler hint macro */
-#cmakedefine ALIGN @ALIGN@
+#define ALIGN @ALIGN@
 
 /* Alignment for 32bit arrays -- necessary for SSE and automated vectorization */
 #define ALIGN_BASE32 @ALIGN_BASE32@
@@ -76,10 +76,10 @@
 
 /* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
    `char[]'. */
-#cmakedefine YYTEXT_POINTER
+//#cmakedefine YYTEXT_POINTER
 
 /* Number of bits in a file offset, on hosts where this is settable. */
-#cmakedefine TM_FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
+#define TM_FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
 
 /* Construct an extra copy of the gauge fields */
 #cmakedefine TM_USE_GAUGE_COPY
@@ -111,17 +111,14 @@
 /* Using QUDA GPU */
 #cmakedefine TM_USE_QUDA 
 
-/* Using experimental QUDA version */
-#cmakedefine TM_QUDA_EXPERIMENTAL
-
-/* Using QUDA fermionic forces */
-#cmakedefine TM_QUDA_FERMIONIC_FORCES
-
 /* Using DDalphaAMG */
 #cmakedefine TM_USE_DDalphaAMG
 
 /* Using QPHIX */
 #cmakedefine TM_USE_QPHIX 
 
+#ifdef TM_USE_QPHIX
 /* Structure of Array length to use with QPhiX */
-#cmakedefine QPHIX_SOALEN @TMLQCD_QPHIX_SOALEN@
+#define QPHIX_SOALEN @TM_QPHIX_SOALEN@
+#endif
+
diff --git a/src/bin/deriv_mg_tune.c b/src/bin/deriv_mg_tune.c
index f65b22c48..7c45524de 100644
--- a/src/bin/deriv_mg_tune.c
+++ b/src/bin/deriv_mg_tune.c
@@ -98,11 +98,6 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_DERIV_MG_TUNE);
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
   verbose = 1;
   g_use_clover_flag = 0;
 
@@ -367,9 +362,6 @@ int main(int argc, char *argv[]) {
 #endif
 
   return (0);
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(main)
-#endif
 }
 
 static void usage(const tm_ExitCode_t exit_code) {
diff --git a/src/bin/hmc_tm.c b/src/bin/hmc_tm.c
index 399362d0b..b68a5250f 100644
--- a/src/bin/hmc_tm.c
+++ b/src/bin/hmc_tm.c
@@ -113,11 +113,6 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_HMC_TM);
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
   strcpy(gauge_filename, "conf.save");
   strcpy(nstore_filename, "nstore_counter");
   strcpy(tmp_filename, ".conf.tmp");
@@ -591,9 +586,6 @@ int main(int argc, char *argv[]) {
 #endif
 
   return (0);
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(main)
-#endif
 }
 
 static void usage(const tm_ExitCode_t exit_code) {
diff --git a/src/bin/invert.c b/src/bin/invert.c
index bb6f15c10..b5040ba88 100644
--- a/src/bin/invert.c
+++ b/src/bin/invert.c
@@ -114,11 +114,6 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_INVERT);
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
   DUM_DERI = 8;
   DUM_MATRIX = DUM_DERI + 5;
   NO_OF_SPINORFIELDS = DUM_MATRIX + 4;
@@ -457,9 +452,6 @@ int main(int argc, char *argv[]) {
   MPI_Finalize();
 #endif
   return (0);
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(main)
-#endif
 }
 
 static void usage(tm_ExitCode_t exit_code) {
diff --git a/src/bin/offline_measurement.c b/src/bin/offline_measurement.c
index c1422858f..b6cbc13fa 100644
--- a/src/bin/offline_measurement.c
+++ b/src/bin/offline_measurement.c
@@ -83,11 +83,6 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_OFFLINE_MEASUREMENT);
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
   DUM_DERI = 8;
   DUM_MATRIX = DUM_DERI + 5;
   NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
@@ -306,10 +301,6 @@ int main(int argc, char *argv[]) {
   MPI_Finalize();
 #endif
   return (0);
-
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(main)
-#endif
 }
 
 static void usage(const tm_ExitCode_t exit_code) {
diff --git a/src/lib/deriv_Sb.c b/src/lib/deriv_Sb.c
index 1427c4af0..c48c8db38 100644
--- a/src/lib/deriv_Sb.c
+++ b/src/lib/deriv_Sb.c
@@ -83,10 +83,6 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
 
 #ifdef TM_USE_OMP
 #undef static
-#endif
-
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(derivSb)
 #endif
 
     if (ieo == 0) {
@@ -292,7 +288,4 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
   } /* OpenMP closing brace */
 #endif
   tm_stopwatch_pop(&g_timers, 0, 1, "");
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(derivSb)
-#endif
 }
diff --git a/src/lib/deriv_Sb_D_psi.c b/src/lib/deriv_Sb_D_psi.c
index 61da4b9d2..3f3319efc 100644
--- a/src/lib/deriv_Sb_D_psi.c
+++ b/src/lib/deriv_Sb_D_psi.c
@@ -61,10 +61,6 @@ void deriv_Sb_D_psi(spinor* const l, spinor* const k, hamiltonian_field_t* const
 
 #ifdef TM_USE_OMP
 #undef static
-#endif
-
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(derivSb)
 #endif
 
     /************** loop over all lattice sites ****************/
@@ -225,9 +221,6 @@ void deriv_Sb_D_psi(spinor* const l, spinor* const k, hamiltonian_field_t* const
 
       /****************** end of loop ************************/
     }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(derivSb)
-#endif
 
 #ifdef TM_USE_OMP
   } /*OpenMP closing brace */
diff --git a/src/lib/get_rectangle_staples.c b/src/lib/get_rectangle_staples.c
index eab6b9d9e..c8f69596b 100644
--- a/src/lib/get_rectangle_staples.c
+++ b/src/lib/get_rectangle_staples.c
@@ -34,9 +34,6 @@ void get_rectangle_staples_general(su3 *const v, const int x, const int mu,
                                    const su3 *const *const gf) {
   su3 ALIGN tmp1, tmp2;
   const su3 *a, *b, *c, *d, *e;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(rectstaples)
-#endif
   _su3_zero((*v));
   for (int nu = 0; nu < 4; nu++) {
     if (mu != nu) {
@@ -178,7 +175,4 @@ void get_rectangle_staples_general(su3 *const v, const int x, const int mu,
       _su3_times_su3_acc((*v), tmp2, tmp1);
     }
   }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(rectstaples)
-#endif
 }
diff --git a/src/lib/get_staples.c b/src/lib/get_staples.c
index b33010f2c..ae7f19d09 100644
--- a/src/lib/get_staples.c
+++ b/src/lib/get_staples.c
@@ -35,10 +35,6 @@ void get_staples(su3* const staple, const int x, const int mu, const su3** in_ga
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(staples)
-#endif
-
   _su3_zero(*staple);
   for (int k = 0; k < 4; k++) {
     if (k != mu) {
@@ -61,9 +57,6 @@ void get_staples(su3* const staple, const int x, const int mu, const su3** in_ga
       _su3d_times_su3_acc(*staple, *w1, st);
     }
   }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(staples)
-#endif
 }
 
 void get_spacelike_staples(su3* const staple, const int x, const int mu,
@@ -72,10 +65,6 @@ void get_spacelike_staples(su3* const staple, const int x, const int mu,
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(staples)
-#endif
-
   _su3_zero(*staple);
   for (int k = 1; k < 4; k++) {
     if (k != mu) {
@@ -98,9 +87,6 @@ void get_spacelike_staples(su3* const staple, const int x, const int mu,
       _su3d_times_su3_acc(*staple, *w1, st);
     }
   }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(staples)
-#endif
 }
 
 void get_timelike_staples(su3* const staple, const int x, const int mu,
@@ -109,10 +95,6 @@ void get_timelike_staples(su3* const staple, const int x, const int mu,
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(staples)
-#endif
-
   _su3_zero(*staple);
   int k = 0;
   if (k != mu) {
@@ -134,7 +116,4 @@ void get_timelike_staples(su3* const staple, const int x, const int mu,
     /* v = v + w1^d * st */
     _su3d_times_su3_acc(*staple, *w1, st);
   }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(staples)
-#endif
 }
diff --git a/src/lib/operator/Hopping_Matrix_nocom.c b/src/lib/operator/Hopping_Matrix_nocom.c
index c7814bbb0..00c34c38b 100644
--- a/src/lib/operator/Hopping_Matrix_nocom.c
+++ b/src/lib/operator/Hopping_Matrix_nocom.c
@@ -48,8 +48,5 @@
 
 #define Hopping_Matrix Hopping_Matrix_nocom
 #define _NO_COMM 1
-#ifdef TM_KOJAK_INST
-#undef TM_KOJAK_INST
-#endif
 
 #include "Hopping_Matrix.c"
diff --git a/src/lib/operator/halfspinor_body.c b/src/lib/operator/halfspinor_body.c
index 3be906764..8286c89f3 100644
--- a/src/lib/operator/halfspinor_body.c
+++ b/src/lib/operator/halfspinor_body.c
@@ -30,10 +30,6 @@ halfspinor* restrict* phi ALIGN;
 halfspinor32* restrict* phi32 ALIGN;
 _declare_hregs();
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(hoppingmatrix)
-#endif
-
 #ifndef TM_USE_OMP
 s = k;
 _prefetch_spinor(s);
@@ -320,6 +316,3 @@ if (g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
 #endif
   }
 }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(hoppingmatrix)
-#endif
diff --git a/src/lib/quda_interface.c b/src/lib/quda_interface.c
index b9a4e62a7..e527dbba7 100644
--- a/src/lib/quda_interface.c
+++ b/src/lib/quda_interface.c
@@ -2059,9 +2059,7 @@ void _setQudaMultigridParam(QudaMultigridParam *mg_param) {
 
     // this is needed after QUDA commit
     // https://github.com/lattice/quda/commit/7903288629f0fcc474989fec5a1393ecc17a4b42
-#ifdef TM_QUDA_EXPERIMENTAL
     mg_param->n_vec_batch[level] = 1;
-#endif
 
     // set the MG EigSolver parameters, almost equivalent to
     // setEigParam from QUDA's multigrid_invert_test, except
@@ -3031,7 +3029,6 @@ void quda_mg_tune_params(void *spinorOut, void *spinorIn, const int max_iter) {
   free(tunable_params);
 }
 
-#ifdef TM_QUDA_FERMIONIC_FORCES
 void compute_cloverdet_derivative_quda(monomial *const mnl, hamiltonian_field_t *const hf,
                                        spinor *const X_o, spinor *const phi, int detratio) {
   tm_stopwatch_push(&g_timers, __func__, "");
@@ -3131,23 +3128,6 @@ void compute_ndcloverrat_derivative_quda(monomial *const mnl, hamiltonian_field_
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
-#else
-void compute_cloverdet_derivative_quda(monomial *const mnl, hamiltonian_field_t *const hf,
-                                       spinor *const X_o, spinor *const phi, int detratio) {
-  tm_debug_printf(0, 0,
-                  "Error:   UseExternalLibrary = quda requires that tmLQCD is compiled with "
-                  "--enable-quda_fermionic=yes\n");
-  exit(1);
-}
-void compute_ndcloverrat_derivative_quda(monomial *const mnl, hamiltonian_field_t *const hf,
-                                         spinor **const Qup, spinor **const Qdn,
-                                         solver_params_t *solver_params, int detratio) {
-  tm_debug_printf(0, 0,
-                  "Error:   UseExternalLibrary = quda requires that tmLQCD is compiled with "
-                  "--enable-quda_fermionic=yes\n");
-  exit(1);
-}
-#endif
 
 void compute_WFlow_quda(const double eps, const double tmax, const int traj, FILE *outfile) {
   tm_stopwatch_push(&g_timers, __func__, "");
diff --git a/src/lib/test/check_overlap.c b/src/lib/test/check_overlap.c
index b032f8cdd..d34e2ae5b 100644
--- a/src/lib/test/check_overlap.c
+++ b/src/lib/test/check_overlap.c
@@ -105,11 +105,6 @@ int main(int argc, char *argv[]) {
   char *gaugecksum = NULL;
   double plaquette_energy;
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
 #ifdef TM_USE_LEMON
   MPI_File fh;
   LemonWriter *lemonWriter;
@@ -389,7 +384,4 @@ int main(int argc, char *argv[]) {
     free_chi_dn_spinor_field();
   }
   return (0);
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(main)
-#endif
 }
diff --git a/src/lib/update_gauge.c b/src/lib/update_gauge.c
index af4730e01..7a7dd34a1 100644
--- a/src/lib/update_gauge.c
+++ b/src/lib/update_gauge.c
@@ -56,43 +56,23 @@ void update_gauge(const double step, hamiltonian_field_t *const hf) {
 #endif
 
 #ifdef TM_USE_OMP
-#define static
-#pragma omp parallel
-  {
+#pragma omp parallel for
 #endif
-    int i, mu;
-    static su3 v, w;
-    su3 *z;
-    static su3adj deriv;
-    su3adj *xm;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(updategauge)
-#endif
-
-#ifdef TM_USE_OMP
-#undef static
-#endif
-
-#ifdef TM_USE_OMP
-#pragma omp for
-#endif
-    for (i = 0; i < VOLUME; i++) {
-      for (mu = 0; mu < 4; mu++) {
-        /* moment[i][mu] = h_{i,mu}^{alpha} */
-        xm = &hf->momenta[i][mu];
-        z = &hf->gaugefield[i][mu];
-        _su3adj_assign_const_times_su3adj(deriv, step, *xm);
-        exposu3(&w, &deriv);
-        restoresu3(&v, &w);
-        _su3_times_su3(w, v, *z);
-        restoresu3(&v, &w);
-        _su3_assign(*z, v);
-      }
+  for (int i = 0; i < VOLUME; i++) {
+    for (int mu = 0; mu < 4; mu++) {
+      /* moment[i][mu] = h_{i,mu}^{alpha} */
+      su3 v, w;
+      su3adj *xm = &hf->momenta[i][mu];
+      su3 *z = &hf->gaugefield[i][mu];
+      su3adj deriv;
+      _su3adj_assign_const_times_su3adj(deriv, step, *xm);
+      exposu3(&w, &deriv);
+      restoresu3(&v, &w);
+      _su3_times_su3(w, v, *z);
+      restoresu3(&v, &w);
+      _su3_assign(*z, v);
     }
-
-#ifdef TM_USE_OMP
-  } /* OpenMP parallel closing brace */
-#endif
+  }
 
 #ifdef TM_USE_MPI
   /* for parallelization */
@@ -115,7 +95,4 @@ void update_gauge(const double step, hamiltonian_field_t *const hf) {
 
   tm_stopwatch_pop(&g_timers, 0, 1, "");
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(updategauge)
-#endif
 }
diff --git a/src/lib/xchange/xchange_2fields.c b/src/lib/xchange/xchange_2fields.c
index c311bf908..46496a0ba 100644
--- a/src/lib/xchange/xchange_2fields.c
+++ b/src/lib/xchange/xchange_2fields.c
@@ -52,10 +52,6 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
   int ix = 0;
 #endif
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchange2fields)
-#endif
-
 #ifdef TM_USE_MPI
 
   /* send the data to the neighbour on the left */
@@ -237,8 +233,5 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
   MPI_Waitall(reqcount, requests, status);
 #endif
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchange2fields)
-#endif
 }
 #endif /*  TM_NON_BLOCKING */
diff --git a/src/lib/xchange/xchange_halffield.c b/src/lib/xchange/xchange_halffield.c
index 3948aa1ca..0dd1effca 100644
--- a/src/lib/xchange/xchange_halffield.c
+++ b/src/lib/xchange/xchange_halffield.c
@@ -176,9 +176,6 @@ void xchange_halffield() {
   int reqcount = 16;
 #endif
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchangehalf)
-#endif
   /* send the data to the neighbour on the right in t direction */
   /* recieve the data from the neighbour on the left in t direction */
   MPI_Isend((void*)(sendBuffer), LX * LY * LZ * 12 / 2, MPI_DOUBLE, g_nb_t_up, 81, g_cart_grid,
@@ -245,10 +242,6 @@ void xchange_halffield() {
   MPI_Waitall(reqcount, requests, status);
 #endif /* MPI */
   return;
-
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchangehalf)
-#endif
 }
 
 #endif /* def (TM_USE_SHMEM || TM_PERSISTENT) */
@@ -265,9 +258,6 @@ void xchange_halffield32() {
   int reqcount = 12;
 #elif defined TM_PARALLELXYZT
   int reqcount = 16;
-#endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchangehalf32)
 #endif
 
   /* send the data to the neighbour on the right in t direction */
@@ -336,8 +326,5 @@ void xchange_halffield32() {
   MPI_Waitall(reqcount, requests, status);
 #endif /* MPI */
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchangehalf32)
-#endif
 }
 #endif /* defined TM_USE_HALFSPINOR */
diff --git a/src/lib/xchange/xchange_lexicfield.c b/src/lib/xchange/xchange_lexicfield.c
index 56cc4315c..282ca8dfa 100644
--- a/src/lib/xchange/xchange_lexicfield.c
+++ b/src/lib/xchange/xchange_lexicfield.c
@@ -60,12 +60,8 @@ void xchange_lexicfield(spinor* const l) {
 #elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchange_lexicfield)
-#endif
 
 #ifdef TM_USE_MPI
-
   /* send the data to the neighbour on the left */
   /* recieve the data from the neighbour on the right */
   MPI_Isend((void*)l, 1, lfield_time_slice_cont, g_nb_t_dn, 5081, g_cart_grid, &requests[0]);
@@ -135,9 +131,6 @@ void xchange_lexicfield(spinor* const l) {
 
 #endif
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchange_lexicfield)
-#endif
 }
 
 /* Here comes the naive version */
@@ -149,12 +142,8 @@ void xchange_lexicfield(spinor* const l) {
 #ifdef TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
 #endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchange_lexicfield)
-#endif
 
 #ifdef TM_USE_MPI
-
   MPI_Status status;
   /* send the data to the neighbour on the left */
   /* recieve the data from the neighbour on the right */
@@ -214,9 +203,6 @@ void xchange_lexicfield(spinor* const l) {
 #endif
 #endif
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchange_lexicfield)
-#endif
 }
 
 #endif
@@ -239,12 +225,8 @@ void xchange_lexicfield32(spinor32* const l) {
 #elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchange_lexicfield32)
-#endif
 
 #ifdef TM_USE_MPI
-
   /* send the data to the neighbour on the left */
   /* recieve the data from the neighbour on the right */
   MPI_Isend((void*)l, 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, g_cart_grid, &requests[0]);
@@ -314,9 +296,6 @@ void xchange_lexicfield32(spinor32* const l) {
 
 #endif
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchange_lexicfield32)
-#endif
 }
 
 /* Here comes the naive version */
@@ -328,12 +307,8 @@ void xchange_lexicfield32(spinor32* const l) {
 #ifdef TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
 #endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchange_lexicfield32)
-#endif
 
 #ifdef TM_USE_MPI
-
   MPI_Status status;
   /* send the data to the neighbour on the left */
   /* recieve the data from the neighbour on the right */
@@ -394,9 +369,6 @@ void xchange_lexicfield32(spinor32* const l) {
 #endif
 #endif
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchange_lexicfield32)
-#endif
 }
 
 #endif

From 6491f117c549516463d58ebc12d31655b67675b1 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Thu, 19 Feb 2026 14:39:07 +0100
Subject: [PATCH 14/80] Added basic documentation

---
 .ci/include/cscs/01-test-templates.yml        |  37 ++--
 .../repo/packages/lemonio/package.py          |   2 +-
 .../repo/packages/tmlqcd/package.py           | 113 ++++++++++
 CMakeLists.txt                                |  19 +-
 README.md                                     | 131 ++++++++++++
 cmake/FindCLime.cmake                         |  19 +-
 cmake/tmlqcd_config_internal.h.in             |   2 +-
 doc/install.tex                               | 201 ++++++++++--------
 install-sh                                    |   0
 quda_gauge_paths.inc                          | 158 --------------
 10 files changed, 397 insertions(+), 285 deletions(-)
 create mode 100644 .ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
 create mode 100644 README.md
 delete mode 100644 install-sh
 delete mode 100644 quda_gauge_paths.inc

diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index 9a4a8da45..9b3a1c414 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -8,30 +8,29 @@ include:
   image: ${UENV_NAME}/${UENV_VERSION}:${UENV_TAG}
   variables:
     WITH_UENV_VIEW: "default"
-    CFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
-    CXXFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
-    LDFLAGS: "-fopenmp"
+#    CFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
+#    CXXFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
+#    LDFLAGS: "-fopenmp"
   before_script:
     - |
       if test "${SLURM_PROCID}" -eq "0"; then
         export CC="$(which mpicc)"
         export CXX="$(which mpicxx)"
-        mkdir -p install_dir
-        autoconf
-        ./configure \
-          --enable-quda_experimental \
-          --enable-mpi \
-          --enable-omp \
-          --with-mpidimension=4 \
-          --disable-sse2 \
-          --disable-sse3 \
-          --enable-alignment=32 \
-          --with-qudadir="/user-environment/env/default" \
-          --with-limedir="/user-environment/env/default" \
-          --with-lemondir="/user-environment/env/default" \
-          --with-lapack="-lopenblas -L/user-environment/env/default/lib" \
-          --with-cudadir="/user-environment/env/default/lib64" \
-          --prefix="$(pwd)/install_dir"
+        mkdir -p build_dir
+        cd build_dir
+        cmake -DCMAKE_PREFIX_PATH="/user-environment/env/default" \
+              -DTM_USE_MPI=ON \
+              -DTM_USE_CUDA=ON \
+              -DCMAKE_C_CFLAGS="-O3 -mtune=neoverse-v2 -mcpu=neoverse-v2" \
+              -DCMAKE_CXX_FLAGS="-O3 -mtune=neoverse-v2 -mcpu=neoverse-v2" \
+              -DCMAKE_CUDA_ARCHITECTURES=90a \
+              -DTM_USE_OMP=ON \
+              -DTM_USE_QUDA=ON \
+              -DTM_USE_LEMON=ON \
+              -DTM_ENABLE_ALIGNMENT=32 \
+              -DTM_USE_GAUGE_COPY-ON \
+              -DTM_USE_HALFSPINOR=ON \
+              -DCMAKE_INSTALL_PREFIX=../install_dir ..
         make
         make install
         touch preparation-done-${CI_JOB_ID}
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
index 7508b4b79..4d7340a03 100755
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
@@ -28,7 +28,7 @@ class CMakeBuilder(cmake.CMakeBuilder):
     def cmake_args(self):
         spec = self.spec
         args = [
-            self.define_from_variant("DBUILD_SHARED_LIBS" "shared"),
+            self.define_from_variant("DBUILD_SHARED_LIBS", "shared"),
         ]
         return args
 
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
new file mode 100644
index 000000000..13fb3238e
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
@@ -0,0 +1,113 @@
+# Copyright Spack Project Developers. See COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+from spack_repo.builtin.build_systems.cmake import CmakePackage
+from spack_repo.builtin.build_systems.rocm import ROCmPackage
+from spack_repo.builtin.build_systems.cuda import CudaPackage
+
+from spack.package import *
+
+class Tmlqcd(CmakePackage, CudaPackage, ROCmPackage):
+"""Base class for building tmlQCD."""
+
+    homepage = "https://www.itkp.uni-bonn.de/~urbach/software.html"
+    url = "https://github.com/etmc/tmLQCD/archive/refs/tags/rel-5-1-6.tar.gz"
+    git = "https://github.com/etmc/tmLQCD.git"
+    license("GPL-3.0-or-later")
+
+    maintainers("mtaillefumier")
+    version("master", branch="master")
+
+    variant("lemon", default=False, description="Enable the lemon backend")
+    variant("mpi", default=True, description="Enable mpi support")
+    variant("DDalphaAMG", default=False, description="Enable DAlphaAMG support")
+    variant("openmp", default=True, description="Enable OpenMP")
+    variant("fftw", default=True, description="Enable FFTW interface")
+    variant(
+        "persistent_mpi",
+        default=True,
+        description="Enable persistent mpi calls for spinor and gauge fields",
+        when="+mpi",
+    )
+    variant(
+        "nonblocking_mpi",
+        default=True,
+        description="Enable non-blocking mpi calls for spinor and gauge fields",
+        when="+mpi",
+    )
+    variant("fixedvolume", default=True, description="Enable fixed volume at compile time")
+    variant(
+        "alignment",
+        default="auto",
+        values=("none", "auto", "16", "32", "64"),
+        description="Automatically or expliclty align arrays",
+    )
+    variant("gauge_copy", default=True, description="Enable gauge field copy")
+    variant("half_spinor", default=True, description="Use a Dirac operator with half-spinor")
+    variant("shared", default=False, description="Enable shared library")
+    variant("shmem", default=False, description="Use shmem API")
+    variant("quda", default=True, description="Enable the QUDA library", when="+cuda",)
+    variant("quda", default=True, description="Enable the QUDA library", when="+rocm",)
+    variant(
+        "QPhiX", default=False, description="Enable the QPhiX library for Intel Xeon and Xeon Phis"
+    )
+    variant(
+        "mpi_dimensions",
+        default="4",
+        values=("1", "2", "3", "4", "x", "xy", "xyz"),
+        description="number of dimensions the mpi processes are distributed. the default is parallelization over all four dimensions txyz",
+        when="+mpi",
+    )
+
+    generator("ninja")
+
+    # language dependencies
+    depends_on("c", type="build")
+    depends_on("cxx", type="build")
+    depends_on("fortran", type="build")
+
+    # conflicts
+    conflicts("+cuda", when="cuda_arch=none")
+    conflicts("+rocm", when="amdgpu_target=none")
+
+    # hard dependencies
+    depends_on("c-lime")
+    depends_on("blas")
+    depends_on("lapack")
+    depends_on("pkgconfig", type="build")
+
+     # dependencies
+    depends_on("mpi", when="+mpi")
+    depends_on("lemon-io", when="+lemon")
+
+    with when("+quda"):
+        depends_on(
+            "quda+twisted_mass+twisted_clover+clover+ndeg_twisted_clover+ndeg_twisted_mass+wilson+qdp+staggered+usqcd+multigrid"
+        )
+
+        depends_on("quda+mpi", when="+mpi")
+        depends_on("quda+cuda", when="+cuda")
+        depends_on("quda+rocm", when="+rocm")
+        depends_on("quda+nvshmem", when="+shmem")
+
+    depends_on("fftw-api@3", when="+fftw")
+
+class CMakeBuilder(cmake.CMakeBuilder):
+    def cmake_args(self):
+        spec = self.spec
+        args = [
+            self.define_from_variant("DBUILD_SHARED_LIBS", "shared"),
+            self.define_from_variant("TM_USE_LEMON", "lemon"),
+            self.define_from_variant("TM_USE_MPI", "mpi"),
+            self.define_from_variant("TM_USE_QUDA", "quda"),
+            self.define_from_variant("TM_USE_CUDA","cuda"),
+            self.define_from_variant("TM_USE_HIP", "cuda"),
+            self.define_from_variant("TM_USE_FFTW", "fftw"),
+            self.define_from_variant("TM_FIXEDVOLUME", "fixed_volume"),
+            self.define_from_variant("TM_USE_OMP", "openmp"),
+            self.define_from_variant("TM_USE_SHMEM", "shmem"),
+            self.define_from_variant("TM_USE_GAUGE_COPY", "gauge_copy"),
+            self.define_from_variant("TM_USE_HALFSPINOR", "half_spinor"),
+        ]
+        return args
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 803feeef7..d363e407c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,8 +80,8 @@ option(TM_USE_MPI "Enable MPI support" OFF)
 option(TM_USE_CUDA "Enable QUDA support" OFF)
 option(TM_USE_HIP "Enable HIP support" OFF)
 option(TM_USE_DDalphaAMG "Enable DDalphaAMG support" OFF)
-option(TM_USE_OMP "Enable openMP" ON)
-option(TM_FIXEDVOLUME "fix volume at compile time" OFF)
+option(TM_USE_OMP "Enable OpenMP" ON)
+option(TM_FIXEDVOLUME "Fix volume at compile time" OFF)
 set(TM_ENABLE_ALIGNMENT
     "auto"
     CACHE
@@ -95,7 +95,7 @@ set_property(CACHE TM_ENABLE_ALIGNMENT PROPERTY STRINGS "auto" "none" "16" "32"
 option(TM_USE_OPTIMIZATION "enable optimisation" ON)
 option(TM_USE_GAUGE_COPY "Enable use of a copy of the gauge field" ON)
 option(TM_USE_HALFSPINOR "Use a Dirac Op. with halfspinor exchange" ON)
-option(TM_USE_QPHIX "enable QPhiX" OFF)
+option(TM_USE_QPHIX "Enable QPhiX" OFF)
 option(TM_USE_SHMEM "Use shmem API" OFF)
 option(TM_USE_QUDA "Enable QUDA support" OFF)
 option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
@@ -106,7 +106,7 @@ cmake_dependent_option(
   TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]" OFF
   "TM_USE_MPI" OFF)
 cmake_dependent_option(
-  TM_NONBLOCKING_MPI "Use non-blocking MPI calls for spinor and gaug" ON
+  TM_NONBLOCKING_MPI "Use non-blocking MPI calls for spinor and gauge" ON
   "TM_USE_MPI" OFF)
 
 # need to do it properly. Just a place holder
@@ -132,8 +132,6 @@ find_package(BLAS REQUIRED)
 find_package(LAPACK REQUIRED)
 set(TM_LAPACK ON)
 find_package(FLEX REQUIRED)
-# do we need bison ?
-find_package(BISON REQUIRED)
 
 set(PACKAGE_NAME ${PROJECT_DESCRIPTION})
 set(PACKAGE_VERSION ${PROJECT_VERSION})
@@ -201,12 +199,6 @@ set(TM_USE_LIME ON)
 
 if(TM_USE_QUDA)
   find_package(QUDA REQUIRED config)
-  if(TM_USE_QUDA_EXPERIMENTAL)
-    set(TM_QUDA_EXPERIMENTAL ON)
-  endif()
-  if(TM_QUDA_FERMIONIC_FORCES)
-    set(TM_QUDA_FERMIONIC_FORCES ON)
-  endif()
 endif()
 
 if(TM_USE_SHMEM)
@@ -293,6 +285,9 @@ if(TM_USE_MPI)
   endif()
 endif()
 
+if (TM_USE_HALFSPINOR AND NOT TM_USE_GAUGE_COPY)
+  message(FATAL_ERROR "The TM_USE_GAUGE_COPY option should also be set to ON when TM_USE_HALFSPINOR is ON")
+endif()
 # keep the autotool config.h header.
 configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlqcd_config_internal.h.in"
                "${PROJECT_BINARY_DIR}/tmlqcd_config_internal.h" @ONLY)
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..84f1e2172
--- /dev/null
+++ b/README.md
@@ -0,0 +1,131 @@
+The software ships with a CMake environment, which will configure and build the
+programmes. It is recommended to configure and build the executables in a
+separate build directory. This also allows to have several builds with different
+options from the same source code directory.
+
+## Prerequisites
+
+In order to compile the programmes the `LAPACK` library (fortran version) needs to be installed. CMake will search for the
+library in all default directories. Also the latest version (tested is version
+1.2.3) of `C-LIME` must be available, which is used as
+a packaging scheme to read and write gauge configurations and propagators to
+files.
+
+## Configuring the hmc package
+:label{sec:config}
+
+The build system uses CMake to configure and build the hmc package. The
+following list gives all options (OFF by default unless specified):
+- `CMAKE_POSITION_INDEPENDENT_CODE`: Build a position independent
+  code. **ON** by default.
+- `BUILD_SHARED_LIBS`: Build the shared version of the hmc library.
+- `TM_USE_FFTW`: Enable fftw support. 
+- `TM_USE_CUDA`: Enable CUDA support.
+- `TM_USE_HIP`: Enable HIP support (AMD or NVidia GPUs)
+- `TM_USE_DDalphaAMG`: Enable DDalphaAMG support.
+- `TM_USE_LEMON`: Use the lemon io library.
+- `TM_USE_OMP`: Enable OpenMP (**ON** by default)
+- `TM_FIXEDVOLUME`: Fix volume at compile time.
+- `TM_ENABLE_ALIGNMENT`: Automatically or expliclty align arrays to
+  byte number. auto, none, 16, 32, 64.
+- `TM_USE_GAUGE_COPY`: Enable use of a copy of the gauge field (**ON**
+  by default). See section ref{sec:dirac} for details on this option. It will
+  increase the memory requirement of the code.
+- `TM_USE_HALFSPINOR`: Use a Dirac Op. with halfspinor exchange (**ON**
+  by default). See sub-section ref{sec:dirac} for details. 
+- `TM_USE_QUDA`: Enable QUDA support.
+- `TM_USE_SHMEM`: Use shmem API.
+- `TM_ENABLE_WARNINGS`: Enable all warnings (**ON** by default).
+- `TM_ENABLE_TESTS`: Enable tests.
+- `TM_USE_QPHIX`: Enable QPhiX.
+  - `TM_QPHIX_SOALEN`: QPhiX specific parameter (default is 4)
+  - **QPHIX_DIR**: Directory where QPhiX is installed.
+    The QPhiX current CMake build system does not export all information (
+    include and lib directories) that are needed to compile hmc.
+  - **QMP_DIR**: Directory where QMP is installed (
+    QPhiX dependency).
+    The QPhiX current CMake build system does not export all information about the
+    include and lib directories nor its dependencies (QMP in that case).
+- `TM_USE_MPI`: Enable MPI support.
+  - `TM_PERSISTENT_MPI`: Use persistent MPI calls for halfspinor.
+  - `TM_NONBLOCKING_MPI`: Use non-blocking MPI calls for spinor and
+    gauge.
+  - `TM_MPI_DIMENSION`: Use $n$ dimensional parallelisation ($XYZT$)
+    [default=4]. The number of parallel directions can be specified. $1, 2, 3$ and $4$
+    dimensional parallelisation is supported.
+  - `TM_USE_LEMON` Use the lemon io library
+
+The following minimal list of commands will configure and build the hmc package with
+minimal dependencies
+
+```bash
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=/my_path -DCMAKE_PREFIX_PATH=/my_c_line_path ..
+make -j
+make install
+'''
+
+These instructions assume that the `c-lime` package is installed in `/my_c_line_path`. By default `CMAKE_PREFIX_PATH` variable is a list
+of paths separated by a semi-colunm containing the path of all installed to
+dependencies.
+
+Adding `-DTM_USE_MPI=ON` will enable MPI support with parallelization
+over spatial and temporal dimensions. The command line is then
+
+```bash
+cmake -DCMAKE_INSTALL_PREFIX=/my_path -DCMAKE_PREFIX_PATH=/my_c_line_path -DTM_USE_MPI=ON ..
+'''
+
+We can combine it with the lemon-io library (isntalled in `/my_lemon_path`)
+
+```bash
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON ..
+'''
+
+`QUDA` support (installed in `/my_quda_path`) can be added with
+
+```bash
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path;/my_quda_path" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON \
+      -DTM_USE_QUDA \
+      -DTM_USE_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=90 ..
+'''
+
+Note that the command assumes that QUDA is compiled with `CUDA` support. AMD GPU
+are also supported after replacing `-DTM_USE_CUDA=ON` with
+`-DTM_USE_HIP=ON` and compiling `QUDA` with `HIP` support. The ROCM architecture is defined by the variable
+`CMAKE_HIP_ARCHITECTURES=gfxxxx`.
+
+`QPhiX` and/or `DDalphaAMG` support can be added with
+
+```bash
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path;/my_quda_path;/my_path_ddalphaamg" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON \
+      -DTM_USE_QUDA=ON \
+      -DTM_USE_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=90 \
+      -DTM_USE_QPHIX=ON \
+      -DQPHIX_DIR=/my_qphix_dir \
+      -DTM_USE_DDalphaAMG=ON \
+      -DQMP_DIR=/my_qmp_dir \
+      -DTM_USE_OMP=ON ..
+'''
+
+`QPhiX` cmake config support is incomplete and requires both the QPhiX
+and QMP installation directories to work properly.
+
+`CMake` has several relevant specific options that control the build. Compiler
+options are defined by the variable `CMAKE_C_FLAGS` and `CMAKE_CXX_FLAGS`. CUDA and HIP compilations options are controlled by their
+equivalent `CMAKE_{CUDA/HIP}_FLAGS`.
+
+Adding for instance `-GNinja` to the `CMake` command line will use
+ninja instead of make.
diff --git a/cmake/FindCLime.cmake b/cmake/FindCLime.cmake
index 0c3eabe48..c9d94ea95 100644
--- a/cmake/FindCLime.cmake
+++ b/cmake/FindCLime.cmake
@@ -1,27 +1,26 @@
 include(FindPackageHandleStandardArgs)
 
 find_library(
-  TMLQCD_CLIME_LIBRARIES
+  TM_CLIME_LIBRARIES
   NAMES lime
   PATH_SUFFIXES "lib" "lib64")
 
 find_path(
-  TMLQCD_CLIME_INCLUDE_DIRS
+  TM_CLIME_INCLUDE_DIRS
   NAMES lime.h
   PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
 
-message("${TMLQCD_CLIME_INCLUDE_DIRS}")
-find_package_handle_standard_args(CLime DEFAULT_MSG TMLQCD_CLIME_LIBRARIES
-                                  TMLQCD_CLIME_INCLUDE_DIRS)
+find_package_handle_standard_args(CLime DEFAULT_MSG TM_CLIME_LIBRARIES
+                                  TM_CLIME_INCLUDE_DIRS)
 
 if(NOT TARGET tmlqcd::clime)
   add_library(tmlqcd::clime INTERFACE IMPORTED)
   set_target_properties(tmlqcd::clime PROPERTIES INTERFACE_LINK_LIBRARIES
-                                                 "${TMLQCD_CLIME_LIBRARIES}")
+                                                 "${TM_CLIME_LIBRARIES}")
   set_target_properties(tmlqcd::clime PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                                 "${TMLQCD_CLIME_INCLUDE_DIRS}")
+                                                 "${TM_CLIME_INCLUDE_DIRS}")
 endif()
 
-set(TMLQCD_CLIME_FOUND ON)
-mark_as_advanced(TMLQCD_CLIME_FOUND TMLQCD_CLIME_LIBRARIES
-                 TMLQCD_CLIME_INCLUDE_DIRS)
+set(TM_CLIME_FOUND ON)
+mark_as_advanced(TM_CLIME_FOUND TM_CLIME_LIBRARIES
+                 TM_CLIME_INCLUDE_DIRS)
diff --git a/cmake/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
index 145df156a..7c11d0446 100644
--- a/cmake/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -79,7 +79,7 @@
 //#cmakedefine YYTEXT_POINTER
 
 /* Number of bits in a file offset, on hosts where this is settable. */
-#define TM_FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
+#define TM_FILE_OFFSET_BITS @TM_FILE_OFFSET_BITS@
 
 /* Construct an extra copy of the gauge fields */
 #cmakedefine TM_USE_GAUGE_COPY
diff --git a/doc/install.tex b/doc/install.tex
index e4d86c2da..9d5e6f887 100644
--- a/doc/install.tex
+++ b/doc/install.tex
@@ -1,103 +1,136 @@
-The software ships with a GNU autoconf environment and a configure
-script, which will generate GNU Makefiles to build the programmes. It
-is supported and recommended to configure and build the executables in
-a separate build directory. This also allows to have several builds with
-different options from the same source code directory. 
+The software ships with a CMake environment, which will configure and build the
+programmes. It is recommended to configure and build the executables in a
+separate build directory. This also allows to have several builds with different
+options from the same source code directory.
 
 \subsection{Prerequisites}
 
-In order to compile the programmes the {\ttfamily
-  LAPACK}~\cite{lapack:web} library (fortran version) needs to be
-installed. In addition it must be known which linker options are
-needed to link against {\ttfamily LAPACK}, e.g. {\ttfamily
-  -Lpath-to-lapack -llapack  -lblas}. Also a the latest
-version (tested is version 1.2.3) of {\ttfamily
-  C-LIME}~\cite{lime:web} must be available, which is used as a
-packaging scheme to read and write gauge configurations and
-propagators to files.
+In order to compile the programmes the {\ttfamily LAPACK}~\cite{lapack:web}
+library (fortran version) needs to be installed. CMake will search for the
+library in all default directories. Also the latest version (tested is version
+1.2.3) of {\ttfamily C-LIME}~\cite{lime:web} must be available, which is used as
+a packaging scheme to read and write gauge configurations and propagators to
+files.
 
 \subsection{Configuring the hmc package}
 \label{sec:config}
 
-In order to get a simple configuration of the hmc package it is enough
-to just type 
-\begin{verbatim}
-path-to-src-code/configure   --with-lime=<path-to-lime> \
-     --with-lapack=<linker-flags> CC=<mycc> \
-     F77=<myf77> CFLAGS=<c-compiler flags>
-\end{verbatim}
-in the build directory. If 
-{\ttfamily CC, F77} and {\ttfamily CFLGAS} are not specified,
-{\ttfamily configure} will guess them.
-
-The code was successfully compiled and run at least on the following
-platforms: i686 and compatible, x64 and compatible, IBM Regatta
-systems, IBM Blue Gene/L, IBM Blue Gene/P, SGI Altix and SGI PC
-clusters, powerpc clusters.
-
-The configure script accepts certain options to influence the building
-procedure. One can get an overview over all supported options with
-{\ttfamily configure --help}. There are {\ttfamily enable|disable}
-options switching on and off optional features and {\ttfamily
-  with|without} switches usually related to optional packages. In the
-following we describe the most important of them (check {\ttfamily
-  configure --help} for the defaults and more options):
-
+The build system uses CMake to configure and build the hmc package. The
+following list gives all options (OFF by default unless specified):
 \begin{itemize}
-\item {\ttfamily --enable-mpi}:\\
-  This option switches on the support for MPI. On certain platforms it
-  automatically chooses the correct parallel compiler or searches for
-  a command {\ttfamily mpicc} in the search path.
-
-\item {\ttfamily --enable-gaugecopy}:\\
-  See section \ref{sec:dirac} for details on this option. It will
+\item {\ttfamily CMAKE\_POSITION\_INDEPENDENT\_CODE}: Build a position independent
+  code. ON by default.
+\item {\ttfamily BUILD\_SHARED\_LIBS}: Build the shared version of the hmc library.
+\item {\ttfamily TM\_USE\_FFTW}: Enable fftw support. 
+\item {\ttfamily TM\_USE\_CUDA}: Enable CUDA support.
+\item {\ttfamily TM\_USE\_HIP}: Enable HIP support (AMD or NVidia GPUs)
+\item {\ttfamily TM\_USE\_DDalphaAMG}: Enable DDalphaAMG support.
+\item {\ttfamily TM\_USE\_LEMON}: Use the lemon io library.
+\item {\ttfamily TM\_USE\_OMP}: Enable OpenMP ({\bf ON} by default)
+\item {\ttfamily TM\_FIXEDVOLUME}: Fix volume at compile time.
+\item {\ttfamily TM\_ENABLE\_ALIGNMENT}: Automatically or expliclty align arrays to
+  byte number. auto, none, 16, 32, 64.
+\item {\ttfamily TM\_USE\_GAUGE\_COPY}: Enable use of a copy of the gauge field (ON
+  by default). See section \ref{sec:dirac} for details on this option. It will
   increase the memory requirement of the code.
+\item {\ttfamily TM\_USE\_HALFSPINOR}: Use a Dirac Op. with halfspinor exchange (ON
+  by default). See sub-section \ref{sec:dirac} for details. 
+\item {\ttfamily TM\_USE\_QUDA}: Enable QUDA support.
+\item {\ttfamily TM\_USE\_SHMEM}: Use shmem API.
+\item {\ttfamily TM\_ENABLE\_WARNINGS}: Enable all warnings (ON by default).
+\item {\ttfamily TM\_ENABLE\_TESTS}: Enable tests.
+\item {\ttfamily TM\_USE\_QPHIX}: Enable QPhiX.
+  \begin{itemize}
+  \item {\ttfamily TM\_QPHIX\_SOALEN}: QPhiX specific parameter (default is 4)
+  \item \textcolor{red}{{\ttfamily QPHIX\_DIR}}: Directory where QPhiX is installed.
+    The QPhiX current CMake build system does not export all information (
+    include and lib directories) that are needed to compile hmc.
+  \item \textcolor{red}{\ttfamily QMP\_DIR}: Directory where QMP is installed (
+    QPhiX dependency).
+    The QPhiX current CMake build system does not export all information about the
+    include and lib directories nor its dependencies (QMP in that case).
+  \end{itemize}
+\item {\ttfamily TM\_USE\_MPI}: Enable MPI support.
+  \begin{itemize}
+  \item {\ttfamily TM\_PERSISTENT\_MPI}: Use persistent MPI calls for halfspinor.
+  \item {\ttfamily TM\_NONBLOCKING\_MPI}: Use non-blocking MPI calls for spinor and
+    gauge.
+  \item {\ttfamily TM\_MPI\_DIMENSION}: Use $n$ dimensional parallelisation ($XYZT$)
+    [default=4]. The number of parallel directions can be specified. $1, 2, 3$ and $4$
+    dimensional parallelisation is supported.
+  \item {\ttfamily TM\_USE\_LEMON} Use the lemon io library
+  \end{itemize}
+\end{itemize}
 
-\item {\ttfamily --enable-halfspinor}:\\
-  If this option is enabled the Dirac operator using half spinor
-  fields is used. See sub-section \ref{sec:dirac} for details. If this
-  feature is switched on, also the gauge copy feature is switched
-  on automatically. 
-
-%\item {\ttfamily --enable-shmem}:\\
-%  Use shared memory API instead of MPI for the communication of spinor
-%  fields. This is currently only usable on the Munich Altix machine.
-
-\item {\ttfamily --with-mpidimension=n}:\\
-  This option has only effect if the preceding one is switched
-  on. The number of parallel directions can be specified. 1,2,3 and 4
-  dimensional parallelisation is supported.
-
-\item {\ttfamily --with-lapack="<linker flags>"}:\\
-  the code requires lapack to be linked. All linker flags necessary
-  to do so must be specified here. Note, that {\ttfamily LIBS="..."}
-  works similar.
+The following minimal list of commands will configure and build the hmc package with
+minimal dependencies
+\begin{verbatim}
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=/my_path -DCMAKE_PREFIX_PATH=/my_c_line_path ..
+make -j
+make install
+\end{verbatim}
 
-\item {\ttfamily --with-limedir=<dir>}:\\
-  Tells configure where to find the lime package, which is required for
-  the build of the HMC. It is used for the ILDG file format.
- 
-\end{itemize}
+These instructions assume that the {\ttfamily c-lime} package is installed in {\ttfamily
+  /my\_c\_line\_path}. By default {\ttfamily CMAKE\_PREFIX\_PATH} variable is a list
+of paths separated by a semi-colunm containing the path of all installed to
+dependencies.
 
-The configure script will guess at the very beginning on which
-platform the build is done. In case this fails or a cross compilation
-must be performed please use the option {\ttfamily --host=HOST}. For
-instance in order to compile for the BG/P one needs to specify
-{\ttfamily --host=ppc-ibm-bprts --build=ppc64-ibm-linux}. 
+Adding {\ttfamily -DTM\_USE\_MPI=ON} will enable MPI support with parallelization
+over spatial and temporal dimensions. The command line is then
+\begin{verbatim}
+cmake -DCMAKE_INSTALL_PREFIX=/my_path -DCMAKE_PREFIX_PATH=/my_c_line_path -DTM_USE_MPI=ON ..
+\end{verbatim}
+We can combine it with the lemon-io library (isntalled in {\ttfamily /my\_lemon\_path})
+\begin{verbatim}
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON ..
+\end{verbatim}
 
-For certain architectures like the Blue Gene systems there are
-{\ttfamily README.arch} files in the top source directory with
-example configure calls.
+{\ttfamily QUDA} support (installed in {\ttfamily my\_quda\_path}) can be added with
+\begin{verbatim}
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path;\my_quda_path" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON \
+      -DTM_USE_QUDA \
+      -DTM_USE_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=90 ..
+\end{verbatim}
+Note that the command assumes that QUDA is compiled with CUDA support. AMD GPU
+are also supported after replacing {\ttfamily -DTM\_USE\_CUDA=ON} with
+{\ttfamily -DTM\_USE\_HIP=ON} and compiling {\ttfamily QUDA} with {\ttfamily
+  HIP} support. The {\ttfamily ROCM} architecture is defined by the variable
+{\ttfamily CMAKE\_HIP\_ARCHITECTURES=gfxxxx}.
 
-\subsection{Building and Installing}
+{\ttfamily QPhiX} and/or {\ttfamily DDalphaAMG} support can be added with
+\begin{verbatim}
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path;/my_quda_path;/my_path_ddalphaamg" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON \
+      -DTM_USE_QUDA=ON \
+      -DTM_USE_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=90 \
+      -DTM_USE_QPHIX=ON \
+      -DQPHIX_DIR=/my_qphix_dir \
+      -DTM_USE_DDalphaAMG=ON \
+      -DQMP_DIR=/my_qmp_dir \
+      -DTM_USE_OMP=ON ..
+\end{verbatim}
+{\ttfamily QPhiX} cmake config support is incomplete and requires both the {\ttfamily QPhiX}
+and {\ttfamily QMP} installation directories to work properly.
 
-After successfully configuring the package the code can be build by
-simply typing {\ttfamily make} in the build directory. This will
-compile the standard executables. Typing {\ttfamily make install} will
-copy these executables into the install directory. The default install
-directory is {\ttfamily \$HOME/bin}, which can be influenced e.g. with
-the {\ttfamily --prefix} option to {\ttfamily configure}. 
+CMake has several relevant specific options that control the build. Compiler
+options are defined by the variable {\ttfamily CMAKE\_C\_FLAGS} and {\ttfamily
+  CMAKE\_CXX\_FLAGS}. CUDA and HIP compilations options are controlled by their
+equivalent {\ttfamily CMAKE\_\{CUDA/HIP\}\_FLAGS}. 
 
+Adding for instance {\ttfamily -GNinja} to the {\ttfamily CMake} command line will use
+{\ttfamily ninja} instead of {\ttfamily make}.
 
 %%% Local Variables: 
 %%% mode: latex
diff --git a/install-sh b/install-sh
deleted file mode 100644
index e69de29bb..000000000
diff --git a/quda_gauge_paths.inc b/quda_gauge_paths.inc
deleted file mode 100644
index d2c898e6c..000000000
--- a/quda_gauge_paths.inc
+++ /dev/null
@@ -1,158 +0,0 @@
-/***********************************************************************
- *
- * Copyright (C) 2021 Bartosz Kostrzewa, Ferenc Pittler, Simone Bacchio
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- *
- *
- ***********************************************************************/
-
-const int plaq_rect_length[24] = {
-    3, 3, 3, 3, 3, 3,
-    5, 5, 5, 5, 5, 5,
-    5, 5, 5, 5, 5, 5,
-    5, 5, 5, 5, 5, 5,
-  };
-
-const int plaq_rect_path[4][24][5] = {
-    { {1, 7, 6 },
-      {6, 7, 1 },
-      {2, 7, 5 },
-      {5, 7, 2 },
-      {3, 7, 4 },
-      {4, 7, 3 }, 
-      {1, 1, 7, 6, 6 },
-      {6, 6, 7, 1, 1 },
-      {2, 2, 7, 5, 5 },
-      {5, 5, 7, 2, 2 },
-      {3, 3, 7, 4, 4 },
-      {4, 4, 7, 3, 3 },
-      {0, 1, 7, 7, 6 },
-      {6, 7, 7, 1, 0 },
-      {0, 2, 7, 7, 5 },
-      {5, 7, 7, 2, 0 },
-      {0, 3, 7, 7, 4 },
-      {4, 7, 7, 3, 0 },
-      {0, 4, 7, 7, 3 },
-      {3, 7, 7, 4, 0 },
-      {0, 5, 7, 7, 2 },
-      {2, 7, 7, 5, 0 },
-      {0, 6, 7, 7, 1 },
-      {1, 7, 7, 6, 0 } },
-    { { 2, 6, 5 },
-      { 5, 6, 2 },
-      { 3, 6, 4 },
-      { 4, 6, 3 },
-      { 0, 6, 7 },
-      { 7, 6, 0 },
-      { 1, 2, 6, 6, 5 },
-      { 2, 6, 6, 5, 1 },
-      { 5, 6, 6, 2, 1 },
-      { 1, 5, 6, 6, 2 },
-      { 1, 3, 6, 6, 4 },
-      { 3, 6, 6, 4, 1 },
-      { 4, 6, 6, 3, 1 },
-      { 1, 4, 6, 6, 3 },
-      { 1, 0, 6, 6, 7 },
-      { 0, 6, 6, 7, 1 },
-      { 7, 6, 6, 0, 1 },
-      { 1, 7, 6, 6, 0 },
-      { 5, 5, 6, 2, 2 },
-      { 2, 2, 6, 5, 5 },
-      { 4, 4, 6, 3, 3 },
-      { 3, 3, 6, 4, 4 },
-      { 7, 7, 6, 0, 0 },
-      { 0, 0, 6, 7, 7 } },
-    { {3, 5, 4},
-      {4, 5, 3},
-      {0, 5, 7},
-      {7, 5, 0},
-      {1, 5, 6},
-      {6, 5, 1},
-      {2, 3, 5, 5, 4},
-      {3, 5, 5, 4, 2}, 
-      {4, 5, 5, 3, 2}, 
-      {2, 4, 5, 5, 3}, 
-      {2, 0, 5, 5, 7}, 
-      {0, 5, 5, 7, 2}, 
-      {7, 5, 5, 0, 2}, 
-      {2, 7, 5, 5, 0},
-      {2, 1, 5, 5, 6}, 
-      {1, 5, 5, 6, 2}, 
-      {6, 5, 5, 1, 2}, 
-      {2, 6, 5, 5, 1}, 
-      {4, 4, 5, 3, 3}, 
-      {3, 3, 5, 4, 4}, 
-      {7, 7, 5, 0, 0},
-      {0, 0, 5, 7, 7}, 
-      {6, 6, 5, 1, 1}, 
-      {1, 1, 5, 6, 6} }, 
-    { { 0, 4, 7 },
-      { 7, 4, 0 },
-      { 1, 4, 6 },
-      { 6, 4, 1 },
-      { 2, 4, 5 },
-      { 5, 4, 2 },
-      { 3, 0, 4, 4, 7 },
-      { 0, 4, 4, 7, 3 },
-      { 7, 4, 4, 0, 3 },
-      { 3, 7, 4, 4, 0 },
-      { 3, 1, 4, 4, 6 },
-      { 1, 4, 4, 6, 3 },
-      { 6, 4, 4, 1, 3 },
-      { 3, 6, 4, 4, 1 },
-      { 3, 2, 4, 4, 5 },
-      { 2, 4, 4, 5, 3 },
-      { 5, 4, 4, 2, 3 },
-      { 3, 5, 4, 4, 2 },
-      { 7, 7, 4, 0, 0 },
-      { 0, 0, 4, 7, 7 },
-      { 6, 6, 4, 1, 1 },
-      { 1, 1, 4, 6, 6 },
-      { 5, 5, 4, 2, 2 },
-      { 2, 2, 4, 5, 5 } } 
-  };
-
-const int plaq_length[] = {
-    3, 3, 3, 3, 3, 3 };
-
-const int plaq_path[4][6][3] = {
-    { { 1, 7, 6 },
-      { 6, 7, 1 },
-      { 2, 7, 5 },
-      { 5, 7, 2 },
-      { 3, 7, 4 },
-      { 4, 7, 3 } },
-    { { 2, 6, 5 },
-      { 5, 6, 2 },
-      { 3, 6, 4 },
-      { 4, 6, 3 },
-      { 0, 6, 7 },
-      { 7, 6, 0 } },
-    { { 3, 5, 4},
-      { 4, 5, 3},
-      { 0, 5, 7},
-      { 7, 5, 0},
-      { 1, 5, 6},
-      { 6, 5, 1} },
-    { { 0, 4, 7 },
-      { 7, 4, 0 },
-      { 1, 4, 6 },
-      { 6, 4, 1 },
-      { 2, 4, 5 },
-      { 5, 4, 2 } } 
-  };
-

From 2ec6afee353cfb8a7100dcaff3f5c192d45f5c0d Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 24 Feb 2026 09:58:58 +0100
Subject: [PATCH 15/80] Add alignment detection at configuration time

---
 CMakeLists.txt                     |  10 +-
 cmake/DetectSimdAndAlignment.cmake | 288 +++++++++++++++++++++++++++++
 2 files changed, 295 insertions(+), 3 deletions(-)
 create mode 100644 cmake/DetectSimdAndAlignment.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d363e407c..82880ef60 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -146,21 +146,25 @@ set(ALIGN32 " ")
 
 message("${TM_ENABLE_ALIGNMENT}")
 if(${TM_ENABLE_ALIGNMENT} STREQUAL "auto")
+  include(cmake/DetectSimdAndAlignment.cmake)
+  message(STATUS "SIMD: ${SIMD_LEVEL} (${SIMD_ARCH_FAMILY}), align=${SIMD_ALIGNMENT}")
+endif()
+if (${TM_ENABLE_ALIGNMENT} STREQUAL "none")
   set(ALIGN_BASE "0x00")
   set(ALIGN " ")
   set(ALIGN_BASE32 "0x00")
   set(ALIGN32 " ")
-elseif(TM_ENABLE_ALIGNMENT EQUAL 16)
+elseif((${TM_ENABLE_ALIGNMENT} STREQUAL "16") OR (${SIMD_ALIGNMENT} EQUAL 16))
   set(ALIGN_BASE "0x0F")
   set(ALIGN "__attribute__ ((aligned (16)))")
   set(ALIGN_BASE32 "0x0F")
   set(ALIGN32 "__attribute__ ((aligned (16)))")
-elseif(TM_ENABLE_ALIGNMENT EQUAL 32)
+elseif((${TM_ENABLE_ALIGNMENT} STREQUAL "32") OR (${SIMD_ALIGNMENT} EQUAL 32))
   set(ALIGN_BASE "0x2F")
   set(ALIGN "__attribute__ ((aligned (32)))")
   set(ALIGN_BASE32 "0x2F")
   set(ALIGN32 "__attribute__ ((aligned (32)))")
-elseif(TM_ENABLE_ALIGNMENT EQUAL 64)
+elseif((${TM_ENABLE_ALIGNMENT} STREQUAL "64") OR (${SIMD_ALIGNMENT} EQUAL 64))
   set(ALIGN_BASE "0x3F")
   set(ALIGN "__attribute__ ((aligned (64)))")
   set(ALIGN_BASE32 "0x3F")
diff --git a/cmake/DetectSimdAndAlignment.cmake b/cmake/DetectSimdAndAlignment.cmake
new file mode 100644
index 000000000..707b9b65b
--- /dev/null
+++ b/cmake/DetectSimdAndAlignment.cmake
@@ -0,0 +1,288 @@
+# DetectSimdAndAlignment.cmake
+#
+# Detect SIMD architecture family, SIMD level and a reasonable alignment value.
+#
+# Exposed cache variables:
+#   SIMD_ARCH_FAMILY : x86 / ARM / PPC / UNKNOWN
+#   SIMD_LEVEL       : AVX512 / AVX2 / SSE2 / NEON / ALTIVEC / SCALAR
+#   SIMD_ALIGNMENT   : integer, in bytes (16, 32, 64, ...)
+#
+# Optional (if you want a configured header):
+#   SIMD_CONFIG_HEADER : path to the generated header (see bottom).
+#
+# Usage:
+#   include(cmake/DetectSimdAndAlignment.cmake)
+#   message(STATUS "SIMD: ${SIMD_ARCH_FAMILY} ${SIMD_LEVEL}, alignment=${SIMD_ALIGNMENT}")
+#
+#   # Example: propagate as defines
+#   target_compile_definitions(my_target PRIVATE
+#       SIMD_ALIGNMENT=${SIMD_ALIGNMENT}
+#       SIMD_LEVEL_${SIMD_LEVEL}
+#   )
+# DetectSimdAndAlignment.cmake - COMPLETE: x86 + ARM NEON + NVIDIA + PowerPC
+
+
+include_guard(GLOBAL) #
+
+include(CheckCXXSourceCompiles)
+include(CheckCXXSourceRuns) # For runtime CPU detection fallback
+
+# ------------------------------
+# 1. Detect architecture family
+# ------------------------------
+if(NOT DEFINED SIMD_ARCH_FAMILY)
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _simd_proc)
+
+    if(_simd_proc MATCHES "x86_64|amd64|i[3-6]86")
+        set(_detected_arch "x86")
+    elseif(_simd_proc MATCHES "armv[0-9]+|aarch64|arm64")
+        set(_detected_arch "ARM")
+    elseif(_simd_proc MATCHES "ppc64(le|el)?|powerpc|ppc")
+        set(_detected_arch "PPC")
+    elseif(_simd_proc MATCHES "nvcl|sm_89|sm_90")
+        set(_detected_arch "NVIDIA")
+    else()
+        set(_detected_arch "UNKNOWN")
+    endif()
+
+    set(SIMD_ARCH_FAMILY "${_detected_arch}" CACHE STRING "SIMD architecture family")
+endif()
+
+# Defaults
+set(SIMD_LEVEL "SCALAR" CACHE STRING "Detected SIMD level")
+set(SIMD_ALIGNMENT 16 CACHE STRING "Alignment in bytes")
+set(SIMD_HAS_FLOAT ON CACHE BOOL "Float SIMD support")
+set(SIMD_HAS_DOUBLE ON CACHE BOOL "Double SIMD support")
+
+# Save/restore flags helper
+set(_SIMD_SAVED_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+macro(_simd_restore_flags)
+    if(DEFINED _SIMD_SAVED_REQUIRED_FLAGS)
+        set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS}")
+    endif()
+endmacro()
+
+# ------------------------------------------------
+# 2. x86: SSE2 → AVX2 → AVX512
+# ------------------------------------------------
+if(SIMD_ARCH_FAMILY STREQUAL "x86")
+    # AVX512 double (64-byte)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mavx512f -mavx512dq")
+    check_cxx_source_compiles("
+        #include <immintrin.h>
+        int main() { __m512d v = _mm512_set1_pd(1.0); (void)v; return 0; }
+    " _HAVE_AVX512_DOUBLE)
+
+    if(_HAVE_AVX512_DOUBLE)
+        set(SIMD_LEVEL "AVX512" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 64 CACHE STRING "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+    # AVX2 double (32-byte)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mavx2")
+    check_cxx_source_compiles("
+        #include <immintrin.h>
+        int main() { __m256d v = _mm256_set1_pd(1.0); (void)v; return 0; }
+    " _HAVE_AVX2_DOUBLE)
+
+    if(_HAVE_AVX2_DOUBLE)
+        set(SIMD_LEVEL "AVX2" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 32 CACHE STRING "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+    # SSE2 double minimum (16-byte)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -msse2")
+    check_cxx_source_compiles("
+        #include <emmintrin.h>
+        int main() { __m128d v = _mm_set1_pd(1.0); (void)v; return 0; }
+    " _HAVE_SSE2_DOUBLE)
+
+    if(_HAVE_SSE2_DOUBLE)
+        set(SIMD_LEVEL "SSE2" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+# --------------------------------------
+# 3. ARM NEON - ALL FAMILIES
+# --------------------------------------
+elseif(SIMD_ARCH_FAMILY STREQUAL "ARM")
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _arm_proc)
+
+    # AArch64 + SVE
+    if(_arm_proc MATCHES "aarch64|arm64")
+        set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -march=armv8-a+sve")
+        check_cxx_source_compiles("
+            #include <arm_sve.h>
+            int main() { svfloat32_t v = svdup_f32(1.0f); (void)v; return 0; }
+        " _HAVE_SVE)
+
+        if(_HAVE_SVE)
+            set(SIMD_LEVEL "SVE" CACHE STRING "" FORCE)
+            set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+            _simd_restore_flags()
+            return()
+        endif()
+
+        # AArch64 NEON (double safe)
+        check_cxx_source_compiles("
+            #include <arm_neon.h>
+            int main() {
+                float64x2_t vd = vdupq_n_f64(1.0);
+                float32x4_t vf = vdupq_n_f32(1.0f);
+                (void)vd; (void)vf; return 0;
+            }" _HAVE_NEON_AARCH64)
+
+        if(_HAVE_NEON_AARCH64)
+            set(SIMD_LEVEL "NEON_AARCH64" CACHE STRING "" FORCE)
+            set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+            _simd_restore_flags()
+            return()
+        endif()
+
+    # ARMv8 32-bit
+    elseif(_arm_proc MATCHES "armv8")
+        set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -march=armv8-a+simd")
+        check_cxx_source_compiles("
+            #include <arm_neon.h>
+            int main() { float32x4_t v = vdupq_n_f32(1.0f); (void)v; return 0; }
+        " _HAVE_ARMv8_NEON)
+
+        if(_HAVE_ARMv8_NEON)
+            set(SIMD_LEVEL "NEON_ARMv8" CACHE STRING "" FORCE)
+            set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+            set(SIMD_HAS_DOUBLE OFF CACHE BOOL "" FORCE)
+            _simd_restore_flags()
+            return()
+        endif()
+
+    # ARMv7 NEON
+    elseif(_arm_proc MATCHES "armv7")
+        set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mfpu=neon -march=armv7-a")
+        check_cxx_source_compiles("
+            #include <arm_neon.h>
+            int main() { float32x4_t v = vdupq_n_f32(1.0f); (void)v; return 0; }
+        " _HAVE_ARMv7_NEON)
+
+        if(_HAVE_ARMv7_NEON)
+            set(SIMD_LEVEL "NEON_ARMv7" CACHE STRING "" FORCE)
+            set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+            set(SIMD_HAS_DOUBLE OFF CACHE BOOL "" FORCE)
+            _simd_restore_flags()
+            return()
+        endif()
+    endif()
+
+# --------------------------------------
+# 4. POWERPC - COMPLETE COVERAGE (NEW!)
+# --------------------------------------
+elseif(SIMD_ARCH_FAMILY STREQUAL "PPC")
+
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _ppc_proc)
+
+    # === Power10+ (512-bit vectors, POWER10)
+    # Note: Power10 needs -mcpu=power10 or -mtune=power10
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mcpu=power10")
+    check_cxx_source_compiles("
+        #include <altivec.h>
+        int main() {
+            vector double vd = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; // 512-bit
+            vector float vf = {1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f};
+            (void)vd; (void)vf; return 0;
+        }" _HAVE_POWER10)
+
+    if(_HAVE_POWER10)
+        set(SIMD_LEVEL "POWER10" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 64 CACHE STRING "" FORCE)  # 512-bit = 64 bytes
+        set(SIMD_HAS_FLOAT ON CACHE BOOL "" FORCE)
+        set(SIMD_HAS_DOUBLE ON CACHE BOOL "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+    # === Power9 VSX (256-bit, POWER8+)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mcpu=power9 -mvsx")
+    check_cxx_source_compiles("
+        #include <altivec.h>
+        int main() {
+            vector double vd = {1.0,1.0,1.0,1.0};  // 256-bit VSX double
+            vector float vf = {1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f}; // 256-bit
+            (void)vd; (void)vf; return 0;
+        }" _HAVE_VSX_POWER9)
+
+    if(_HAVE_VSX_POWER9)
+        set(SIMD_LEVEL "VSX_POWER9" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 32 CACHE STRING "" FORCE)  # 256-bit = 32 bytes
+        set(SIMD_HAS_FLOAT ON CACHE BOOL "" FORCE)
+        set(SIMD_HAS_DOUBLE ON CACHE BOOL "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+    # === Power7+ VSX (128-bit double, POWER7+)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mcpu=power7 -mvsx")
+    check_cxx_source_compiles("
+        #include <altivec.h>
+        int main() {
+            vector double vd = {1.0,1.0};  // VSX 128-bit double
+            (void)vd; return 0;
+        }" _HAVE_VSX_POWER7)
+
+    if(_HAVE_VSX_POWER7)
+        set(SIMD_LEVEL "VSX_POWER7" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+        set(SIMD_HAS_FLOAT ON CACHE BOOL "" FORCE)
+        set(SIMD_HAS_DOUBLE ON CACHE BOOL "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+    # === Classic AltiVec/VMX (PowerPC baseline, 128-bit)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -maltivec -mabi=altivec")
+    check_cxx_source_compiles("
+        #include <altivec.h>
+        int main() {
+            vector float vf = (vector float){1.0f,1.0f,1.0f,1.0f};
+            (void)vf; return 0;
+        }" _HAVE_ALTIVEC)
+
+    if(_HAVE_ALTIVEC)
+        set(SIMD_LEVEL "ALTIVEC" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+        set(SIMD_HAS_FLOAT ON CACHE BOOL "" FORCE)
+        set(SIMD_HAS_DOUBLE OFF CACHE BOOL "" FORCE)  # AltiVec: float primary
+        _simd_restore_flags()
+        return()
+    endif()
+
+# --------------------------------------
+# 5. NVIDIA GH200 (sm_89)
+# --------------------------------------
+elseif(SIMD_ARCH_FAMILY STREQUAL "NVIDIA")
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} --gpu-arch=sm_89")
+    check_cxx_source_compiles("
+        #include <cuda_runtime.h>
+        int main() { double d = 1.0; (void)d; return 0; }
+    " _HAVE_CUDA_SM89)
+
+    if(_HAVE_CUDA_SM89)
+        set(SIMD_LEVEL "CUDA_SM89" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+# --------------------------------------
+# 6. Fallback
+# --------------------------------------
+else()
+    _simd_restore_flags()
+    return()
+endif()
+
+_simd_restore_flags()

From 174add7c8183809707ff7ea3537ed70b3be0543a Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 24 Feb 2026 17:02:41 +0100
Subject: [PATCH 16/80] Build DDalphaAMG automatically when DDalphaAMG is
 enabled

---
 .github/workflows/ddalphaamg-build.yaml |   4 +-
 CMakeLists.txt                          |  90 +++++++++-
 DDalphaAMG/CMakeLists.txt               | 216 ++++++++++++++++++++++++
 cmake/DDalphaAMG-Config.cmake.in        |  52 ++++++
 cmake/tmlQCD-config.cmake.in            |  80 +++++++++
 src/lib/CMakeLists.txt                  |  27 ++-
 6 files changed, 455 insertions(+), 14 deletions(-)
 create mode 100644 DDalphaAMG/CMakeLists.txt
 create mode 100644 cmake/DDalphaAMG-Config.cmake.in
 create mode 100644 cmake/tmlQCD-config.cmake.in

diff --git a/.github/workflows/ddalphaamg-build.yaml b/.github/workflows/ddalphaamg-build.yaml
index 509fb28b6..0d475e219 100644
--- a/.github/workflows/ddalphaamg-build.yaml
+++ b/.github/workflows/ddalphaamg-build.yaml
@@ -111,8 +111,8 @@ jobs:
         run: |
           CC=mpicc CXX=mpicxx \
             LDFLAGS="-fopenmp" \
-            CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-            CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
+            CFLAGS="-O3 -ffast-math -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
+            CXXFLAGS="-O3 -ffast-math -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
             cmake -DCMAKE_PREFIX_PATH="${{github.workspace}}/lime/build/install_dir;${{github.workspace}}/lemon/build/install_dir;${{github.workspace}}/ddalphaamg" \
             -DTM_USE_MPI=ON \
             -DTM_USE_OMP=ON \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82880ef60..75bac17b9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,8 @@ endif()
 # PROJECT AND VERSION
 include(CMakeDependentOption)
 include(GNUInstallDirs)
+include(FetchContent)
+include(CMakePackageConfigHelpers)
 
 cmake_policy(SET CMP0048 NEW)
 
@@ -100,7 +102,9 @@ option(TM_USE_SHMEM "Use shmem API" OFF)
 option(TM_USE_QUDA "Enable QUDA support" OFF)
 option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
 option(TM_ENABLE_TESTS "Enable tests" OFF)
-set(TM_QPHIX_SOALEN "4" CACHE STRING "QPhiX specific parameter")
+set(TM_QPHIX_SOALEN
+    "4"
+    CACHE STRING "QPhiX specific parameter")
 # MPI dependent options
 cmake_dependent_option(
   TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]" OFF
@@ -126,6 +130,44 @@ cmake_dependent_option(TM_USE_LEMON "Use the lemon io library" OFF "TM_USE_MPI"
 cmake_dependent_option(TM_USE_NVHPC "Enable Nvidia HPC toolkit" OFF
                        "TM_USE_CUDA" OFF)
 
+# DDAlphaAMG specific options
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_PARAMOUNT_OUTPUT "Enable paramount output support" ON
+  "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(DDalphaAMG_ENABLE_FGMRES_RESTEST "Enable GMRES test" OFF
+                       "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_PROFILING "Enable paramount output support" OFF
+  "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(DDalphaAMG_ENABLE_TRACK_RES "Enable track res support"
+                       ON "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI OFF
+  "Enable paramount output support" OFF "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_COARSE_RES "Enable paramount output support" OFF
+  "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_SCHWARZ_RES "Enable paramount output support" OFF 
+  "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(DDalphaAMG_ENABLE_OMP  "Enable OpenMP support" ON
+                       "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_TESTVECTOR_ANALYSIS "Enable vector analysis support" OFF
+  "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(DDalphaAMG_ENABLE_HDF5 "Enable HDF5 support" OFF
+                       "TM_USE_DDalphaAMG" OFF)
+
 # search for blas and lapack
 find_package(BLAS REQUIRED)
 #
@@ -144,12 +186,16 @@ set(ALIGN_BASE "0")
 set(ALIGN_BASE32 "0")
 set(ALIGN32 " ")
 
-message("${TM_ENABLE_ALIGNMENT}")
+# DO NOT MERGE the two if statements as otherwise the automatic alignment will
+# not be taken into account
+
 if(${TM_ENABLE_ALIGNMENT} STREQUAL "auto")
   include(cmake/DetectSimdAndAlignment.cmake)
-  message(STATUS "SIMD: ${SIMD_LEVEL} (${SIMD_ARCH_FAMILY}), align=${SIMD_ALIGNMENT}")
+  message(
+    STATUS "SIMD: ${SIMD_LEVEL} (${SIMD_ARCH_FAMILY}), align=${SIMD_ALIGNMENT}")
 endif()
-if (${TM_ENABLE_ALIGNMENT} STREQUAL "none")
+
+if(${TM_ENABLE_ALIGNMENT} STREQUAL "none")
   set(ALIGN_BASE "0x00")
   set(ALIGN " ")
   set(ALIGN_BASE32 "0x00")
@@ -195,7 +241,11 @@ if(TM_USE_HDF5)
 endif()
 
 if(TM_USE_LEMON)
-  find_package(lemon REQUIRED)
+  FetchContent_Declare(lemon
+    GIT_REPOSITORY https://github.com/etmc/lemon
+    GIT_TAG        187de3435d604251e078eb083016131f035d6a51
+    FIND_PACKAGE_ARGS NAMES lemon)
+  FetchContent_MakeAvailable(lemon)
 endif()
 
 find_package(CLime REQUIRED)
@@ -255,7 +305,7 @@ if(TM_USE_FFTW)
 endif()
 
 if(TM_USE_DDalphaAMG)
-  find_package(DDalphaAMG REQUIRED)
+  add_subdirectory(DDalphaAMG)
 endif()
 
 if(TM_ENABLE_WARNINGS)
@@ -289,8 +339,11 @@ if(TM_USE_MPI)
   endif()
 endif()
 
-if (TM_USE_HALFSPINOR AND NOT TM_USE_GAUGE_COPY)
-  message(FATAL_ERROR "The TM_USE_GAUGE_COPY option should also be set to ON when TM_USE_HALFSPINOR is ON")
+if(TM_USE_HALFSPINOR AND NOT TM_USE_GAUGE_COPY)
+  message(
+    FATAL_ERROR
+      "The TM_USE_GAUGE_COPY option should also be set to ON when TM_USE_HALFSPINOR is ON"
+  )
 endif()
 # keep the autotool config.h header.
 configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlqcd_config_internal.h.in"
@@ -323,3 +376,24 @@ endif()
 configure_file(cmake/git_hash.c.in git_hash.c @ONLY)
 add_subdirectory(src/lib)
 add_subdirectory(src/bin)
+
+write_basic_package_version_file(
+  "${PROJECT_BINARY_DIR}/tmlQCDConfigVersion.cmake"
+  VERSION "${PROJECT_VERSION}"
+  COMPATIBILITY SameMajorVersion)
+
+configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlQCD-config.cmake.in"
+  "${PROJECT_BINARY_DIR}/tmlQCD-config.cmake" @ONLY)
+
+install(FILES "${PROJECT_BINARY_DIR}/tmlQCD-config.cmake"
+  "${PROJECT_BINARY_DIR}/tmlQCDConfigVersion.cmake"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/tmlQCD")
+
+install(FILES "${PROJECT_BINARY_DIR}/libtmlQCD.pc"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+install(
+  DIRECTORY "${PROJECT_SOURCE_DIR}/cmake"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/tmlQCD"
+  FILES_MATCHING
+  PATTERN "*.cmake")
diff --git a/DDalphaAMG/CMakeLists.txt b/DDalphaAMG/CMakeLists.txt
new file mode 100644
index 000000000..22c0c35d1
--- /dev/null
+++ b/DDalphaAMG/CMakeLists.txt
@@ -0,0 +1,216 @@
+# there is a lot of custom directories to circonvent the deletion of the
+# CMakeLists.txt contained in the DDalphaAMG directory. CMake will clone the
+# source code and build it with the default options used in the ci/cd. More
+# options are available in the main CMakeLists.txt.
+
+include(GNUInstallDirs)
+
+set(DDalphaAMG_SRC_DIR ${CMAKE_SOURCE_DIR}/DDalphaAMG/deps)
+
+FetchContent_Declare(
+  DDalphaAMG
+  GIT_REPOSITORY https://github.com/etmc/DDalphaAMG.git
+  SOURCE_DIR ${DDalphaAMG_SRC_DIR})
+
+FetchContent_MakeAvailable(DDalphaAMG)
+
+list(
+  APPEND
+  DDalphaAMG_SRC_GENERIC
+  interpolation_generic.c
+  gathering_generic.c
+  sse_interpolation_generic.c
+  coarse_oddeven_generic.c
+  operator_generic.c
+  oddeven_generic.c
+  linalg_generic.c
+  init_generic.c
+  vcycle_generic.c
+  dirac_generic.c
+  coarse_operator_generic.c
+  coarsening_generic.c
+  schwarz_generic.c
+  ghost_generic.c
+  vectorization_dirac_generic.c
+  linsolve_generic.c
+  sse_coarse_operator_generic.c
+  data_generic.c
+  setup_generic.c
+  sse_linalg_generic.c)
+
+list(
+  APPEND
+  DDalphaAMG_HEADER_GENERIC
+  interpolation_generic.h
+  gathering_generic.h
+  sse_interpolation_generic.h
+  coarse_oddeven_generic.h
+  operator_generic.h
+  oddeven_generic.h
+  linalg_generic.h
+  init_generic.h
+  vcycle_generic.h
+  dirac_generic.h
+  coarse_operator_generic.h
+  coarsening_generic.h
+  schwarz_generic.h
+  ghost_generic.h
+  vectorization_dirac_generic.h
+  linsolve_generic.h
+  sse_coarse_operator_generic.h
+  data_generic.h
+  setup_generic.h
+  sse_linalg_generic.h
+  main_pre_def_generic.h
+  main_post_def_generic.h)
+
+list(
+  APPEND
+  DDalphaAMG_SRC_GENERAL
+  ${DDalphaAMG_SRC_DIR}/src/preconditioner.c
+  ${DDalphaAMG_SRC_DIR}/src/threading.c
+  ${DDalphaAMG_SRC_DIR}/src/main.c
+  ${DDalphaAMG_SRC_DIR}/src/sse_dirac.c
+  ${DDalphaAMG_SRC_DIR}/src/var_table.c
+  ${DDalphaAMG_SRC_DIR}/src/data_layout.c
+  ${DDalphaAMG_SRC_DIR}/src/linsolve.c
+  ${DDalphaAMG_SRC_DIR}/src/ghost.c
+  ${DDalphaAMG_SRC_DIR}/src/top_level.c
+  ${DDalphaAMG_SRC_DIR}/src/dirac.c
+  ${DDalphaAMG_SRC_DIR}/src/linalg.c
+  ${DDalphaAMG_SRC_DIR}/src/init.c
+  ${DDalphaAMG_SRC_DIR}/src/DDalphaAMG_interface.c
+  ${DDalphaAMG_SRC_DIR}/src/lime_io.c
+  ${DDalphaAMG_SRC_DIR}/src/sse_linalg.c
+  ${DDalphaAMG_SRC_DIR}/src/solver_analysis.c
+  ${DDalphaAMG_SRC_DIR}/src/io.c)
+
+foreach(f IN LISTS DDalphaAMG_SRC_GENERIC)
+  string(REPLACE "_generic" "_float" f_float "${f}")
+
+  add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done"
+    COMMAND
+    sed -f "${DDalphaAMG_SRC_DIR}/float.sed" "${DDalphaAMG_SRC_DIR}/src/${f}"
+    > "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}"
+    COMMAND ${CMAKE_COMMAND} -E touch
+    "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done"
+    DEPENDS "${DDalphaAMG_SRC_DIR}/src/${f}" "${DDalphaAMG_SRC_DIR}/float.sed"
+    VERBATIM)
+  list(APPEND SED_MARKERS "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done")
+  list(APPEND DDalphaAMG_SRC_SINGLE_DOUBLE ${f_float})
+
+  string(REPLACE "_generic" "_double" f_double "${f}")
+  add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done"
+    COMMAND
+    sed -f "${DDalphaAMG_SRC_DIR}/double.sed" "${DDalphaAMG_SRC_DIR}/src/${f}"
+    > "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}"
+    COMMAND ${CMAKE_COMMAND} -E touch
+    "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done"
+    DEPENDS "${DDalphaAMG_SRC_DIR}/src/${f}" "${DDalphaAMG_SRC_DIR}/double.sed"
+    VERBATIM)
+  list(APPEND SED_MARKERS "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done")
+  list(APPEND DDalphaAMG_SRC_SINGLE_DOUBLE ${f_double})
+endforeach()
+
+# now parse the header
+foreach(f IN LISTS DDalphaAMG_HEADER_GENERIC)
+  string(REPLACE "_generic" "_float" f_float "${f}")
+  add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done"
+    COMMAND
+    sed -f "${DDalphaAMG_SRC_DIR}/float.sed" "${DDalphaAMG_SRC_DIR}/src/${f}"
+    > "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}"
+    COMMAND ${CMAKE_COMMAND} -E touch
+    "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done"
+    DEPENDS "${DDalphaAMG_SRC_DIR}/src/${f}" "${DDalphaAMG_SRC_DIR}/float.sed"
+    VERBATIM)
+  list(APPEND SED_MARKERS "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done")
+  list(APPEND DDalphaAMG_HEADER_SINGLE_DOUBLE ${f_float})
+
+  string(REPLACE "_generic" "_double" f_double "${f}")
+  add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done"
+    COMMAND
+    sed -f "${DDalphaAMG_SRC_DIR}/double.sed" "${DDalphaAMG_SRC_DIR}/src/${f}"
+    > "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}"
+    COMMAND ${CMAKE_COMMAND} -E touch
+    "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done"
+    DEPENDS "${DDalphaAMG_SRC_DIR}/src/${f}" "${DDalphaAMG_SRC_DIR}/double.sed"
+    VERBATIM)
+
+  list(APPEND SED_MARKERS "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done")
+  list(APPEND DDalphaAMG_HEADER_SINGLE_DOUBLE ${f_double})
+endforeach()
+
+foreach(outfile IN LISTS DDalphaAMG_SRC_SINGLE_DOUBLE
+    DDalphaAMG_HEADER_SINGLE_DOUBLE)
+  set_source_files_properties("${CMAKE_BINARY_DIR}/DDalphaAMG/${outfile}"
+    PROPERTIES GENERATED TRUE)
+endforeach()
+
+add_custom_target(run_sed ALL DEPENDS ${SED_MARKERS})
+
+add_library(DDalphaAMG ${DDalphaAMG_SRC_GENERAL}
+  ${DDalphaAMG_SRC_SINGLE_DOUBLE})
+
+target_compile_options(DDalphaAMG
+  PRIVATE "$<$<COMPILE_LANG_AND_ID:C,GNU>:-O3;-ffast-math;-mavx2;-mfma>")
+
+add_dependencies(DDalphaAMG run_sed)
+
+target_link_libraries(
+  DDalphaAMG
+  PUBLIC MPI::MPI_C $<$<BOOL:${DDalphaAMG_ENABLE_HDF5}>:hdf5:hdf5>
+  $<$<BOOL:${DDalphaAMG_ENABLE_OMP}>:OpenMP::OpenMP_C> tmlqcd::clime)
+
+target_include_directories(
+  DDalphaAMG
+  PUBLIC $<INSTALL_INTERFACE:include>
+  $<BUILD_INTERFACE:${DDalphaAMG_SRC_DIR}/src>
+  $<BUILD_INTERFACE:${DDalphaAMG_SRC_DIR}/include>
+  $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/DDalphaAMG>)
+
+target_compile_definitions(
+  DDalphaAMG
+  PUBLIC
+  $<$<BOOL:${DDalphaAMG_ENABLE_PARAMOUNT_OUTPUT}>:PARAMOUNTOUTPUT>
+  $<$<BOOL:${DDalphaAMG_ENABLE_FGMRES_RESTEST}>:FGMRES_RESTEST>
+  $<$<BOOL:${DDalphaAMG_ENABLE_PROFILING}>:PROFILING>
+  $<$<BOOL:${DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI}>:SINGLE_ALLREDUCE_ARNOLDI>
+  $<$<BOOL:${DDalphaAMG_ENABLE_COARSE_RES}>:COARSE_RES>
+  $<$<BOOL:${DDalphaAMG_ENABLE_SCHWARZ_RES}>:SCHWARZ_RES>
+  $<$<BOOL:${DDalphaAMG_ENABLE_OMP}>:OPENMP>
+  $<$<BOOL:${DDalphaAMG_ENABLE_TRACK_RES}>:TRACK_RES>
+  $<$<BOOL:${DDalphaAMG_ENABLE_TESTVECTOR_ANALYSIS}>:TESTVECTOR_ANALYSIS>
+  $<$<BOOL:${DDalphaAMG_ENABLE_HDF5}>:HAVE_HDF5>
+  $<$<CONFIG:Debug>:DEBUG>
+  SSE)
+
+install(FILES "${CMAKE_SOURCE_DIR}/deps/DDalphaAMG/DDalphaAMG.h"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}")
+
+write_basic_package_version_file(
+  "${PROJECT_BINARY_DIR}/DDalphaAMGonfigVersion.cmake"
+  VERSION "0.0.0"
+  COMPATIBILITY SameMajorVersion)
+
+install(TARGETS DDalphaAMG
+  EXPORT DDalphaAMG_targets
+  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+
+install(EXPORT DDalphaAMG_targets
+  FILE DDalphaAMG-Targets.cmake
+  NAMESPACE DDalphaAMG::
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+
+configure_file("${PROJECT_SOURCE_DIR}/cmake/DDalphaAMG-Config.cmake.in"
+  "${PROJECT_BINARY_DIR}/DDalphaAMG-Config.cmake" @ONLY)
+
+install(FILES "${PROJECT_BINARY_DIR}/DDalphaAMG-Config.cmake"
+  "${PROJECT_BINARY_DIR}/DDalphaAMG-ConfigVersion.cmake"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/tmlQCD")
+
+
+# add_library(tmlqcd::DDalphaAMG alias DDalphaAMG)
diff --git a/cmake/DDalphaAMG-Config.cmake.in b/cmake/DDalphaAMG-Config.cmake.in
new file mode 100644
index 000000000..539b644a0
--- /dev/null
+++ b/cmake/DDalphaAMG-Config.cmake.in
@@ -0,0 +1,52 @@
+cmake_minimum_required(VERSION 3.23)
+
+if (NOT TARGET DDalphaAMG::DDalphaAMG)
+  
+  find_dependency(MPI REQUIRED)
+
+  if (@DDalphaAMG_ENABLE_PARAMOUNT_OUTPUT@)
+    set(DDalphaAMG_ENABLE_PARAMOUNT_OUTPUT @DDalphaAMG_ENABLE_PARAMOUNT_OUTPUT@)
+  endif()
+  if (@DDalphaAMG_ENABLE_FGMRES_RESTEST@)
+    set(DDalphaAMG_ENABLE_FGMRES_RESTEST @DDalphaAMG_ENABLE_FGMRES_RESTEST@)
+  endif()
+  if (@DDalphaAMG_ENABLE_FGMRES_RESTEST@)
+    set(DDalphaAMG_ENABLE_FGMRES_RESTEST @DDalphaAMG_ENABLE_FGMRES_RESTEST@)
+  endif()
+
+  if (@DDalphaAMG_ENABLE_PROFILING@)
+    set(DDalphaAMG_ENABLE_PROFILING @DDalphaAMG_ENABLE_PROFILING@)
+  endif()
+
+  if (@DDalphaAMG_ENABLE_TRACK_RES@)
+    set(DDalphaAMG_ENABLE_TRACK_RES @DDalphaAMG_ENABLE_TRACK_RES@)
+  endif()
+
+  if (@DDalphaAMG_ENABLE_TESTVECTOR_ANALYSIS@)
+    set(DDalphaAMG_ENABLE_TESTVECTOR_ANALYSIS @DDalphaAMG_ENABLE_TESTVECTOR_ANALYSIS@)
+  endif()
+
+  if (@DDalphaAMG_ENABLE_SCHWARZ_RES@)
+    set(DDalphaAMG_ENABLE_SCHWARZ_RES @DDalphaAMG_ENABLE_SCHWARZ_RES@)
+  endif()
+
+  if (@DDalphaAMG_ENABLE_COARSE_RES@)
+    set(DDalphaAMG_ENABLE_COARSE_RES @DDalphaAMG_ENABLE_COARSE_RES@)
+  endif()
+  if (@DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI@)
+    set(  DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI
+      @DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI@
+)
+  endif()
+  if (@DDalphaAMG_ENABLE_OMP@)
+    set(DDalphaAMG_ENABLE_OMP @DDalphaAMG_ENABLE_OMP@)
+    find_dependency(OpenMP REQUIRED)
+  endif()
+
+  if (@DDalphaAMG_ENABLE_HDF5@)
+    set(DDalphaAMG_ENABLE_HDF5 @DDalphaAMG_ENABLE_HDF5@)
+  endif()
+
+
+  include("${CMAKE_CURRENT_LIST_DIR}/DDalphaAMG-Targets.cmake")
+endif()
diff --git a/cmake/tmlQCD-config.cmake.in b/cmake/tmlQCD-config.cmake.in
new file mode 100644
index 000000000..5d7778801
--- /dev/null
+++ b/cmake/tmlQCD-config.cmake.in
@@ -0,0 +1,80 @@
+cmake_minimum_required(VERSION 3.23)
+
+if (NOT TARGET tmlqcd::tmlqcd)
+
+  # store CXX compiler id. Used in MKL package.
+  set(TM_C_COMPILER_ID @CMAKE_C_COMPILER_ID@)
+  if(NOT ${CMAKE_C_COMPILER_ID})
+    set(CMAKE_C_COMPILER_ID ${TM_C_COMPILER_ID})
+  endif()
+
+  # pass REQUIRED or QUIET depending on top Config call
+  if(tmlQCD_c_FIND_REQUIRED)
+    set(mode REQUIRED)
+  else()
+    set(mode QUIET)
+  endif()
+
+  if (@TM_USE_MPI@)
+    set(TM_USE_MPI @TM_USE_MPI@)
+    find_dependency(MPI ${mode} COMPONENTS C)
+  endif()
+
+  if (@TM_USE_OMP@)
+    set(TM_USE_OMP @TM_USE_OMP@)
+    find_dependency(OpenMP ${mode})
+  endif()
+
+  if (@TM_USE_LEMON@)
+    set(TM_USE_LEMON @TM_USE_LEMON@)
+    find_dependency(Lemon ${mode})
+  endif()
+
+  find_package(BLAS ${mode})
+  find_dependency(LAPACK ${mode})
+
+  find_package(CLime REQUIRED)
+  set(TM_USE_LIME ON)
+
+  if(@TM_USE_QUDA@)
+    set(TM_USE_QUDA @TM_USE_QUDA@)
+    find_package(QUDA REQUIRED config)
+  endif()
+
+  if(@TM_USE_HIP@)
+    set(TM_USE_HIP @TM_USE_HIP@)
+    enable_language(hip)
+  endif()
+
+  if(@TM_USE_QPHIX@)
+    set(TM_USE_QPHIX @TM_USE_QPHIX@)
+    set(QPHIX_DIR @QPHIX_DIR@)
+    set(QMP_DIR @QMP_DIR@)
+    find_package(Qphix REQUIRED)
+  endif()
+
+  if(@TM_USE_FFTW@)
+    set(TM_USE_FFTW @TM_USE_FFTW@)
+    pkg_search_module(tmlqcd_fftw3 IMPORTED_TARGET GLOBAL fftw3)
+    if(tmlqcd_fftw3_FOUND)
+      add_library(tmlqcd::fftw3 ALIAS PkgConfig::tmlqcd_fftw3)
+    endif()
+  endif()
+
+  if(TM_USE_DDalphaAMG)
+    set(TM_USE_DDalphaAMG @TM_USE_DDalphaAMG@)
+  endif()
+
+  set(TM_MPI_DIMENSION @TM_USE_DIMENSION@)
+
+  if(@TM_USE_HALFSPINOR@)
+    set(TM_USE_HALFSPINOR @TM_USE_HALFSPINOR@)
+  endif()
+
+  if (@TM_USE_GAUGE_COPY@)
+    set(TM_USE_GAUGE_COPY @TM_USE_GAUGE_COPY@)
+  endif()
+
+  # Include SIRIUS target
+  include("${CMAKE_CURRENT_LIST_DIR}/tmlQCDTargets.cmake")
+endif()
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index ebed35308..19d24d40c 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -399,6 +399,7 @@ include_directories(
   $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
   $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/src/lib/include>
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
   $<$<BOOL:${TMLQCD_USE_LEMON}>:${TMLQCD_LEMON_INCLUDE_DIRS}>
   ${TMLQCD_CLIME_INCLUDE_DIRS})
 
@@ -407,8 +408,8 @@ if(CMAKE_MAJOR_VERSION LESS 4)
   flex_target(tmlqcd_input_read read_input.l ${CMAKE_BINARY_DIR}/read_input.c
               COMPILE_FLAGS "-Ca -Ptmlqcd -i")
 else()
-  flex_target(tmlqcd_input_read read_input.l ${CMAKE_BINARY_DIR}/read_input.c OPTIONS
-              "-Ca -Ptmlqcd -i")
+  flex_target(tmlqcd_input_read read_input.l ${CMAKE_BINARY_DIR}/read_input.c
+              OPTIONS "-Ca -Ptmlqcd -i")
 endif()
 
 # create a target library with namespacing because cmake does not know name
@@ -425,7 +426,7 @@ set_target_properties(hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
 # define a library and add the dependencies
 target_link_libraries(
   hmc
-  PUBLIC $<$<BOOL:${TM_USE_DDalphaAMG}>:tmlqcd::DDalphaAMG>
+  PUBLIC $<$<BOOL:${TM_USE_DDalphaAMG}>:DDalphaAMG>
          $<$<BOOL:${TM_USE_QPHIX}>:tmlqcd::qphix>
          $<$<BOOL:${TM_USE_FFTW}>:tmlqcd::fftw3>
          $<$<BOOL:${TM_USE_QUDA}>:QUDA::quda>
@@ -452,6 +453,24 @@ target_compile_definitions(
 
 target_include_directories(
   hmc
-  PUBLIC $<INSTALL_INTERFACE:include>
+  PUBLIC $<INSTALL_INTERFACE:include/tmlqcd>
          $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
          $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
+
+install(TARGETS hmc
+EXPORT tmlqcd_targets
+LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+
+install(EXPORT tmlqcd_targets
+  FILE tmlQCDTargets.cmake
+  NAMESPACE tmlQCD::
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+
+install(DIRECTORY ${CMAKE_SOURCE_DIR}/src/lib/include
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}"
+  FILES_MATCHING
+  PATTERN "*.h")
+
+install(FILES "${CMAKE_BINARY_DIR}/tmlqcd_config_internal.h"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}"
+)

From d3e02423854b22aa29d5212caa7a7d50d4c61c0d Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Wed, 11 Mar 2026 02:57:51 +0100
Subject: [PATCH 17/80] Remove comment

---
 cmake/tmlQCD-config.cmake.in | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cmake/tmlQCD-config.cmake.in b/cmake/tmlQCD-config.cmake.in
index 5d7778801..91aaa837a 100644
--- a/cmake/tmlQCD-config.cmake.in
+++ b/cmake/tmlQCD-config.cmake.in
@@ -75,6 +75,5 @@ if (NOT TARGET tmlqcd::tmlqcd)
     set(TM_USE_GAUGE_COPY @TM_USE_GAUGE_COPY@)
   endif()
 
-  # Include SIRIUS target
   include("${CMAKE_CURRENT_LIST_DIR}/tmlQCDTargets.cmake")
 endif()

From c0afb1e15aed53c5b28e862797fdd79920dc0ce1 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Wed, 11 Mar 2026 03:32:18 +0100
Subject: [PATCH 18/80] Remove explicit reference to lemon include directory as
 Lemon uses target to pass these settings

---
 src/lib/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 19d24d40c..ad0723e51 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -400,7 +400,6 @@ include_directories(
   $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/src/lib/include>
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  $<$<BOOL:${TMLQCD_USE_LEMON}>:${TMLQCD_LEMON_INCLUDE_DIRS}>
   ${TMLQCD_CLIME_INCLUDE_DIRS})
 
 # cmake 4.0 uses a different syntax for the option

From dc63083e54ac01dc4b23fdd78e983ecc2e5e728e Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Wed, 11 Mar 2026 10:26:28 +0100
Subject: [PATCH 19/80] Rename file_body.c to file_body.inc

---
 src/lib/CMakeLists.txt                        |  28 ++---
 src/lib/linalg/assign_add_mul.c               |   4 +-
 ...add_mul_body.c => assign_add_mul_body.inc} |   0
 src/lib/linalg/scalar_prod.c                  |   4 +-
 ...calar_prod_body.c => scalar_prod_body.inc} |   0
 src/lib/little_D.c                            |   4 +-
 .../{little_D_body.c => little_D_body.inc}    |   0
 ...lock_D_psi_body.c => Block_D_psi_body.inc} |   0
 src/lib/operator/D_psi.c                      |   8 +-
 .../operator/{D_psi_body.c => D_psi_body.inc} |   0
 src/lib/operator/Hopping_Matrix.c             |   4 +-
 src/lib/operator/Hopping_Matrix_32.c          |   2 +-
 ...sign_mul_one_sw_pm_imu_inv_block_body.inc} |   0
 ...ign_mul_one_sw_pm_imu_site_lexic_body.inc} |   0
 src/lib/operator/clovertm_operators.c         |   8 +-
 ...{halfspinor_body.c => halfspinor_body.inc} |   0
 ...pinor_body_32.c => halfspinor_body_32.inc} |   2 +-
 ...opping_body_dbl.c => hopping_body_dbl.inc} |   0
 ...inv_body.c => mul_one_pm_imu_inv_body.inc} |   0
 ...body.c => mul_one_pm_imu_sub_mul_body.inc} |   0
 src/lib/operator/tm_operators.c               |   8 +-
 src/lib/operator/tm_sub_Hopping_Matrix.c      |   2 +-
 src/lib/operator/tm_times_Hopping_Matrix.c    |   2 +-
 ...k_psi_body.c => M_plus_block_psi_body.inc} |   0
 src/lib/solver/Makefile.in                    | 106 ------------------
 src/lib/solver/Msap.c                         |   4 +-
 src/lib/solver/dfl_projector.c                |   8 +-
 src/lib/solver/fgmres4complex.c               |   4 +-
 ...complex_body.c => fgmres4complex_body.inc} |   0
 src/lib/solver/gcr4complex.c                  |   4 +-
 ...cr4complex_body.c => gcr4complex_body.inc} |   0
 ...recon_body.c => little_mg_precon_body.inc} |   0
 ...t_eo_body.c => little_project_eo_body.inc} |   0
 src/lib/solver/mr.c                           |   4 +-
 .../solver/{mrblk_body.c => mrblk_body.inc}   |   0
 src/lib/xchange/little_field_gather.c         |   4 +-
 ...er_body.c => little_field_gather_body.inc} |   0
 src/lib/xchange/xchange_field.c               |   4 +-
 38 files changed, 54 insertions(+), 160 deletions(-)
 rename src/lib/linalg/{assign_add_mul_body.c => assign_add_mul_body.inc} (100%)
 rename src/lib/linalg/{scalar_prod_body.c => scalar_prod_body.inc} (100%)
 rename src/lib/{little_D_body.c => little_D_body.inc} (100%)
 rename src/lib/operator/{Block_D_psi_body.c => Block_D_psi_body.inc} (100%)
 rename src/lib/operator/{D_psi_body.c => D_psi_body.inc} (100%)
 rename src/lib/operator/{assign_mul_one_sw_pm_imu_inv_block_body.c => assign_mul_one_sw_pm_imu_inv_block_body.inc} (100%)
 rename src/lib/operator/{assign_mul_one_sw_pm_imu_site_lexic_body.c => assign_mul_one_sw_pm_imu_site_lexic_body.inc} (100%)
 rename src/lib/operator/{halfspinor_body.c => halfspinor_body.inc} (100%)
 rename src/lib/operator/{halfspinor_body_32.c => halfspinor_body_32.inc} (98%)
 rename src/lib/operator/{hopping_body_dbl.c => hopping_body_dbl.inc} (100%)
 rename src/lib/operator/{mul_one_pm_imu_inv_body.c => mul_one_pm_imu_inv_body.inc} (100%)
 rename src/lib/operator/{mul_one_pm_imu_sub_mul_body.c => mul_one_pm_imu_sub_mul_body.inc} (100%)
 rename src/lib/solver/{M_plus_block_psi_body.c => M_plus_block_psi_body.inc} (100%)
 delete mode 100644 src/lib/solver/Makefile.in
 rename src/lib/solver/{fgmres4complex_body.c => fgmres4complex_body.inc} (100%)
 rename src/lib/solver/{gcr4complex_body.c => gcr4complex_body.inc} (100%)
 rename src/lib/solver/{little_mg_precon_body.c => little_mg_precon_body.inc} (100%)
 rename src/lib/solver/{little_project_eo_body.c => little_project_eo_body.inc} (100%)
 rename src/lib/solver/{mrblk_body.c => mrblk_body.inc} (100%)
 rename src/lib/xchange/{little_field_gather_body.c => little_field_gather_body.inc} (100%)

diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index ad0723e51..eeeab9f70 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -78,23 +78,23 @@ list(
   solver/chrono_guess.c
   solver/gcr4complex.c
   solver/jdher.c
-  # solver/gcr4complex_body.c
+  # solver/gcr4complex_body.inc
   solver/gmres_dr.c
-  # solver/fgmres4complex_body.c
+  # solver/fgmres4complex_body.inc
   solver/cg_her_bi.c
   solver/solver_field.c
   solver/quicksort.c
   solver/bicgstab2.c
   solver/cgs_real.c
-  # solver/M_plus_block_psi_body.c solver/little_mg_precon_body.c
-  # solver/little_project_eo_body.c
+  # solver/M_plus_block_psi_body.inc solver/little_mg_precon_body.inc
+  # solver/little_project_eo_body.inc
   solver/monomial_solve.c
   solver/cr.c
   solver/gram-schmidt.c
   solver/solver_types.c
   solver/cg_her.c
   solver/jdher_bi.c
-  # solver/mrblk_body.c
+  # solver/mrblk_body.inc
   solver/eigcg.c
   solver/poly_precon.c
   solver/Msap.c
@@ -138,7 +138,7 @@ list(
   linalg/mul_r_gamma5.c
   linalg/convert_eo_to_lexic.c
   linalg/print_spinor.c
-  # linalg/assign_add_mul_body.c
+  # linalg/assign_add_mul_body.inc
   linalg/mul_diff_mul_r.c
   linalg/square_norm_32.c
   linalg/mul.c
@@ -183,7 +183,7 @@ list(
   linalg/set_even_to_zero.c
   linalg/assign_mul_add.c
   linalg/square_and_prod_r.c
-  # linalg/scalar_prod_body.c
+  # linalg/scalar_prod_body.inc
   linalg/assign_mul_bra_add_mul_ket_add.c
   linalg/assign_add_mul_r_32.c
   linalg/scalar_prod.c
@@ -200,10 +200,10 @@ list(
   operator/clover_invert.c
   # operator/hopping_body_dbl.c
   operator/tm_operators_nd_32.c
-  # operator/halfspinor_body.c operator/Block_D_psi_body.c
-  # operator/mul_one_pm_imu_sub_mul_body.c
-  # operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
-  # operator/assign_mul_one_sw_pm_imu_inv_block_body.c
+  # operator/halfspinor_body.inc operator/Block_D_psi_body.inc
+  # operator/mul_one_pm_imu_sub_mul_body.inc
+  # operator/assign_mul_one_sw_pm_imu_site_lexic_body.inc
+  # operator/assign_mul_one_sw_pm_imu_inv_block_body.inc
   operator/clover_accumulate_deriv.c
   operator/Hopping_Matrix.c
   operator/tm_operators.c
@@ -214,7 +214,7 @@ list(
   operator/clover_deriv.c
   operator/clover_det.c
   operator/clover_leaf.c
-  # operator/D_psi_body.c
+  # operator/D_psi_body.inc
   operator/clovertm_operators.c
   operator/Dov_psi.c
   operator/tm_operators_nd.c
@@ -225,7 +225,7 @@ list(
   operator/D_psi.c
   operator/tm_operators_32.c
   operator/Hopping_Matrix_32.c)
-# operator/halfspinor_body_32.c operator/mul_one_pm_imu_inv_body.c)
+# operator/halfspinor_body_32.c operator/mul_one_pm_imu_inv_body.inc)
 
 list(
   APPEND
@@ -295,7 +295,7 @@ list(
   xchange/xchange_2fields.c
   xchange/xchange_gauge.c
   xchange/xchange_halffield.c
-  # xchange/xchange_jacobi.c xchange/little_field_gather_body.c
+  # xchange/xchange_jacobi.c xchange/little_field_gather_body.inc
   xchange/little_field_gather.c
   xchange/xchange_deri.c
   xchange/xchange_field.c)
diff --git a/src/lib/linalg/assign_add_mul.c b/src/lib/linalg/assign_add_mul.c
index cdc9f4931..eae85f685 100644
--- a/src/lib/linalg/assign_add_mul.c
+++ b/src/lib/linalg/assign_add_mul.c
@@ -41,7 +41,7 @@
 #define _PSWITCH(s) s
 #define _PTSWITCH(s) s
 
-#include "assign_add_mul_body.c"
+#include "assign_add_mul_body.inc"
 
 #undef _C_TYPE
 #undef _PSWITCH
@@ -51,7 +51,7 @@
 #define _PSWITCH(s) s##_32
 #define _PTSWITCH(s) s##32
 
-#include "assign_add_mul_body.c"
+#include "assign_add_mul_body.inc"
 
 #undef _C_TYPE
 #undef _PSWITCH
diff --git a/src/lib/linalg/assign_add_mul_body.c b/src/lib/linalg/assign_add_mul_body.inc
similarity index 100%
rename from src/lib/linalg/assign_add_mul_body.c
rename to src/lib/linalg/assign_add_mul_body.inc
diff --git a/src/lib/linalg/scalar_prod.c b/src/lib/linalg/scalar_prod.c
index 365b966c1..2e2a631c7 100644
--- a/src/lib/linalg/scalar_prod.c
+++ b/src/lib/linalg/scalar_prod.c
@@ -36,7 +36,7 @@
 #define _PSWITCH(s) s
 #define _PTSWITCH(s) s
 
-#include "scalar_prod_body.c"
+#include "scalar_prod_body.inc"
 
 #undef _C_TYPE
 #undef _PSWITCH
@@ -46,7 +46,7 @@
 #define _PSWITCH(s) s##_32
 #define _PTSWITCH(s) s##32
 
-#include "scalar_prod_body.c"
+#include "scalar_prod_body.inc"
 
 #undef _C_TYPE
 #undef _PSWITCH
diff --git a/src/lib/linalg/scalar_prod_body.c b/src/lib/linalg/scalar_prod_body.inc
similarity index 100%
rename from src/lib/linalg/scalar_prod_body.c
rename to src/lib/linalg/scalar_prod_body.inc
diff --git a/src/lib/little_D.c b/src/lib/little_D.c
index 2bee49824..e923f462e 100644
--- a/src/lib/little_D.c
+++ b/src/lib/little_D.c
@@ -243,7 +243,7 @@ void apply_little_D_spinor(spinor *r, spinor *s) {
 #endif
 #define _C_TYPE _Complex double
 
-#include "little_D_body.c"
+#include "little_D_body.inc"
 
 #undef _C_TYPE
 #undef _PSWITCH
@@ -259,7 +259,7 @@ void apply_little_D_spinor(spinor *r, spinor *s) {
 #endif
 #define _C_TYPE _Complex float
 
-#include "little_D_body.c"
+#include "little_D_body.inc"
 
 #undef _C_TYPE
 #undef _PSWITCH
diff --git a/src/lib/little_D_body.c b/src/lib/little_D_body.inc
similarity index 100%
rename from src/lib/little_D_body.c
rename to src/lib/little_D_body.inc
diff --git a/src/lib/operator/Block_D_psi_body.c b/src/lib/operator/Block_D_psi_body.inc
similarity index 100%
rename from src/lib/operator/Block_D_psi_body.c
rename to src/lib/operator/Block_D_psi_body.inc
diff --git a/src/lib/operator/D_psi.c b/src/lib/operator/D_psi.c
index 750fe67e8..ab93cb8fd 100644
--- a/src/lib/operator/D_psi.c
+++ b/src/lib/operator/D_psi.c
@@ -54,7 +54,7 @@
 #define _PSWITCH(s) s##_32
 #define _PTSWITCH(s) s##32
 
-#include "D_psi_body.c"
+#include "D_psi_body.inc"
 
 #undef _C_TYPE
 #undef _F_TYPE
@@ -66,7 +66,7 @@
 #define _PSWITCH(s) s
 #define _PTSWITCH(s) s
 
-#include "D_psi_body.c"
+#include "D_psi_body.inc"
 
 #undef _C_TYPE
 #undef _F_TYPE
@@ -92,7 +92,7 @@ void D_psi_prec(spinor *const P, spinor *const Q) {
 #define _PSWITCH(s) s##_32
 #define _PTSWITCH(s) s##32
 
-#include "Block_D_psi_body.c"
+#include "Block_D_psi_body.inc"
 
 #undef _F_TYPE
 #undef _C_TYPE
@@ -104,7 +104,7 @@ void D_psi_prec(spinor *const P, spinor *const Q) {
 #define _PSWITCH(s) s
 #define _PTSWITCH(s) s
 
-#include "Block_D_psi_body.c"
+#include "Block_D_psi_body.inc"
 
 #undef _F_TYPE
 #undef _C_TYPE
diff --git a/src/lib/operator/D_psi_body.c b/src/lib/operator/D_psi_body.inc
similarity index 100%
rename from src/lib/operator/D_psi_body.c
rename to src/lib/operator/D_psi_body.inc
diff --git a/src/lib/operator/Hopping_Matrix.c b/src/lib/operator/Hopping_Matrix.c
index 759809a8e..41b85aa21 100644
--- a/src/lib/operator/Hopping_Matrix.c
+++ b/src/lib/operator/Hopping_Matrix.c
@@ -80,7 +80,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     su3* restrict u0 ALIGN;
 #endif
 
-#include "operator/halfspinor_body.c"
+#include "operator/halfspinor_body.inc"
 
 #ifdef TM_USE_OMP
   } /* OpenMP closing brace */
@@ -106,7 +106,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
   {
 #endif
 
-#include "operator/hopping_body_dbl.c"
+#include "operator/hopping_body_dbl.inc"
 
 #ifdef TM_USE_OMP
   } /* OpenMP closing brace */
diff --git a/src/lib/operator/Hopping_Matrix_32.c b/src/lib/operator/Hopping_Matrix_32.c
index 0991811b7..06fd33efb 100644
--- a/src/lib/operator/Hopping_Matrix_32.c
+++ b/src/lib/operator/Hopping_Matrix_32.c
@@ -82,7 +82,7 @@ void Hopping_Matrix_32_orphaned(const int ieo, spinor32* const l, spinor32* cons
   su3_32* restrict u0 ALIGN32;
 #endif
 
-#include "operator/halfspinor_body_32.c"
+#include "operator/halfspinor_body_32.inc"
 #else
   printf("Error: Single precision Matrix only implemented with HALFSPINOR\n");
   exit(200);
diff --git a/src/lib/operator/assign_mul_one_sw_pm_imu_inv_block_body.c b/src/lib/operator/assign_mul_one_sw_pm_imu_inv_block_body.inc
similarity index 100%
rename from src/lib/operator/assign_mul_one_sw_pm_imu_inv_block_body.c
rename to src/lib/operator/assign_mul_one_sw_pm_imu_inv_block_body.inc
diff --git a/src/lib/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c b/src/lib/operator/assign_mul_one_sw_pm_imu_site_lexic_body.inc
similarity index 100%
rename from src/lib/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
rename to src/lib/operator/assign_mul_one_sw_pm_imu_site_lexic_body.inc
diff --git a/src/lib/operator/clovertm_operators.c b/src/lib/operator/clovertm_operators.c
index fe328ac00..b6221b78b 100644
--- a/src/lib/operator/clovertm_operators.c
+++ b/src/lib/operator/clovertm_operators.c
@@ -64,7 +64,7 @@ su3 ***sw_inv;
 #define _PTSWITCH(s) s
 #define _PSWITCH(s) s
 
-#include "assign_mul_one_sw_pm_imu_site_lexic_body.c"
+#include "assign_mul_one_sw_pm_imu_site_lexic_body.inc"
 
 #undef _F_TYPE
 #undef _PSWITCH
@@ -74,7 +74,7 @@ su3 ***sw_inv;
 #define _PTSWITCH(s) s##32
 #define _PSWITCH(s) s##_32
 
-#include "assign_mul_one_sw_pm_imu_site_lexic_body.c"
+#include "assign_mul_one_sw_pm_imu_site_lexic_body.inc"
 
 #undef _F_TYPE
 #undef _PSWITCH
@@ -1039,7 +1039,7 @@ void assign_mul_one_sw_pm_imu_eps(const int ieo, spinor *const k_s, spinor *cons
 #define _PSWITCH(s) s
 #define _PTSWITCH(s) s
 
-#include "assign_mul_one_sw_pm_imu_inv_block_body.c"
+#include "assign_mul_one_sw_pm_imu_inv_block_body.inc"
 
 #undef _F_TYPE
 #undef _PSWITCH
@@ -1049,7 +1049,7 @@ void assign_mul_one_sw_pm_imu_eps(const int ieo, spinor *const k_s, spinor *cons
 #define _PSWITCH(s) s##_32
 #define _PTSWITCH(s) s##32
 
-#include "assign_mul_one_sw_pm_imu_inv_block_body.c"
+#include "assign_mul_one_sw_pm_imu_inv_block_body.inc"
 
 #undef _F_TYPE
 #undef _PSWITCH
diff --git a/src/lib/operator/halfspinor_body.c b/src/lib/operator/halfspinor_body.inc
similarity index 100%
rename from src/lib/operator/halfspinor_body.c
rename to src/lib/operator/halfspinor_body.inc
diff --git a/src/lib/operator/halfspinor_body_32.c b/src/lib/operator/halfspinor_body_32.inc
similarity index 98%
rename from src/lib/operator/halfspinor_body_32.c
rename to src/lib/operator/halfspinor_body_32.inc
index a8022382c..c1f9a25a1 100644
--- a/src/lib/operator/halfspinor_body_32.c
+++ b/src/lib/operator/halfspinor_body_32.inc
@@ -1,6 +1,6 @@
 /**********************************************************************
  * single precision version Copyright (C) 2013 Florian Burger
- * based on halfspinor_body.c by Carsten Urbach
+ * based on halfspinor_body.inc by Carsten Urbach
  *
  * This file is based on an implementation of the Dirac operator
  * written by Martin Luescher, modified by Martin Hasenbusch in 2002
diff --git a/src/lib/operator/hopping_body_dbl.c b/src/lib/operator/hopping_body_dbl.inc
similarity index 100%
rename from src/lib/operator/hopping_body_dbl.c
rename to src/lib/operator/hopping_body_dbl.inc
diff --git a/src/lib/operator/mul_one_pm_imu_inv_body.c b/src/lib/operator/mul_one_pm_imu_inv_body.inc
similarity index 100%
rename from src/lib/operator/mul_one_pm_imu_inv_body.c
rename to src/lib/operator/mul_one_pm_imu_inv_body.inc
diff --git a/src/lib/operator/mul_one_pm_imu_sub_mul_body.c b/src/lib/operator/mul_one_pm_imu_sub_mul_body.inc
similarity index 100%
rename from src/lib/operator/mul_one_pm_imu_sub_mul_body.c
rename to src/lib/operator/mul_one_pm_imu_sub_mul_body.inc
diff --git a/src/lib/operator/tm_operators.c b/src/lib/operator/tm_operators.c
index 934db2068..0d40b6097 100644
--- a/src/lib/operator/tm_operators.c
+++ b/src/lib/operator/tm_operators.c
@@ -514,7 +514,7 @@ void tm_sub_H_eo_gamma5(spinor *const l, spinor *const p, spinor *const k, const
 #define _PSWITCH(s) s
 #define _PTSWITCH(s) s
 
-#include "mul_one_pm_imu_inv_body.c"
+#include "mul_one_pm_imu_inv_body.inc"
 
 #undef _F_TYPE
 #undef _C_TYPE
@@ -526,7 +526,7 @@ void tm_sub_H_eo_gamma5(spinor *const l, spinor *const p, spinor *const k, const
 #define _PSWITCH(s) s##_32
 #define _PTSWITCH(s) s##32
 
-#include "mul_one_pm_imu_inv_body.c"
+#include "mul_one_pm_imu_inv_body.inc"
 
 #undef _F_TYPE
 #undef _C_TYPE
@@ -768,7 +768,7 @@ void mul_one_pm_imu_sub_mul_gamma5(spinor *const l, spinor *const k, spinor *con
 #define _PSWITCH(s) s
 #define _PTSWITCH(s) s
 
-#include "mul_one_pm_imu_sub_mul_body.c"
+#include "mul_one_pm_imu_sub_mul_body.inc"
 
 #undef _C_TYPE
 #undef _F_TYPE
@@ -780,7 +780,7 @@ void mul_one_pm_imu_sub_mul_gamma5(spinor *const l, spinor *const k, spinor *con
 #define _PSWITCH(s) s##_32
 #define _PTSWITCH(s) s##32
 
-#include "mul_one_pm_imu_sub_mul_body.c"
+#include "mul_one_pm_imu_sub_mul_body.inc"
 
 #undef _C_TYPE
 #undef _F_TYPE
diff --git a/src/lib/operator/tm_sub_Hopping_Matrix.c b/src/lib/operator/tm_sub_Hopping_Matrix.c
index 7edf2c954..a163d26e5 100644
--- a/src/lib/operator/tm_sub_Hopping_Matrix.c
+++ b/src/lib/operator/tm_sub_Hopping_Matrix.c
@@ -70,7 +70,7 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* const p, spin
 
 #define _TM_SUB_HOP
     spinor* pn;
-#include "operator/halfspinor_body.c"
+#include "operator/halfspinor_body.inc"
 #undef _TM_SUB_HOP
 #ifdef TM_USE_OMP
   } /* OpenMP closing brace */
diff --git a/src/lib/operator/tm_times_Hopping_Matrix.c b/src/lib/operator/tm_times_Hopping_Matrix.c
index 9b09c090f..eaeb92a93 100644
--- a/src/lib/operator/tm_times_Hopping_Matrix.c
+++ b/src/lib/operator/tm_times_Hopping_Matrix.c
@@ -69,7 +69,7 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
 #endif
 
 #define _MUL_G5_CMPLX
-#include "operator/halfspinor_body.c"
+#include "operator/halfspinor_body.inc"
 #undef _MUL_G5_CMPLX
 
 #ifdef TM_USE_OMP
diff --git a/src/lib/solver/M_plus_block_psi_body.c b/src/lib/solver/M_plus_block_psi_body.inc
similarity index 100%
rename from src/lib/solver/M_plus_block_psi_body.c
rename to src/lib/solver/M_plus_block_psi_body.inc
diff --git a/src/lib/solver/Makefile.in b/src/lib/solver/Makefile.in
deleted file mode 100644
index 584428871..000000000
--- a/src/lib/solver/Makefile.in
+++ /dev/null
@@ -1,106 +0,0 @@
-
-srcdir = @srcdir@
-top_builddir =  @top_builddir@
-abs_top_builddir = @abs_top_builddir@
-top_srcdir = @top_srcdir@
-abs_top_srcdir = @abs_top_srcdir@
-subdir = solver
-builddir = @builddir@
-
-CFLAGS = @CFLAGS@ @SOLVEROUT@
-DEPFLAGS = @DEPFLAGS@
-LDFLAGS = @LDFLAGS@
-DEFS = @DEFS@
-OPTARGS = @OPTARGS@
-
-AR = @AR@
-RANLIB = @RANLIB@
-CC = @CC@
-CCDEP = @CCDEP@
-CCLD = $(CC)
-LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@
-LEX = @LEX@
-AUTOCONF = @AUTOCONF@
-DEFS = @DEFS@
-
-INCLUDES = @INCLUDES@
-LDADD =
-#COMPILE = ${CC} ${DEFS} $(INCLUDES) ${CFLAGS}
-COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS}
-
-LIBRARIES = libsolver
-libsolver_TARGETS = bicgstab_complex gmres incr_eigcg eigcg restart_X ortho \
-	            cgs_real cg_her mr chrono_guess \
-	            bicgstabell bicgstab2 eigenvalues fgmres \
-	            gcr gcr4complex diagonalise_general_matrix \
-	            cgne4complex mr4complex fgmres4complex \
-	            quicksort gmres_dr lu_solve jdher Msap \
-                    jdher_bi gram-schmidt eigenvalues_bi \
-                    bicgstab_complex_bi cg_her_bi pcg_her \
-                    sub_low_ev cg_her_nd poly_precon \
-                    generate_dfl_subspace dfl_projector \
-                    cg_mms_tm cg_mms_tm_nd mixed_cg_mms_tm_nd \
-                    solver_field sumr mixed_cg_her index_jd \
-		    rg_mixed_cg_her rg_mixed_cg_her_nd \
-                    dirac_operator_eigenvectors \
-		    mcr cr mcr4complex bicg_complex monomial_solve \
-		    solver_types init_guess
-
-libsolver_OBJECTS = $(addsuffix .o, ${libsolver_TARGETS})
-
-# default rule
-
-all: Makefile dep libsolver.a
-
-# rules for debugging
-debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@
-debug all-debug: all
-
-# rules for profiling information
-profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@
-profile all-profile: all
-
-
-#include dep rules
-
--include $(addsuffix .d,${libsolver_TARGETS})
-
-include ${top_srcdir}/Makefile.global
-
-# rule to compile objects
-
-%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config_internal.h
-	$(COMPILE) -c $<
-
-
-# rule to make liblinalg
-
-libsolver.a: ${libsolver_OBJECTS} Makefile
-	@rm -f libsolver.a
-	@${AR} cru libsolver.a $(libsolver_OBJECTS)
-	@$(RANLIB) libsolver.a
-	@cp libsolver.a ${top_builddir}/lib/libsolver.a
-
-# rule to generate .d files
-
-$(addsuffix .d,$(libsolver_TARGETS)): %.d: ${srcdir}/%.c Makefile
-	@$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@
-
-# rule to make dependencies
-
-dep: ${addsuffix .d, ${libsolver_TARGETS}}
-
-# rules to clean
-
-compile-clean: Makefile
-	rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d
-
-clean: compile-clean 
-	rm -f $(addsuffix .a, ${LIBRARIES})
-	rm -f ../lib/libsolver.a
-
-distclean: clean
-	rm -f Makefile
-
-
-.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile
diff --git a/src/lib/solver/Msap.c b/src/lib/solver/Msap.c
index 79e99489f..0a228eebe 100644
--- a/src/lib/solver/Msap.c
+++ b/src/lib/solver/Msap.c
@@ -48,7 +48,7 @@ void dummy_Di(spinor* const P, spinor* const Q, const int i) {
 #define _PTSWITCH(s) s
 #define _PSWITCH(s) s
 
-#include "M_plus_block_psi_body.c"
+#include "M_plus_block_psi_body.inc"
 
 #undef _PTSWITCH
 #undef _PSWITCH
@@ -58,7 +58,7 @@ void dummy_Di(spinor* const P, spinor* const Q, const int i) {
 // this is ugly!
 #define DUM_MATRIX 0
 
-#include "M_plus_block_psi_body.c"
+#include "M_plus_block_psi_body.inc"
 
 #undef _PTSWITCH
 #undef _PSWITCH
diff --git a/src/lib/solver/dfl_projector.c b/src/lib/solver/dfl_projector.c
index b840aabfa..5f0ce2026 100644
--- a/src/lib/solver/dfl_projector.c
+++ b/src/lib/solver/dfl_projector.c
@@ -453,7 +453,7 @@ void little_project(_Complex double *const out, _Complex double *const in, const
 #define _MPI_C_TYPE MPI_DOUBLE_COMPLEX
 #define _F_TYPE double
 
-#include "little_project_eo_body.c"
+#include "little_project_eo_body.inc"
 
 #undef _PSWITCH
 #undef _F_TYPE
@@ -465,7 +465,7 @@ void little_project(_Complex double *const out, _Complex double *const in, const
 #define _MPI_C_TYPE MPI_COMPLEX
 #define _F_TYPE float
 
-#include "little_project_eo_body.c"
+#include "little_project_eo_body.inc"
 
 #undef _PSWITCH
 #undef _F_TYPE
@@ -552,7 +552,7 @@ void little_P_L_D(_Complex double *const out, _Complex double *const in) {
 #define _PSWITCH(s) s
 #define _F_TYPE double
 
-#include "little_mg_precon_body.c"
+#include "little_mg_precon_body.inc"
 
 #undef _PSWITCH
 #undef _F_TYPE
@@ -560,7 +560,7 @@ void little_P_L_D(_Complex double *const out, _Complex double *const in) {
 #define _PSWITCH(s) s##_32
 #define _F_TYPE float
 
-#include "little_mg_precon_body.c"
+#include "little_mg_precon_body.inc"
 
 #undef _PSWITCH
 #undef _F_TYPE
diff --git a/src/lib/solver/fgmres4complex.c b/src/lib/solver/fgmres4complex.c
index 5d77f3ab0..83b8a72c1 100644
--- a/src/lib/solver/fgmres4complex.c
+++ b/src/lib/solver/fgmres4complex.c
@@ -53,7 +53,7 @@
 #define _PSWITCH(s) s
 #define _F_TYPE double
 
-#include "fgmres4complex_body.c"
+#include "fgmres4complex_body.inc"
 
 #undef _PSWITCH
 #undef _F_TYPE
@@ -61,7 +61,7 @@
 #define _PSWITCH(s) s##_32
 #define _F_TYPE float
 
-#include "fgmres4complex_body.c"
+#include "fgmres4complex_body.inc"
 
 #undef _PSWITCH
 #undef _F_TYPE
diff --git a/src/lib/solver/fgmres4complex_body.c b/src/lib/solver/fgmres4complex_body.inc
similarity index 100%
rename from src/lib/solver/fgmres4complex_body.c
rename to src/lib/solver/fgmres4complex_body.inc
diff --git a/src/lib/solver/gcr4complex.c b/src/lib/solver/gcr4complex.c
index e6019f404..4d394cd7f 100644
--- a/src/lib/solver/gcr4complex.c
+++ b/src/lib/solver/gcr4complex.c
@@ -41,7 +41,7 @@
 #define _C_TYPE _Complex double
 #define _F_TYPE double
 
-#include "gcr4complex_body.c"
+#include "gcr4complex_body.inc"
 
 #undef _PSWITCH
 #undef _PTSWITCH
@@ -53,7 +53,7 @@
 #define _C_TYPE _Complex float
 #define _F_TYPE float
 
-#include "gcr4complex_body.c"
+#include "gcr4complex_body.inc"
 
 #undef _PSWITCH
 #undef _PTSWITCH
diff --git a/src/lib/solver/gcr4complex_body.c b/src/lib/solver/gcr4complex_body.inc
similarity index 100%
rename from src/lib/solver/gcr4complex_body.c
rename to src/lib/solver/gcr4complex_body.inc
diff --git a/src/lib/solver/little_mg_precon_body.c b/src/lib/solver/little_mg_precon_body.inc
similarity index 100%
rename from src/lib/solver/little_mg_precon_body.c
rename to src/lib/solver/little_mg_precon_body.inc
diff --git a/src/lib/solver/little_project_eo_body.c b/src/lib/solver/little_project_eo_body.inc
similarity index 100%
rename from src/lib/solver/little_project_eo_body.c
rename to src/lib/solver/little_project_eo_body.inc
diff --git a/src/lib/solver/mr.c b/src/lib/solver/mr.c
index db6a60eb1..fed20f40b 100644
--- a/src/lib/solver/mr.c
+++ b/src/lib/solver/mr.c
@@ -111,7 +111,7 @@ int mr(spinor* const P, spinor* const Q, const int max_iter, const double eps_sq
 #define _PSWITCH(s) s
 #define _PTSWITCH(s) s
 
-#include "mrblk_body.c"
+#include "mrblk_body.inc"
 
 #undef _F_TYPE
 #undef _C_TYPE
@@ -123,7 +123,7 @@ int mr(spinor* const P, spinor* const Q, const int max_iter, const double eps_sq
 #define _PSWITCH(s) s##_32
 #define _PTSWITCH(s) s##32
 
-#include "mrblk_body.c"
+#include "mrblk_body.inc"
 
 #undef _F_TYPE
 #undef _C_TYPE
diff --git a/src/lib/solver/mrblk_body.c b/src/lib/solver/mrblk_body.inc
similarity index 100%
rename from src/lib/solver/mrblk_body.c
rename to src/lib/solver/mrblk_body.inc
diff --git a/src/lib/xchange/little_field_gather.c b/src/lib/xchange/little_field_gather.c
index 2821ddcd0..ae1a53abc 100644
--- a/src/lib/xchange/little_field_gather.c
+++ b/src/lib/xchange/little_field_gather.c
@@ -47,7 +47,7 @@ int waitcount = 0;
 #define _C_TYPE _Complex double
 #define _MPI_C_TYPE MPI_DOUBLE_COMPLEX
 
-#include "little_field_gather_body.c"
+#include "little_field_gather_body.inc"
 
 #undef _PSWITCH
 #undef _PTSWITCH
@@ -59,7 +59,7 @@ int waitcount = 0;
 #define _C_TYPE _Complex float
 #define _MPI_C_TYPE MPI_COMPLEX
 
-#include "little_field_gather_body.c"
+#include "little_field_gather_body.inc"
 
 #undef _PSWITCH
 #undef _PTSWITCH
diff --git a/src/lib/xchange/little_field_gather_body.c b/src/lib/xchange/little_field_gather_body.inc
similarity index 100%
rename from src/lib/xchange/little_field_gather_body.c
rename to src/lib/xchange/little_field_gather_body.inc
diff --git a/src/lib/xchange/xchange_field.c b/src/lib/xchange/xchange_field.c
index 417aa8981..9cdfdb7c9 100644
--- a/src/lib/xchange/xchange_field.c
+++ b/src/lib/xchange/xchange_field.c
@@ -105,11 +105,11 @@ void xchange_field(spinor* const l, const int ieo) {
     /* This is now depending on whether the field is */
     /* even or odd */
     if (ieo == 1) {
-      for (ix = 0; ix < T * LX * LY / 2; ix++) {
+      for (int ix = 0; ix < T * LX * LY / 2; ix++) {
         field_buffer_z[ix] = l[g_field_z_ipt_even[ix]];
       }
     } else {
-      for (ix = 0; ix < T * LX * LY / 2; ix++) {
+      for (int ix = 0; ix < T * LX * LY / 2; ix++) {
         field_buffer_z[ix] = l[g_field_z_ipt_odd[ix]];
       }
     }

From 31cc894d5cd90c1612f345cac2259b3f13eeeb97 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Wed, 11 Mar 2026 15:01:30 +0100
Subject: [PATCH 20/80] More cleanup

---
 src/lib/CMakeLists.txt        | 38 +++++++----------------------
 src/lib/solver/gmres_precon.c | 46 ++++++++++++++++-------------------
 2 files changed, 30 insertions(+), 54 deletions(-)

diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index eeeab9f70..b73401e4c 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -78,23 +78,18 @@ list(
   solver/chrono_guess.c
   solver/gcr4complex.c
   solver/jdher.c
-  # solver/gcr4complex_body.inc
   solver/gmres_dr.c
-  # solver/fgmres4complex_body.inc
   solver/cg_her_bi.c
   solver/solver_field.c
   solver/quicksort.c
   solver/bicgstab2.c
   solver/cgs_real.c
-  # solver/M_plus_block_psi_body.inc solver/little_mg_precon_body.inc
-  # solver/little_project_eo_body.inc
   solver/monomial_solve.c
   solver/cr.c
   solver/gram-schmidt.c
   solver/solver_types.c
   solver/cg_her.c
   solver/jdher_bi.c
-  # solver/mrblk_body.inc
   solver/eigcg.c
   solver/poly_precon.c
   solver/Msap.c
@@ -138,7 +133,6 @@ list(
   linalg/mul_r_gamma5.c
   linalg/convert_eo_to_lexic.c
   linalg/print_spinor.c
-  # linalg/assign_add_mul_body.inc
   linalg/mul_diff_mul_r.c
   linalg/square_norm_32.c
   linalg/mul.c
@@ -183,7 +177,6 @@ list(
   linalg/set_even_to_zero.c
   linalg/assign_mul_add.c
   linalg/square_and_prod_r.c
-  # linalg/scalar_prod_body.inc
   linalg/assign_mul_bra_add_mul_ket_add.c
   linalg/assign_add_mul_r_32.c
   linalg/scalar_prod.c
@@ -198,23 +191,16 @@ list(
   APPEND
   OPERATOR_SRC_C
   operator/clover_invert.c
-  # operator/hopping_body_dbl.c
   operator/tm_operators_nd_32.c
-  # operator/halfspinor_body.inc operator/Block_D_psi_body.inc
-  # operator/mul_one_pm_imu_sub_mul_body.inc
-  # operator/assign_mul_one_sw_pm_imu_site_lexic_body.inc
-  # operator/assign_mul_one_sw_pm_imu_inv_block_body.inc
   operator/clover_accumulate_deriv.c
   operator/Hopping_Matrix.c
   operator/tm_operators.c
   operator/tm_times_Hopping_Matrix.c
   operator/clovertm_operators_32.c
-  # operator/hopping_sgl.c
   operator/Dov_proj.c
   operator/clover_deriv.c
   operator/clover_det.c
   operator/clover_leaf.c
-  # operator/D_psi_body.inc
   operator/clovertm_operators.c
   operator/Dov_psi.c
   operator/tm_operators_nd.c
@@ -225,7 +211,6 @@ list(
   operator/D_psi.c
   operator/tm_operators_32.c
   operator/Hopping_Matrix_32.c)
-# operator/halfspinor_body_32.c operator/mul_one_pm_imu_inv_body.inc)
 
 list(
   APPEND
@@ -249,7 +234,6 @@ list(
   smearing/ape_ape_smear.c
   smearing/uils_print_config_to_screen.c
   smearing/utils_project_antiherm.c)
-# smearing/utils_print_config_to_screen.c smearing/utils_reunitarize_MILC.c)
 
 list(
   APPEND
@@ -295,11 +279,9 @@ list(
   xchange/xchange_2fields.c
   xchange/xchange_gauge.c
   xchange/xchange_halffield.c
-  # xchange/xchange_jacobi.c xchange/little_field_gather_body.inc
   xchange/little_field_gather.c
   xchange/xchange_deri.c
   xchange/xchange_field.c)
-# xchange/xchange_field_tslice.c)
 
 list(
   APPEND
@@ -315,7 +297,6 @@ list(
 list(
   APPEND
   MAIN_SRC_C
-  # cu/cu.c
   measure_gauge_action.c
   start.c
   deriv_Sb.c
@@ -332,7 +313,6 @@ list(
   get_rectangle_staples.c
   rnd_gauge_trafo.c
   measure_rectangles.c
-  # invert.c
   deriv_Sb_D_psi.c
   mpi_init.c
   update_momenta_fg.c
@@ -359,12 +339,10 @@ list(
   aligned_malloc.c
   fatal_error.c
   operator.c
-  # cu/cu.c chebyshev_polynomial.c qphix_test_Dslash.c
   expo.c
   overrelaxation.c
   Ptilde_nd.c
   update_gauge.c
-  # hopping_test.c
   integrator.c)
 
 list(APPEND TEST_SRC_C test/check_xchange.c test/check_geometry.c
@@ -456,20 +434,22 @@ target_include_directories(
          $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
          $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
 
-install(TARGETS hmc
-EXPORT tmlqcd_targets
-LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+install(
+  TARGETS hmc
+  EXPORT tmlqcd_targets
+  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
 
-install(EXPORT tmlqcd_targets
+install(
+  EXPORT tmlqcd_targets
   FILE tmlQCDTargets.cmake
   NAMESPACE tmlQCD::
   DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/src/lib/include
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/src/lib/include
   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}"
   FILES_MATCHING
   PATTERN "*.h")
 
 install(FILES "${CMAKE_BINARY_DIR}/tmlqcd_config_internal.h"
-  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}"
-)
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}")
diff --git a/src/lib/solver/gmres_precon.c b/src/lib/solver/gmres_precon.c
index 9497b8b14..d92e26466 100644
--- a/src/lib/solver/gmres_precon.c
+++ b/src/lib/solver/gmres_precon.c
@@ -307,33 +307,29 @@ complex scalar_prod_nocom(spinor *const S, spinor *const R, const int N) {
 }
 
 double square_norm_nocom(spinor *const P, const int N) {
-  int ix;
-  static double ks, kc, ds, tr, ts, tt;
-  spinor *s;
-
-  ks = 0.0;
-  kc = 0.0;
+  double ks = 0.0;
+  double kc = 0.0;
 
   /* Change due to even-odd preconditioning : VOLUME   to VOLUME/2 */
-  for (ix = 0; ix < N; ix++) {
-    s = P + ix;
-
-    ds = (*s).s0.c0.re * (*s).s0.c0.re + (*s).s0.c0.im * (*s).s0.c0.im +
-         (*s).s0.c1.re * (*s).s0.c1.re + (*s).s0.c1.im * (*s).s0.c1.im +
-         (*s).s0.c2.re * (*s).s0.c2.re + (*s).s0.c2.im * (*s).s0.c2.im +
-         (*s).s1.c0.re * (*s).s1.c0.re + (*s).s1.c0.im * (*s).s1.c0.im +
-         (*s).s1.c1.re * (*s).s1.c1.re + (*s).s1.c1.im * (*s).s1.c1.im +
-         (*s).s1.c2.re * (*s).s1.c2.re + (*s).s1.c2.im * (*s).s1.c2.im +
-         (*s).s2.c0.re * (*s).s2.c0.re + (*s).s2.c0.im * (*s).s2.c0.im +
-         (*s).s2.c1.re * (*s).s2.c1.re + (*s).s2.c1.im * (*s).s2.c1.im +
-         (*s).s2.c2.re * (*s).s2.c2.re + (*s).s2.c2.im * (*s).s2.c2.im +
-         (*s).s3.c0.re * (*s).s3.c0.re + (*s).s3.c0.im * (*s).s3.c0.im +
-         (*s).s3.c1.re * (*s).s3.c1.re + (*s).s3.c1.im * (*s).s3.c1.im +
-         (*s).s3.c2.re * (*s).s3.c2.re + (*s).s3.c2.im * (*s).s3.c2.im;
-
-    tr = ds + kc;
-    ts = tr + ks;
-    tt = ts - ks;
+  for (int ix = 0; ix < N; ix++) {
+    spinor *s = P + ix;
+
+    double ds = (*s).s0.c0.re * (*s).s0.c0.re + (*s).s0.c0.im * (*s).s0.c0.im +
+                (*s).s0.c1.re * (*s).s0.c1.re + (*s).s0.c1.im * (*s).s0.c1.im +
+                (*s).s0.c2.re * (*s).s0.c2.re + (*s).s0.c2.im * (*s).s0.c2.im +
+                (*s).s1.c0.re * (*s).s1.c0.re + (*s).s1.c0.im * (*s).s1.c0.im +
+                (*s).s1.c1.re * (*s).s1.c1.re + (*s).s1.c1.im * (*s).s1.c1.im +
+                (*s).s1.c2.re * (*s).s1.c2.re + (*s).s1.c2.im * (*s).s1.c2.im +
+                (*s).s2.c0.re * (*s).s2.c0.re + (*s).s2.c0.im * (*s).s2.c0.im +
+                (*s).s2.c1.re * (*s).s2.c1.re + (*s).s2.c1.im * (*s).s2.c1.im +
+                (*s).s2.c2.re * (*s).s2.c2.re + (*s).s2.c2.im * (*s).s2.c2.im +
+                (*s).s3.c0.re * (*s).s3.c0.re + (*s).s3.c0.im * (*s).s3.c0.im +
+                (*s).s3.c1.re * (*s).s3.c1.re + (*s).s3.c1.im * (*s).s3.c1.im +
+                (*s).s3.c2.re * (*s).s3.c2.re + (*s).s3.c2.im * (*s).s3.c2.im;
+
+    double tr = ds + kc;
+    double ts = tr + ks;
+    double tt = ts - ks;
     ks = ts;
     kc = tr - tt;
   }

From c48d6cf395c07659da9460e2a85c573f94d250ca Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Fri, 13 Mar 2026 07:24:07 +0100
Subject: [PATCH 21/80] More cleanup

---
 src/bin/deriv_mg_tune.c              |   8 ++-
 src/bin/hmc_tm.c                     |  14 ++--
 src/bin/offline_measurement.c        |  53 +++++++-------
 src/lib/CMakeLists.txt               |   5 +-
 src/lib/DDalphaAMG_interface.c       |   4 +-
 src/lib/init/init_gauge_field.c      |   1 -
 src/lib/init/init_stout_smear_vars.c |  29 ++++----
 src/lib/quda_interface.c             |   6 +-
 src/lib/solver/gmres_precon.c        | 101 +++++++++++++++++++++------
 src/lib/solver/jdher.c               |   2 +-
 src/lib/xchange/xchange_field.c      |  52 ++++----------
 src/lib/xchange/xchange_halffield.c  |  20 +++---
 src/lib/xchange/xchange_lexicfield.c |   9 ---
 13 files changed, 160 insertions(+), 144 deletions(-)

diff --git a/src/bin/deriv_mg_tune.c b/src/bin/deriv_mg_tune.c
index 7c45524de..0fd6ec529 100644
--- a/src/bin/deriv_mg_tune.c
+++ b/src/bin/deriv_mg_tune.c
@@ -387,11 +387,11 @@ static void process_args(int argc, char *argv[], char **input_filename, char **f
   while ((c = getopt(argc, argv, "h?vVf:o:m:")) != -1) {
     switch (c) {
       case 'f':
-        *input_filename = calloc(200, sizeof(char));
+        *input_filename = calloc(256, sizeof(char));
         strncpy(*input_filename, optarg, 200);
         break;
       case 'o':
-        *filename = calloc(200, sizeof(char));
+        *filename = calloc(256, sizeof(char));
         strncpy(*filename, optarg, 200);
         break;
       case 'v':
@@ -425,12 +425,14 @@ static void process_args(int argc, char *argv[], char **input_filename, char **f
 
 static void set_default_filenames(char **input_filename, char **filename) {
   if (*input_filename == NULL) {
-    *input_filename = calloc(13, sizeof(char));
+    *input_filename = calloc(16, sizeof(char));
     strcpy(*input_filename, "hmc.input");
+    (*input_filename)[9] = '\0';
   }
 
   if (*filename == NULL) {
     *filename = calloc(7, sizeof(char));
     strcpy(*filename, "output");
+    (*filename)[6] = '\0';
   }
 }
diff --git a/src/bin/hmc_tm.c b/src/bin/hmc_tm.c
index b68a5250f..017ab9cb3 100644
--- a/src/bin/hmc_tm.c
+++ b/src/bin/hmc_tm.c
@@ -85,11 +85,11 @@ static void set_default_filenames(char **input_filename, char **filename);
 int main(int argc, char *argv[]) {
   FILE *parameterfile = NULL, *countfile = NULL;
   char *filename = NULL;
-  char datafilename[206];
-  char parameterfilename[206];
-  char gauge_filename[50];
-  char nstore_filename[50];
-  char tmp_filename[50];
+  char datafilename[256];
+  char parameterfilename[256];
+  char gauge_filename[64];
+  char nstore_filename[64];
+  char tmp_filename[64];
   char *input_filename = NULL;
   int status = 0, accept = 0;
   int j, ix, mu, trajectory_counter = 0;
@@ -611,11 +611,11 @@ static void process_args(int argc, char *argv[], char **input_filename, char **f
   while ((c = getopt(argc, argv, "h?vVf:o:m:")) != -1) {
     switch (c) {
       case 'f':
-        *input_filename = calloc(200, sizeof(char));
+        *input_filename = calloc(256, sizeof(char));
         strncpy(*input_filename, optarg, 200);
         break;
       case 'o':
-        *filename = calloc(200, sizeof(char));
+        *filename = calloc(256, sizeof(char));
         strncpy(*filename, optarg, 200);
         break;
       case 'v':
diff --git a/src/bin/offline_measurement.c b/src/bin/offline_measurement.c
index b6cbc13fa..667dce3f9 100644
--- a/src/bin/offline_measurement.c
+++ b/src/bin/offline_measurement.c
@@ -73,9 +73,9 @@ static void set_default_filenames(char **input_filename, char **filename);
 
 int main(int argc, char *argv[]) {
   FILE *parameterfile = NULL;
-  int j, i;
-  char datafilename[206];
-  char parameterfilename[206];
+  int err;
+  char datafilename[256];
+  char parameterfilename[256];
   char conf_filename[CONF_FILENAME_LENGTH];
   char *input_filename = NULL;
   char *filename = NULL;
@@ -115,7 +115,7 @@ int main(int argc, char *argv[]) {
   /* we need to make sure that we don't have even_odd_flag = 1 */
   /* if any of the operators doesn't use it                    */
   /* in this way even/odd can still be used by other operators */
-  for (j = 0; j < no_operators; j++)
+  for (int j = 0; j < no_operators; j++)
     if (!operator_list[j].even_odd_flag) even_odd_flag = 0;
 
 #ifndef TM_USE_MPI
@@ -123,43 +123,42 @@ int main(int argc, char *argv[]) {
 #endif
 
 #ifdef TM_USE_GAUGE_COPY
-  j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
+  err = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
-  j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
+  err = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
 #endif
-  if (j != 0) {
+  if (err != 0) {
     fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
     exit(-1);
   }
-  j = init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);
-  if (j != 0) {
+  if (init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand) != 0) {
     fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n");
     exit(-1);
   }
   if (no_monomials > 0) {
     if (even_odd_flag) {
-      j = init_monomials(VOLUMEPLUSRAND / 2, even_odd_flag);
+      err = init_monomials(VOLUMEPLUSRAND / 2, even_odd_flag);
     } else {
-      j = init_monomials(VOLUMEPLUSRAND, even_odd_flag);
+      err = init_monomials(VOLUMEPLUSRAND, even_odd_flag);
     }
-    if (j != 0) {
+    if (err != 0) {
       fprintf(stderr, "Not enough memory for monomial pseudo fermion fields! Aborting...\n");
       exit(-1);
     }
   }
   if (even_odd_flag) {
-    j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS);
+    err = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS);
   } else {
-    j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
+    err = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
   }
-  if (j != 0) {
+  if (err != 0) {
     fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
     exit(-1);
   }
 
   if (g_running_phmc) {
-    j = init_chi_spinor_field(VOLUMEPLUSRAND / 2, 20);
-    if (j != 0) {
+    err = init_chi_spinor_field(VOLUMEPLUSRAND / 2, 20);
+    if (err != 0) {
       fprintf(stderr, "Not enough memory for PHMC Chi fields! Aborting...\n");
       exit(-1);
     }
@@ -206,14 +205,12 @@ int main(int argc, char *argv[]) {
 
   /* this could be maybe moved to init_operators */
 #ifdef TM_USE_HALFSPINOR
-  j = init_dirac_halfspinor();
-  if (j != 0) {
+  if (init_dirac_halfspinor() != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
     exit(-1);
   }
   if (g_sloppy_precision_flag == 1) {
-    j = init_dirac_halfspinor32();
-    if (j != 0) {
+    if (init_dirac_halfspinor32() != 0) {
       fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
       exit(-1);
     }
@@ -223,12 +220,12 @@ int main(int argc, char *argv[]) {
 #endif
 #endif
 
-  for (j = 0; j < Nmeas; j++) {
+  for (int j = 0; j < Nmeas; j++) {
     int n_written =
         snprintf(conf_filename, CONF_FILENAME_LENGTH, "%s.%.4d", gauge_input_filename, nstore);
     if (n_written < 0 || n_written > CONF_FILENAME_LENGTH) {
-      char error_message[500];
-      snprintf(error_message, 500,
+      char error_message[512];
+      snprintf(error_message, 512,
                "Encoding error or gauge configuration filename "
                "longer than %d characters! See offline_measurement.c CONF_FILENAME_LENGTH\n",
                CONF_FILENAME_LENGTH);
@@ -239,8 +236,8 @@ int main(int argc, char *argv[]) {
              (gauge_precision_read_flag == 32 ? "single" : "double"));
       fflush(stdout);
     }
-    if ((i = read_gauge_field(conf_filename, g_gauge_field)) != 0) {
-      fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i,
+    if ((err = read_gauge_field(conf_filename, g_gauge_field)) != 0) {
+      fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", err,
               conf_filename);
       exit(-2);
     }
@@ -321,11 +318,11 @@ static void usage(const tm_ExitCode_t exit_code) {
 }
 
 static void process_args(int argc, char *argv[], char **input_filename, char **filename) {
-  int c;
+  int c = 0;
   while ((c = getopt(argc, argv, "h?vVf:o:")) != -1) {
     switch (c) {
       case 'f':
-        *input_filename = calloc(200, sizeof(char));
+        *input_filename = calloc(256, sizeof(char));
         strncpy(*input_filename, optarg, 200);
         break;
       case 'v':
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index b73401e4c..700dabdf4 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -65,7 +65,7 @@ list(
   init/init_gauge_tmp.c
   init/init_critical_globals.c
   init/init_omp_accumulators.c
-  # init/init_stout_smear_vars.c
+  init/init_stout_smear_vars.c
   init/init_moment_field.c)
 
 list(
@@ -74,7 +74,8 @@ list(
   solver/bicg_complex.c
   solver/dfl_projector.c
   solver/gcr.c
-  # solver/gmres_precon.c
+  # this file is not used anywhere at the moment
+  #  solver/gmres_precon.c
   solver/chrono_guess.c
   solver/gcr4complex.c
   solver/jdher.c
diff --git a/src/lib/DDalphaAMG_interface.c b/src/lib/DDalphaAMG_interface.c
index bf2da4bef..5c49ddc66 100644
--- a/src/lib/DDalphaAMG_interface.c
+++ b/src/lib/DDalphaAMG_interface.c
@@ -542,8 +542,8 @@ static int MG_solve_nd(spinor *up_new, spinor *dn_new, spinor *const up_old, spi
   int init_guess = 0;
   spinor *old1 = up_old;
   spinor *old2 = dn_old;
-  spinor *new1 = up_new, *new1tmp;
-  spinor *new2 = dn_new, *new2tmp;
+  spinor *new1 = up_new, *new1tmp = NULL;
+  spinor *new2 = dn_new, *new2tmp = NULL;
   spinor **solver_field = NULL, **oe_solver_field = NULL;
   int no_solver_field = 0;
 
diff --git a/src/lib/init/init_gauge_field.c b/src/lib/init/init_gauge_field.c
index 1ad4463a8..a83e66dfd 100644
--- a/src/lib/init/init_gauge_field.c
+++ b/src/lib/init/init_gauge_field.c
@@ -34,7 +34,6 @@ su3* gauge_field_copy = NULL;
 su3_32* gauge_field_copy_32 = NULL;
 
 int init_gauge_field(const int V, const int back) {
-  int i = 0;
   g_gauge_field_copy = NULL;
 
   if (g_exposu3_no_c == 0) init_exposu3();
diff --git a/src/lib/init/init_stout_smear_vars.c b/src/lib/init/init_stout_smear_vars.c
index e1c64f75a..92b5889a5 100644
--- a/src/lib/init/init_stout_smear_vars.c
+++ b/src/lib/init/init_stout_smear_vars.c
@@ -27,7 +27,6 @@
 #include "expo.h"
 #include "global.h"
 #include "init_stout_smear_vars.h"
-#include "sse.h"
 #include "su3.h"
 
 su3* gauge_field_saved;
@@ -83,11 +82,11 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
   printf("Running init_stout_smear_vars\n");
   const int dim = 4;
 
-  int i, k, x, mu;
+  /* int i, k, x, mu; */
 
-  i = 0;
-  k = 0;
-  mu = 0;
+  /* i = 0; */
+  /* k = 0; */
+  /* mu = 0; */
 
   if (g_exposu3_no_c == 0) init_exposu3();
 
@@ -107,7 +106,7 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
 
   g_gauge_field_smeared[0] = gauge_field_smeared;
 
-  for (x = 1; x < V; x++) {
+  for (int x = 1; x < V; x++) {
     g_gauge_field_smeared[x] = g_gauge_field_smeared[x - 1] + 4;
   }
 
@@ -126,7 +125,7 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
 
   g_gauge_field_saved[0] = gauge_field_saved;
 
-  for (x = 1; x < V; x++) {
+  for (int x = 1; x < V; x++) {
     g_gauge_field_saved[x] = g_gauge_field_saved[x - 1] + 4;
   }
 
@@ -145,7 +144,7 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
 
   g_C_smearing[0] = C_smearing;
 
-  for (x = 1; x < V; x++) {
+  for (int x = 1; x < V; x++) {
     g_C_smearing[x] = g_C_smearing[x - 1] + 4;
   }
 
@@ -164,7 +163,7 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
 
   g_Q_smearing[0] = Q_smearing;
 
-  for (x = 1; x < V; x++) {
+  for (int x = 1; x < V; x++) {
     g_Q_smearing[x] = g_Q_smearing[x - 1] + 4;
   }
 
@@ -183,7 +182,7 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
 
   g_Q_squared_smearing[0] = Q_squared_smearing;
 
-  for (x = 1; x < V; x++) {
+  for (int x = 1; x < V; x++) {
     g_Q_squared_smearing[x] = g_Q_squared_smearing[x - 1] + 4;
   }
 
@@ -211,7 +210,7 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
   g_B1_smearing[0] = B1_smearing;
   g_B2_smearing[0] = B2_smearing;
 
-  for (x = 1; x < V; x++) {
+  for (int x = 1; x < V; x++) {
     g_B1_smearing[x] = g_B1_smearing[x - 1] + 4;
     g_B2_smearing[x] = g_B2_smearing[x - 1] + 4;
   }
@@ -231,7 +230,7 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
 
   g_Gamma_smearing[0] = Gamma_smearing;
 
-  for (x = 1; x < V; x++) {
+  for (int x = 1; x < V; x++) {
     g_Gamma_smearing[x] = g_Gamma_smearing[x - 1] + 4;
   }
 
@@ -250,7 +249,7 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
 
   g_Lambda_smearing[0] = Lambda_smearing;
 
-  for (x = 1; x < V; x++) {
+  for (int x = 1; x < V; x++) {
     g_Lambda_smearing[x] = g_Lambda_smearing[x - 1] + 4;
   }
 
@@ -366,7 +365,7 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
 
   g_stout_force_field[0] = stout_force_field;
 
-  for (x = 1; x < V; x++) {
+  for (int x = 1; x < V; x++) {
     g_stout_force_field[x] = g_stout_force_field[x - 1] + 4;
   }
 
@@ -386,7 +385,7 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
 
   g_previous_stout_force_field[0] = previous_stout_force_field;
 
-  for (x = 1; x < V; x++) {
+  for (int x = 1; x < V; x++) {
     g_previous_stout_force_field[x] = g_previous_stout_force_field[x - 1] + 4;
   }
 
diff --git a/src/lib/quda_interface.c b/src/lib/quda_interface.c
index e527dbba7..674b84677 100644
--- a/src/lib/quda_interface.c
+++ b/src/lib/quda_interface.c
@@ -2511,7 +2511,7 @@ void compute_gauge_derivative_quda(monomial *const mnl, hamiltonian_field_t *con
 
   const int rect = mnl->use_rectangles;
 
-  const int *path_length = rect ? plaq_rect_length : plaq_length;
+  const int *path_length = ((rect) ? (plaq_rect_length) : (plaq_length));
 
   const int num_paths = rect ? 24 : 6;
   const int max_length = rect ? 5 : 3;
@@ -2546,7 +2546,7 @@ void compute_gauge_derivative_quda(monomial *const mnl, hamiltonian_field_t *con
   reset_quda_gauge_state(&quda_gauge_state);
 
   tm_stopwatch_push(&g_timers, "computeGaugeForceQuda", "");
-  computeGaugeForceQuda((void *)mom_quda, (void *)gauge_quda, path_buf, path_length, loop_coeff,
+  computeGaugeForceQuda((void *)mom_quda, (void *)gauge_quda, path_buf, (int *)path_length, loop_coeff,
                         num_paths, max_length, 1.0, &f_gauge_param);
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 
@@ -2907,7 +2907,7 @@ void quda_mg_tune_params(void *spinorOut, void *spinorIn, const int max_iter) {
   int cur_tuning_lvl = mg_n_level - 1;
   int cur_lvl_tuning_steps = get_lvl_tuning_steps(&quda_mg_tuning_plan, cur_tuning_lvl);
   int steps_done_in_cur_dir = 0;
-  int i = 0;
+  int i = 0; 
   tm_QudaMGTuningDirection_t cur_tuning_dir = TM_MG_TUNE_MU_FACTOR;
 
   // when tuning over multiple configurations, we tune on the first config based
diff --git a/src/lib/solver/gmres_precon.c b/src/lib/solver/gmres_precon.c
index d92e26466..de84eff0d 100644
--- a/src/lib/solver/gmres_precon.c
+++ b/src/lib/solver/gmres_precon.c
@@ -44,6 +44,7 @@
 #ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
 #endif
+#include <complex.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -75,7 +76,6 @@ static double *s;
 
 int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_restarts,
                  const double eps_sq, const int rel_prec, const int N, matrix_mult f) {
-  int restart, i, j, k;
   double beta, eps, norm;
   complex tmp1, tmp2;
   spinor **solver_field = NULL;
@@ -93,22 +93,22 @@ int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_re
 
   /*   assign(solver_field[1], P, N); */
   zero_spinor_field(solver_field[1], N);
-  for (restart = 0; restart < max_restarts; restart++) {
+  for (int restart = 0; restart < max_restarts; restart++) {
     /* r_0=Q-AP  (b=Q, x+0=P) */
     f(solver_field[1], solver_field[1]);
     diff(solver_field[1], Q, solver_field[1], N);
     /* v_0=r_0/||r_0|| */
-    alpha[0].re = sqrt(square_norm(solver_field[1], N, 1));
+    alpha[0] = sqrt(square_norm(solver_field[1], N, 1)) + 0.0I;
 
     /*     if(alpha[0].re == 0.){  */
     /*        assign(P, solver_field[1], N);  */
     /*        return(restart*m);  */
     /*     }  */
 
-    if (alpha[0].re != 0.) {
-      mul_r(V[0], 1. / alpha[0].re, solver_field[1], N);
+    if (creal(alpha[0]) != 0.) {
+      mul_r(V[0], 1. / creal(alpha[0]), solver_field[1], N);
 
-      for (j = 0; j < m; j++) {
+      for (int j = 0; j < m; j++) {
         /* solver_field[1]=A*v_j */
 
         f(solver_field[1], V[j]);
@@ -116,13 +116,13 @@ int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_re
         /* Set h_ij and omega_j */
         /* solver_field[0] <- omega_j */
         assign(solver_field[0], solver_field[1], N);
-        for (i = 0; i <= j; i++) {
+        for (int i = 0; i <= j; i++) {
           H[i][j] = scalar_prod(V[i], solver_field[0], N, 1);
           assign_diff_mul(solver_field[0], V[i], H[i][j], N);
         }
 
         _complex_set(H[j + 1][j], sqrt(square_norm(solver_field[0], N, 1)), 0.);
-        for (i = 0; i < j; i++) {
+        for (int i = 0; i < j; i++) {
           tmp1 = H[i][j];
           tmp2 = H[i + 1][j];
           _mult_real(H[i][j], tmp2, s[i]);
@@ -133,7 +133,7 @@ int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_re
 
         /* Set beta, s, c, alpha[j],[j+1] */
         beta = sqrt(_complex_square_norm(H[j][j]) + _complex_square_norm(H[j + 1][j]));
-        s[j] = H[j + 1][j].re / beta;
+        s[j] = creal(H[j + 1][j]) / beta;
         _mult_real(c[j], H[j][j], 1. / beta);
         _complex_set(H[j][j], beta, 0.);
         _mult_real(alpha[j + 1], alpha[j], s[j]);
@@ -146,19 +146,19 @@ int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_re
                  alpha[j + 1].re * alpha[j + 1].re);
           fflush(stdout);
         }
-        if (((alpha[j + 1].re <= eps) && (rel_prec == 0)) ||
-            ((alpha[j + 1].re <= eps * norm) && (rel_prec == 1))) {
-          _mult_real(alpha[j], alpha[j], 1. / H[j][j].re);
+        if (((creal(alpha[j + 1]) <= eps) && (rel_prec == 0)) ||
+            (creal((alpha[j + 1]) <= eps * norm) && (rel_prec == 1))) {
+          _mult_real(alpha[j], alpha[j], 1. / creal(H[j][j]));
           assign_add_mul(solver_field[1], V[j], alpha[j], N);
-          for (i = j - 1; i >= 0; i--) {
-            for (k = i + 1; k <= j; k++) {
+          for (int i = j - 1; i >= 0; i--) {
+            for (int k = i + 1; k <= j; k++) {
               _mult_assign_complex(tmp1, H[i][k], alpha[k]);
               _diff_complex(alpha[i], tmp1);
             }
-            _mult_real(alpha[i], alpha[i], 1. / H[i][i].re);
+            _mult_real(alpha[i], alpha[i], 1. / creal(H[i][i]));
             assign_add_mul(solver_field[1], V[i], alpha[i], N);
           }
-          for (i = 0; i < m; i++) {
+          for (int i = 0; i < m; i++) {
             alpha[i].im = 0.;
           }
           assign(P, solver_field[1], N);
@@ -168,24 +168,24 @@ int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_re
         /* if not */
         else {
           if (j != m - 1) {
-            mul_r(V[(j + 1)], 1. / H[j + 1][j].re, solver_field[0], N);
+            mul_r(V[(j + 1)], 1. / creal(H[j + 1][j]), solver_field[0], N);
           }
         }
       }
 
       j = m - 1;
       /* prepare for restart */
-      _mult_real(alpha[j], alpha[j], 1. / H[j][j].re);
+      _mult_real(alpha[j], alpha[j], 1. / creal(H[j][j]));
       assign_add_mul(solver_field[1], V[j], alpha[j], N);
-      for (i = j - 1; i >= 0; i--) {
-        for (k = i + 1; k <= j; k++) {
+      for (int i = j - 1; i >= 0; i--) {
+        for (int k = i + 1; k <= j; k++) {
           _mult_assign_complex(tmp1, H[i][k], alpha[k]);
           _diff_complex(alpha[i], tmp1);
         }
-        _mult_real(alpha[i], alpha[i], 1. / H[i][i].re);
+        _mult_real(alpha[i], alpha[i], 1. / creal(H[i][i]));
         assign_add_mul(solver_field[1], V[i], alpha[i], N);
       }
-      for (i = 0; i < m; i++) {
+      for (int i = 0; i < m; i++) {
         alpha[i].im = 0.;
       }
     }
@@ -234,6 +234,7 @@ static void init_pgmres(const int _M, const int _V) {
 }
 
 complex scalar_prod_nocom(spinor *const S, spinor *const R, const int N) {
+#ifdef __STDC_NO_COMPLEX__
   int ix;
   static double ks, kc, ds, tr, ts, tt;
   spinor *s, *r;
@@ -304,12 +305,43 @@ complex scalar_prod_nocom(spinor *const S, spinor *const R, const int N) {
 
   c.im = kc;
   return (c);
+#else
+  double complex ks, kc;
+
+  ks = 0.0 + 0.0I;
+  kc = 0.0 + 0.0I;
+  for (int ix = 0; ix < N; ix++) {
+    spinor *s = (spinor *)S + ix;
+    spinor *r = (spinor *)R + ix;
+    double complex ds = (*r).s0.c0 * (*r).s0.c0 +
+      (*r).s0.c1 * (*r).s0.c1 +
+      (*r).s0.c2 * (*r).s0.c2 +
+      (*r).s1.c0 * (*r).s1.c0 +
+      (*r).s1.c1 * (*r).s1.c1 +
+      (*r).s1.c2 * (*r).s1.c2 +
+      (*r).s2.c0 * (*r).s2.c0 +
+      (*r).s2.c1 * (*r).s2.c1 +
+      (*r).s2.c2 * (*r).s2.c2 +
+      (*r).s3.c0 * (*r).s3.c0 +
+      (*r).s3.c1 * (*r).s3.c1 +
+      (*r).s3.c2 * (*r).s3.c2;
+
+    double complex tr = ds + kc;
+    double complex ts = tr + ks;
+    double complex tt = ts - ks;
+    ks = ts;
+    kc = tr - tt;
+  }
+  return  ks + kc;
+#endif
 }
 
+
 double square_norm_nocom(spinor *const P, const int N) {
   double ks = 0.0;
   double kc = 0.0;
 
+#ifdef __STDC_NO_COMPLEX__
   /* Change due to even-odd preconditioning : VOLUME   to VOLUME/2 */
   for (int ix = 0; ix < N; ix++) {
     spinor *s = P + ix;
@@ -334,5 +366,30 @@ double square_norm_nocom(spinor *const P, const int N) {
     kc = tr - tt;
   }
   kc = ks + kc;
+#else
+  /* Change due to even-odd preconditioning : VOLUME   to VOLUME/2 */
+  for (int ix = 0; ix < N; ix++) {
+    spinor *s = P + ix;
+    double complex ds =  (*s).s0.c0 * conj((*s).s0.c0) +
+      (*s).s0.c1 * conj((*s).s0.c1) +
+      (*s).s0.c2 * conj((*s).s0.c2) +
+      (*s).s1.c0 * conj((*s).s1.c0) +
+      (*s).s1.c1 * conj((*s).s1.c1) +
+      (*s).s1.c2 * conj((*s).s1.c2) +
+      (*s).s2.c0 * conj((*s).s2.c0) +
+      (*s).s2.c1 * conj((*s).s2.c1) +
+      (*s).s2.c2 * conj((*s).s2.c2) +
+      (*s).s3.c0 * conj((*s).s3.c0) +
+      (*s).s3.c1 * conj((*s).s3.c1) +
+      (*s).s3.c2 * conj((*s).s3.c2);
+    
+    double tr = creal(ds) + kc;
+    double ts = tr + ks;
+    double tt = ts - ks;
+    ks = ts;
+    kc = tr - tt;
+  }
+#endif
+
   return kc;
 }
diff --git a/src/lib/solver/jdher.c b/src/lib/solver/jdher.c
index c3d874781..bbe25d11a 100644
--- a/src/lib/solver/jdher.c
+++ b/src/lib/solver/jdher.c
@@ -127,7 +127,7 @@ void jdher(int n, int lda, double tau, double tol, int kmax, int jmax, int jmin,
    * initialize with NULL, so we can free even unallocated ptrs */
   double *s = NULL, *resnrm = NULL, *resnrm_old = NULL, *dtemp = NULL, *rwork = NULL;
 
-  _Complex double *V_ = NULL, *V, *Vtmp = NULL, *U = NULL, *M = NULL, *Z = NULL, *Res_ = NULL, *Res,
+  _Complex double *V = NULL, *Vtmp = NULL, *U = NULL, *M = NULL, *Z = NULL, *Res,
                   *eigwork = NULL, *temp1_ = NULL, *temp1;
 
   int *idx1 = NULL, *idx2 = NULL, *convind = NULL, *keepind = NULL, *solvestep = NULL,
diff --git a/src/lib/xchange/xchange_field.c b/src/lib/xchange/xchange_field.c
index 9cdfdb7c9..ee463ec17 100644
--- a/src/lib/xchange/xchange_field.c
+++ b/src/lib/xchange/xchange_field.c
@@ -44,10 +44,6 @@
 #include "su3.h"
 #include "xchange_field.h"
 
-#if (defined TM_PARALLELXYZT)
-#pragma disjoint(*field_buffer_z2, *field_buffer_z)
-#endif
-
 /* this version uses non-blocking MPI calls */
 #if (defined TM_NON_BLOCKING)
 
@@ -67,10 +63,6 @@ void xchange_field(spinor* const l, const int ieo) {
   int reqcount = 16;
 #endif
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchangefield)
-#endif
-
 #ifdef TM_USE_MPI
 
   /* In 4 dimensions there are two processors sharing the   */
@@ -259,9 +251,6 @@ void xchange_field(spinor* const l, const int ieo) {
 #endif
 
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchangefield)
-#endif
 }
 
 #elif (defined TM_USE_SHMEM) /* TM_NON_BLOCKING */
@@ -271,11 +260,7 @@ void xchange_field(spinor* const l, const int ieo) {
 void xchange_field(spinor* const l, const int ieo) {
 
 #ifdef TM_USE_MPI
-  int i, ix, mu, x0, x1, x2, x3, k;
-
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchangefield)
-#endif
+  int k;
 
   shmem_barrier_all();
 
@@ -285,13 +270,13 @@ void xchange_field(spinor* const l, const int ieo) {
 
 #if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   k = (T + 2) * LX * LY * LZ / 2;
-  for (x0 = 0; x0 < T; x0++) {
+  for (int x0 = 0; x0 < T; x0++) {
     shmem_double_put((double*)(l + k), (double*)(l + g_lexic2eo[g_ipt[x0][0][0][0]]), 12 * LZ * LY,
                      g_nb_x_dn);
     k += LZ * LY;
   }
   k = ((T + 2) * LX * LY * LZ + T * LY * LZ) / 2;
-  for (x0 = 0; x0 < T; x0++) {
+  for (int x0 = 0; x0 < T; x0++) {
     shmem_double_put((double*)(l + k), (double*)(l + g_lexic2eo[g_ipt[x0][LX - 1][0][0]]),
                      12 * LZ * LY, g_nb_x_up);
     k += LZ * LY;
@@ -300,16 +285,16 @@ void xchange_field(spinor* const l, const int ieo) {
 
 #if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   k = ((T + 2) * LX * LY * LZ + 2 * T * LY * LZ) / 2;
-  for (x0 = 0; x0 < T; x0++) {
-    for (x1 = 0; x1 < LX; x1++) {
+  for (int x0 = 0; x0 < T; x0++) {
+    for (int x1 = 0; x1 < LX; x1++) {
       shmem_double_put((double*)(l + k), (double*)(l + g_lexic2eo[g_ipt[x0][x1][0][0]]), 12 * LZ,
                        g_nb_y_dn);
       k += LZ;
     }
   }
   k = ((T + 2) * LX * LY * LZ + 2 * T * LY * LZ + T * LX * LZ) / 2;
-  for (x0 = 0; x0 < T; x0++) {
-    for (x1 = 0; x1 < LX; x1++) {
+  for (int x0 = 0; x0 < T; x0++) {
+    for (int x1 = 0; x1 < LX; x1++) {
       shmem_double_put((double*)(l + k), (double*)(l + g_lexic2eo[g_ipt[x0][x1][LY - 1][0]]),
                        12 * LZ, g_nb_y_up);
       k += LZ;
@@ -318,7 +303,7 @@ void xchange_field(spinor* const l, const int ieo) {
 #endif
 
 #if (defined TM_PARALLELXYZT)
-  x0 = (VOLUME / 2 + LX * LY * LZ + T * LY * LZ + T * LX * LZ);
+  int x0 = (VOLUME / 2 + LX * LY * LZ + T * LY * LZ + T * LX * LZ);
   if (ieo == 1) {
     for (k = 0; k < T * LX * LY / 2; k++) {
       shmem_double_put((double*)(l + x0), (double*)(l + g_field_z_ipt_even[k]), 24, g_nb_z_dn);
@@ -347,9 +332,6 @@ void xchange_field(spinor* const l, const int ieo) {
   shmem_barrier_all();
 #endif  // MPI
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchangefield)
-#endif
 }
 
 /* Here comes the naive version */
@@ -358,13 +340,6 @@ void xchange_field(spinor* const l, const int ieo) {
 /* exchanges the field  l */
 void xchange_field(spinor* const l, const int ieo) {
 
-#ifdef TM_PARALLELXYZT
-  int x0 = 0, x1 = 0, x2 = 0, ix = 0;
-#endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchangefield)
-#endif
-
 #ifdef TM_USE_MPI
 
   MPI_Status status;
@@ -414,11 +389,11 @@ void xchange_field(spinor* const l, const int ieo) {
   /* This is now depending on whether the field is */
   /* even or odd */
   if (ieo == 1) {
-    for (ix = 0; ix < T * LX * LY / 2; ix++) {
+    for (int ix = 0; ix < T * LX * LY / 2; ix++) {
       field_buffer_z[ix] = l[g_field_z_ipt_even[ix]];
     }
   } else {
-    for (ix = 0; ix < T * LX * LY / 2; ix++) {
+    for (int ix = 0; ix < T * LX * LY / 2; ix++) {
       field_buffer_z[ix] = l[g_field_z_ipt_odd[ix]];
     }
   }
@@ -429,11 +404,11 @@ void xchange_field(spinor* const l, const int ieo) {
                12 * T * LX * LY, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &status);
 
   if (ieo == 1) {
-    for (ix = T * LX * LY / 2; ix < T * LX * LY; ix++) {
+    for (int ix = T * LX * LY / 2; ix < T * LX * LY; ix++) {
       field_buffer_z[ix - T * LX * LY / 2] = l[g_field_z_ipt_even[ix]];
     }
   } else {
-    for (ix = T * LX * LY / 2; ix < T * LX * LY; ix++) {
+    for (int ix = T * LX * LY / 2; ix < T * LX * LY; ix++) {
       field_buffer_z[ix - T * LX * LY / 2] = l[g_field_z_ipt_odd[ix]];
     }
   }
@@ -448,9 +423,6 @@ void xchange_field(spinor* const l, const int ieo) {
 #endif
 #endif  // MPI
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchangefield)
-#endif
 }
 
 #endif /* TM_NON_BLOCKING */
diff --git a/src/lib/xchange/xchange_halffield.c b/src/lib/xchange/xchange_halffield.c
index 0dd1effca..d3d19794a 100644
--- a/src/lib/xchange/xchange_halffield.c
+++ b/src/lib/xchange/xchange_halffield.c
@@ -51,16 +51,15 @@ MPI_Request prequests[16];
 void init_xchange_halffield() {
 #ifdef TM_USE_MPI
 
-#ifdef TM_PARALLELT
-  int reqcount = 4;
-#elif defined TM_PARALLELXT
-  int reqcount = 8;
-#elif defined TM_PARALLELXYT
-  int reqcount = 12;
-#elif defined TM_PARALLELXYZT
-  int x0 = 0, x1 = 0, x2 = 0, ix = 0;
-  int reqcount = 16;
-#endif
+/* #ifdef TM_PARALLELT */
+/*   int reqcount = 4; */
+/* #elif defined TM_PARALLELXT */
+/*   int reqcount = 8; */
+/* #elif defined TM_PARALLELXYT */
+/*   int reqcount = 12; */
+/* #elif defined TM_PARALLELXYZT */
+/*   int reqcount = 16; */
+/* #endif */
 
   /* send the data to the neighbour on the right in t direction */
   /* recieve the data from the neighbour on the left in t direction */
@@ -148,7 +147,6 @@ void xchange_halffield() {
 #elif defined TM_PARALLELXYT
   int reqcount = 12;
 #elif defined TM_PARALLELXYZT
-  int x0 = 0, x1 = 0, x2 = 0, ix = 0;
   int reqcount = 16;
 #endif
   MPI_Startall(reqcount, prequests);
diff --git a/src/lib/xchange/xchange_lexicfield.c b/src/lib/xchange/xchange_lexicfield.c
index 282ca8dfa..857bb4b98 100644
--- a/src/lib/xchange/xchange_lexicfield.c
+++ b/src/lib/xchange/xchange_lexicfield.c
@@ -139,10 +139,6 @@ void xchange_lexicfield(spinor* const l) {
 /* exchanges the field  l */
 void xchange_lexicfield(spinor* const l) {
 
-#ifdef TM_PARALLELXYZT
-  int x0 = 0, x1 = 0, x2 = 0, ix = 0;
-#endif
-
 #ifdef TM_USE_MPI
   MPI_Status status;
   /* send the data to the neighbour on the left */
@@ -303,11 +299,6 @@ void xchange_lexicfield32(spinor32* const l) {
 #else /* TM_NON_BLOCKING */
 /* exchanges the field  l */
 void xchange_lexicfield32(spinor32* const l) {
-
-#ifdef TM_PARALLELXYZT
-  int x0 = 0, x1 = 0, x2 = 0, ix = 0;
-#endif
-
 #ifdef TM_USE_MPI
   MPI_Status status;
   /* send the data to the neighbour on the left */

From 959f3588fb2fc08c85886bfa1430fcd90f5155bd Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Mon, 16 Mar 2026 13:58:23 +0100
Subject: [PATCH 22/80] Fix typo in the ci/cd cmake command line

---
 .ci/include/cscs/01-test-templates.yml                          | 2 +-
 .../tmlqcd/daint-gh200/repo/packages/lemonio/package.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index 9b3a1c414..e993f8aaa 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -28,7 +28,7 @@ include:
               -DTM_USE_QUDA=ON \
               -DTM_USE_LEMON=ON \
               -DTM_ENABLE_ALIGNMENT=32 \
-              -DTM_USE_GAUGE_COPY-ON \
+              -DTM_USE_GAUGE_COPY=ON \
               -DTM_USE_HALFSPINOR=ON \
               -DCMAKE_INSTALL_PREFIX=../install_dir ..
         make
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
index 4d7340a03..aa8ac2013 100755
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
@@ -18,7 +18,7 @@ class Lemonio(AutotoolsPackage, CMakePackage):
     version('master', branch='master')
 
     depends_on("libtool", type="build", when="@master build_system=cmake")
-    depends_on("cmake", type="build", when="master build_system=cmake")
+    depends_on("cmake@4", type="build", when="master build_system=cmake")
 
     depends_on('mpi')
 

From e0b3e0292bf047dcda504b0445882d4889afda2b Mon Sep 17 00:00:00 2001
From: Taillefumier Mathieu <29380261+mtaillefumier@users.noreply.github.com>
Date: Thu, 19 Mar 2026 15:10:48 +0100
Subject: [PATCH 23/80] Add HIP language support and dependencies

Enable HIP language support and find hipblas and hipfft packages.
---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 75bac17b9..b20566b15 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -280,7 +280,9 @@ endif()
 # if AMD GPU access is not possible. So allow it
 
 if(TM_USE_HIP OR QUDA_TARGET_HIP)
-  enable_language(hip)
+  enable_language(HIP)
+  find_package(hipblas)
+  find_package(hipfft)
   if(TM_USE_CUDA_HIP)
     find_package(CUDA)
   endif()

From 5f5d51c0bf4f1d8091578604330328bc17257471 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Thu, 19 Mar 2026 15:22:14 +0100
Subject: [PATCH 24/80] Fix include directories

---
 CMakeLists.txt                   | 3 ++-
 DDalphaAMG/CMakeLists.txt        | 2 +-
 cmake/DDalphaAMG-Config.cmake.in | 4 +---
 src/lib/CMakeLists.txt           | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b20566b15..fc633d805 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,7 +252,8 @@ find_package(CLime REQUIRED)
 set(TM_USE_LIME ON)
 
 if(TM_USE_QUDA)
-  find_package(QUDA REQUIRED config)
+	enable_language(CUDA)
+	find_package(QUDA REQUIRED config)
 endif()
 
 if(TM_USE_SHMEM)
diff --git a/DDalphaAMG/CMakeLists.txt b/DDalphaAMG/CMakeLists.txt
index 22c0c35d1..2f428a177 100644
--- a/DDalphaAMG/CMakeLists.txt
+++ b/DDalphaAMG/CMakeLists.txt
@@ -167,7 +167,7 @@ target_link_libraries(
 
 target_include_directories(
   DDalphaAMG
-  PUBLIC $<INSTALL_INTERFACE:include>
+  PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}>
   $<BUILD_INTERFACE:${DDalphaAMG_SRC_DIR}/src>
   $<BUILD_INTERFACE:${DDalphaAMG_SRC_DIR}/include>
   $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/DDalphaAMG>)
diff --git a/cmake/DDalphaAMG-Config.cmake.in b/cmake/DDalphaAMG-Config.cmake.in
index 539b644a0..abb7d45d3 100644
--- a/cmake/DDalphaAMG-Config.cmake.in
+++ b/cmake/DDalphaAMG-Config.cmake.in
@@ -34,9 +34,7 @@ if (NOT TARGET DDalphaAMG::DDalphaAMG)
     set(DDalphaAMG_ENABLE_COARSE_RES @DDalphaAMG_ENABLE_COARSE_RES@)
   endif()
   if (@DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI@)
-    set(  DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI
-      @DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI@
-)
+    set(DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI @DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI@)
   endif()
   if (@DDalphaAMG_ENABLE_OMP@)
     set(DDalphaAMG_ENABLE_OMP @DDalphaAMG_ENABLE_OMP@)
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 700dabdf4..8ae48d31b 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -431,7 +431,7 @@ target_compile_definitions(
 
 target_include_directories(
   hmc
-  PUBLIC $<INSTALL_INTERFACE:include/tmlqcd>
+  PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}>
          $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
          $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
 

From 010a063eafbb57fccc0e6b82fc58b843339a84fe Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Fri, 20 Mar 2026 12:33:11 +0100
Subject: [PATCH 25/80] Revert changes

---
 src/lib/solver/gmres_precon.c | 147 +++++++++++-----------------------
 1 file changed, 47 insertions(+), 100 deletions(-)

diff --git a/src/lib/solver/gmres_precon.c b/src/lib/solver/gmres_precon.c
index de84eff0d..9497b8b14 100644
--- a/src/lib/solver/gmres_precon.c
+++ b/src/lib/solver/gmres_precon.c
@@ -44,7 +44,6 @@
 #ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
 #endif
-#include <complex.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -76,6 +75,7 @@ static double *s;
 
 int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_restarts,
                  const double eps_sq, const int rel_prec, const int N, matrix_mult f) {
+  int restart, i, j, k;
   double beta, eps, norm;
   complex tmp1, tmp2;
   spinor **solver_field = NULL;
@@ -93,22 +93,22 @@ int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_re
 
   /*   assign(solver_field[1], P, N); */
   zero_spinor_field(solver_field[1], N);
-  for (int restart = 0; restart < max_restarts; restart++) {
+  for (restart = 0; restart < max_restarts; restart++) {
     /* r_0=Q-AP  (b=Q, x+0=P) */
     f(solver_field[1], solver_field[1]);
     diff(solver_field[1], Q, solver_field[1], N);
     /* v_0=r_0/||r_0|| */
-    alpha[0] = sqrt(square_norm(solver_field[1], N, 1)) + 0.0I;
+    alpha[0].re = sqrt(square_norm(solver_field[1], N, 1));
 
     /*     if(alpha[0].re == 0.){  */
     /*        assign(P, solver_field[1], N);  */
     /*        return(restart*m);  */
     /*     }  */
 
-    if (creal(alpha[0]) != 0.) {
-      mul_r(V[0], 1. / creal(alpha[0]), solver_field[1], N);
+    if (alpha[0].re != 0.) {
+      mul_r(V[0], 1. / alpha[0].re, solver_field[1], N);
 
-      for (int j = 0; j < m; j++) {
+      for (j = 0; j < m; j++) {
         /* solver_field[1]=A*v_j */
 
         f(solver_field[1], V[j]);
@@ -116,13 +116,13 @@ int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_re
         /* Set h_ij and omega_j */
         /* solver_field[0] <- omega_j */
         assign(solver_field[0], solver_field[1], N);
-        for (int i = 0; i <= j; i++) {
+        for (i = 0; i <= j; i++) {
           H[i][j] = scalar_prod(V[i], solver_field[0], N, 1);
           assign_diff_mul(solver_field[0], V[i], H[i][j], N);
         }
 
         _complex_set(H[j + 1][j], sqrt(square_norm(solver_field[0], N, 1)), 0.);
-        for (int i = 0; i < j; i++) {
+        for (i = 0; i < j; i++) {
           tmp1 = H[i][j];
           tmp2 = H[i + 1][j];
           _mult_real(H[i][j], tmp2, s[i]);
@@ -133,7 +133,7 @@ int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_re
 
         /* Set beta, s, c, alpha[j],[j+1] */
         beta = sqrt(_complex_square_norm(H[j][j]) + _complex_square_norm(H[j + 1][j]));
-        s[j] = creal(H[j + 1][j]) / beta;
+        s[j] = H[j + 1][j].re / beta;
         _mult_real(c[j], H[j][j], 1. / beta);
         _complex_set(H[j][j], beta, 0.);
         _mult_real(alpha[j + 1], alpha[j], s[j]);
@@ -146,19 +146,19 @@ int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_re
                  alpha[j + 1].re * alpha[j + 1].re);
           fflush(stdout);
         }
-        if (((creal(alpha[j + 1]) <= eps) && (rel_prec == 0)) ||
-            (creal((alpha[j + 1]) <= eps * norm) && (rel_prec == 1))) {
-          _mult_real(alpha[j], alpha[j], 1. / creal(H[j][j]));
+        if (((alpha[j + 1].re <= eps) && (rel_prec == 0)) ||
+            ((alpha[j + 1].re <= eps * norm) && (rel_prec == 1))) {
+          _mult_real(alpha[j], alpha[j], 1. / H[j][j].re);
           assign_add_mul(solver_field[1], V[j], alpha[j], N);
-          for (int i = j - 1; i >= 0; i--) {
-            for (int k = i + 1; k <= j; k++) {
+          for (i = j - 1; i >= 0; i--) {
+            for (k = i + 1; k <= j; k++) {
               _mult_assign_complex(tmp1, H[i][k], alpha[k]);
               _diff_complex(alpha[i], tmp1);
             }
-            _mult_real(alpha[i], alpha[i], 1. / creal(H[i][i]));
+            _mult_real(alpha[i], alpha[i], 1. / H[i][i].re);
             assign_add_mul(solver_field[1], V[i], alpha[i], N);
           }
-          for (int i = 0; i < m; i++) {
+          for (i = 0; i < m; i++) {
             alpha[i].im = 0.;
           }
           assign(P, solver_field[1], N);
@@ -168,24 +168,24 @@ int gmres_precon(spinor *const P, spinor *const Q, const int m, const int max_re
         /* if not */
         else {
           if (j != m - 1) {
-            mul_r(V[(j + 1)], 1. / creal(H[j + 1][j]), solver_field[0], N);
+            mul_r(V[(j + 1)], 1. / H[j + 1][j].re, solver_field[0], N);
           }
         }
       }
 
       j = m - 1;
       /* prepare for restart */
-      _mult_real(alpha[j], alpha[j], 1. / creal(H[j][j]));
+      _mult_real(alpha[j], alpha[j], 1. / H[j][j].re);
       assign_add_mul(solver_field[1], V[j], alpha[j], N);
-      for (int i = j - 1; i >= 0; i--) {
-        for (int k = i + 1; k <= j; k++) {
+      for (i = j - 1; i >= 0; i--) {
+        for (k = i + 1; k <= j; k++) {
           _mult_assign_complex(tmp1, H[i][k], alpha[k]);
           _diff_complex(alpha[i], tmp1);
         }
-        _mult_real(alpha[i], alpha[i], 1. / creal(H[i][i]));
+        _mult_real(alpha[i], alpha[i], 1. / H[i][i].re);
         assign_add_mul(solver_field[1], V[i], alpha[i], N);
       }
-      for (int i = 0; i < m; i++) {
+      for (i = 0; i < m; i++) {
         alpha[i].im = 0.;
       }
     }
@@ -234,7 +234,6 @@ static void init_pgmres(const int _M, const int _V) {
 }
 
 complex scalar_prod_nocom(spinor *const S, spinor *const R, const int N) {
-#ifdef __STDC_NO_COMPLEX__
   int ix;
   static double ks, kc, ds, tr, ts, tt;
   spinor *s, *r;
@@ -305,91 +304,39 @@ complex scalar_prod_nocom(spinor *const S, spinor *const R, const int N) {
 
   c.im = kc;
   return (c);
-#else
-  double complex ks, kc;
-
-  ks = 0.0 + 0.0I;
-  kc = 0.0 + 0.0I;
-  for (int ix = 0; ix < N; ix++) {
-    spinor *s = (spinor *)S + ix;
-    spinor *r = (spinor *)R + ix;
-    double complex ds = (*r).s0.c0 * (*r).s0.c0 +
-      (*r).s0.c1 * (*r).s0.c1 +
-      (*r).s0.c2 * (*r).s0.c2 +
-      (*r).s1.c0 * (*r).s1.c0 +
-      (*r).s1.c1 * (*r).s1.c1 +
-      (*r).s1.c2 * (*r).s1.c2 +
-      (*r).s2.c0 * (*r).s2.c0 +
-      (*r).s2.c1 * (*r).s2.c1 +
-      (*r).s2.c2 * (*r).s2.c2 +
-      (*r).s3.c0 * (*r).s3.c0 +
-      (*r).s3.c1 * (*r).s3.c1 +
-      (*r).s3.c2 * (*r).s3.c2;
-
-    double complex tr = ds + kc;
-    double complex ts = tr + ks;
-    double complex tt = ts - ks;
-    ks = ts;
-    kc = tr - tt;
-  }
-  return  ks + kc;
-#endif
 }
 
-
 double square_norm_nocom(spinor *const P, const int N) {
-  double ks = 0.0;
-  double kc = 0.0;
+  int ix;
+  static double ks, kc, ds, tr, ts, tt;
+  spinor *s;
+
+  ks = 0.0;
+  kc = 0.0;
 
-#ifdef __STDC_NO_COMPLEX__
   /* Change due to even-odd preconditioning : VOLUME   to VOLUME/2 */
-  for (int ix = 0; ix < N; ix++) {
-    spinor *s = P + ix;
-
-    double ds = (*s).s0.c0.re * (*s).s0.c0.re + (*s).s0.c0.im * (*s).s0.c0.im +
-                (*s).s0.c1.re * (*s).s0.c1.re + (*s).s0.c1.im * (*s).s0.c1.im +
-                (*s).s0.c2.re * (*s).s0.c2.re + (*s).s0.c2.im * (*s).s0.c2.im +
-                (*s).s1.c0.re * (*s).s1.c0.re + (*s).s1.c0.im * (*s).s1.c0.im +
-                (*s).s1.c1.re * (*s).s1.c1.re + (*s).s1.c1.im * (*s).s1.c1.im +
-                (*s).s1.c2.re * (*s).s1.c2.re + (*s).s1.c2.im * (*s).s1.c2.im +
-                (*s).s2.c0.re * (*s).s2.c0.re + (*s).s2.c0.im * (*s).s2.c0.im +
-                (*s).s2.c1.re * (*s).s2.c1.re + (*s).s2.c1.im * (*s).s2.c1.im +
-                (*s).s2.c2.re * (*s).s2.c2.re + (*s).s2.c2.im * (*s).s2.c2.im +
-                (*s).s3.c0.re * (*s).s3.c0.re + (*s).s3.c0.im * (*s).s3.c0.im +
-                (*s).s3.c1.re * (*s).s3.c1.re + (*s).s3.c1.im * (*s).s3.c1.im +
-                (*s).s3.c2.re * (*s).s3.c2.re + (*s).s3.c2.im * (*s).s3.c2.im;
-
-    double tr = ds + kc;
-    double ts = tr + ks;
-    double tt = ts - ks;
+  for (ix = 0; ix < N; ix++) {
+    s = P + ix;
+
+    ds = (*s).s0.c0.re * (*s).s0.c0.re + (*s).s0.c0.im * (*s).s0.c0.im +
+         (*s).s0.c1.re * (*s).s0.c1.re + (*s).s0.c1.im * (*s).s0.c1.im +
+         (*s).s0.c2.re * (*s).s0.c2.re + (*s).s0.c2.im * (*s).s0.c2.im +
+         (*s).s1.c0.re * (*s).s1.c0.re + (*s).s1.c0.im * (*s).s1.c0.im +
+         (*s).s1.c1.re * (*s).s1.c1.re + (*s).s1.c1.im * (*s).s1.c1.im +
+         (*s).s1.c2.re * (*s).s1.c2.re + (*s).s1.c2.im * (*s).s1.c2.im +
+         (*s).s2.c0.re * (*s).s2.c0.re + (*s).s2.c0.im * (*s).s2.c0.im +
+         (*s).s2.c1.re * (*s).s2.c1.re + (*s).s2.c1.im * (*s).s2.c1.im +
+         (*s).s2.c2.re * (*s).s2.c2.re + (*s).s2.c2.im * (*s).s2.c2.im +
+         (*s).s3.c0.re * (*s).s3.c0.re + (*s).s3.c0.im * (*s).s3.c0.im +
+         (*s).s3.c1.re * (*s).s3.c1.re + (*s).s3.c1.im * (*s).s3.c1.im +
+         (*s).s3.c2.re * (*s).s3.c2.re + (*s).s3.c2.im * (*s).s3.c2.im;
+
+    tr = ds + kc;
+    ts = tr + ks;
+    tt = ts - ks;
     ks = ts;
     kc = tr - tt;
   }
   kc = ks + kc;
-#else
-  /* Change due to even-odd preconditioning : VOLUME   to VOLUME/2 */
-  for (int ix = 0; ix < N; ix++) {
-    spinor *s = P + ix;
-    double complex ds =  (*s).s0.c0 * conj((*s).s0.c0) +
-      (*s).s0.c1 * conj((*s).s0.c1) +
-      (*s).s0.c2 * conj((*s).s0.c2) +
-      (*s).s1.c0 * conj((*s).s1.c0) +
-      (*s).s1.c1 * conj((*s).s1.c1) +
-      (*s).s1.c2 * conj((*s).s1.c2) +
-      (*s).s2.c0 * conj((*s).s2.c0) +
-      (*s).s2.c1 * conj((*s).s2.c1) +
-      (*s).s2.c2 * conj((*s).s2.c2) +
-      (*s).s3.c0 * conj((*s).s3.c0) +
-      (*s).s3.c1 * conj((*s).s3.c1) +
-      (*s).s3.c2 * conj((*s).s3.c2);
-    
-    double tr = creal(ds) + kc;
-    double ts = tr + ks;
-    double tt = ts - ks;
-    ks = ts;
-    kc = tr - tt;
-  }
-#endif
-
   return kc;
 }

From 9bb11644604ced4542ddbd0fa5a45894c339270f Mon Sep 17 00:00:00 2001
From: Taillefumier Mathieu <29380261+mtaillefumier@users.noreply.github.com>
Date: Fri, 20 Mar 2026 12:51:02 +0100
Subject: [PATCH 26/80] Add cmake version 4 to environments.yaml

---
 .ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml b/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
index 41be6341d..a4390d942 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
@@ -10,6 +10,7 @@ gcc-env:
   - lemonio
   - c-lime
   - openblas
+  - cmake@4
   - cuda
   variants:
   - +mpi

From 7350295d09e241ffeb3ecc4ebb9fcc283989c3a0 Mon Sep 17 00:00:00 2001
From: chaoos <chaoos@users.noreply.github.com>
Date: Fri, 20 Mar 2026 13:51:56 +0100
Subject: [PATCH 27/80] Update UENV_TAG to version v0.0.7

---
 .ci/include/cscs/00-variables.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/include/cscs/00-variables.yml b/.ci/include/cscs/00-variables.yml
index 27bb44047..c4db27021 100644
--- a/.ci/include/cscs/00-variables.yml
+++ b/.ci/include/cscs/00-variables.yml
@@ -9,5 +9,5 @@
 variables:
   UENV_NAME: tmlqcd
   UENV_VERSION: experimental
-  UENV_TAG: v0.0.6
+  UENV_TAG: v0.0.7
 

From 51dc1e51ba6488089c8f1714a28e3b38083dbfb3 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Fri, 20 Mar 2026 15:36:57 +0100
Subject: [PATCH 28/80] downgrade to cmake 3 something

---
 .ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml b/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
index a4390d942..fbd514f02 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
@@ -10,7 +10,7 @@ gcc-env:
   - lemonio
   - c-lime
   - openblas
-  - cmake@4
+  - cmake@3
   - cuda
   variants:
   - +mpi

From 8041586a9a3b82f2c97c3cfb96a08c0b7d4b75cd Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Mon, 23 Mar 2026 08:05:46 +0100
Subject: [PATCH 29/80] Fix an issue with QUDA and HIP

---
 CMakeLists.txt | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc633d805..59737df6a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -155,10 +155,10 @@ cmake_dependent_option(
   "TM_USE_DDalphaAMG" OFF)
 
 cmake_dependent_option(
-  DDalphaAMG_ENABLE_SCHWARZ_RES "Enable paramount output support" OFF 
+  DDalphaAMG_ENABLE_SCHWARZ_RES "Enable paramount output support" OFF
   "TM_USE_DDalphaAMG" OFF)
 
-cmake_dependent_option(DDalphaAMG_ENABLE_OMP  "Enable OpenMP support" ON
+cmake_dependent_option(DDalphaAMG_ENABLE_OMP "Enable OpenMP support" ON
                        "TM_USE_DDalphaAMG" OFF)
 
 cmake_dependent_option(
@@ -241,9 +241,10 @@ if(TM_USE_HDF5)
 endif()
 
 if(TM_USE_LEMON)
-  FetchContent_Declare(lemon
+  FetchContent_Declare(
+    lemon
     GIT_REPOSITORY https://github.com/etmc/lemon
-    GIT_TAG        187de3435d604251e078eb083016131f035d6a51
+    GIT_TAG 187de3435d604251e078eb083016131f035d6a51
     FIND_PACKAGE_ARGS NAMES lemon)
   FetchContent_MakeAvailable(lemon)
 endif()
@@ -252,8 +253,7 @@ find_package(CLime REQUIRED)
 set(TM_USE_LIME ON)
 
 if(TM_USE_QUDA)
-	enable_language(CUDA)
-	find_package(QUDA REQUIRED config)
+  find_package(QUDA REQUIRED config)
 endif()
 
 if(TM_USE_SHMEM)
@@ -386,14 +386,14 @@ write_basic_package_version_file(
   COMPATIBILITY SameMajorVersion)
 
 configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlQCD-config.cmake.in"
-  "${PROJECT_BINARY_DIR}/tmlQCD-config.cmake" @ONLY)
+               "${PROJECT_BINARY_DIR}/tmlQCD-config.cmake" @ONLY)
 
 install(FILES "${PROJECT_BINARY_DIR}/tmlQCD-config.cmake"
-  "${PROJECT_BINARY_DIR}/tmlQCDConfigVersion.cmake"
-  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/tmlQCD")
+              "${PROJECT_BINARY_DIR}/tmlQCDConfigVersion.cmake"
+        DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/tmlQCD")
 
 install(FILES "${PROJECT_BINARY_DIR}/libtmlQCD.pc"
-  DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+        DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 
 install(
   DIRECTORY "${PROJECT_SOURCE_DIR}/cmake"

From 0c4210aa22e59f77c3fa53718fe5bb32bd26280f Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Mon, 23 Mar 2026 08:19:30 +0100
Subject: [PATCH 30/80] Update the README.md

---
 README    | 364 ------------------------------------------------------
 README.md |  18 ++-
 2 files changed, 15 insertions(+), 367 deletions(-)
 delete mode 100644 README

diff --git a/README b/README
deleted file mode 100644
index f7bf3a896..000000000
--- a/README
+++ /dev/null
@@ -1,364 +0,0 @@
-Here are some remarks collected in order to configure, compile and
-install the tmLQCD programme suit. For more information, also about running
-the code please read the documentation in the doc sub-directory. 
-
-CONFIGURE and COMPILE
-
-It is recommended to build the code not in the source directory but in
-a separate directory.
-
-The lime library (tested with version 1.2.3) is needed to compile the
-program. Please download it at
-
-http://usqcd.jlab.org/usqcd-software/c-lime/
-
-Configure and compile lime (for documentation see
-http://usqcd.jlab.org/usqcd-docs/c-lime/) first.
-Then you should use the configure option --with-lime=dir for the
-tmLQCD to set the correct directory where to find lime (see below). 
-
-For more documentation please change into the doc directory and type
-latex main.tex
-and see the sections for configuring, installing and testing the code.
-
-Here we have gathered some examples for some standard architectures.
-Building the tmLQCD executables is a three step procedure:
-
-****************************************************************************
-
-1) configure:
-
-In your build directory type
-
-path-to-the-sources/configure --help
-
-to get an overview of the available options and switches. In
-particular check out the prefix option for your installation path. 
-What follows now are some examples for a few standard architectures.
-
-- a scalar build on a P4 machine would look like:
-
-path-to-the-sources/configure --disable-mpi --enable-sse2 --enable-p4 \
-  --enable-gaugecopy --disable-newdiracop --with-limedir=<path-to-lime> \
-  --with-lapack="<linker options needed for lapack>" \
-  CC=<cc>
-
-- Opteron with SSE2:
-
-path-to-the-sources/configure --disable-mpi --enable-sse2 --enable-opteron \
-  --enable-gaugecopy --disable-newdiracop --with-limedir=<path-to-lime> \
-  --with-lapack="<linker options needed for lapack>" \
-  CC=<cc>
-
-- A MPI parallel (4dims) build on a P4 cluster:
-
-path-to-the-sources/configure --enable-mpi --enable-sse2 --enable-p4 \
-  --with-mpidimension=4 --enable-gaugecopy --disable-newdiracop \
-  --with-limedir=<path-to-lime> --with-lapack="<linker options needed for lapack>" \
-  CC=<mpicc>
-
-- on the Munich Altix machine:
-
-path-to-the-sources/configure --enable-mpi --with-mpidimension=4 \
-  --with-limedir=<path-to-lime> --enable-newdiracop \
-  --disable-shmem --with-lapack="<linker options needed for lapack>" \
-  CC=mpicc CFLAGS="-mcpu=itanium2 -O3 -g -c99 -mtune=itanium2" 
-
-for lapack on this machine please type
-module load mkl
-
-
-- on the HLRB ice installation use
-
-path-to-the-sources/configure --enable-mpi --with-mpidimension=4 \
-   --disable-sse2 --disable-p4  --with-limedir=<path-to-lime> \
-   --enable-newdiracop --with-lapack="<linker options needed for lapack>" \
-   CC="mpicc -std=c99" CFLAGS="-g" \
-
-where it is again important to use the Intel C compiler! 
-
-for lapack first load the module mkl and then use
-
---with-lapack="-L$LIBRARY_PATH -llapack -lblas"
-
-You may enable or disable other configure options as needed. See the
-documentation for more details.
-
-****************************************************************************
-
-2) make
-
-type `make` in your build directory.
-
-If there appears no error message during compilation you should end up
-with a few executable in the build directory, namely `hmc_tm`,
-`invert` and `invert_doublet`.
-
-****************************************************************************
-
-3) make install
-
-type `make install`
-
-to get the executables installed.
-
-
-
-****************************************************************************
-****************************************************************************
-
-in the following we provide a "codemap", giving a short explanation
-for the contents of each c-file:
-
-****************************************************************************
-top directory: apart from the main routines all routines are compiled into
-	       the run-time library libhmc.
-
-DML_crc32.c: invert, invert_doublet, hmc_tm
-	     some helper functions to compute the SCIDAC 
-	     checksum
-D_psi.c:     invert, invert_doublet, hmc_tm
-	     Wilson twisted mass Dirac operator, not even/odd 
-	     preconditioned 
-Hopping_Matrix.c: invert, invert_doublet, hmc_tm
-	     Hopping matrix for the even/odd preconditioned 
-	     Dirac operator
-Hopping_Matrix_nocom.c: benchmark
-	     Hopping matrix for the even/odd preconditioned 
-	     Dirac operator, communication switched off
-Nondegenerate_Matrix.c: invert_doublet, hmc_tm
-	     operators needed for even/odd preconditioning 
-	     the non-degenerate flavour doublet Dirac operator
-Ptilde_nd.c: hmc_tm
-	     the more precise polynomial $\tilde P$ needed for 
-	     the PHMC for the non-degenerate flavour doublet
-benchmark.c: main routine
-	     benchmark code for D_psi and Hopping_Matrix
-block.c:     experimental
-boundary.c:  invert, invert_doublet, hmc_tm
-	     implements the twisted boundary conditions for the
-	     spinor fields
-chebyshev_polynomial.c: experimental
-chebyshev_polynomial_nd.c: hmc_tm
-	     implements the generation of coefficients for the 
-	     chebyshev polynomial using the clenshaw recursion 
-	     relation
-deriv_Sb.c:  hmc_tm
-	     the variation of Q=gamma_5 D with respect to the 
-	     gauge fields in the even/odd case 
-deriv_Sb_D_psi.c: hmc_tm
-	     the variation of Q=gamma_5 D with respect to the 
-	     gauge fields in the non even/odd case 
-det_monomial.c: hmc_tm
-	     implements the functions needed for a det monomial
-detratio_monomial.c: hmc_tm
-	     implements the functions needed for a detratio monomial
-poly_monomial.c: hmc_tm
-             implements function needed for a POLY monomial 
-             (PHMC for light degenerate quarks)
-dml.c:       invert, invert_doublet, hmc_tm
-	     some helper functions to compute the SCIDAC 
-	     checksum
-double2single.c: main routine
-	     can convert a gauge field from double to single precision
-single2double.c: main routine
-	     can convert a gauge field from single to double precision
-eigenvalues_bi.c: hmc_tm
-	     computes eigenvalues of the mass non-degenerate two flavour 
-	     Dirac operatoe
-expo.c:      hmc_tm
-	     implements the exponetial function of an su(3) element
-gamma.c:     invert, invert_doublet, hmc_tm
-	     implements multiplication of gamma matrices and some useful
-	     combination of those with a spinor field
-gauge_io.c:  invert, invert_doublet, hmc_tm
-	     IO routines for gauge fields 
-gauge_monomial.c: hmc_tm
-	     implements the functions needed for a gauge monomial
-gen_sources.c: invert, invert_doublet, hmc_tm
-	     implements the generation of source spinor fields
-geometry_eo.c: invert, invert_doublet, hmc_tm
-	     anything related to gauge and spinor field geometry
-get_rectangle_staples.c: hmc_tm
-             computes rectangular staples of gauge links as needed for
-	     e.g. the Iwasaki gauge action and its derivative
-get_staples.c: hmc_tm
-             computes plaquette staples of gauge links as needed for
-	     for all gauge actions and their derivatives
-getopt.c:    invert, invert_doublet, hmc_tm
-	     needed for command line options
-hmc_tm.c:    main routine
-	     hmc_tm executable
-hybrid_update.c: hmc_tm
-	     implements the functions for the gauge field update and
-	     the momenta update
-init_bispinor_field.c 
-init_chi_copy.c
-init_chi_spinor_field.c
-init_dirac_halfspinor.c
-init_gauge_field.c
-init_gauge_tmp.c
-init_geometry_indices.c
-init_moment_field.c
-init_spinor_field.c
-init_stout_smear_vars.c: invert, invert_doublet, hmc_tm
-	     provide routines to allocate memory for the corresponding
-	     objects
-integrator.c: hmc_tm
-	     implements the routines needed for the integrator in the
-	     MD udpate
-invert.c:    main routine
-	     invert executable
-invert_doublet.c: main routine
-	     invert_doublet executable
-invert_doublet_eo.c: invert_doublet
-	     performs an inversion of the flavour doublet operator using
-	     even/odd preconditioning and the CG solver
-invert_eo.c: invert
-	     performs an inversion of the Wilson twisted mass Dirac operator
-	     using a solver as specified in the input file. Depending on the 
-	     input file even/odd preconditioning is used or not
-io.c:        invert, invert_doublet, hmc_tm
-	     helper routines: some deprecated IO routines for gauge and spinor 
-	     spinor fields, and the routine writing the initial stdout message
-	     of the executables
-io_utils.c:  invert, invert_doublet, hmc_tm
-	     IO helper routines related to swap endian and checksums
-linsolve.c:  hmc_tm
-	     CG and bicgstab solvers as used only in the HMC
-little_D.c:  experimental
-measure_rectangles.c: hmc_tm
-	     computes the gauge action related to the rectangular part
-monomial.c:  hmc_tm
-             provides the definition for monomials and initialisation functions
-mpi_init.c:  invert, invert_doublet, hmc_tm, benchmark
-	     MPI initialisation routine
-ndpoly_monomial.c: hmc_tm
-	     implements the functions needed for a ndpoly monomial
-observables.c: hmc_tm, invert, invert_doublet
-	     computes the gauge action related to the Wilson plaquette part
-online_measurement.c: hmc_tm
-	     anything related to online measurements
-phmc.c       hmc_tm
-	     functions and variables as needed for the PHC
-polyakov_loop.c: hmc_tm
-	     measures the polyakov loop
-propagator_io.c: invert, invert_doublet, hmc_tm
-	     functions related to spinor field IO
-ranlxd.c:    invert, invert_doublet, hmc_tm
-	     RANLUX random number generator (64 Bit)
-ranlxs.c:    invert, invert_doublet, hmc_tm
-	     RANLUX random number generator (32 Bit)
-read_input.l: invert, invert_doublet, hmc_tm
-             definition of the input file parser (flex)
-reweighting_factor.c: experimental
-reweighting_factor_nd.c: experimental
-sighandler.c: invert, invert_doublet, hmc_tm
-	     handles signal related to illegal instructions
-start.c:     invert, invert_doublet, hmc_tm
-	     functions needed to give initial values to gauge and spinor fields
-stout_smear.c: invert, invert_doublet
-	     functions to stout smear a given gauge configuration
-stout_smear_force.c: experimental
-tm_operators.c: invert, invert_doublet, hmc_tm
-	     operators needed for even/odd preconditioning the Wilson
-	     twisted mass Dirac operator
-update_backward_gauge.c: invert, invert_doublet, hmc_tm
-	     functions to update the gauge copy
-update_momenta.c: hmc_tm
-	     function to update the momenta in the HMC MD part
-update_tm.c: hmc_tm
-	     the HMC MD part
-xchange_2fields.c: invert, invert_doublet, hmc_tm
-	     implements the MPI communication of two even/odd spinor fields
-	     at once
-xchange_deri.c: hmc_tm
-	     implements the MPI communication of derivatives
-xchange_field.c: invert, invert_doublet, hmc_tm
-	     implements the MPI communication of a single even/odd spinor
-	     field
-xchange_gauge.c: invert, invert_doublet, hmc_tm
-	     implements the MPI communication of the gauge field
-xchange_halffield.c: invert, invert_doublet, hmc_tm
-	     implements the MPI communication of a half spinor field
-xchange_lexicfield.c: invert, invert_doublet, hmc_tm
-	     implements the MPI communication of a single (full) spinor
-	     field
-
-****************************************************************************
-the linalg directory: all routines here are compiled into the liblinalg
-                      runtime library
-                      capital letters are spinor fields, others scalars
-add.c:                Q = R + S
-assign.c:             R = S
-assign_add_mul.c:     P = P + c Q with c complex
-assign_add_mul_r.c:   P = P + c Q with c real
-assign_add_mul_add_mul.c:   R = R + c1*S + c2*U with c1 and c2 complex variables
-assign_add_mul_add_mul_r.c: R = R + c1*S + c2*U with c1 and c2 real variables
-assign_diff_mul.c:    S=S-c*Q
-assign_mul_add_mul_add_mul_add_mul_r.c: R = c1*R + c2*S + c3*U + c4*V
-			 		with c1, c2, c3, c4 real variables
-assign_mul_add_mul_add_mul_r.c:         R = c1*R + c2*S + c3*U 
-					with c1, c2 and c3 real variables
-assign_mul_add_mul_r.c:     R = c1*R + c2*S , c1 and c2 are real constants 
-assign_mul_add_r.c:         R = c*R + S  c is a real constant
-assign_mul_bra_add_mul_ket_add.c:       R = c2*(R + c1*S) + (*U)
-					with c1 and c2 complex variables
-assign_mul_bra_add_mul_ket_add_r.c:     R = c2*(R + c1*S) + (*U)
-					with c1 and c2 complex variables
-assign_mul_bra_add_mul_r.c:             R = c1*(R + c2*S)
-					with c1 and c2 complex variables
-comp_decomp.c:                          Splits the Bi-spinor R in the spinors S and T 
-convert_eo_to_lexic.c:                  convert to even odd spinors to one full spinor
-diff.c:                 Q = R - S
-diff_and_square_norm.c: Q = R - S and ||Q||^2
-mattimesvec.c:          w = M*v for complex vectors w,v and and complex square matrix M
-mul.c:                  R = c*S, for complex c
-mul_r.c:                R = c*S, for real c
-mul_add_mul.c:          R = c1*S + c2*U , c1 and c2 are complex constants
-mul_add_mul_r.c         R = c1*S + c2*U , c1 and c2 are real constants
-mul_diff_mul.c:         R = c1*S - c2*U , c1 and c2 are complex constants
-mul_diff_mul_r.c        R = c1*S - c2*U , c1 and c2 are real constants
-mul_diff_r.c            R = c1*S - U , c1 is a real constant 
-scalar_prod.c:          c = (R, S)
-scalar_prod_i.c:        c = Im(R, S)
-scalar_prod_r.c:        c = Re(R, S)
-square_and_prod_r.c:    Returns Re(R,S) and the square norm of S
-square_norm.c:          c = ||Q||^2
-
-****************************************************************************
-solver directory: all routines here are compiled into the libsolver
-                  runtime library
-		  the solvers are for spinor fields, if not indicated
-		  otherwise.
-
-Msap.c:                 experimental SAP preconditioner
-bicgstab_complex.c:     BiCGstab for complex fields
-bicgstabell.c:          experimental
-cg_her.c :              CG solver for hermitian operators
-cg_her_nd.c:            CG solver for hermitian heavy doublet operators
-cgs_real.c:             CGS solver
-chrono_guess.c:         routines for the chronological solver
-dfl_projector.c:        experimental
-diagonalise_general_matrix.c:  subroutine to diagonalise a complex n times n
-                               matrix. Input is a complex matrix in _C_ like
-                               order. Output is again _C_ like. Uses lapack
-eigenvalues.c           compute the nr_of_eigenvalues lowest eigenvalues
-                        of (gamma5*D)^2
-fgmres.c:               FGMRES (flexible GMRES) solver
-gcr.c:                  GCR solver
-gcr4complex.c:          GCR solver for complex fields
-generate_dfl_subspace.c: experimental
-gmres.c:                GMRES solver
-gmres_dr.c:             GMRES-DR solver
-gmres_precon.c:         GMRES usable for preconditioning other solvers (experimental)
-gram-schmidt.c:         Gram-Schmidt orthonormalisation routines
-jdher.c:                Jacobi Davidson for hermitian matrices (to compute EVs)
-lu_solve.c:             compute the inverse of a matrix with LU decomposition
-mr.c:                   MR solver
-pcg_her.c:              PCG solver
-poly_precon.c:          polynomial preconditioner using Chebysheff polynomials
-			with complex argument
-quicksort.c:            a quicksort routine
-sub_low_ev.c:           routines to subtract exactly computed eigenvectors from
-			a given spinor field
diff --git a/README.md b/README.md
index 84f1e2172..f01ab8284 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,19 @@ cmake -DCMAKE_INSTALL_PREFIX=/my_path \
 Note that the command assumes that QUDA is compiled with `CUDA` support. AMD GPU
 are also supported after replacing `-DTM_USE_CUDA=ON` with
 `-DTM_USE_HIP=ON` and compiling `QUDA` with `HIP` support. The ROCM architecture is defined by the variable
-`CMAKE_HIP_ARCHITECTURES=gfxxxx`.
+`CMAKE_HIP_ARCHITECTURES=gfxxxx`.  An extra parameter `-DCMAKE_CXX_COMPILER=clang++` is needed because `QUDA` use the `ROCM clang++` 
+compiler internally and the build will fail if `gcc` or any other compiler is used during 
+link time. This option only affects the linking behavior not the compilation. The cmake command line for HIP/ROCM support is then
+```bash
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+    -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path;/my_quda_path" \
+    -DTM_USE_MPI=ON \
+    -DTM_USE_LEMON=ON \
+    -DTM_USE_QUDA \
+    -DTM_USE_HIP=ON \
+    -DCMAKE_HIP_ARCHITECTURES=gfx90a \
+    -DCMAKE_CXX_COMPILER=/opr/rocm/bin/clang++ ..
+'''
 
 `QPhiX` and/or `DDalphaAMG` support can be added with
 
@@ -120,8 +132,8 @@ cmake -DCMAKE_INSTALL_PREFIX=/my_path \
       -DTM_USE_OMP=ON ..
 '''
 
-`QPhiX` cmake config support is incomplete and requires both the QPhiX
-and QMP installation directories to work properly.
+`QPhiX` cmake config support is incomplete and requires both the `QPhiX`
+and `QMP` installation directories to work properly.
 
 `CMake` has several relevant specific options that control the build. Compiler
 options are defined by the variable `CMAKE_C_FLAGS` and `CMAKE_CXX_FLAGS`. CUDA and HIP compilations options are controlled by their

From cfe38bd16d659df21bc2e01e409e915826db74d2 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Mon, 23 Mar 2026 08:41:04 +0100
Subject: [PATCH 31/80] update lemon hash tag to the latest version

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 59737df6a..9eb04bbf0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -244,7 +244,7 @@ if(TM_USE_LEMON)
   FetchContent_Declare(
     lemon
     GIT_REPOSITORY https://github.com/etmc/lemon
-    GIT_TAG 187de3435d604251e078eb083016131f035d6a51
+    GIT_TAG 42c2d99dae9c04ca1f09c532a8f9bcb1bb667528
     FIND_PACKAGE_ARGS NAMES lemon)
   FetchContent_MakeAvailable(lemon)
 endif()

From 5c00e8aa99bf9c9c38fae932902d70f2418fbfc8 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 24 Mar 2026 09:52:18 +0100
Subject: [PATCH 32/80] Fix typos

---
 .../daint-gh200/repo/packages/tmlqcd/package.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
index 13fb3238e..74ea1d069 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
@@ -2,14 +2,16 @@
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
-from spack_repo.builtin.build_systems.cmake import CmakePackage
+from spack_repo.builtin.build_systems import cmake
+from spack_repo.builtin.build_systems.cmake import CMakePackage, generator
 from spack_repo.builtin.build_systems.rocm import ROCmPackage
 from spack_repo.builtin.build_systems.cuda import CudaPackage
 
 from spack.package import *
 
-class Tmlqcd(CmakePackage, CudaPackage, ROCmPackage):
-"""Base class for building tmlQCD."""
+
+class Tmlqcd(CMakePackage, CudaPackage, ROCmPackage):
+    """Base class for building tmlQCD."""
 
     homepage = "https://www.itkp.uni-bonn.de/~urbach/software.html"
     url = "https://github.com/etmc/tmLQCD/archive/refs/tags/rel-5-1-6.tar.gz"
@@ -47,8 +49,8 @@ class Tmlqcd(CmakePackage, CudaPackage, ROCmPackage):
     variant("half_spinor", default=True, description="Use a Dirac operator with half-spinor")
     variant("shared", default=False, description="Enable shared library")
     variant("shmem", default=False, description="Use shmem API")
-    variant("quda", default=True, description="Enable the QUDA library", when="+cuda",)
-    variant("quda", default=True, description="Enable the QUDA library", when="+rocm",)
+    variant("quda", default=True, description="Enable the QUDA library", when="+cuda")
+    variant("quda", default=True, description="Enable the QUDA library", when="+rocm")
     variant(
         "QPhiX", default=False, description="Enable the QPhiX library for Intel Xeon and Xeon Phis"
     )
@@ -77,7 +79,7 @@ class Tmlqcd(CmakePackage, CudaPackage, ROCmPackage):
     depends_on("lapack")
     depends_on("pkgconfig", type="build")
 
-     # dependencies
+    # dependencies
     depends_on("mpi", when="+mpi")
     depends_on("lemon-io", when="+lemon")
 
@@ -93,6 +95,7 @@ class Tmlqcd(CmakePackage, CudaPackage, ROCmPackage):
 
     depends_on("fftw-api@3", when="+fftw")
 
+
 class CMakeBuilder(cmake.CMakeBuilder):
     def cmake_args(self):
         spec = self.spec
@@ -101,7 +104,7 @@ def cmake_args(self):
             self.define_from_variant("TM_USE_LEMON", "lemon"),
             self.define_from_variant("TM_USE_MPI", "mpi"),
             self.define_from_variant("TM_USE_QUDA", "quda"),
-            self.define_from_variant("TM_USE_CUDA","cuda"),
+            self.define_from_variant("TM_USE_CUDA", "cuda"),
             self.define_from_variant("TM_USE_HIP", "cuda"),
             self.define_from_variant("TM_USE_FFTW", "fftw"),
             self.define_from_variant("TM_FIXEDVOLUME", "fixed_volume"),

From a3f85c7bbefd0a3664536f67b1f2d729b5f3202b Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 24 Mar 2026 10:01:32 +0100
Subject: [PATCH 33/80] Remove tmLQCD package.py

---
 .../repo/packages/tmlqcd/package.py           | 116 ------------------
 1 file changed, 116 deletions(-)
 delete mode 100644 .ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py

diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
deleted file mode 100644
index 74ea1d069..000000000
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright Spack Project Developers. See COPYRIGHT file for details.
-#
-# SPDX-License-Identifier: (Apache-2.0 OR MIT)
-
-from spack_repo.builtin.build_systems import cmake
-from spack_repo.builtin.build_systems.cmake import CMakePackage, generator
-from spack_repo.builtin.build_systems.rocm import ROCmPackage
-from spack_repo.builtin.build_systems.cuda import CudaPackage
-
-from spack.package import *
-
-
-class Tmlqcd(CMakePackage, CudaPackage, ROCmPackage):
-    """Base class for building tmlQCD."""
-
-    homepage = "https://www.itkp.uni-bonn.de/~urbach/software.html"
-    url = "https://github.com/etmc/tmLQCD/archive/refs/tags/rel-5-1-6.tar.gz"
-    git = "https://github.com/etmc/tmLQCD.git"
-    license("GPL-3.0-or-later")
-
-    maintainers("mtaillefumier")
-    version("master", branch="master")
-
-    variant("lemon", default=False, description="Enable the lemon backend")
-    variant("mpi", default=True, description="Enable mpi support")
-    variant("DDalphaAMG", default=False, description="Enable DAlphaAMG support")
-    variant("openmp", default=True, description="Enable OpenMP")
-    variant("fftw", default=True, description="Enable FFTW interface")
-    variant(
-        "persistent_mpi",
-        default=True,
-        description="Enable persistent mpi calls for spinor and gauge fields",
-        when="+mpi",
-    )
-    variant(
-        "nonblocking_mpi",
-        default=True,
-        description="Enable non-blocking mpi calls for spinor and gauge fields",
-        when="+mpi",
-    )
-    variant("fixedvolume", default=True, description="Enable fixed volume at compile time")
-    variant(
-        "alignment",
-        default="auto",
-        values=("none", "auto", "16", "32", "64"),
-        description="Automatically or expliclty align arrays",
-    )
-    variant("gauge_copy", default=True, description="Enable gauge field copy")
-    variant("half_spinor", default=True, description="Use a Dirac operator with half-spinor")
-    variant("shared", default=False, description="Enable shared library")
-    variant("shmem", default=False, description="Use shmem API")
-    variant("quda", default=True, description="Enable the QUDA library", when="+cuda")
-    variant("quda", default=True, description="Enable the QUDA library", when="+rocm")
-    variant(
-        "QPhiX", default=False, description="Enable the QPhiX library for Intel Xeon and Xeon Phis"
-    )
-    variant(
-        "mpi_dimensions",
-        default="4",
-        values=("1", "2", "3", "4", "x", "xy", "xyz"),
-        description="number of dimensions the mpi processes are distributed. the default is parallelization over all four dimensions txyz",
-        when="+mpi",
-    )
-
-    generator("ninja")
-
-    # language dependencies
-    depends_on("c", type="build")
-    depends_on("cxx", type="build")
-    depends_on("fortran", type="build")
-
-    # conflicts
-    conflicts("+cuda", when="cuda_arch=none")
-    conflicts("+rocm", when="amdgpu_target=none")
-
-    # hard dependencies
-    depends_on("c-lime")
-    depends_on("blas")
-    depends_on("lapack")
-    depends_on("pkgconfig", type="build")
-
-    # dependencies
-    depends_on("mpi", when="+mpi")
-    depends_on("lemon-io", when="+lemon")
-
-    with when("+quda"):
-        depends_on(
-            "quda+twisted_mass+twisted_clover+clover+ndeg_twisted_clover+ndeg_twisted_mass+wilson+qdp+staggered+usqcd+multigrid"
-        )
-
-        depends_on("quda+mpi", when="+mpi")
-        depends_on("quda+cuda", when="+cuda")
-        depends_on("quda+rocm", when="+rocm")
-        depends_on("quda+nvshmem", when="+shmem")
-
-    depends_on("fftw-api@3", when="+fftw")
-
-
-class CMakeBuilder(cmake.CMakeBuilder):
-    def cmake_args(self):
-        spec = self.spec
-        args = [
-            self.define_from_variant("DBUILD_SHARED_LIBS", "shared"),
-            self.define_from_variant("TM_USE_LEMON", "lemon"),
-            self.define_from_variant("TM_USE_MPI", "mpi"),
-            self.define_from_variant("TM_USE_QUDA", "quda"),
-            self.define_from_variant("TM_USE_CUDA", "cuda"),
-            self.define_from_variant("TM_USE_HIP", "cuda"),
-            self.define_from_variant("TM_USE_FFTW", "fftw"),
-            self.define_from_variant("TM_FIXEDVOLUME", "fixed_volume"),
-            self.define_from_variant("TM_USE_OMP", "openmp"),
-            self.define_from_variant("TM_USE_SHMEM", "shmem"),
-            self.define_from_variant("TM_USE_GAUGE_COPY", "gauge_copy"),
-            self.define_from_variant("TM_USE_HALFSPINOR", "half_spinor"),
-        ]
-        return args

From 4420f6a104f9abc75827fa68b9f925040e5184db Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 24 Mar 2026 10:08:52 +0100
Subject: [PATCH 34/80] Remove autotools support from package.py

---
 .../daint-gh200/repo/packages/lemonio/package.py      | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
index aa8ac2013..82c48545b 100755
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
@@ -8,22 +8,24 @@
 
 from spack.package import *
 
-class Lemonio(AutotoolsPackage, CMakePackage):
+
+class Lemonio(CMakePackage):
     """LEMON: Lightweight Parallel I/O library for Lattice QCD."""
 
     homepage = "https://github.com/etmc/lemon"
-    git      = "https://github.com/etmc/lemon.git"
+    git = "https://github.com/etmc/lemon.git"
     license("GPL-3.0-or-later")
 
-    version('master', branch='master')
+    version("master", branch="master")
 
     depends_on("libtool", type="build", when="@master build_system=cmake")
     depends_on("cmake@4", type="build", when="master build_system=cmake")
 
-    depends_on('mpi')
+    depends_on("mpi")
 
     generator("ninja")
 
+
 class CMakeBuilder(cmake.CMakeBuilder):
     def cmake_args(self):
         spec = self.spec
@@ -31,4 +33,3 @@ def cmake_args(self):
             self.define_from_variant("DBUILD_SHARED_LIBS", "shared"),
         ]
         return args
-

From 25a19cc9aed0974982b7cb4d3932de4ba00455e3 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Tue, 24 Mar 2026 11:02:10 +0100
Subject: [PATCH 35/80] use stringent solver precisions for ddalphaamg CI
 workflow

---
 ...mple-hmc-ddalphaamg-tmcloverdetratio.input | 10 ++---
 .../sample-hmc_nf211_tmclover_quda.input      |  1 -
 .../onlinemeas.000006                         |  4 +-
 .../onlinemeas.000008                         | 14 +++----
 .../onlinemeas.000010                         | 18 ++++-----
 .../onlinemeas.000012                         | 18 ++++-----
 .../onlinemeas.000014                         | 30 +++++++-------
 .../onlinemeas.000016                         | 30 +++++++-------
 .../onlinemeas.000018                         | 30 +++++++-------
 .../output.data                               | 40 +++++++++----------
 .../return_check.data                         | 10 ++---
 doc/sample-output/hmc_nf211_tmclover_quda     |  1 -
 12 files changed, 102 insertions(+), 104 deletions(-)
 delete mode 120000 doc/sample-input/sample-hmc_nf211_tmclover_quda.input
 delete mode 120000 doc/sample-output/hmc_nf211_tmclover_quda

diff --git a/doc/sample-input/sample-hmc-ddalphaamg-tmcloverdetratio.input b/doc/sample-input/sample-hmc-ddalphaamg-tmcloverdetratio.input
index 083932f19..4f1ac0af4 100644
--- a/doc/sample-input/sample-hmc-ddalphaamg-tmcloverdetratio.input
+++ b/doc/sample-input/sample-hmc-ddalphaamg-tmcloverdetratio.input
@@ -52,8 +52,8 @@ BeginMonomial CLOVERDET
   # nominator shift
   rho = 0.1
   kappa = 0.138
-  AcceptancePrecision =  1.e-20
-  ForcePrecision = 1.e-14
+  AcceptancePrecision =  1.e-24
+  ForcePrecision = 1.e-22
   Name = cloverdet
   solver = ddalphaamg
 EndMonomial
@@ -67,8 +67,8 @@ BeginMonomial CLOVERDETRATIO
   rho2 = 0.1
   CSW = 1.00
   kappa = 0.138
-  AcceptancePrecision =  1.e-20
-  ForcePrecision = 1.e-16
+  AcceptancePrecision =  1.e-24
+  ForcePrecision = 1.e-22
   Name = cloverdetratio
   solver = ddalphaamg
 EndMonomial
@@ -91,7 +91,7 @@ BeginOperator CLOVER
   2KappaMu = 0.01
   CSW = 1.00
   kappa = 0.138
-  SolverPrecision = 1e-16
+  SolverPrecision = 1e-24
   MaxSolverIterations = 1000
   useevenodd = no
   solver = ddalphaamg
diff --git a/doc/sample-input/sample-hmc_nf211_tmclover_quda.input b/doc/sample-input/sample-hmc_nf211_tmclover_quda.input
deleted file mode 120000
index 9587e4499..000000000
--- a/doc/sample-input/sample-hmc_nf211_tmclover_quda.input
+++ /dev/null
@@ -1 +0,0 @@
-../.ci/sample-input/quda.in
\ No newline at end of file
diff --git a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000006 b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000006
index 4734cef39..e39a749b5 100644
--- a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000006
+++ b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000006
@@ -3,12 +3,12 @@
 1  1  2  4.420688e-01  4.450500e-01
 1  1  3  7.635246e-02  7.292410e-02
 1  1  4  2.547268e-02  0.000000e+00
-2  1  0  -6.505429e-01  0.000000e+00
+2  1  0  -6.505430e-01  0.000000e+00
 2  1  1  1.641175e+00  -1.692241e+00
 2  1  2  2.578280e-01  -2.469258e-01
 2  1  3  3.891519e-02  -3.979994e-02
 2  1  4  -1.066065e-03  0.000000e+00
-6  1  0  -8.320553e-02  0.000000e+00
+6  1  0  -8.320554e-02  0.000000e+00
 6  1  1  1.107951e-01  -1.188322e-01
 6  1  2  1.658398e-02  -2.028692e-02
 6  1  3  3.560260e-03  -3.057380e-03
diff --git a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000008 b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000008
index 5ab1cdbb4..b7d02ad69 100644
--- a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000008
+++ b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000008
@@ -2,14 +2,14 @@
 1  1  1  3.340338e+00  2.839302e+00
 1  1  2  6.626364e-01  4.646722e-01
 1  1  3  1.371573e-01  8.789097e-02
-1  1  4  4.330242e-02  0.000000e+00
+1  1  4  4.330241e-02  0.000000e+00
 2  1  0  1.081758e+00  0.000000e+00
 2  1  1  1.893688e+00  -1.631633e+00
 2  1  2  3.752751e-01  -2.454020e-01
-2  1  3  7.408655e-02  -4.233174e-02
-2  1  4  7.188709e-03  0.000000e+00
-6  1  0  9.491140e-01  0.000000e+00
+2  1  3  7.408654e-02  -4.233174e-02
+2  1  4  7.188707e-03  0.000000e+00
+6  1  0  9.491139e-01  0.000000e+00
 6  1  1  1.554260e-01  -1.443985e-01
-6  1  2  3.971019e-02  -2.313646e-02
-6  1  3  7.194545e-03  -3.899612e-03
-6  1  4  9.475720e-04  0.000000e+00
+6  1  2  3.971018e-02  -2.313646e-02
+6  1  3  7.194543e-03  -3.899613e-03
+6  1  4  9.475708e-04  0.000000e+00
diff --git a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000010 b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000010
index 228672c7b..7a5a54ece 100644
--- a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000010
+++ b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000010
@@ -1,15 +1,15 @@
 1  1  0  3.387886e+01  0.000000e+00
 1  1  1  2.788626e+00  2.999193e+00
-1  1  2  4.787638e-01  5.208716e-01
+1  1  2  4.787637e-01  5.208716e-01
 1  1  3  1.020219e-01  1.109699e-01
 1  1  4  4.286604e-02  0.000000e+00
-2  1  0  -9.232348e-01  0.000000e+00
+2  1  0  -9.232351e-01  0.000000e+00
 2  1  1  1.498371e+00  -1.651027e+00
 2  1  2  2.458386e-01  -2.892127e-01
-2  1  3  5.026310e-02  -5.299146e-02
-2  1  4  9.781858e-05  0.000000e+00
-6  1  0  7.982990e-01  0.000000e+00
-6  1  1  1.062982e-01  -1.420299e-01
-6  1  2  2.513887e-02  -2.502436e-02
-6  1  3  6.373549e-03  -6.618316e-03
-6  1  4  -3.151778e-04  0.000000e+00
+2  1  3  5.026309e-02  -5.299147e-02
+2  1  4  9.781639e-05  0.000000e+00
+6  1  0  7.982995e-01  0.000000e+00
+6  1  1  1.062981e-01  -1.420299e-01
+6  1  2  2.513885e-02  -2.502437e-02
+6  1  3  6.373541e-03  -6.618322e-03
+6  1  4  -3.151829e-04  0.000000e+00
diff --git a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000012 b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000012
index f2523546f..10e85c917 100644
--- a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000012
+++ b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000012
@@ -1,15 +1,15 @@
 1  1  0  3.382234e+01  0.000000e+00
 1  1  1  2.995261e+00  2.875812e+00
-1  1  2  5.063795e-01  5.100208e-01
-1  1  3  1.006529e-01  1.174076e-01
-1  1  4  4.782682e-02  0.000000e+00
-2  1  0  -2.748838e-01  0.000000e+00
+1  1  2  5.063796e-01  5.100208e-01
+1  1  3  1.006530e-01  1.174076e-01
+1  1  4  4.782684e-02  0.000000e+00
+2  1  0  -2.748842e-01  0.000000e+00
 2  1  1  1.546980e+00  -1.573491e+00
-2  1  2  2.491686e-01  -2.772142e-01
-2  1  3  4.337945e-02  -6.241266e-02
-2  1  4  -6.321273e-03  0.000000e+00
+2  1  2  2.491687e-01  -2.772142e-01
+2  1  3  4.337949e-02  -6.241267e-02
+2  1  4  -6.321260e-03  0.000000e+00
 6  1  0  5.387689e-01  0.000000e+00
 6  1  1  1.311453e-01  -1.202091e-01
 6  1  2  2.469061e-02  -3.317881e-02
-6  1  3  1.976094e-03  -8.455000e-03
-6  1  4  -1.927750e-03  0.000000e+00
+6  1  3  1.976102e-03  -8.454999e-03
+6  1  4  -1.927752e-03  0.000000e+00
diff --git a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000014 b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000014
index b19dceb50..cd7513818 100644
--- a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000014
+++ b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000014
@@ -1,15 +1,15 @@
-1  1  0  3.384332e+01  0.000000e+00
-1  1  1  3.058971e+00  2.867482e+00
-1  1  2  5.927879e-01  5.535211e-01
-1  1  3  1.463143e-01  1.372395e-01
-1  1  4  6.632476e-02  0.000000e+00
-2  1  0  6.828284e-01  0.000000e+00
-2  1  1  1.675028e+00  -1.532287e+00
-2  1  2  3.123937e-01  -3.063247e-01
-2  1  3  6.949756e-02  -6.162975e-02
-2  1  4  7.048824e-03  0.000000e+00
-6  1  0  -2.533447e-01  0.000000e+00
-6  1  1  1.515393e-01  -1.275537e-01
-6  1  2  2.346966e-02  -4.019053e-02
-6  1  3  1.433944e-03  -1.463813e-02
-6  1  4  -7.198556e-03  0.000000e+00
+1  1  0  3.384331e+01  0.000000e+00
+1  1  1  3.058970e+00  2.867480e+00
+1  1  2  5.927875e-01  5.535203e-01
+1  1  3  1.463141e-01  1.372392e-01
+1  1  4  6.632463e-02  0.000000e+00
+2  1  0  6.828323e-01  0.000000e+00
+2  1  1  1.675028e+00  -1.532286e+00
+2  1  2  3.123934e-01  -3.063241e-01
+2  1  3  6.949739e-02  -6.162966e-02
+2  1  4  7.048780e-03  0.000000e+00
+6  1  0  -2.533367e-01  0.000000e+00
+6  1  1  1.515395e-01  -1.275537e-01
+6  1  2  2.346976e-02  -4.019037e-02
+6  1  3  1.433995e-03  -1.463802e-02
+6  1  4  -7.198496e-03  0.000000e+00
diff --git a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000016 b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000016
index 3c9999abe..c51999157 100644
--- a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000016
+++ b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000016
@@ -1,15 +1,15 @@
-1  1  0  3.350499e+01  0.000000e+00
-1  1  1  2.932334e+00  2.982039e+00
-1  1  2  5.434805e-01  5.351988e-01
-1  1  3  1.330346e-01  1.288597e-01
-1  1  4  6.230389e-02  0.000000e+00
-2  1  0  -5.539881e-01  0.000000e+00
-2  1  1  1.574741e+00  -1.569952e+00
-2  1  2  2.688100e-01  -2.687716e-01
-2  1  3  5.300017e-02  -5.329829e-02
-2  1  4  -3.085424e-03  0.000000e+00
-6  1  0  -7.089285e-01  0.000000e+00
-6  1  1  1.479719e-01  -1.378143e-01
-6  1  2  3.012575e-02  -2.458313e-02
-6  1  3  1.285082e-02  -2.411722e-03
-6  1  4  4.275350e-03  0.000000e+00
+1  1  0  3.350500e+01  0.000000e+00
+1  1  1  2.932339e+00  2.982047e+00
+1  1  2  5.434813e-01  5.352003e-01
+1  1  3  1.330350e-01  1.288600e-01
+1  1  4  6.230422e-02  0.000000e+00
+2  1  0  -5.539986e-01  0.000000e+00
+2  1  1  1.574744e+00  -1.569958e+00
+2  1  2  2.688097e-01  -2.687722e-01
+2  1  3  5.299989e-02  -5.329831e-02
+2  1  4  -3.085467e-03  0.000000e+00
+6  1  0  -7.089245e-01  0.000000e+00
+6  1  1  1.479736e-01  -1.378157e-01
+6  1  2  3.012610e-02  -2.458331e-02
+6  1  3  1.285099e-02  -2.411671e-03
+6  1  4  4.275483e-03  0.000000e+00
diff --git a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000018 b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000018
index 82a17be37..296f3a58a 100644
--- a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000018
+++ b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/onlinemeas.000018
@@ -1,15 +1,15 @@
-1  1  0  3.327833e+01  0.000000e+00
-1  1  1  2.914884e+00  2.887613e+00
-1  1  2  5.422550e-01  5.327642e-01
-1  1  3  1.342023e-01  1.336877e-01
-1  1  4  6.497096e-02  0.000000e+00
-2  1  0  2.431595e-01  0.000000e+00
-2  1  1  1.491172e+00  -1.524832e+00
-2  1  2  2.682825e-01  -2.606945e-01
-2  1  3  5.792593e-02  -5.123251e-02
-2  1  4  4.446912e-03  0.000000e+00
-6  1  0  -6.232563e-01  0.000000e+00
-6  1  1  1.495484e-01  -1.463705e-01
-6  1  2  3.212356e-02  -2.883108e-02
-6  1  3  1.268960e-02  -2.096584e-03
-6  1  4  5.938212e-03  0.000000e+00
+1  1  0  3.327825e+01  0.000000e+00
+1  1  1  2.914904e+00  2.887607e+00
+1  1  2  5.422698e-01  5.327645e-01
+1  1  3  1.342080e-01  1.336886e-01
+1  1  4  6.497243e-02  0.000000e+00
+2  1  0  2.431385e-01  0.000000e+00
+2  1  1  1.491198e+00  -1.524830e+00
+2  1  2  2.682909e-01  -2.606948e-01
+2  1  3  5.792842e-02  -5.123145e-02
+2  1  4  4.448118e-03  0.000000e+00
+6  1  0  -6.233064e-01  0.000000e+00
+6  1  1  1.495564e-01  -1.463691e-01
+6  1  2  3.212387e-02  -2.883197e-02
+6  1  3  1.268921e-02  -2.097509e-03
+6  1  4  5.937257e-03  0.000000e+00
diff --git a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/output.data b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/output.data
index 420c5bd08..82ded2a49 100644
--- a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/output.data
+++ b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/output.data
@@ -1,20 +1,20 @@
-00000000 0.291802998166 0.150813159478 8.600084e-01 11 216 17 116 1 1.695511e+01
-00000001 0.367103026382 0.123429505793 8.838839e-01 6 100 10 61 1 8.344895e+00
-00000002 0.411116783387 0.045573902534 9.554490e-01 6 121 11 65 1 9.167332e+00
-00000003 0.437627142550 0.260037508415 7.710227e-01 6 125 12 65 1 9.248657e+00
-00000004 0.461210881231 -0.087496153400 1.091438e+00 12 250 18 130 1 1.827640e+01
-00000005 0.477370074683 0.331285967965 7.179998e-01 7 125 12 69 1 9.230436e+00
-00000006 0.490500781082 0.211017029075 8.097603e-01 7 137 13 78 1 9.551435e+00
-00000007 0.499497899365 -0.271498625284 1.311929e+00 7 145 13 78 1 9.687941e+00
-00000008 0.507695934800 -0.453122057246 1.573216e+00 14 300 21 156 1 1.978807e+01
-00000009 0.516327346026 -0.140664545065 1.151038e+00 7 150 14 78 1 9.840636e+00
-00000010 0.521231381097 0.051098995079 9.501846e-01 8 150 14 78 1 9.848677e+00
-00000011 0.529386494343 -0.099410380527 1.104519e+00 8 150 15 78 1 9.740137e+00
-00000012 0.529386494343 0.376606119881 6.861863e-01 16 300 22 176 0 2.002176e+01
-00000013 0.531891500121 -0.145672763511 1.156818e+00 8 150 15 85 1 1.024416e+01
-00000014 0.536421758394 -0.180197251221 1.197454e+00 8 150 15 91 1 1.036149e+01
-00000015 0.540314511899 -0.014164886897 1.014266e+00 8 151 16 91 1 1.031493e+01
-00000016 0.545384396635 0.456731404050 6.333504e-01 16 346 24 182 1 2.143623e+01
-00000017 0.549339344577 -0.381480879502 1.464452e+00 8 175 16 91 1 1.008381e+01
-00000018 0.548855930680 0.308666362994 7.344258e-01 8 175 16 91 1 1.027304e+01
-00000019 0.551922552298 -0.257831496596 1.294121e+00 9 175 16 91 1 1.029375e+01
+00000000 0.291802998166 0.150813205437 8.600083e-01 13 310 20 148 1 1.796150e+01
+00000001 0.367103026382 0.123429431545 8.838840e-01 6 150 12 77 1 9.083342e+00
+00000002 0.411116783383 0.045573922136 9.554490e-01 7 162 12 78 1 9.169988e+00
+00000003 0.437627142549 0.260037513702 7.710227e-01 7 175 14 78 1 9.476499e+00
+00000004 0.461210881240 -0.087496158702 1.091438e+00 14 350 21 176 1 2.006502e+01
+00000005 0.477370074736 0.331285968350 7.179998e-01 8 175 14 91 1 9.868406e+00
+00000006 0.490500781150 0.211016940603 8.097603e-01 8 197 15 91 1 1.009657e+01
+00000007 0.499497899357 -0.271498597692 1.311929e+00 8 200 16 91 1 1.018464e+01
+00000008 0.507695934126 -0.453122351633 1.573217e+00 16 400 24 200 1 2.089432e+01
+00000009 0.516327346842 -0.140664533460 1.151038e+00 9 200 16 104 1 1.039694e+01
+00000010 0.521231381453 0.051098763556 9.501848e-01 9 219 17 104 1 1.061809e+01
+00000011 0.529386496220 -0.099409891257 1.104519e+00 9 225 17 104 1 1.063195e+01
+00000012 0.529386496220 0.376609569126 6.861839e-01 18 450 26 208 0 2.172998e+01
+00000013 0.531891491082 -0.145673731306 1.156819e+00 9 225 18 104 1 1.213700e+01
+00000014 0.536421760877 -0.180205391949 1.197463e+00 9 225 18 104 1 1.241186e+01
+00000015 0.540314479338 -0.014163550569 1.014264e+00 9 225 18 113 1 1.196440e+01
+00000016 0.545384434240 0.456736123271 6.333474e-01 19 450 27 234 1 2.358034e+01
+00000017 0.549339288645 -0.381464882069 1.464428e+00 10 228 19 117 1 1.220460e+01
+00000018 0.548856488255 0.308746327179 7.343670e-01 10 247 19 117 1 1.213587e+01
+00000019 0.551923185293 -0.257738529083 1.294000e+00 10 244 19 117 1 1.161498e+01
diff --git a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/return_check.data b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/return_check.data
index 9baf9a7c7..8bb8c4b85 100644
--- a/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/return_check.data
+++ b/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio/return_check.data
@@ -1,5 +1,5 @@
-00000000 ddh = -2.6085e-08 ddh/dh = -1.7296e-07 ddh/H = -1.9734e-13 ddU= 2.6765e-11
-00000004 ddh = 7.9704e-09 ddh/dh = -9.1094e-08 ddh/H = 4.5374e-14 ddU= 1.0448e-11
-00000008 ddh = 6.6077e-08 ddh/dh = -1.4583e-07 ddh/H = 3.5941e-13 ddU= 3.3593e-11
-00000012 ddh = 2.1891e-08 ddh/dh = 5.8126e-08 ddh/H = 1.1640e-13 ddU= 4.4380e-11
-00000016 ddh = 1.7062e-07 ddh/dh = 3.7356e-07 ddh/H = 9.0165e-13 ddU= 1.1339e-10
+00000000 ddh = 2.5750e-11 ddh/dh = 1.7074e-10 ddh/H = 1.9480e-16 ddU= 3.9489e-15
+00000004 ddh = 1.4438e-11 ddh/dh = -1.6502e-10 ddh/H = 8.2195e-17 ddU= 4.0239e-15
+00000008 ddh = -1.1642e-10 ddh/dh = 2.5692e-10 ddh/H = -6.3322e-16 ddU= 6.5245e-15
+00000012 ddh = 1.2739e-10 ddh/dh = 3.3824e-10 ddh/H = 6.7734e-16 ddU= 5.1587e-15
+00000016 ddh = 3.2742e-11 ddh/dh = 7.1686e-11 ddh/H = 1.7303e-16 ddU= 9.5886e-15
diff --git a/doc/sample-output/hmc_nf211_tmclover_quda b/doc/sample-output/hmc_nf211_tmclover_quda
deleted file mode 120000
index c4ac0d4ff..000000000
--- a/doc/sample-output/hmc_nf211_tmclover_quda
+++ /dev/null
@@ -1 +0,0 @@
-../.ci/sample-output/cscs-test
\ No newline at end of file

From 7fc11ef0c46946cd9169ba4957b1a16ea07cc7bb Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Tue, 24 Mar 2026 11:15:11 +0100
Subject: [PATCH 36/80] make the DDalphaAMG workflow more stringent

---
 .github/workflows/ddalphaamg-build.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ddalphaamg-build.yaml b/.github/workflows/ddalphaamg-build.yaml
index 0d475e219..a2fef7241 100644
--- a/.github/workflows/ddalphaamg-build.yaml
+++ b/.github/workflows/ddalphaamg-build.yaml
@@ -139,10 +139,10 @@ jobs:
         working-directory: ${{github.workspace}}/main/build
         run: |
           refpath=${{github.workspace}}/main/doc/sample-output/hmc-ddalphaamg-tmcloverdetratio
-          numdiff -r 1e-4 -X 1:10 -X 1:5-8 -X 2:10 -X 2:5-8 output.data ${refpath}/output.data
+          numdiff -r 1e-6 -X 1:10 -X 1:5-8 -X 2:10 -X 2:5-8 output.data ${refpath}/output.data
           for i in $(seq 0 2 18); do \
             f=onlinemeas.$(printf %06d $i); \
-            numdiff -r 1e-5 ${f} ${refpath}/${f}; \
+            numdiff -r 1e-6 ${f} ${refpath}/${f}; \
           done
 
 

From 24364342736ea25161be105ad49dcc9324107de0 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 24 Mar 2026 09:52:18 +0100
Subject: [PATCH 37/80] Update the CI/CD

---
 .ci/include/cscs/00-variables.yml             |   2 +-
 .ci/include/cscs/01-test-templates.yml        |   2 +-
 .../tmlqcd/daint-gh200/compilers.yaml         |   2 +-
 .../tmlqcd/daint-gh200/config.yaml            |   4 +-
 .../tmlqcd/daint-gh200/environments.yaml      |   3 +-
 .../repo/packages/lemonio/package.py          |   4 +-
 .../repo/packages/tmlqcd/package.py           | 116 ++++++++++++++++++
 .../tmlqcd/daint-gh200/repo/repo.yaml         |   1 +
 CMakeLists.txt                                |   7 +-
 DDalphaAMG/CMakeLists.txt                     |  10 +-
 cmake/tmlQCD.pc.in                            |  10 ++
 src/bin/CMakeLists.txt                        |   2 +
 12 files changed, 148 insertions(+), 15 deletions(-)
 mode change 100755 => 100644 .ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
 create mode 100644 .ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
 create mode 100644 cmake/tmlQCD.pc.in

diff --git a/.ci/include/cscs/00-variables.yml b/.ci/include/cscs/00-variables.yml
index c4db27021..382960fac 100644
--- a/.ci/include/cscs/00-variables.yml
+++ b/.ci/include/cscs/00-variables.yml
@@ -10,4 +10,4 @@ variables:
   UENV_NAME: tmlqcd
   UENV_VERSION: experimental
   UENV_TAG: v0.0.7
-
+  UENV_VERSION: v1
diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index e993f8aaa..16b187ab5 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -27,12 +27,12 @@ include:
               -DTM_USE_OMP=ON \
               -DTM_USE_QUDA=ON \
               -DTM_USE_LEMON=ON \
-              -DTM_ENABLE_ALIGNMENT=32 \
               -DTM_USE_GAUGE_COPY=ON \
               -DTM_USE_HALFSPINOR=ON \
               -DCMAKE_INSTALL_PREFIX=../install_dir ..
         make
         make install
+        cd ..
         touch preparation-done-${CI_JOB_ID}
       fi
     - while test ! -f preparation-done-${CI_JOB_ID}; do sleep 5; done
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/compilers.yaml b/.ci/uenv-recipes/tmlqcd/daint-gh200/compilers.yaml
index 840d9974d..8bd185e43 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/compilers.yaml
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/compilers.yaml
@@ -1,2 +1,2 @@
 gcc:
-  version: "14.2"
+  version: "14.3"
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/config.yaml b/.ci/uenv-recipes/tmlqcd/daint-gh200/config.yaml
index 3ec694351..b15e4e7ad 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/config.yaml
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/config.yaml
@@ -2,10 +2,10 @@ name: tmlqcd
 store: /user-environment
 spack:
   repo: https://github.com/spack/spack.git
-  commit: releases/v1.0
+  commit: releases/v1.1
   packages:
     repo: https://github.com/spack/spack-packages.git
-    commit: releases/v2025.11
+    #commit: 
 modules: true
 description: "tmLQCD is a freely available software suite providing a set of tools to be used in lattice QCD simulations."
 version: 2
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml b/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
index fbd514f02..fd4c1568f 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
@@ -2,6 +2,7 @@ gcc-env:
   compiler: [gcc]
   network:
       mpi: cray-mpich@8.1.32 +cuda
+      specs: ['libfabric@2.4.0+cuda']
   unify: true
   specs:
   - python@3.12
@@ -10,7 +11,7 @@ gcc-env:
   - lemonio
   - c-lime
   - openblas
-  - cmake@3
+  - cmake@3.31
   - cuda
   variants:
   - +mpi
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
old mode 100755
new mode 100644
index 82c48545b..ff3367b26
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
@@ -17,9 +17,9 @@ class Lemonio(CMakePackage):
     license("GPL-3.0-or-later")
 
     version("master", branch="master")
-
+    variant("shared", default=False, description="Build shared library")
     depends_on("libtool", type="build", when="@master build_system=cmake")
-    depends_on("cmake@4", type="build", when="master build_system=cmake")
+    depends_on("cmake@3.28:", type="build", when="@master build_system=cmake")
 
     depends_on("mpi")
 
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
new file mode 100644
index 000000000..74ea1d069
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
@@ -0,0 +1,116 @@
+# Copyright Spack Project Developers. See COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+from spack_repo.builtin.build_systems import cmake
+from spack_repo.builtin.build_systems.cmake import CMakePackage, generator
+from spack_repo.builtin.build_systems.rocm import ROCmPackage
+from spack_repo.builtin.build_systems.cuda import CudaPackage
+
+from spack.package import *
+
+
+class Tmlqcd(CMakePackage, CudaPackage, ROCmPackage):
+    """Base class for building tmlQCD."""
+
+    homepage = "https://www.itkp.uni-bonn.de/~urbach/software.html"
+    url = "https://github.com/etmc/tmLQCD/archive/refs/tags/rel-5-1-6.tar.gz"
+    git = "https://github.com/etmc/tmLQCD.git"
+    license("GPL-3.0-or-later")
+
+    maintainers("mtaillefumier")
+    version("master", branch="master")
+
+    variant("lemon", default=False, description="Enable the lemon backend")
+    variant("mpi", default=True, description="Enable mpi support")
+    variant("DDalphaAMG", default=False, description="Enable DAlphaAMG support")
+    variant("openmp", default=True, description="Enable OpenMP")
+    variant("fftw", default=True, description="Enable FFTW interface")
+    variant(
+        "persistent_mpi",
+        default=True,
+        description="Enable persistent mpi calls for spinor and gauge fields",
+        when="+mpi",
+    )
+    variant(
+        "nonblocking_mpi",
+        default=True,
+        description="Enable non-blocking mpi calls for spinor and gauge fields",
+        when="+mpi",
+    )
+    variant("fixedvolume", default=True, description="Enable fixed volume at compile time")
+    variant(
+        "alignment",
+        default="auto",
+        values=("none", "auto", "16", "32", "64"),
+        description="Automatically or expliclty align arrays",
+    )
+    variant("gauge_copy", default=True, description="Enable gauge field copy")
+    variant("half_spinor", default=True, description="Use a Dirac operator with half-spinor")
+    variant("shared", default=False, description="Enable shared library")
+    variant("shmem", default=False, description="Use shmem API")
+    variant("quda", default=True, description="Enable the QUDA library", when="+cuda")
+    variant("quda", default=True, description="Enable the QUDA library", when="+rocm")
+    variant(
+        "QPhiX", default=False, description="Enable the QPhiX library for Intel Xeon and Xeon Phis"
+    )
+    variant(
+        "mpi_dimensions",
+        default="4",
+        values=("1", "2", "3", "4", "x", "xy", "xyz"),
+        description="number of dimensions the mpi processes are distributed. the default is parallelization over all four dimensions txyz",
+        when="+mpi",
+    )
+
+    generator("ninja")
+
+    # language dependencies
+    depends_on("c", type="build")
+    depends_on("cxx", type="build")
+    depends_on("fortran", type="build")
+
+    # conflicts
+    conflicts("+cuda", when="cuda_arch=none")
+    conflicts("+rocm", when="amdgpu_target=none")
+
+    # hard dependencies
+    depends_on("c-lime")
+    depends_on("blas")
+    depends_on("lapack")
+    depends_on("pkgconfig", type="build")
+
+    # dependencies
+    depends_on("mpi", when="+mpi")
+    depends_on("lemon-io", when="+lemon")
+
+    with when("+quda"):
+        depends_on(
+            "quda+twisted_mass+twisted_clover+clover+ndeg_twisted_clover+ndeg_twisted_mass+wilson+qdp+staggered+usqcd+multigrid"
+        )
+
+        depends_on("quda+mpi", when="+mpi")
+        depends_on("quda+cuda", when="+cuda")
+        depends_on("quda+rocm", when="+rocm")
+        depends_on("quda+nvshmem", when="+shmem")
+
+    depends_on("fftw-api@3", when="+fftw")
+
+
+class CMakeBuilder(cmake.CMakeBuilder):
+    def cmake_args(self):
+        spec = self.spec
+        args = [
+            self.define_from_variant("DBUILD_SHARED_LIBS", "shared"),
+            self.define_from_variant("TM_USE_LEMON", "lemon"),
+            self.define_from_variant("TM_USE_MPI", "mpi"),
+            self.define_from_variant("TM_USE_QUDA", "quda"),
+            self.define_from_variant("TM_USE_CUDA", "cuda"),
+            self.define_from_variant("TM_USE_HIP", "cuda"),
+            self.define_from_variant("TM_USE_FFTW", "fftw"),
+            self.define_from_variant("TM_FIXEDVOLUME", "fixed_volume"),
+            self.define_from_variant("TM_USE_OMP", "openmp"),
+            self.define_from_variant("TM_USE_SHMEM", "shmem"),
+            self.define_from_variant("TM_USE_GAUGE_COPY", "gauge_copy"),
+            self.define_from_variant("TM_USE_HALFSPINOR", "half_spinor"),
+        ]
+        return args
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/repo.yaml b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/repo.yaml
index f08fa46a4..7070c57de 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/repo.yaml
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/repo.yaml
@@ -1,2 +1,3 @@
 repo:
   namespace: apps
+  api: v2.2
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9eb04bbf0..3bed56be6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ project(
   tmlqcd
   DESCRIPTION "tmlQCD"
   HOMEPAGE_URL "https://github.com/etmc/tmLQCD"
-  VERSION "6.0.0"
+  VERSION "6.0.2"
   LANGUAGES C CXX)
 
 # include our cmake snippets
@@ -388,11 +388,14 @@ write_basic_package_version_file(
 configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlQCD-config.cmake.in"
                "${PROJECT_BINARY_DIR}/tmlQCD-config.cmake" @ONLY)
 
+configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlQCD.pc.in"
+               "${PROJECT_BINARY_DIR}/tmlQCD.pc" @ONLY)
+
 install(FILES "${PROJECT_BINARY_DIR}/tmlQCD-config.cmake"
               "${PROJECT_BINARY_DIR}/tmlQCDConfigVersion.cmake"
         DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/tmlQCD")
 
-install(FILES "${PROJECT_BINARY_DIR}/libtmlQCD.pc"
+install(FILES "${PROJECT_BINARY_DIR}/tmlQCD.pc"
         DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 
 install(
diff --git a/DDalphaAMG/CMakeLists.txt b/DDalphaAMG/CMakeLists.txt
index 2f428a177..2a80851a5 100644
--- a/DDalphaAMG/CMakeLists.txt
+++ b/DDalphaAMG/CMakeLists.txt
@@ -167,7 +167,7 @@ target_link_libraries(
 
 target_include_directories(
   DDalphaAMG
-  PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}>
+  PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/DDalphaAMG>
   $<BUILD_INTERFACE:${DDalphaAMG_SRC_DIR}/src>
   $<BUILD_INTERFACE:${DDalphaAMG_SRC_DIR}/include>
   $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/DDalphaAMG>)
@@ -188,8 +188,8 @@ target_compile_definitions(
   $<$<CONFIG:Debug>:DEBUG>
   SSE)
 
-install(FILES "${CMAKE_SOURCE_DIR}/deps/DDalphaAMG/DDalphaAMG.h"
-  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}")
+install(FILES "${CMAKE_SOURCE_DIR}/deps/lib/DDalphaAMG.h"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DDalphaAMG")
 
 write_basic_package_version_file(
   "${PROJECT_BINARY_DIR}/DDalphaAMGonfigVersion.cmake"
@@ -203,14 +203,14 @@ install(TARGETS DDalphaAMG
 install(EXPORT DDalphaAMG_targets
   FILE DDalphaAMG-Targets.cmake
   NAMESPACE DDalphaAMG::
-  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/DDalphaAMG")
 
 configure_file("${PROJECT_SOURCE_DIR}/cmake/DDalphaAMG-Config.cmake.in"
   "${PROJECT_BINARY_DIR}/DDalphaAMG-Config.cmake" @ONLY)
 
 install(FILES "${PROJECT_BINARY_DIR}/DDalphaAMG-Config.cmake"
   "${PROJECT_BINARY_DIR}/DDalphaAMG-ConfigVersion.cmake"
-  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/tmlQCD")
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/DDalphaAMG")
 
 
 # add_library(tmlqcd::DDalphaAMG alias DDalphaAMG)
diff --git a/cmake/tmlQCD.pc.in b/cmake/tmlQCD.pc.in
new file mode 100644
index 000000000..1660120d3
--- /dev/null
+++ b/cmake/tmlQCD.pc.in
@@ -0,0 +1,10 @@
+prefix="@CMAKE_INSTALL_PREFIX@"
+exec_prefix="${prefix}"
+libdir="${prefix}/@CMAKE_INSTALL_LIBDIR@"
+includedir="${prefix}/include"
+
+Name: tmlQCD
+Description: "tmlQCD"
+Version: "@PROJECT_VERSION@"
+Cflags: -I"${includedir}/include/tmlqcd"
+Libs: -L"${libdir}" -lhmc
diff --git a/src/bin/CMakeLists.txt b/src/bin/CMakeLists.txt
index 2f135ddae..c03624ac1 100644
--- a/src/bin/CMakeLists.txt
+++ b/src/bin/CMakeLists.txt
@@ -42,3 +42,5 @@ if(TM_ENABLE_TESTS)
                  LINKER_LANGUAGE "CXX")
   endforeach()
 endif()
+
+install(TARGETS ${tmlqcd_prog} RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")

From e641c5290e911cfc842b242f7a22bed8e7293b1f Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Thu, 26 Mar 2026 18:51:08 +0100
Subject: [PATCH 38/80] Use CMake 3.26 as default

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bed56be6..9aae0097b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.30)
+cmake_minimum_required(VERSION 3.26)
 
 project(
   tmlqcd

From c35a98f59159777d2f4530bbac7df37f372fde15 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Fri, 27 Mar 2026 09:07:02 +0100
Subject: [PATCH 39/80] Fix compilation issues

- Restored hopping.h
- Rename hmc with tmlqcd
- Fix TM_USE_DDalphaAMG when necessary
- Remove header guards in git_hash.c
---
 CMakeLists.txt                             |   4 +-
 cmake/git_hash.c.in                        |   5 -
 src/bin/CMakeLists.txt                     |   4 +-
 src/lib/CMakeLists.txt                     |  14 +-
 src/lib/DDalphaAMG_interface.c             |  71 ++++---
 src/lib/operator/Hopping_Matrix.c          |   1 +
 src/lib/operator/hopping.h                 | 211 +++++++++++----------
 src/lib/operator/hopping_body_dbl.c        | 179 +++++++++++++++++
 src/lib/operator/tm_times_Hopping_Matrix.c |   6 +-
 src/lib/quda_interface.c                   |   6 +-
 src/lib/solver/jdher.c                     |   4 +-
 src/lib/xchange/xchange_field.c            |   2 +-
 src/lib/xchange/xchange_halffield.c        |  18 +-
 13 files changed, 349 insertions(+), 176 deletions(-)
 create mode 100644 src/lib/operator/hopping_body_dbl.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9aae0097b..46e37f0d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,7 +158,7 @@ cmake_dependent_option(
   DDalphaAMG_ENABLE_SCHWARZ_RES "Enable paramount output support" OFF
   "TM_USE_DDalphaAMG" OFF)
 
-cmake_dependent_option(DDalphaAMG_ENABLE_OMP "Enable OpenMP support" ON
+cmake_dependent_option(DDalphaAMG_ENABLE_OMP "Enable OpenMP support" ${TM_USE_OMP}
                        "TM_USE_DDalphaAMG" OFF)
 
 cmake_dependent_option(
@@ -178,7 +178,7 @@ find_package(FLEX REQUIRED)
 set(PACKAGE_NAME ${PROJECT_DESCRIPTION})
 set(PACKAGE_VERSION ${PROJECT_VERSION})
 set(PACKAGE_TARNAME "tmlqcd")
-set(PACKAGE_BUGREPORT "curbach@gmx.de")
+set(PACKAGE_BUGREPORT "https://github.com/etmc/tmLQCD")
 set(PACKAGE_STRING "${PROJECT_DESCRIPTION} ${PROJECT_VERSION}")
 
 set(ALIGN " ")
diff --git a/cmake/git_hash.c.in b/cmake/git_hash.c.in
index 912085abb..b73d81cd4 100644
--- a/cmake/git_hash.c.in
+++ b/cmake/git_hash.c.in
@@ -1,6 +1 @@
-#ifndef _GIT_HASH_H
-#define _GIT_HASH_H
-
 const char git_hash[] = "@TM_SHA@";
-
-#endif /* _GIT_HASH_H */
diff --git a/src/bin/CMakeLists.txt b/src/bin/CMakeLists.txt
index c03624ac1..f641f8b18 100644
--- a/src/bin/CMakeLists.txt
+++ b/src/bin/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND tmlqcd_prog "benchmark;deriv_mg_tune;hmc_tm;offline_measurement")
+list(APPEND tmlqcd_prog "invert;benchmark;deriv_mg_tune;hmc_tm;offline_measurement")
 
 include_directories(
   $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
@@ -10,7 +10,7 @@ include_directories(
 foreach(_prog ${tmlqcd_prog})
   add_executable(${_prog} "${_prog}.c")
 
-  target_link_libraries(${_prog} PUBLIC hmc)
+  target_link_libraries(${_prog} PUBLIC tmlqcd)
   set_target_properties(
     ${_prog}
     PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 8ae48d31b..937836ec1 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -394,16 +394,16 @@ endif()
 # space at all
 
 if(BUILD_SHARED_LIBS)
-  add_library(hmc SHARED "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
+  add_library(tmlqcd SHARED "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
 else()
-  add_library(hmc STATIC "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
+  add_library(tmlqcd STATIC "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
 endif()
 
-set_target_properties(hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
+set_target_properties(tmlqcd PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
 
 # define a library and add the dependencies
 target_link_libraries(
-  hmc
+  tmlqcd
   PUBLIC $<$<BOOL:${TM_USE_DDalphaAMG}>:DDalphaAMG>
          $<$<BOOL:${TM_USE_QPHIX}>:tmlqcd::qphix>
          $<$<BOOL:${TM_USE_FFTW}>:tmlqcd::fftw3>
@@ -427,16 +427,16 @@ target_link_libraries(
          m)
 
 target_compile_definitions(
-  hmc PUBLIC HAVE_CONFIG_H $<$<BOOL:${TM_USE_HIP}>:${TM_GPU_PLATFORM_DFLAGS}>)
+  tmlqcd PUBLIC HAVE_CONFIG_H $<$<BOOL:${TM_USE_HIP}>:${TM_GPU_PLATFORM_DFLAGS}>)
 
 target_include_directories(
-  hmc
+  tmlqcd
   PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}>
          $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
          $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
 
 install(
-  TARGETS hmc
+  TARGETS tmlqcd
   EXPORT tmlqcd_targets
   LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
 
diff --git a/src/lib/DDalphaAMG_interface.c b/src/lib/DDalphaAMG_interface.c
index 5c49ddc66..a4b1d61cf 100644
--- a/src/lib/DDalphaAMG_interface.c
+++ b/src/lib/DDalphaAMG_interface.c
@@ -17,7 +17,7 @@
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  *
- * Interface for TM_USE_DDalphaAMG
+ * Interface for DDalphaAMG
  *
  *******************************************************************************/
 
@@ -43,47 +43,47 @@ double mg_dtau_update;
 double mg_rho_update;
 
 void MG_init(void) {
-  printf("ERROR: MG_init called but TM_USE_DDalphaAMG library not included.\n");
+  printf("ERROR: MG_init called but DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_update_gauge(double step) {
-  printf("ERROR: MG_update_gauge called but TM_USE_DDalphaAMG library not included.\n");
+  printf("ERROR: MG_update_gauge called but DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_update_mu(double mu_tmLQCD, double odd_tmLQCD) {
-  printf("ERROR: MG_update_mu called but TM_USE_DDalphaAMG library not included.\n");
+  printf("ERROR: MG_update_mu called but DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_reset(void) {
-  printf("ERROR: MG_reset called but TM_USE_DDalphaAMG library not included.\n");
+  printf("ERROR: MG_reset called but DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_finalize(void) {
-  printf("ERROR: MG_finalize called but TM_USE_DDalphaAMG library not included.\n");
+  printf("ERROR: MG_finalize called but DDalphaAMG library not included.\n");
   exit(1);
 }
 
 int MG_solver(spinor *const phi_new, spinor *const phi_old, const double precision,
               const int max_iter, const int rel_prec, const int N, su3 **gf, matrix_mult f) {
-  printf("ERROR: MG_solver called but TM_USE_DDalphaAMG library not included.\n");
+  printf("ERROR: MG_solver called but DDalphaAMG library not included.\n");
   exit(1);
 }
 
 int MG_solver_eo(spinor *const Even_new, spinor *const Odd_new, spinor *const Even,
                  spinor *const Odd, const double precision, const int max_iter, const int rel_prec,
                  const int N, su3 **gf, matrix_mult_full f_full) {
-  printf("ERROR: MG_solver_eo called but TM_USE_DDalphaAMG library not included.\n");
+  printf("ERROR: MG_solver_eo called but DDalphaAMG library not included.\n");
   exit(1);
 }
 
 int MG_solver_nd(spinor *const up_new, spinor *const dn_new, spinor *const up_old,
                  spinor *const dn_old, const double precision, const int max_iter,
                  const int rel_prec, const int N, su3 **gf, matrix_mult_nd f) {
-  printf("ERROR: MG_solver_nd called but TM_USE_DDalphaAMG library not included.\n");
+  printf("ERROR: MG_solver_nd called but DDalphaAMG library not included.\n");
   exit(1);
 }
 
@@ -207,7 +207,7 @@ static inline int MG_check(spinor *const phi_new, spinor *const phi_old, const i
           "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
           "restart... \n");
       printf(
-          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
+          "ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
           "%e > %e "
           "\n",
           differ[0], differ[1], differ[0] / differ[1], precision);
@@ -216,9 +216,8 @@ static inline int MG_check(spinor *const phi_new, spinor *const phi_old, const i
   }
 
   if (g_debug_level > 0 && g_proc_id == 0)
-    printf(
-        "MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
-        differ[0], differ[1], differ[0] / differ[1]);
+    printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+           differ[0], differ[1], differ[0] / differ[1]);
 
   return 1;
 }
@@ -259,7 +258,7 @@ static inline int MG_check_nd(spinor *const up_new, spinor *const dn_new, spinor
           "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
           "restart... \n");
       printf(
-          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
+          "ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
           "%e > %e "
           "\n",
           differ[0], differ[1], differ[0] / differ[1], precision);
@@ -268,9 +267,8 @@ static inline int MG_check_nd(spinor *const up_new, spinor *const dn_new, spinor
   }
 
   if (g_debug_level > 0 && g_proc_id == 0)
-    printf(
-        "MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
-        differ[0], differ[1], differ[0] / differ[1]);
+    printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+           differ[0], differ[1], differ[0] / differ[1]);
 
   return 1;
 }
@@ -308,7 +306,7 @@ static inline int MG_mms_check_nd(spinor **const up_new, spinor **const dn_new,
             "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
             "restart... \n");
         printf(
-            "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
+            "ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
             "%e > "
             "%e \n",
             differ[0], differ[1], differ[0] / differ[1], precision[i]);
@@ -318,9 +316,8 @@ static inline int MG_mms_check_nd(spinor **const up_new, spinor **const dn_new,
     }
 
     if (g_debug_level > 0 && g_proc_id == 0)
-      printf(
-          "MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
-          differ[0], differ[1], differ[0] / differ[1]);
+      printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+             differ[0], differ[1], differ[0] / differ[1]);
   }
 
   finalize_solver(check_vect, 2);
@@ -349,7 +346,7 @@ static int MG_pre_solve(su3 **gf) {
   if (mg_initialized == 0) {
     MG_init();
     mg_initialized = 1;
-    if (g_proc_id == 0) printf("TM_USE_DDalphaAMG initialized\n");
+    if (g_proc_id == 0) printf("DDalphaAMG initialized\n");
     MPI_Barrier(MPI_COMM_WORLD);
   }
 
@@ -357,41 +354,41 @@ static int MG_pre_solve(su3 **gf) {
     DDalphaAMG_set_configuration((double *)&(gf[0][0]), &mg_status);
     mg_update_gauge = 0;
     if (mg_status.success && g_proc_id == 0)
-      printf("TM_USE_DDalphaAMG cnfg set, plaquette %e\n", mg_status.info);
+      printf("DDalphaAMG cnfg set, plaquette %e\n", mg_status.info);
     else if (g_proc_id == 0)
       printf("ERROR: configuration updating did not run correctly");
   }
 
   if (mg_do_setup == 1) {
     if (mg_setup_mu_set) {
-      if (g_proc_id == 0) printf("TM_USE_DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
+      if (g_proc_id == 0) printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
       MG_update_mu(mg_setup_mu, 0);
     } else
       MG_update_mu(g_mu, 0);
-    if (g_proc_id == 0) printf("TM_USE_DDalphaAMG running setup\n");
+    if (g_proc_id == 0) printf("DDalphaAMG running setup\n");
     DDalphaAMG_setup(&mg_status);
     mg_do_setup = 0;
     mg_tau = gauge_tau;
     if (mg_status.success && g_proc_id == 0)
-      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n",
-             mg_status.time, 100. * (mg_status.coarse_time / mg_status.time));
+      printf("DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
+             100. * (mg_status.coarse_time / mg_status.time));
     else if (g_proc_id == 0)
       printf("ERROR: setup procedure did not run correctly");
   }
 
   if (mg_update_setup > 0) {
     if (mg_setup_mu_set) {
-      if (g_proc_id == 0) printf("TM_USE_DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
+      if (g_proc_id == 0) printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
       MG_update_mu(mg_setup_mu, 0);
     } else
       MG_update_mu(g_mu, 0);
-    if (g_proc_id == 0) printf("TM_USE_DDalphaAMG updating setup\n");
+    if (g_proc_id == 0) printf("DDalphaAMG updating setup\n");
     DDalphaAMG_update_setup(mg_update_setup, &mg_status);
     mg_update_setup = 0;
     mg_tau = gauge_tau;
     if (mg_status.success && g_proc_id == 0)
-      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n",
-             mg_status.time, 100. * (mg_status.coarse_time / mg_status.time));
+      printf("DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
+             100. * (mg_status.coarse_time / mg_status.time));
     else if (g_proc_id == 0)
       printf("ERROR: setup updating did not run correctly");
   }
@@ -401,7 +398,7 @@ static int MG_pre_solve(su3 **gf) {
 
 static int MG_solve(spinor *const phi_new, spinor *const phi_old, const double precision,
                     const int N, matrix_mult f) {
-  // for rescaling  convention in TM_USE_DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
+  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
   // rescale by 1/4+m
   double mg_scale = 0.5 / g_kappa;
   double *old = (double *)phi_old;
@@ -535,7 +532,7 @@ static int MG_solve(spinor *const phi_new, spinor *const phi_old, const double p
 
 static int MG_solve_nd(spinor *up_new, spinor *dn_new, spinor *const up_old, spinor *const dn_old,
                        const double precision, const int N, matrix_mult_nd f) {
-  // for rescaling  convention in TM_USE_DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
+  // for rescaling  convention in DalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
   // rescale by 1/4+m moreover in the nd case, the tmLQCD is multiplied by phmc_invmaxev
   double mg_scale = 0.5 / g_kappa / phmc_invmaxev;
   double sqnorm;
@@ -809,7 +806,7 @@ static int MG_solve_nd(spinor *up_new, spinor *dn_new, spinor *const up_old, spi
                                          // 0 and shift
              f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
              f == Qsw_pm_ndpsi_shift) {  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
-    // TM_USE_DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
+    // DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
     // tmLQCD:          gamma5 Dh tau1 gamma5 Dh tau1
     if (init_guess) {
       mul_gamma5(old1, VOLUME);
@@ -906,7 +903,7 @@ static int MG_solve_nd(spinor *up_new, spinor *dn_new, spinor *const up_old, spi
 static int MG_mms_solve_nd(spinor **const up_new, spinor **const dn_new, spinor *const up_old,
                            spinor *const dn_old, const double *shifts, const int no_shifts,
                            double *precision, const int N, matrix_mult_nd f) {
-  // for rescaling  convention in TM_USE_DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
+  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
   // rescale by 1/4+m moreover in the nd case, the tmLQCD is multiplied by phmc_invmaxev
   double mg_scale = 0.5 / g_kappa / phmc_invmaxev;
   double *old1 = (double *)up_old;
@@ -1007,7 +1004,7 @@ static int MG_mms_solve_nd(spinor **const up_new, spinor **const dn_new, spinor
                                          // 0 and shift
              f == Qsw_pm_ndpsi_shift) {  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
     mg_scale *= mg_scale;
-    // TM_USE_DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
+    // DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
     // tmLQCD:          gamma5 Dh tau1 gamma5 Dh tau1
     DDalphaAMG_solve_ms_doublet_squared_odd(new2, old2, new1, old1, mg_even_shifts, mg_odd_shifts,
                                             no_shifts, precision, &mg_status);
@@ -1116,7 +1113,7 @@ void MG_init() {
   mg_params.conf_index_fct = conf_index_fct;
   mg_params.vector_index_fct = vector_index_fct;
 
-  /* in TM_USE_DDalphaAMG
+  /* in DDalphaAMG
    * Printing level:
    *  -1: silent (errors or warnings)
    *   0: minimal //default
diff --git a/src/lib/operator/Hopping_Matrix.c b/src/lib/operator/Hopping_Matrix.c
index 41b85aa21..d25c3d961 100644
--- a/src/lib/operator/Hopping_Matrix.c
+++ b/src/lib/operator/Hopping_Matrix.c
@@ -106,6 +106,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
   {
 #endif
 
+#include "hopping.h"
 #include "operator/hopping_body_dbl.inc"
 
 #ifdef TM_USE_OMP
diff --git a/src/lib/operator/hopping.h b/src/lib/operator/hopping.h
index f3b948c49..5027f1976 100644
--- a/src/lib/operator/hopping.h
+++ b/src/lib/operator/hopping.h
@@ -28,125 +28,126 @@
 #ifndef _HOPPING_H
 #define _HOPPING_H
 
-#define _declare_regs()      \
+#define _declare_regs()	     \
   su3_vector ALIGN psi, chi; \
   spinor ALIGN temp;
 
-#define _hop_t_p()                      \
-  _vector_add(psi, sp->s0, sp->s2);     \
-  _su3_multiply(chi, (*up), psi);       \
-  _complex_times_vector(psi, ka0, chi); \
-  _vector_assign(temp.s0, psi);         \
-  _vector_assign(temp.s2, psi);         \
-  _vector_add(psi, sp->s1, sp->s3);     \
-  _su3_multiply(chi, (*up), psi);       \
-  _complex_times_vector(psi, ka0, chi); \
-  _vector_assign(temp.s1, psi);         \
-  _vector_assign(temp.s3, psi);
+#define _hop_t_p()				\
+  _vector_add(psi,sp->s0,sp->s2);		\
+  _su3_multiply(chi,(*up),psi);			\
+  _complex_times_vector(psi,ka0,chi);		\
+  _vector_assign(temp.s0,psi);			\
+  _vector_assign(temp.s2,psi);			\
+  _vector_add(psi,sp->s1,sp->s3);		\
+  _su3_multiply(chi,(*up),psi);			\
+  _complex_times_vector(psi,ka0,chi);		\
+  _vector_assign(temp.s1,psi);			\
+  _vector_assign(temp.s3,psi);
+  
+#define _hop_t_m()				\
+  _vector_sub(psi,sm->s0,sm->s2);		\
+  _su3_inverse_multiply(chi,(*um),psi);		\
+  _complexcjg_times_vector(psi,ka0,chi);	\
+  _vector_add_assign(temp.s0,psi);		\
+  _vector_sub_assign(temp.s2,psi);		\
+  _vector_sub(psi,sm->s1,sm->s3);		\
+  _su3_inverse_multiply(chi,(*um),psi);		\
+  _complexcjg_times_vector(psi,ka0,chi);	\
+  _vector_add_assign(temp.s1,psi);		\
+  _vector_sub_assign(temp.s3,psi);
 
-#define _hop_t_m()                         \
-  _vector_sub(psi, sm->s0, sm->s2);        \
-  _su3_inverse_multiply(chi, (*um), psi);  \
-  _complexcjg_times_vector(psi, ka0, chi); \
-  _vector_add_assign(temp.s0, psi);        \
-  _vector_sub_assign(temp.s2, psi);        \
-  _vector_sub(psi, sm->s1, sm->s3);        \
-  _su3_inverse_multiply(chi, (*um), psi);  \
-  _complexcjg_times_vector(psi, ka0, chi); \
-  _vector_add_assign(temp.s1, psi);        \
-  _vector_sub_assign(temp.s3, psi);
+#define _hop_x_p()				\
+  _vector_i_add(psi,sp->s0,sp->s3);		\
+  _su3_multiply(chi,(*up),psi);			\
+  _complex_times_vector(psi,ka1,chi);		\
+  _vector_add_assign(temp.s0,psi);		\
+  _vector_i_sub_assign(temp.s3,psi);		\
+  _vector_i_add(psi,sp->s1,sp->s2);		\
+  _su3_multiply(chi,(*up),psi);			\
+  _complex_times_vector(psi,ka1,chi);		\
+  _vector_add_assign(temp.s1,psi);		\
+  _vector_i_sub_assign(temp.s2,psi);
 
-#define _hop_x_p()                      \
-  _vector_i_add(psi, sp->s0, sp->s3);   \
-  _su3_multiply(chi, (*up), psi);       \
-  _complex_times_vector(psi, ka1, chi); \
-  _vector_add_assign(temp.s0, psi);     \
-  _vector_i_sub_assign(temp.s3, psi);   \
-  _vector_i_add(psi, sp->s1, sp->s2);   \
-  _su3_multiply(chi, (*up), psi);       \
-  _complex_times_vector(psi, ka1, chi); \
-  _vector_add_assign(temp.s1, psi);     \
-  _vector_i_sub_assign(temp.s2, psi);
+#define _hop_x_m()				\
+  _vector_i_sub(psi,sm->s0,sm->s3);		\
+  _su3_inverse_multiply(chi,(*um),psi);		\
+  _complexcjg_times_vector(psi,ka1,chi);	\
+  _vector_add_assign(temp.s0,psi);		\
+  _vector_i_add_assign(temp.s3,psi);		\
+  _vector_i_sub(psi,sm->s1,sm->s2);		\
+  _su3_inverse_multiply(chi,(*um),psi);		\
+  _complexcjg_times_vector(psi,ka1,chi);	\
+  _vector_add_assign(temp.s1,psi);		\
+  _vector_i_add_assign(temp.s2,psi);
 
-#define _hop_x_m()                         \
-  _vector_i_sub(psi, sm->s0, sm->s3);      \
-  _su3_inverse_multiply(chi, (*um), psi);  \
-  _complexcjg_times_vector(psi, ka1, chi); \
-  _vector_add_assign(temp.s0, psi);        \
-  _vector_i_add_assign(temp.s3, psi);      \
-  _vector_i_sub(psi, sm->s1, sm->s2);      \
-  _su3_inverse_multiply(chi, (*um), psi);  \
-  _complexcjg_times_vector(psi, ka1, chi); \
-  _vector_add_assign(temp.s1, psi);        \
-  _vector_i_add_assign(temp.s2, psi);
+#define _hop_y_p()				\
+  _vector_add(psi,sp->s0,sp->s3);		\
+  _su3_multiply(chi,(*up),psi);			\
+  _complex_times_vector(psi,ka2,chi);		\
+  _vector_add_assign(temp.s0,psi);		\
+  _vector_add_assign(temp.s3,psi);		\
+  _vector_sub(psi,sp->s1,sp->s2);		\
+  _su3_multiply(chi,(*up),psi);			\
+  _complex_times_vector(psi,ka2,chi);		\
+  _vector_add_assign(temp.s1,psi);		\
+  _vector_sub_assign(temp.s2,psi);
 
-#define _hop_y_p()                      \
-  _vector_add(psi, sp->s0, sp->s3);     \
-  _su3_multiply(chi, (*up), psi);       \
-  _complex_times_vector(psi, ka2, chi); \
-  _vector_add_assign(temp.s0, psi);     \
-  _vector_add_assign(temp.s3, psi);     \
-  _vector_sub(psi, sp->s1, sp->s2);     \
-  _su3_multiply(chi, (*up), psi);       \
-  _complex_times_vector(psi, ka2, chi); \
-  _vector_add_assign(temp.s1, psi);     \
-  _vector_sub_assign(temp.s2, psi);
+#define _hop_y_m()				\
+  _vector_sub(psi,sm->s0,sm->s3);		\
+  _su3_inverse_multiply(chi,(*um),psi);		\
+  _complexcjg_times_vector(psi,ka2,chi);	\
+  _vector_add_assign(temp.s0,psi);		\
+  _vector_sub_assign(temp.s3,psi);		\
+  _vector_add(psi,sm->s1,sm->s2);		\
+  _su3_inverse_multiply(chi,(*um),psi);		\
+  _complexcjg_times_vector(psi,ka2,chi);	\
+  _vector_add_assign(temp.s1,psi);		\
+  _vector_add_assign(temp.s2,psi);
 
-#define _hop_y_m()                         \
-  _vector_sub(psi, sm->s0, sm->s3);        \
-  _su3_inverse_multiply(chi, (*um), psi);  \
-  _complexcjg_times_vector(psi, ka2, chi); \
-  _vector_add_assign(temp.s0, psi);        \
-  _vector_sub_assign(temp.s3, psi);        \
-  _vector_add(psi, sm->s1, sm->s2);        \
-  _su3_inverse_multiply(chi, (*um), psi);  \
-  _complexcjg_times_vector(psi, ka2, chi); \
-  _vector_add_assign(temp.s1, psi);        \
-  _vector_add_assign(temp.s2, psi);
+#define _hop_z_p()				\
+  _vector_i_add(psi,sp->s0,sp->s2);		\
+  _su3_multiply(chi,(*up),psi);			\
+  _complex_times_vector(psi,ka3,chi);		\
+  _vector_add_assign(temp.s0,psi);		\
+  _vector_i_sub_assign(temp.s2,psi);		\
+  _vector_i_sub(psi,sp->s1,sp->s3);		\
+  _su3_multiply(chi,(*up),psi);			\
+  _complex_times_vector(psi,ka3,chi);		\
+  _vector_add_assign(temp.s1,psi);		\
+  _vector_i_add_assign(temp.s3,psi);
 
-#define _hop_z_p()                      \
-  _vector_i_add(psi, sp->s0, sp->s2);   \
-  _su3_multiply(chi, (*up), psi);       \
-  _complex_times_vector(psi, ka3, chi); \
-  _vector_add_assign(temp.s0, psi);     \
-  _vector_i_sub_assign(temp.s2, psi);   \
-  _vector_i_sub(psi, sp->s1, sp->s3);   \
-  _su3_multiply(chi, (*up), psi);       \
-  _complex_times_vector(psi, ka3, chi); \
-  _vector_add_assign(temp.s1, psi);     \
-  _vector_i_add_assign(temp.s3, psi);
-
-#define _hop_z_m()                         \
-  _vector_i_sub(psi, sm->s0, sm->s2);      \
-  _su3_inverse_multiply(chi, (*um), psi);  \
-  _complexcjg_times_vector(psi, ka3, chi); \
-  _vector_add_assign(temp.s0, psi);        \
-  _vector_i_add_assign(temp.s2, psi);      \
-  _vector_i_add(psi, sm->s1, sm->s3);      \
-  _su3_inverse_multiply(chi, (*um), psi);  \
-  _complexcjg_times_vector(psi, ka3, chi); \
-  _vector_add_assign(temp.s1, psi);        \
+#define _hop_z_m()				\
+  _vector_i_sub(psi,sm->s0,sm->s2);		\
+  _su3_inverse_multiply(chi,(*um),psi);		\
+  _complexcjg_times_vector(psi,ka3,chi);	\
+  _vector_add_assign(temp.s0, psi);		\
+  _vector_i_add_assign(temp.s2, psi);		\
+  _vector_i_add(psi,sm->s1,sm->s3);		\
+  _su3_inverse_multiply(chi,(*um),psi);		\
+  _complexcjg_times_vector(psi,ka3,chi);	\
+  _vector_add_assign(temp.s1, psi);		\
   _vector_i_sub_assign(temp.s3, psi);
 
-#define _hop_mul_g5_cmplx_and_store()                 \
-  _complex_times_vector(rn->s0, cfactor, temp.s0);    \
-  _complex_times_vector(rn->s1, cfactor, temp.s1);    \
-  _complexcjg_times_vector(rn->s2, cfactor, temp.s2); \
+#define _hop_mul_g5_cmplx_and_store()			\
+  _complex_times_vector(rn->s0, cfactor, temp.s0);	\
+  _complex_times_vector(rn->s1, cfactor, temp.s1);	\
+  _complexcjg_times_vector(rn->s2, cfactor, temp.s2);	\
   _complexcjg_times_vector(rn->s3, cfactor, temp.s3);
 
-#define _g5_cmplx_sub_hop_and_g5store()           \
-  _complex_times_vector(psi, cfactor, pn->s0);    \
-  _vector_sub(rn->s0, psi, temp.s0);              \
-  _complex_times_vector(chi, cfactor, pn->s1);    \
-  _vector_sub(rn->s1, chi, temp.s1);              \
-  _complexcjg_times_vector(psi, cfactor, pn->s2); \
-  _vector_sub(rn->s2, temp.s2, psi);              \
-  _complexcjg_times_vector(chi, cfactor, pn->s3); \
+#define _g5_cmplx_sub_hop_and_g5store()			\
+  _complex_times_vector(psi, cfactor, pn->s0);		\
+  _vector_sub(rn->s0, psi, temp.s0);			\
+  _complex_times_vector(chi, cfactor, pn->s1);		\
+  _vector_sub(rn->s1, chi, temp.s1);			\
+  _complexcjg_times_vector(psi, cfactor, pn->s2);	\
+  _vector_sub(rn->s2, temp.s2, psi);			\
+  _complexcjg_times_vector(chi, cfactor, pn->s3);	\
   _vector_sub(rn->s3, temp.s3, chi);
 
-#define _store_res()               \
-  _vector_assign(rn->s0, temp.s0); \
-  _vector_assign(rn->s1, temp.s1); \
-  _vector_assign(rn->s2, temp.s2); \
+#define _store_res()				\
+  _vector_assign(rn->s0, temp.s0);		\
+  _vector_assign(rn->s1, temp.s1);		\
+  _vector_assign(rn->s2, temp.s2);		\
   _vector_assign(rn->s3, temp.s3);
+
 #endif
diff --git a/src/lib/operator/hopping_body_dbl.c b/src/lib/operator/hopping_body_dbl.c
new file mode 100644
index 000000000..62e303d84
--- /dev/null
+++ b/src/lib/operator/hopping_body_dbl.c
@@ -0,0 +1,179 @@
+/**********************************************************************
+ *
+ *
+ * Copyright (C) 2012 Carsten Urbach, Bartosz Kostrzewa
+ *
+ * This file is based on an implementation of the Dirac operator 
+ * written by Martin Luescher, modified by Martin Hasenbusch in 2002 
+ * and modified and extended by Carsten Urbach from 2003-2008
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ **********************************************************************/
+#include "hopping.h"
+
+  int ioff;
+  int * hi;
+  su3 * restrict ALIGN up;
+  su3 * restrict ALIGN um;
+  spinor * restrict ALIGN sp;
+  spinor * restrict ALIGN sm;
+  spinor * restrict ALIGN rn;
+  
+  _declare_regs();
+
+  if(ieo == 0){
+    ioff = 0;
+  } 
+  else{
+    ioff = (VOLUME+RAND)/2;
+  }
+
+#ifndef TM_USE_OMP
+  hi = &g_hi[16*ioff];
+
+#  if ((defined _GAUGE_COPY))
+  up=&g_gauge_field_copy[ioff][0];
+#  else
+  up=&g_gauge_field[(*hi)][0];
+#  endif
+  hi++;
+  sp=k+(*hi);
+  hi++;
+#endif
+
+  /**************** loop over all lattice sites ******************/
+#ifdef TM_USE_OMP
+#  pragma omp for
+#endif
+  for(int icx = ioff; icx < (VOLUME/2+ioff); icx++){
+#ifdef TM_USE_OMP
+    hi = &g_hi[16*icx];
+#  if ((defined _GAUGE_COPY))
+    up=&g_gauge_field_copy[icx][0];
+#  else
+    up=&g_gauge_field[(*hi)][0];
+#  endif
+    hi++;
+    sp=k+(*hi);
+    hi++;
+#endif
+    rn=l+(icx-ioff);
+#ifdef _TM_SUB_HOP
+    pn=p+(icx-ioff);
+#endif
+    /*********************** direction +t ************************/
+#    if (!defined _GAUGE_COPY)
+    um=&g_gauge_field[(*hi)][0]; 
+#    else
+    um=up+1;
+#    endif
+    hi++;
+    sm=k+(*hi);
+    hi+=2;
+
+    _hop_t_p();
+
+    /*********************** direction -t ************************/
+#    if ((defined _GAUGE_COPY))
+    up=um+1;
+#    else
+    up+=1;
+#    endif
+    sp=k+(*hi);
+    hi++;
+    
+    _hop_t_m();
+
+    /*********************** direction +1 ************************/
+#    ifndef _GAUGE_COPY
+    um=&g_gauge_field[(*hi)][1]; 
+#    else
+    um = up+1;
+#    endif
+    hi++;
+    sm=k+(*hi);
+    hi+=2;
+
+    _hop_x_p();
+
+    /*********************** direction -1 ************************/
+#    if ((defined _GAUGE_COPY))
+    up=um+1;
+#    else
+    up+=1;
+#    endif
+    sp=k+(*hi);
+    hi++;
+
+    _hop_x_m();
+
+    /*********************** direction +2 ************************/
+#    ifndef _GAUGE_COPY
+    um=&g_gauge_field[(*hi)][2]; 
+#    else
+    um= up+1;
+#    endif
+    hi++;
+    sm=k+(*hi);
+    hi+=2;
+
+    _hop_y_p();
+
+    /*********************** direction -2 ************************/
+#    if ((defined _GAUGE_COPY))
+    up=um+1;
+#    else
+    up+=1;
+#    endif
+    sp=k+(*hi);
+    hi++;
+
+    _hop_y_m();
+
+    /*********************** direction +3 ************************/
+#    ifndef _GAUGE_COPY
+    um=&g_gauge_field[(*hi)][3]; 
+#    else
+    um=up+1;
+#    endif
+    hi++;
+    sm=k+(*hi);
+    hi++;
+
+    _hop_z_p();
+
+    /*********************** direction -3 ************************/
+#ifndef TM_USE_OMP
+#  if ((defined _GAUGE_COPY))
+    up=um+1;
+#  else
+    up=&g_gauge_field[(*hi)][0];
+#  endif
+    hi++;
+    sp=k+(*hi);
+    hi++;
+#endif
+    _hop_z_m();
+
+#ifdef _MUL_G5_CMPLX
+    _hop_mul_g5_cmplx_and_store();
+#elif defined _TM_SUB_HOP
+    _g5_cmplx_sub_hop_and_g5store();
+#else
+    _store_res();
+#endif
+  }
diff --git a/src/lib/operator/tm_times_Hopping_Matrix.c b/src/lib/operator/tm_times_Hopping_Matrix.c
index eaeb92a93..cbd8132b1 100644
--- a/src/lib/operator/tm_times_Hopping_Matrix.c
+++ b/src/lib/operator/tm_times_Hopping_Matrix.c
@@ -52,7 +52,7 @@
 //
 
 #if (defined TM_USE_HALFSPINOR && !defined _NO_COMM)
-#include "operator/halfspinor_hopping.h"
+#include "halfspinor_hopping.h"
 
 void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
                              complex double const cfactor) {
@@ -69,7 +69,7 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
 #endif
 
 #define _MUL_G5_CMPLX
-#include "operator/halfspinor_body.inc"
+#include "halfspinor_body.inc"
 #undef _MUL_G5_CMPLX
 
 #ifdef TM_USE_OMP
@@ -96,7 +96,7 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
   {
 #endif
 #define _MUL_G5_CMPLX
-#include "operator/hopping_body_dbl.c"
+#include "hopping_body_dbl.c"
 #undef _MUL_G5_CMPLX
 #ifdef TM_USE_OMP
   } /* OpenMP closing brace */
diff --git a/src/lib/quda_interface.c b/src/lib/quda_interface.c
index 674b84677..17898ea8a 100644
--- a/src/lib/quda_interface.c
+++ b/src/lib/quda_interface.c
@@ -2546,8 +2546,8 @@ void compute_gauge_derivative_quda(monomial *const mnl, hamiltonian_field_t *con
   reset_quda_gauge_state(&quda_gauge_state);
 
   tm_stopwatch_push(&g_timers, "computeGaugeForceQuda", "");
-  computeGaugeForceQuda((void *)mom_quda, (void *)gauge_quda, path_buf, (int *)path_length, loop_coeff,
-                        num_paths, max_length, 1.0, &f_gauge_param);
+  computeGaugeForceQuda((void *)mom_quda, (void *)gauge_quda, path_buf, (int *)path_length,
+                        loop_coeff, num_paths, max_length, 1.0, &f_gauge_param);
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 
   free(path_buf);
@@ -2907,7 +2907,7 @@ void quda_mg_tune_params(void *spinorOut, void *spinorIn, const int max_iter) {
   int cur_tuning_lvl = mg_n_level - 1;
   int cur_lvl_tuning_steps = get_lvl_tuning_steps(&quda_mg_tuning_plan, cur_tuning_lvl);
   int steps_done_in_cur_dir = 0;
-  int i = 0; 
+  int i = 0;
   tm_QudaMGTuningDirection_t cur_tuning_dir = TM_MG_TUNE_MU_FACTOR;
 
   // when tuning over multiple configurations, we tune on the first config based
diff --git a/src/lib/solver/jdher.c b/src/lib/solver/jdher.c
index bbe25d11a..2ed9b2246 100644
--- a/src/lib/solver/jdher.c
+++ b/src/lib/solver/jdher.c
@@ -127,8 +127,8 @@ void jdher(int n, int lda, double tau, double tol, int kmax, int jmax, int jmin,
    * initialize with NULL, so we can free even unallocated ptrs */
   double *s = NULL, *resnrm = NULL, *resnrm_old = NULL, *dtemp = NULL, *rwork = NULL;
 
-  _Complex double *V = NULL, *Vtmp = NULL, *U = NULL, *M = NULL, *Z = NULL, *Res,
-                  *eigwork = NULL, *temp1_ = NULL, *temp1;
+  _Complex double *V = NULL, *Vtmp = NULL, *U = NULL, *M = NULL, *Z = NULL, *Res, *eigwork = NULL,
+                  *temp1_ = NULL, *temp1;
 
   int *idx1 = NULL, *idx2 = NULL, *convind = NULL, *keepind = NULL, *solvestep = NULL,
       *actcorrits = NULL;
diff --git a/src/lib/xchange/xchange_field.c b/src/lib/xchange/xchange_field.c
index ee463ec17..217631f8e 100644
--- a/src/lib/xchange/xchange_field.c
+++ b/src/lib/xchange/xchange_field.c
@@ -336,7 +336,7 @@ void xchange_field(spinor* const l, const int ieo) {
 
 /* Here comes the naive version */
 /* Using MPI_Sendrecv */
-#else /* TM_NON_BLOCKING TM_USE_SHMEM */
+#else   /* TM_NON_BLOCKING TM_USE_SHMEM */
 /* exchanges the field  l */
 void xchange_field(spinor* const l, const int ieo) {
 
diff --git a/src/lib/xchange/xchange_halffield.c b/src/lib/xchange/xchange_halffield.c
index d3d19794a..9d7c21f29 100644
--- a/src/lib/xchange/xchange_halffield.c
+++ b/src/lib/xchange/xchange_halffield.c
@@ -51,15 +51,15 @@ MPI_Request prequests[16];
 void init_xchange_halffield() {
 #ifdef TM_USE_MPI
 
-/* #ifdef TM_PARALLELT */
-/*   int reqcount = 4; */
-/* #elif defined TM_PARALLELXT */
-/*   int reqcount = 8; */
-/* #elif defined TM_PARALLELXYT */
-/*   int reqcount = 12; */
-/* #elif defined TM_PARALLELXYZT */
-/*   int reqcount = 16; */
-/* #endif */
+  /* #ifdef TM_PARALLELT */
+  /*   int reqcount = 4; */
+  /* #elif defined TM_PARALLELXT */
+  /*   int reqcount = 8; */
+  /* #elif defined TM_PARALLELXYT */
+  /*   int reqcount = 12; */
+  /* #elif defined TM_PARALLELXYZT */
+  /*   int reqcount = 16; */
+  /* #endif */
 
   /* send the data to the neighbour on the right in t direction */
   /* recieve the data from the neighbour on the left in t direction */

From 65725e2bb0b626a2be3a0cde9c136bdf843c08c1 Mon Sep 17 00:00:00 2001
From: Taillefumier Mathieu <29380261+mtaillefumier@users.noreply.github.com>
Date: Fri, 27 Mar 2026 09:13:55 +0100
Subject: [PATCH 40/80] Update src/lib/init/init_stout_smear_vars.c

Co-authored-by: chaoos <chaoos@users.noreply.github.com>
---
 src/lib/init/init_stout_smear_vars.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/lib/init/init_stout_smear_vars.c b/src/lib/init/init_stout_smear_vars.c
index 92b5889a5..5b69005b2 100644
--- a/src/lib/init/init_stout_smear_vars.c
+++ b/src/lib/init/init_stout_smear_vars.c
@@ -84,9 +84,6 @@ int init_stout_smear_vars(const int V, const int stout_no_iter) {
 
   /* int i, k, x, mu; */
 
-  /* i = 0; */
-  /* k = 0; */
-  /* mu = 0; */
 
   if (g_exposu3_no_c == 0) init_exposu3();
 

From 6970e4d5b7b2de245bc5e83bd5089a82b6d5fd40 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Fri, 27 Mar 2026 09:07:02 +0100
Subject: [PATCH 41/80] Fix compilation issues

- Restored hopping.h
- Rename hmc with tmlqcd
- Fix TM_USE_DDalphaAMG when necessary
- Remove header guards in git_hash.c
- Update Spack package
---
 .../tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py           | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
index 74ea1d069..1c76b2991 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
@@ -91,7 +91,6 @@ class Tmlqcd(CMakePackage, CudaPackage, ROCmPackage):
         depends_on("quda+mpi", when="+mpi")
         depends_on("quda+cuda", when="+cuda")
         depends_on("quda+rocm", when="+rocm")
-        depends_on("quda+nvshmem", when="+shmem")
 
     depends_on("fftw-api@3", when="+fftw")
 

From 86544321e2ee411304babe72cafb556da047fedc Mon Sep 17 00:00:00 2001
From: Taillefumier Mathieu <29380261+mtaillefumier@users.noreply.github.com>
Date: Fri, 27 Mar 2026 09:18:19 +0100
Subject: [PATCH 42/80] Update
 .ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py

Co-authored-by: chaoos <chaoos@users.noreply.github.com>
---
 .../tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
index 1c76b2991..6cb679f4f 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
@@ -72,6 +72,7 @@ class Tmlqcd(CMakePackage, CudaPackage, ROCmPackage):
     # conflicts
     conflicts("+cuda", when="cuda_arch=none")
     conflicts("+rocm", when="amdgpu_target=none")
+conflicts("+cuda +rocm", msg="CUDA and ROCm support are mutually exclusive")
 
     # hard dependencies
     depends_on("c-lime")

From 716e0214842f171a9ee4941039676d98cdcdb715 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Fri, 27 Mar 2026 10:13:00 +0100
Subject: [PATCH 43/80] Remove comment

---
 src/lib/xchange/xchange_halffield.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/lib/xchange/xchange_halffield.c b/src/lib/xchange/xchange_halffield.c
index 9d7c21f29..8b56d6593 100644
--- a/src/lib/xchange/xchange_halffield.c
+++ b/src/lib/xchange/xchange_halffield.c
@@ -50,17 +50,6 @@ MPI_Request prequests[16];
 /* 2. */
 void init_xchange_halffield() {
 #ifdef TM_USE_MPI
-
-  /* #ifdef TM_PARALLELT */
-  /*   int reqcount = 4; */
-  /* #elif defined TM_PARALLELXT */
-  /*   int reqcount = 8; */
-  /* #elif defined TM_PARALLELXYT */
-  /*   int reqcount = 12; */
-  /* #elif defined TM_PARALLELXYZT */
-  /*   int reqcount = 16; */
-  /* #endif */
-
   /* send the data to the neighbour on the right in t direction */
   /* recieve the data from the neighbour on the left in t direction */
   MPI_Send_init((void*)(sendBuffer), LX * LY * LZ * 12 / 2, MPI_DOUBLE, g_nb_t_up, 81, g_cart_grid,

From 919d565bdcf9a08ef8333773db833de033bbe9d0 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Fri, 27 Mar 2026 10:23:22 +0100
Subject: [PATCH 44/80] Removed duplicate file

---
 src/lib/operator/hopping_body_dbl.c        | 179 ---------------------
 src/lib/operator/hopping_body_dbl.inc      |   1 +
 src/lib/operator/tm_sub_Hopping_Matrix.c   |   2 +-
 src/lib/operator/tm_times_Hopping_Matrix.c |   2 +-
 4 files changed, 3 insertions(+), 181 deletions(-)
 delete mode 100644 src/lib/operator/hopping_body_dbl.c

diff --git a/src/lib/operator/hopping_body_dbl.c b/src/lib/operator/hopping_body_dbl.c
deleted file mode 100644
index 62e303d84..000000000
--- a/src/lib/operator/hopping_body_dbl.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/**********************************************************************
- *
- *
- * Copyright (C) 2012 Carsten Urbach, Bartosz Kostrzewa
- *
- * This file is based on an implementation of the Dirac operator 
- * written by Martin Luescher, modified by Martin Hasenbusch in 2002 
- * and modified and extended by Carsten Urbach from 2003-2008
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- *
- **********************************************************************/
-#include "hopping.h"
-
-  int ioff;
-  int * hi;
-  su3 * restrict ALIGN up;
-  su3 * restrict ALIGN um;
-  spinor * restrict ALIGN sp;
-  spinor * restrict ALIGN sm;
-  spinor * restrict ALIGN rn;
-  
-  _declare_regs();
-
-  if(ieo == 0){
-    ioff = 0;
-  } 
-  else{
-    ioff = (VOLUME+RAND)/2;
-  }
-
-#ifndef TM_USE_OMP
-  hi = &g_hi[16*ioff];
-
-#  if ((defined _GAUGE_COPY))
-  up=&g_gauge_field_copy[ioff][0];
-#  else
-  up=&g_gauge_field[(*hi)][0];
-#  endif
-  hi++;
-  sp=k+(*hi);
-  hi++;
-#endif
-
-  /**************** loop over all lattice sites ******************/
-#ifdef TM_USE_OMP
-#  pragma omp for
-#endif
-  for(int icx = ioff; icx < (VOLUME/2+ioff); icx++){
-#ifdef TM_USE_OMP
-    hi = &g_hi[16*icx];
-#  if ((defined _GAUGE_COPY))
-    up=&g_gauge_field_copy[icx][0];
-#  else
-    up=&g_gauge_field[(*hi)][0];
-#  endif
-    hi++;
-    sp=k+(*hi);
-    hi++;
-#endif
-    rn=l+(icx-ioff);
-#ifdef _TM_SUB_HOP
-    pn=p+(icx-ioff);
-#endif
-    /*********************** direction +t ************************/
-#    if (!defined _GAUGE_COPY)
-    um=&g_gauge_field[(*hi)][0]; 
-#    else
-    um=up+1;
-#    endif
-    hi++;
-    sm=k+(*hi);
-    hi+=2;
-
-    _hop_t_p();
-
-    /*********************** direction -t ************************/
-#    if ((defined _GAUGE_COPY))
-    up=um+1;
-#    else
-    up+=1;
-#    endif
-    sp=k+(*hi);
-    hi++;
-    
-    _hop_t_m();
-
-    /*********************** direction +1 ************************/
-#    ifndef _GAUGE_COPY
-    um=&g_gauge_field[(*hi)][1]; 
-#    else
-    um = up+1;
-#    endif
-    hi++;
-    sm=k+(*hi);
-    hi+=2;
-
-    _hop_x_p();
-
-    /*********************** direction -1 ************************/
-#    if ((defined _GAUGE_COPY))
-    up=um+1;
-#    else
-    up+=1;
-#    endif
-    sp=k+(*hi);
-    hi++;
-
-    _hop_x_m();
-
-    /*********************** direction +2 ************************/
-#    ifndef _GAUGE_COPY
-    um=&g_gauge_field[(*hi)][2]; 
-#    else
-    um= up+1;
-#    endif
-    hi++;
-    sm=k+(*hi);
-    hi+=2;
-
-    _hop_y_p();
-
-    /*********************** direction -2 ************************/
-#    if ((defined _GAUGE_COPY))
-    up=um+1;
-#    else
-    up+=1;
-#    endif
-    sp=k+(*hi);
-    hi++;
-
-    _hop_y_m();
-
-    /*********************** direction +3 ************************/
-#    ifndef _GAUGE_COPY
-    um=&g_gauge_field[(*hi)][3]; 
-#    else
-    um=up+1;
-#    endif
-    hi++;
-    sm=k+(*hi);
-    hi++;
-
-    _hop_z_p();
-
-    /*********************** direction -3 ************************/
-#ifndef TM_USE_OMP
-#  if ((defined _GAUGE_COPY))
-    up=um+1;
-#  else
-    up=&g_gauge_field[(*hi)][0];
-#  endif
-    hi++;
-    sp=k+(*hi);
-    hi++;
-#endif
-    _hop_z_m();
-
-#ifdef _MUL_G5_CMPLX
-    _hop_mul_g5_cmplx_and_store();
-#elif defined _TM_SUB_HOP
-    _g5_cmplx_sub_hop_and_g5store();
-#else
-    _store_res();
-#endif
-  }
diff --git a/src/lib/operator/hopping_body_dbl.inc b/src/lib/operator/hopping_body_dbl.inc
index e34469f4e..41682fd31 100644
--- a/src/lib/operator/hopping_body_dbl.inc
+++ b/src/lib/operator/hopping_body_dbl.inc
@@ -23,6 +23,7 @@
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  *
  **********************************************************************/
+#include "hopping.h"
 
 #include "hopping.h"
 
diff --git a/src/lib/operator/tm_sub_Hopping_Matrix.c b/src/lib/operator/tm_sub_Hopping_Matrix.c
index a163d26e5..f999d42aa 100644
--- a/src/lib/operator/tm_sub_Hopping_Matrix.c
+++ b/src/lib/operator/tm_sub_Hopping_Matrix.c
@@ -98,7 +98,7 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* p, spinor* co
 #define _TM_SUB_HOP
     spinor* pn;
 
-#include "operator/hopping_body_dbl.c"
+#include "operator/hopping_body_dbl.inc"
 
 #undef _TM_SUB_HOP
 #ifdef TM_USE_OMP
diff --git a/src/lib/operator/tm_times_Hopping_Matrix.c b/src/lib/operator/tm_times_Hopping_Matrix.c
index cbd8132b1..a83f2febb 100644
--- a/src/lib/operator/tm_times_Hopping_Matrix.c
+++ b/src/lib/operator/tm_times_Hopping_Matrix.c
@@ -96,7 +96,7 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
   {
 #endif
 #define _MUL_G5_CMPLX
-#include "hopping_body_dbl.c"
+#include "hopping_body_dbl.inc"
 #undef _MUL_G5_CMPLX
 #ifdef TM_USE_OMP
   } /* OpenMP closing brace */

From bf2b0487b8f0da24b150894c8ae965eb7771c0f6 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Fri, 27 Mar 2026 13:27:13 +0100
Subject: [PATCH 45/80] include hopping.h in tm_[sub,times]_Hopping_Matrix.c
 and include hopping_body_dbl.inc

---
 src/lib/operator/tm_sub_Hopping_Matrix.c   | 2 ++
 src/lib/operator/tm_times_Hopping_Matrix.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/lib/operator/tm_sub_Hopping_Matrix.c b/src/lib/operator/tm_sub_Hopping_Matrix.c
index f999d42aa..ef64be0c1 100644
--- a/src/lib/operator/tm_sub_Hopping_Matrix.c
+++ b/src/lib/operator/tm_sub_Hopping_Matrix.c
@@ -79,6 +79,8 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* const p, spin
 }
 
 #elif (!defined _NO_COMM && !defined TM_USE_HALFSPINOR)
+#include "hopping.h"
+
 void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* p, spinor* const k,
                            complex double const cfactor) {
 #ifdef TM_USE_GAUGE_COPY
diff --git a/src/lib/operator/tm_times_Hopping_Matrix.c b/src/lib/operator/tm_times_Hopping_Matrix.c
index a83f2febb..1de2a872c 100644
--- a/src/lib/operator/tm_times_Hopping_Matrix.c
+++ b/src/lib/operator/tm_times_Hopping_Matrix.c
@@ -79,6 +79,8 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
 }
 
 #elif (!defined _NO_COMM && !defined TM_USE_HALFSPINOR)
+#include "hopping.h"
+
 void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
                              double complex const cfactor) {
 #ifdef TM_USE_GAUGE_COPY

From 215b8cd3c159dfad4f47d7cbd819a70e685a32b4 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Mon, 30 Mar 2026 15:59:25 +0200
Subject: [PATCH 46/80] adjusted beverin pipeline

---
 .ci/build-rocm.sh                      | 49 ++++++++++++++++++++++++++
 .ci/cscs_beverin_pipeline.yml          | 31 ++++++++++++++--
 .ci/include/cscs/01-test-templates.yml | 32 ++++++-----------
 3 files changed, 89 insertions(+), 23 deletions(-)
 create mode 100644 .ci/build-rocm.sh

diff --git a/.ci/build-rocm.sh b/.ci/build-rocm.sh
new file mode 100644
index 000000000..2ce6d58c7
--- /dev/null
+++ b/.ci/build-rocm.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+set -xeuo pipefail
+
+echo "VARIABLE = $VARIABLE"
+
+export SPACK_SYSTEM_CONFIG_PATH=/user-environment/config
+export CICD_SRC_DIR=$PWD
+export QUDA_SRC_DIR=$PWD/deps/src/quda
+export SPACK_PYTHON=$(which python3.6) # must be <=3.12, system python is 3.6
+
+# QUDA git, branch and commit
+export QUDA_GIT_REPO="${QUDA_GIT_REPO:=https://github.com/lattice/quda.git}"
+export QUDA_GIT_BRANCH="${QUDA_GIT_BRANCH:=develop}"
+export QUDA_GIT_COMMIT="${QUDA_GIT_COMMIT:=$(git ls-remote ${QUDA_GIT_REPO} refs/heads/${QUDA_GIT_BRANCH} | awk '{print $1}')}"
+
+# make sure we keep the stage direcorty
+spack config --scope=user add config:build_stage:/dev/shm/spack-stage
+# we might need to install dependencies too, e.g. nlcglib in case of API changes
+spack config --scope=user add config:install_tree:root:/dev/shm/spack-stage
+
+spack env create -d ./spack-env
+
+# add local repository with current tmlqcd recipe
+spack -e ./spack-env repo add $REPO
+
+spack -e ./spack-env config add "packages:all:variants:[amdgpu_target=${ROCM_ARCH},amdgpu_target_sram_ecc=${ROCM_ARCH},+rocm]"
+
+spack -e ./spack-env add $SPEC
+
+# for tmlqcd use local src instead of fetch git
+spack -e ./spack-env develop -p ${CICD_SRC_DIR} tmlqcd@cicd
+
+# for quda use local src instead of fetch git, to be able to tests against
+# differnt repo, branch, commit and also to support that quda branch develop is
+# a moving target
+spack -e ./spack-env develop -p ${QUDA_SRC_DIR} quda@cicd
+
+# display spack.yaml
+cat ./spack-env/spack.yaml
+
+spack -e ./spack-env concretize
+spack -e ./spack-env install
+
+# the tar pipe below expects a relative path
+builddir=$(spack -e ./spack-env location -b tmlqcd)
+
+# create a symlink to spack build directory (keep in artifacts)
+tar -cf builddir.tar $builddir
diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index e06b39867..efa5398ac 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -3,16 +3,43 @@ include:
   - local: '/.ci/include/cscs/00-variables.yml'
   - local: '/.ci/include/cscs/01-test-templates.yml'
 
+
 stages:
+  - prepare
   - build
   - test
 
-build-quda/uenv/beverin-mi300:
-  stage: build
+
+build-base/uenv/beverin-mi300:
+  stage: prepare
   extends: [.uenv-builder-beverin-mi300, .beverin-mi300-secrets]
   variables:
     UENV_RECIPE: .ci/uenv-recipes/tmlqcd/beverin-mi300
     SLURM_TIMELIMIT: "08:00:00"
+
+
+build-tmlqcd/uenv/beverin-mi300:
+  stage: build
+  extends: [.uenv-runner-beverin-mi300, .beverin-mi300-secrets]
+  needs: ["build-base/uenv/beverin-mi300"]
+  image: $UENV_NAME/$UENV_VERSION:$CI_PIPELINE_ID
+  artifacts:
+    paths:
+      - builddir.tar
+  variables:
+    SPEC: "tmlqcd@cicd ~openmp +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
+    ROCM_ARCH: gfx942
+    REPO: "./.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/"
+  script:
+    - echo "YAML VARIABLE = $VARIABLE"
+    - |
+      #
+      git clone --filter=tree:0 $(jq -r .spack.repo /user-environment/meta/configure.json) /dev/shm/spack-clone
+      git -C /dev/shm/spack-clone checkout $(jq -r .spack.commit /user-environment/meta/configure.json)
+      source /dev/shm/spack-clone/share/spack/setup-env.sh
+      bwrap --dev-bind / / --tmpfs ~ -- ./ci/build-rocm.sh
+
+
 test/beverin-mi300:
   extends: [.uenv-runner-beverin-mi300, .test/hmc, .beverin-mi300-vars, .beverin-mi300-secrets]
   variables:
diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index 45e49b0fa..7c851ca8c 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -8,36 +8,26 @@ include:
   variables:
     WITH_UENV_VIEW: "default"
   before_script:
+    - echo "before VARIABLE = $VARIABLE"
     - |
       if test "${SLURM_PROCID}" -eq "0"; then
-        export CC="$(which mpicc)"
-        export CXX="$(which mpicxx)"
-        mkdir -p install_dir
-        autoconf
-        ./configure \
-          --enable-quda_experimental \
-          --enable-mpi \
-          --enable-omp \
-          --with-mpidimension=4 \
-          --disable-sse2 \
-          --disable-sse3 \
-          --enable-alignment=32 \
-          --with-qudadir="/user-environment/env/default" \
-          --with-limedir="/user-environment/env/default" \
-          --with-lemondir="/user-environment/env/default" \
-          --with-lapack="-lopenblas -L/user-environment/env/default/lib" \
-          --with-cudadir="/user-environment/env/default/lib64" \
-          --prefix="$(pwd)/install_dir"
-        make
-        make install
+        ls -la
+        tar xf ./builddir.tar -C /
         touch preparation-done-${CI_JOB_ID}
       fi
-    - while test ! -f preparation-done-${CI_JOB_ID}; do sleep 5; done
+    - |
+      while test ! -f preparation-done-${CI_JOB_ID}; do sleep 5; done
+      stagedir=$(find /dev/shm/spack-stage -type d -name spack-stage-sirius-*)
+      echo "stagedir: $stagedir"
 
 
 .test/hmc:
   extends: .test/base
   script:
+    - echo "script VARIABLE = $VARIABLE"
+    - ls -la
+    - which numdiff
+    - which hmc_tm
     - ./install_dir/bin/hmc_tm -f "${INPUT_FILE}"
     - |
       if test "${SLURM_PROCID}" -eq "0"; then

From e99c3785248ec5d9eb24fd531b7ddef78a3e05c6 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Mon, 30 Mar 2026 16:00:09 +0200
Subject: [PATCH 47/80] fixed tmlqcd environment

---
 .../tmlqcd/beverin-mi300/compilers.yaml       |  4 +++-
 .../tmlqcd/beverin-mi300/config.yaml          |  7 +++---
 .../tmlqcd/beverin-mi300/environments.yaml    | 22 +++++++++++--------
 .../tmlqcd/beverin-mi300/post-install         |  8 +++++++
 4 files changed, 27 insertions(+), 14 deletions(-)
 mode change 100644 => 100755 .ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml
 create mode 100755 .ci/uenv-recipes/tmlqcd/beverin-mi300/post-install

diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/compilers.yaml b/.ci/uenv-recipes/tmlqcd/beverin-mi300/compilers.yaml
index 840d9974d..38a8faa0b 100644
--- a/.ci/uenv-recipes/tmlqcd/beverin-mi300/compilers.yaml
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/compilers.yaml
@@ -1,2 +1,4 @@
 gcc:
-  version: "14.2"
+  version: "13"
+llvm-amdgpu:
+  version: "6.3.3"
\ No newline at end of file
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml b/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml
index 3ec694351..7fbeacb99 100644
--- a/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml
@@ -2,10 +2,9 @@ name: tmlqcd
 store: /user-environment
 spack:
   repo: https://github.com/spack/spack.git
-  commit: releases/v1.0
+  commit: releases/v1.1
   packages:
     repo: https://github.com/spack/spack-packages.git
-    commit: releases/v2025.11
-modules: true
-description: "tmLQCD is a freely available software suite providing a set of tools to be used in lattice QCD simulations."
+    commit: 5f20b9190596e0b875141e8cee03f0d3847ad65c
+description: "tmLQCD dependencies for CSCS CI."
 version: 2
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml b/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml
old mode 100644
new mode 100755
index 07307771a..d0799a646
--- a/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml
@@ -1,24 +1,28 @@
 gcc-env:
-  compiler: [gcc]
+  compiler: [gcc, llvm-amdgpu]
   network:
-      mpi: cray-mpich@8.1.32 +rocm
+      mpi: cray-mpich@8.1.32+rocm
       specs: [ 'libfabric@2.3 +rocm' ]
-  unify: true
+  unify: when_possible
+  duplicates:
+    strategy: full
   specs:
-  - python@3.12
-  - numdiff
-  - quda@develop +qdp +multigrid +twisted_clover +twisted_mass
-  - lemonio
-  - c-lime
-  - openblas
+  # add GPU-specific packages here
   - hip@6.3.3 ^mesa@23.3.6
+  - llvm-amdgpu
+  # tmlqcd and quda are not required, since we build their newest commits in the
+  # next step. Although, we want all dependencies
+  - tmlqcd +lemon +quda
+  - numdiff
   variants:
   - +mpi
   - +rocm
   - amdgpu_target=gfx942
+  - amdgpu_target_sram_ecc=gfx942
   views:
     default:
       link: roots
+      exclude: ["llvm"]
       uenv:
         add_compilers: true
         prefix_paths:
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/post-install b/.ci/uenv-recipes/tmlqcd/beverin-mi300/post-install
new file mode 100755
index 000000000..a5e932cc5
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/post-install
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+# remove offending environment variables
+jq '.views["default"].env.values.scalar["HIPCC_LINK_FLAGS_APPEND"]|="" | .views["default"].env.values.scalar["HIPCC_COMPILE_FLAGS_APPEND"]|="" ' /user-environment/meta/env.json > /tmp/env.json
+# copy file back to destination
+cp /tmp/env.json /user-environment/meta/env.json
\ No newline at end of file

From 8fe54981bbb0b61ca7732070955b21b81e4e5218 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Mon, 30 Mar 2026 16:00:35 +0200
Subject: [PATCH 48/80] lemonio is now a CmakePackage in spack

---
 .ci/spack_packages/lemonio/package.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)
 mode change 100755 => 100644 .ci/spack_packages/lemonio/package.py

diff --git a/.ci/spack_packages/lemonio/package.py b/.ci/spack_packages/lemonio/package.py
old mode 100755
new mode 100644
index d70cac492..a3e66f64e
--- a/.ci/spack_packages/lemonio/package.py
+++ b/.ci/spack_packages/lemonio/package.py
@@ -2,12 +2,13 @@
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
-from spack_repo.builtin.build_systems.autotools import AutotoolsPackage
+from spack_repo.builtin.build_systems import cmake
+from spack_repo.builtin.build_systems.cmake import CMakePackage, generator
 
 
 from spack.package import *
 
-class Lemonio(AutotoolsPackage):
+class Lemonio(CMakePackage):
     """LEMON: Lightweight Parallel I/O library for Lattice QCD."""
 
     homepage = "https://github.com/etmc/lemon"
@@ -16,13 +17,13 @@ class Lemonio(AutotoolsPackage):
 
     version('master', branch='master')
 
-    depends_on("autoconf", type="build", when="@master build_system=autotools")
-    depends_on("automake", type="build", when="@master build_system=autotools")
-    depends_on("libtool", type="build", when="@master build_system=autotools")
+    depends_on("c", type="build")
+    depends_on("cxx", type="build")
+    depends_on("fortran", type="build")
 
     depends_on('mpi')
+    generator("ninja")
 
     def configure_args(self):
         args = []
-        args.append('CC={0}'.format(self.spec['mpi'].mpicc))
         return args

From 13ec95d7a0deed0a3c7fb077964f4d900e8ce0e5 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Mon, 30 Mar 2026 16:01:07 +0200
Subject: [PATCH 49/80] tmlqcd spack package

---
 .ci/spack_packages/tmlqcd/package.py | 119 +++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100755 .ci/spack_packages/tmlqcd/package.py

diff --git a/.ci/spack_packages/tmlqcd/package.py b/.ci/spack_packages/tmlqcd/package.py
new file mode 100755
index 000000000..78a94340b
--- /dev/null
+++ b/.ci/spack_packages/tmlqcd/package.py
@@ -0,0 +1,119 @@
+# Copyright Spack Project Developers. See COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+from spack_repo.builtin.build_systems import cmake
+from spack_repo.builtin.build_systems.cmake import CMakePackage, generator
+from spack_repo.builtin.build_systems.rocm import ROCmPackage
+from spack_repo.builtin.build_systems.cuda import CudaPackage
+
+from spack.package import *
+
+
+class Tmlqcd(CMakePackage, CudaPackage, ROCmPackage):
+    """Base class for building tmlQCD."""
+
+    homepage = "https://www.itkp.uni-bonn.de/~urbach/software.html"
+    url = "https://github.com/etmc/tmLQCD/archive/refs/tags/rel-5-1-6.tar.gz"
+    git = "https://github.com/etmc/tmLQCD.git"
+    license("GPL-3.0-or-later")
+
+    maintainers("mtaillefumier")
+    version("master", branch="master")
+
+    variant("lemon", default=False, description="Enable the lemon backend")
+    variant("mpi", default=True, description="Enable mpi support")
+    variant("DDalphaAMG", default=False, description="Enable DAlphaAMG support")
+    variant("openmp", default=True, description="Enable OpenMP")
+    variant("fftw", default=True, description="Enable FFTW interface")
+    variant(
+        "persistent_mpi",
+        default=True,
+        description="Enable persistent mpi calls for spinor and gauge fields",
+        when="+mpi",
+    )
+    variant(
+        "nonblocking_mpi",
+        default=True,
+        description="Enable non-blocking mpi calls for spinor and gauge fields",
+        when="+mpi",
+    )
+    variant("fixedvolume", default=True, description="Enable fixed volume at compile time")
+    variant(
+        "alignment",
+        default="auto",
+        values=("none", "auto", "16", "32", "64"),
+        description="Automatically or expliclty align arrays",
+    )
+    variant("gauge_copy", default=True, description="Enable gauge field copy")
+    variant("half_spinor", default=True, description="Use a Dirac operator with half-spinor")
+    variant("shared", default=False, description="Enable shared library")
+    variant("shmem", default=False, description="Use shmem API")
+    variant("quda", default=True, description="Enable the QUDA library", when="+cuda")
+    variant("quda", default=True, description="Enable the QUDA library", when="+rocm")
+    variant(
+        "QPhiX", default=False, description="Enable the QPhiX library for Intel Xeon and Xeon Phis"
+    )
+    variant(
+        "mpi_dimensions",
+        default="4",
+        values=("1", "2", "3", "4", "x", "xy", "xyz"),
+        description="number of dimensions the mpi processes are distributed. the default is parallelization over all four dimensions txyz",
+        when="+mpi",
+    )
+
+    generator("ninja")
+
+    # language dependencies
+    depends_on("c", type="build")
+    depends_on("cxx", type="build")
+    depends_on("fortran", type="build")
+
+    # conflicts
+    conflicts("+cuda", when="cuda_arch=none")
+    conflicts("+rocm", when="amdgpu_target=none")
+    conflicts("+cuda +rocm", msg="CUDA and ROCm support are mutually exclusive")
+
+    # hard dependencies
+    depends_on("c-lime")
+    depends_on("blas")
+    depends_on("lapack")
+    depends_on("pkgconfig", type="build")
+
+    # dependencies
+    depends_on("mpi", when="+mpi")
+    depends_on("lemonio", when="+lemon")
+
+    with when("+quda"):
+        depends_on(
+            "quda+shared+twisted_mass+twisted_clover+clover+ndeg_twisted_clover+ndeg_twisted_mass+wilson+qdp+multigrid"
+        )
+
+        depends_on("quda+mpi", when="+mpi")
+        depends_on("quda+cuda", when="+cuda")
+        depends_on("quda+rocm", when="+rocm")
+
+    depends_on("fftw-api@3", when="+fftw")
+
+
+class CMakeBuilder(cmake.CMakeBuilder):
+    def cmake_args(self):
+        spec = self.spec
+        args = [
+            self.define_from_variant("BUILD_SHARED_LIBS", "shared"),
+            self.define_from_variant("TM_USE_LEMON", "lemon"),
+            self.define_from_variant("TM_USE_MPI", "mpi"),
+            self.define_from_variant("TM_USE_QUDA", "quda"),
+            self.define_from_variant("TM_USE_CUDA", "cuda"),
+            self.define_from_variant("TM_USE_HIP", "rocm"),
+            self.define_from_variant("TM_USE_FFTW", "fftw"),
+            self.define_from_variant("TM_USE_OMP", "openmp"),
+            self.define_from_variant("TM_USE_SHMEM", "shmem"),
+            self.define_from_variant("TM_USE_GAUGE_COPY", "gauge_copy"),
+            self.define_from_variant("TM_USE_HALFSPINOR", "half_spinor"),
+        ]
+
+        args.append("-DCMAKE_C_COMPILER={0}".format(self.spec["hip"].hipcc))
+        args.append("-DCMAKE_CXX_COMPILER={0}".format(self.spec["hip"].hipcc))
+
+        return args

From c9d6609459ca63f10e2df4ce91eef921e6aa6afb Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Mon, 30 Mar 2026 16:01:26 +0200
Subject: [PATCH 50/80] added quda spack package

---
 .ci/spack_packages/quda/package.py | 200 +++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 .ci/spack_packages/quda/package.py

diff --git a/.ci/spack_packages/quda/package.py b/.ci/spack_packages/quda/package.py
new file mode 100644
index 000000000..c554fabbb
--- /dev/null
+++ b/.ci/spack_packages/quda/package.py
@@ -0,0 +1,200 @@
+# Copyright Spack Project Developers. See COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+from spack_repo.builtin.build_systems.cmake import CMakePackage, generator
+from spack_repo.builtin.build_systems.cuda import CudaPackage
+from spack_repo.builtin.build_systems.rocm import ROCmPackage
+
+from spack.package import *
+
+
+class Quda(CMakePackage, CudaPackage, ROCmPackage):
+    """QUDA is a library for performing calculations in lattice QCD on GPUs."""
+
+    homepage = "https://lattice.github.io/quda/"
+    url = "https://github.com/lattice/quda/archive/refs/tags/v1.1.0.tar.gz"
+    git = "https://github.com/lattice/quda.git"
+
+    tags = ["hep", "lattice"]
+
+    maintainers("chaoos", "mtaillefumier")
+
+    license("MIT OR BSD-3-Clause", checked_by="chaoos")
+
+    version("develop", branch="develop")
+
+    # git describe --tags --match 'v*' 18bf43ed40c75ae276e55bb8ddf2f64aa5510c37
+    version(
+        "1.1.0-4597-g18bf43ed4", preferred=True, commit="18bf43ed40c75ae276e55bb8ddf2f64aa5510c37"
+    )
+
+    version("1.1.0", sha256="b4f635c993275010780ea09d8e593e0713a6ca1af1db6cc86c64518714fcc745")
+
+    # build dependencies
+    generator("ninja")
+    depends_on("cmake@3.18:", type="build")
+    depends_on("ninja", type="build")
+    depends_on("c", type="build")
+    depends_on("cxx", type="build")
+    depends_on("fortran", type="build", when="+tifr")
+    depends_on("fortran", type="build", when="+bqcd")
+
+    variant("shared", default=True, description="Build shared libraries")
+    variant(
+        "backwards", default=False, description="Enable stacktrace generation using backwards-cpp"
+    )
+    variant("mpi", default=False, description="Enable MPI support")
+    variant("qmp", default=False, description="Enable QMP")
+    variant("qio", default=False, description="Enable QIO", when="+qmp")
+    variant("openqcd", default=False, description="Enable openQCD interface")
+    variant("milc", default=False, description="Enable MILC interface")
+    variant("qdp", default=False, description="Enable QDP interface")
+    variant("bqcd", default=False, description="Enable BQCD interface")
+    variant("cps", default=False, description="Enable CPS interface")
+    variant("qdpjit", default=False, description="Enable QDPJIT interface")
+    variant("tifr", default=False, description="Enable TIFR interface")
+    variant("multigrid", default=False, description="Enable multigrid")
+    variant("nvshmem", default=False, description="Enable NVSHMEM", when="+cuda")
+    variant("openmp", default=False, description="Enable openmp support")
+    variant("clover", default=False, description="Build clover Dirac operators")
+    variant(
+        "clover_hasenbusch", default=False, description="Build clover Hasenbusch twist operators"
+    )
+    variant("domain_wall", default=False, description="Build domain wall Dirac operators")
+    variant("laplace", default=False, description="Build laplace operator")
+    variant(
+        "ndeg_twisted_clover",
+        default=False,
+        description="Build non-degenerate twisted clover Dirac operators",
+    )
+    variant(
+        "ndeg_twisted_mass",
+        default=False,
+        description="Build non-degenerate twisted mass Dirac operators",
+    )
+    variant("staggered", default=False, description="Build staggered Dirac operators")
+    variant("twisted_clover", default=False, description="Build twisted clover Dirac operators")
+    variant("twisted_mass", default=False, description="Build twisted mass Dirac operators")
+    variant("wilson", default=True, description="Build Wilson Dirac operators")
+    variant("usqcd", default=False, description="Download and build usqcd", when="+qmp")
+    variant("eigen", default=True, description="Enable eigen support")
+
+    with when("+multigrid"):
+        variant(
+            "mg_mrhs_list",
+            default="16",
+            multi=True,
+            description="The list of multi-rhs sizes that get compiled",
+        )
+        variant(
+            "mg_nvec_list",
+            default="6,24,32",
+            multi=True,
+            description="The list of null space vector sizes that get compiled",
+        )
+
+    # dependencies
+    depends_on("mpi", when="+mpi")
+    depends_on("cuda", when="+cuda")
+    depends_on("nvshmem", when="+nvshmem")
+    depends_on("gdrcopy", when="+nvshmem")
+
+    with when("+rocm"):
+        depends_on("hip")
+        depends_on("hipblas")
+        depends_on("hipfft")
+        depends_on("hiprand")
+        depends_on("hipcub")
+
+    conflicts("+qmp +mpi", msg="Specifying both QMP and MPI might result in undefined behavior")
+    conflicts("+cuda +rocm", msg="CUDA and ROCm support are mutually exclusive")
+    conflicts("~cuda ~rocm", msg="Either CUDA or ROCm support is required")
+    conflicts("cuda_arch=none", when="+cuda", msg="Please indicate a cuda_arch value")
+    conflicts("amdgpu_target=none", when="+rocm", msg="Please indicate a amdgpu_target value")
+    conflicts(
+        "+nvshmem", when="~mpi ~qmp", msg="NVSHMEM requires either +mpi or +qmp to be enabled"
+    )
+
+    # CMAKE_BUILD_TYPE
+    variant(
+        "build_type",
+        default="STRICT",
+        description="The build type to build",
+        values=("STRICT", "RELEASE", "DEVEL", "DEBUG", "HOSTDEBUG", "SANITIZE"),
+    )
+
+    def cmake_args(self):
+        if self.spec.satisfies("+cuda"):
+            target = "CUDA"
+            cuda_archs = self.spec.variants["cuda_arch"].value
+            arch = " ".join(f"sm_{i}" for i in cuda_archs)
+        elif self.spec.satisfies("+rocm"):
+            target = "HIP"
+            arch = self.spec.variants["amdgpu_target"].value
+
+        args = [
+            self.define("QUDA_BUILD_ALL_TESTS", False),
+            self.define("QUDA_TARGET_TYPE", target),
+            self.define("QUDA_GPU_ARCH", arch),
+            self.define("QUDA_PRECISION", 14),
+            self.define("QUDA_RECONSTRUCT", 7),
+            self.define("QUDA_DOWNLOAD_USQCD", "usqcd"),
+            self.define("QUDA_DIRAC_DEFAULT_OFF", True),
+            self.define_from_variant("QUDA_DIRAC_CLOVER", "clover"),
+            self.define_from_variant("QUDA_DIRAC_CLOVER_HASENBUSCH", "clover_hasenbusch"),
+            self.define_from_variant("QUDA_DIRAC_DOMAIN_WALL", "domain_wall"),
+            self.define_from_variant("QUDA_DIRAC_LAPLACE", "laplace"),
+            self.define_from_variant("QUDA_DIRAC_NDEG_TWISTED_CLOVER", "ndeg_twisted_clover"),
+            self.define_from_variant("QUDA_DIRAC_NDEG_TWISTED_MASS", "ndeg_twisted_mass"),
+            self.define_from_variant("QUDA_DIRAC_STAGGERED", "staggered"),
+            self.define_from_variant("QUDA_DIRAC_TWISTED_CLOVER", "twisted_clover"),
+            self.define_from_variant("QUDA_DIRAC_TWISTED_MASS", "twisted_mass"),
+            self.define_from_variant("QUDA_DIRAC_WILSON", "wilson"),
+            self.define_from_variant("QUDA_MPI", "mpi"),
+            self.define_from_variant("QUDA_QMP", "qmp"),
+            self.define_from_variant("QUDA_QIO", "qio"),
+            self.define_from_variant("QUDA_INTERFACE_OPENQCD", "openqcd"),
+            self.define_from_variant("QUDA_INTERFACE_MILC", "milc"),
+            self.define_from_variant("QUDA_INTERFACE_QDP", "qdp"),
+            self.define_from_variant("QUDA_INTERFACE_BQCD", "bqcd"),
+            self.define_from_variant("QUDA_INTERFACE_CPS", "cps"),
+            self.define_from_variant("QUDA_INTERFACE_QDPJIT", "qdpjit"),
+            self.define_from_variant("QUDA_INTERFACE_TIFR", "tifr"),
+            self.define_from_variant("QUDA_MULTIGRID", "multigrid"),
+            self.define_from_variant("QUDA_NVSHMEM", "nvshmem"),
+            self.define_from_variant("QUDA_OPENMP", "openmp"),
+            self.define_from_variant("QUDA_BACKWARDS", "backwards"),
+            self.define_from_variant("QUDA_USE_EIGEN", "eigen"),
+            self.define_from_variant("QUDA_BUILD_SHAREDLIB", "shared"),
+            self.define_from_variant("BUILD_SHARED_LIBS", "shared"),
+        ]
+        if self.spec.satisfies("+multigrid"):
+            args.append(
+                self.define(
+                    "QUDA_MULTIGRID_NVEC_LIST", ",".join(self.spec.variants["mg_nvec_list"].value)
+                )
+            )
+            args.append(
+                self.define(
+                    "QUDA_MULTIGRID_MRHS_LIST", ",".join(self.spec.variants["mg_mrhs_list"].value)
+                )
+            )
+
+        if self.spec.satisfies("+nvshmem"):
+            args.append(self.define("QUDA_NVSHMEM_HOME", self.spec["nvshmem"].prefix))
+            args.append(self.define("QUDA_GDRCOPY_HOME", self.spec["gdrcopy"].prefix))
+
+        if self.spec.satisfies("+cuda"):
+            args.append(self.define("QUDA_GPU_ARCH_SUFFIX", "real"))  # real or virtual
+        elif self.spec.satisfies("+rocm"):
+            args.append(self.define("CMAKE_C_COMPILER", self.spec["hip"].hipcc))
+            args.append(self.define("CMAKE_CXX_COMPILER", self.spec["hip"].hipcc))
+            # args.append(self.define("ROCM_PATH", self.spec["hip"].prefix))
+
+            # required when building on a machine with no AMD GPU present
+            args.append(self.define("AMDGPU_TARGETS", arch))
+
+            # suppress _GLIBCXX17_DEPRECATED warnings when compiling c++17
+            args.append(self.define("CMAKE_CXX_FLAGS", "-Wno-deprecated-declarations"))
+        return args

From 0dce3daaf9377e14ef5b2690bce27bf446bcd420 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Mon, 30 Mar 2026 16:02:23 +0200
Subject: [PATCH 51/80] add beverin sample input file

---
 .ci/cscs_beverin_pipeline.yml                 |   2 +-
 .../sample-hmc-quda-cscs-beverin.input        | 274 ++++++++++++++++++
 2 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 doc/sample-input/sample-hmc-quda-cscs-beverin.input

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index efa5398ac..17105519f 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -43,7 +43,7 @@ build-tmlqcd/uenv/beverin-mi300:
 test/beverin-mi300:
   extends: [.uenv-runner-beverin-mi300, .test/hmc, .beverin-mi300-vars, .beverin-mi300-secrets]
   variables:
-    INPUT_FILE: "doc/sample-input/sample-hmc-quda-cscs.input"
+    INPUT_FILE: "doc/sample-input/sample-hmc-quda-cscs-beverin.input"
     REFPATH: "doc/sample-output/hmc-quda-cscs"
     QUDA_ENABLE_TUNING: 0 # disable tuning
     QUDA_ENABLE_GDR: 1 # enable GPU-Direct RDMA
diff --git a/doc/sample-input/sample-hmc-quda-cscs-beverin.input b/doc/sample-input/sample-hmc-quda-cscs-beverin.input
new file mode 100644
index 000000000..e98f2a924
--- /dev/null
+++ b/doc/sample-input/sample-hmc-quda-cscs-beverin.input
@@ -0,0 +1,274 @@
+L=24
+T=48
+
+NrXProcs = 1
+NrYProcs = 1
+NrZprocs = 1
+
+ompnumthreads=32
+
+BarrierMonomialsConverge = yes
+
+Measurements = 20
+
+thermalisationsweeps = 0
+
+seed=146555
+
+Startcondition = hot
+InitialStoreCounter = 0
+
+2KappaMu = 0.0023801411000
+2KappaMuBar = 0.03875149727400
+2KappaEpsBar = 0.04103923289600
+CSW = 1.7112
+kappa =    0.1400083
+NSave = 10
+ThetaT = 1.0
+UseEvenOdd = yes
+userelativeprecision=yes
+
+ReversibilityCheck = no
+ReversibilityCheckIntervall = 10
+
+DebugLevel = 2
+
+ReproduceRandomNumbers = yes
+RanluxdLevel = 2
+
+BeginMeasurement CORRELATORS
+  Frequency = 2
+EndMeasurement
+
+BeginExternalInverter QUDA
+  Pipeline = 0
+  gcrNkrylov = 24
+  MGNumberOfLevels = 3
+  MGNumberOfVectors = 24, 32
+  MGSetupSolver = cg
+  MGSetup2KappaMu = 0.0023801411000
+  MGVerbosity = silent, silent, silent
+  MGSetupSolverTolerance = 5e-7, 5e-7
+  MGSetupMaxSolverIterations = 1500, 1500
+  MGCoarseSolverType = gcr, gcr, cagcr
+  MGSmootherType = cagcr, cagcr, cagcr
+  MGBlockSizesX = 3,2
+  MGBlockSizesY = 3,2
+  MGBlockSizesZ = 4,3
+  MGBlockSizesT = 3,2
+  MGResetSetupMDUThreshold = 1.0
+  MGRefreshSetupMDUThreshold = 0.0149
+  MGRefreshSetupMaxSolverIterations = 40, 40
+ 
+  MGCoarseMuFactor = 1.4, 2.4, 110.0
+  MGCoarseMaxSolverIterations = 45, 40, 10
+  MGCoarseSolverTolerance = 0.1, 0.35, 0.45
+  MGSmootherPostIterations = 2, 3, 6
+  MGSmootherPreIterations = 2, 0, 1
+  MGSmootherTolerance = 0.1, 0.1, 0.2
+  MGOverUnderRelaxationFactor = 0.90, 0.85, 1.00  
+EndExternalInverter
+
+BeginMonomial GAUGE
+  Type = Iwasaki
+  beta = 1.745
+  Timescale = 0
+  UseExternalLibrary = quda
+EndMonomial
+
+BeginMonomial CLOVERDET
+  Timescale = 1
+  kappa =    0.1400083
+  2KappaMu = 0.0023801411000
+  CSW = 1.7112
+  rho = 0.45
+  MaxSolverIterations = 5000
+  AcceptancePrecision =  1.e-23
+  ForcePrecision = 1.e-19
+  Name = cloverdetlight
+  solver= cg
+  UseExternalInverter = quda
+  UseSloppyPrecision = half
+  UseExternalLibrary = quda
+EndMonomial
+
+BeginMonomial CLOVERDETRATIO
+  Timescale = 1
+  kappa =    0.1400083
+  2KappaMu = 0.0023801411000
+  rho = 0.030
+  rho2 = 0.45
+  CSW = 1.7112
+  MaxSolverIterations = 1000
+  AcceptancePrecision =  1.e-23
+  ForcePrecision = 1.e-19
+  Name = cloverdetratio1light
+
+  solver= cg
+  UseExternalInverter = quda
+  UseSloppyPrecision = half
+  UseExternalLibrary = quda
+EndMonomial
+
+BeginMonomial CLOVERDETRATIO
+  Timescale = 2
+  kappa =    0.1400083
+  2KappaMu = 0.0023801411000
+  rho = 0.0030
+  # rho2 = 0.045
+  rho2 = 0.030
+  CSW = 1.7112
+  MaxSolverIterations = 1000
+  AcceptancePrecision =  1.e-23
+  ForcePrecision = 1.e-20
+  Name = cloverdetratio2light
+  solver = mg
+  UseExternalInverter = quda
+  UseSloppyPrecision = single
+
+  HB_solver = cg
+  HB_usesloppyprecision = half
+  HB_UseExternalInverter = quda
+  HB_MaxSolverIterations = 3000
+  UseExternalLibrary = quda
+EndMonomial
+
+
+BeginMonomial CLOVERDETRATIO
+  Timescale = 3
+  kappa =    0.1400083
+  2KappaMu = 0.0023801411000
+  rho = 0.0
+  rho2 = 0.0030
+  CSW = 1.7112
+  MaxSolverIterations = 1000
+  AcceptancePrecision =  1.e-23
+  ForcePrecision = 1.e-20
+  Name = cloverdetratio3light
+  solver = mg
+  UseExternalInverter = quda
+  UseSloppyPrecision = single
+  UseExternalLibrary = quda
+EndMonomial
+
+
+BeginMonomial NDCLOVERRAT
+  Timescale = 1
+  kappa =    0.1400083
+  CSW = 1.7112
+  AcceptancePrecision =  1e-23
+  ForcePrecision = 1e-19
+  StildeMin = 0.0000376
+  StildeMax = 4.7
+  Name = ndcloverrat1
+  DegreeOfRational = 10
+  Cmin = 0
+  Cmax = 3
+  ComputeEVFreq = 0
+  2KappaMuBar = 0.03875149727400
+  2KappaEpsBar = 0.04103923289600
+  AddTrLog = yes
+  solver= cgmmsnd
+  UseExternalInverter = quda
+  UseSloppyPrecision = single
+  RefinementPrecision = half
+  MaxSolverIterations = 5000
+EndMonomial
+
+BeginMonomial NDCLOVERRAT
+  Timescale = 2
+  kappa =    0.1400083
+  CSW = 1.7112
+  AcceptancePrecision =  1e-23
+  ForcePrecision = 1e-19
+  StildeMin = 0.0000376
+  StildeMax = 4.7
+  Name = ndcloverrat2
+  DegreeOfRational = 10
+  Cmin = 4
+  Cmax = 6
+  ComputeEVFreq = 0
+  2KappaMuBar = 0.03875149727400
+  2KappaEpsBar = 0.04103923289600
+  AddTrLog = no
+  solver= cgmmsnd
+  UseExternalInverter = quda
+  UseSloppyPrecision = single
+  RefinementPrecision = half
+  MaxSolverIterations = 5000
+EndMonomial
+
+BeginMonomial NDCLOVERRAT
+  Timescale = 3
+  kappa =    0.1400083
+  CSW = 1.7112
+  AcceptancePrecision =  1e-23
+  ForcePrecision = 1e-19
+  StildeMin = 0.0000376
+  StildeMax = 4.7
+  Name = ndcloverrat3
+  DegreeOfRational = 10
+  Cmin = 7
+  Cmax = 9
+  ComputeEVFreq = 0
+  2KappaMuBar = 0.03875149727400
+  2KappaEpsBar = 0.04103923289600
+  AddTrLog = no
+  solver= cgmmsnd
+  UseExternalInverter = quda
+  UseSloppyPrecision = single
+  RefinementPrecision = half
+  MaxSolverIterations = 5000
+EndMonomial
+
+BeginMonomial NDCLOVERRATCOR
+  Timescale = 1
+  kappa =    0.1400083
+  CSW = 1.7112
+  AcceptancePrecision =  1e-23
+  ForcePrecision = 1e-19
+  StildeMin = 0.0000376
+  StildeMax = 4.7
+  Name = ndcloverratcor
+  DegreeOfRational = 10
+  ComputeEVFreq = 0
+  #UseExternalEigsolver = quda
+  2KappaMuBar = 0.03875149727400
+  2KappaEpsBar = 0.04103923289600
+  solver= cgmmsnd
+  UseExternalInverter = quda
+  UseSloppyPrecision = single
+  RefinementPrecision = half
+  MaxSolverIterations = 15000
+EndMonomial
+
+BeginIntegrator
+  Type0 = 2MN
+  Type1 = 2MN
+  Type2 = 2MN
+  Type3 = 2MN
+  IntegrationSteps0 = 1
+  IntegrationSteps1 = 1
+  IntegrationSteps2 = 1
+  IntegrationSteps3 = 1
+  tau = 0.03
+  Lambda0 =  0.19318332750
+  Lambda1 = 0.194
+  Lambda2 = 0.196
+  Lambda2 = 0.198
+  NumberOfTimescales =  4
+  MonitorForces = no
+EndIntegrator
+
+BeginOperator CLOVER
+  CSW = 1.7112
+  kappa =    0.1400083
+  2KappaMu = 0.0023801411000
+  SolverPrecision = 1e-20
+  useevenodd = yes
+  useexternalinverter = quda
+  usesloppyprecision = single
+  solver = mg
+  MaxSolverIterations = 500
+EndOperator
+

From 679963d5e7bb0c53992b205e4f1a208b74a011b1 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Mon, 30 Mar 2026 16:08:08 +0200
Subject: [PATCH 52/80] fix yaml syntax error

---
 .ci/cscs_beverin_pipeline.yml          |  2 +-
 .ci/include/cscs/01-test-templates.yml | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 17105519f..4b8e7030c 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -31,9 +31,9 @@ build-tmlqcd/uenv/beverin-mi300:
     ROCM_ARCH: gfx942
     REPO: "./.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/"
   script:
-    - echo "YAML VARIABLE = $VARIABLE"
     - |
       #
+      echo "YAML VARIABLE = $VARIABLE"
       git clone --filter=tree:0 $(jq -r .spack.repo /user-environment/meta/configure.json) /dev/shm/spack-clone
       git -C /dev/shm/spack-clone checkout $(jq -r .spack.commit /user-environment/meta/configure.json)
       source /dev/shm/spack-clone/share/spack/setup-env.sh
diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index 7c851ca8c..b43577c94 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -8,8 +8,8 @@ include:
   variables:
     WITH_UENV_VIEW: "default"
   before_script:
-    - echo "before VARIABLE = $VARIABLE"
     - |
+      echo "before VARIABLE = $VARIABLE"
       if test "${SLURM_PROCID}" -eq "0"; then
         ls -la
         tar xf ./builddir.tar -C /
@@ -24,12 +24,12 @@ include:
 .test/hmc:
   extends: .test/base
   script:
-    - echo "script VARIABLE = $VARIABLE"
-    - ls -la
-    - which numdiff
-    - which hmc_tm
-    - ./install_dir/bin/hmc_tm -f "${INPUT_FILE}"
     - |
+      echo "script VARIABLE = $VARIABLE"
+      ls -la
+      which numdiff
+      which hmc_tm
+      ./install_dir/bin/hmc_tm -f "${INPUT_FILE}"
       if test "${SLURM_PROCID}" -eq "0"; then
         echo "Check the results on SLURM_PROCID=${SLURM_PROCID} ..."
         numdiff -r 1.2e-6 -X 1:22 -X 1:5-21 -X 2:22 -X 2:5-21 output.data ${REFPATH}/output.data

From f6068028f28443e8293c6b4979d27a72e3b21395 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Mon, 30 Mar 2026 16:16:13 +0200
Subject: [PATCH 53/80] bump ci version

---
 .ci/include/cscs/00-variables.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/include/cscs/00-variables.yml b/.ci/include/cscs/00-variables.yml
index bddb06752..24d135c51 100644
--- a/.ci/include/cscs/00-variables.yml
+++ b/.ci/include/cscs/00-variables.yml
@@ -9,7 +9,7 @@
 variables:
   UENV_NAME: tmlqcd
   UENV_VERSION: experimental
-  UENV_TAG: v0.0.6
+  UENV_TAG: v0.0.8
 
 
 # These are the firecrest id and secret for the beverin pipeline

From c83eb697d1225922874513f37e7ecaaf43a9386d Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 12:23:48 +0200
Subject: [PATCH 54/80] fix job dependencies

---
 .ci/build-rocm.sh             | 8 ++++++--
 .ci/cscs_beverin_pipeline.yml | 3 ++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.ci/build-rocm.sh b/.ci/build-rocm.sh
index 2ce6d58c7..1788347fe 100644
--- a/.ci/build-rocm.sh
+++ b/.ci/build-rocm.sh
@@ -5,15 +5,19 @@ set -xeuo pipefail
 echo "VARIABLE = $VARIABLE"
 
 export SPACK_SYSTEM_CONFIG_PATH=/user-environment/config
+export SPACK_PYTHON=$(which python3.6) # must be <=3.12, system python is 3.6
 export CICD_SRC_DIR=$PWD
 export QUDA_SRC_DIR=$PWD/deps/src/quda
-export SPACK_PYTHON=$(which python3.6) # must be <=3.12, system python is 3.6
 
 # QUDA git, branch and commit
 export QUDA_GIT_REPO="${QUDA_GIT_REPO:=https://github.com/lattice/quda.git}"
 export QUDA_GIT_BRANCH="${QUDA_GIT_BRANCH:=develop}"
 export QUDA_GIT_COMMIT="${QUDA_GIT_COMMIT:=$(git ls-remote ${QUDA_GIT_REPO} refs/heads/${QUDA_GIT_BRANCH} | awk '{print $1}')}"
 
+# obtain QUDA
+git clone -b ${QUDA_GIT_BRANCH} ${QUDA_GIT_REPO} ${QUDA_SRC_DIR}
+git -C ${QUDA_SRC_DIR} checkout ${QUDA_GIT_COMMIT}
+
 # make sure we keep the stage direcorty
 spack config --scope=user add config:build_stage:/dev/shm/spack-stage
 # we might need to install dependencies too, e.g. nlcglib in case of API changes
@@ -24,7 +28,7 @@ spack env create -d ./spack-env
 # add local repository with current tmlqcd recipe
 spack -e ./spack-env repo add $REPO
 
-spack -e ./spack-env config add "packages:all:variants:[amdgpu_target=${ROCM_ARCH},amdgpu_target_sram_ecc=${ROCM_ARCH},+rocm]"
+spack -e ./spack-env config add "packages:all:variants:[amdgpu_target=${ROCM_ARCH},amdgpu_target_sram_ecc=${ROCM_ARCH},+rocm,+mpi]"
 
 spack -e ./spack-env add $SPEC
 
diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 4b8e7030c..4206a1c1f 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -21,7 +21,7 @@ build-base/uenv/beverin-mi300:
 build-tmlqcd/uenv/beverin-mi300:
   stage: build
   extends: [.uenv-runner-beverin-mi300, .beverin-mi300-secrets]
-  needs: ["build-base/uenv/beverin-mi300"]
+  needs: [build-base/uenv/beverin-mi300]
   image: $UENV_NAME/$UENV_VERSION:$CI_PIPELINE_ID
   artifacts:
     paths:
@@ -42,6 +42,7 @@ build-tmlqcd/uenv/beverin-mi300:
 
 test/beverin-mi300:
   extends: [.uenv-runner-beverin-mi300, .test/hmc, .beverin-mi300-vars, .beverin-mi300-secrets]
+  needs: [build-tmlqcd/uenv/beverin-mi300]
   variables:
     INPUT_FILE: "doc/sample-input/sample-hmc-quda-cscs-beverin.input"
     REFPATH: "doc/sample-output/hmc-quda-cscs"

From b45067db8392b856d16feaaf9eadd18e525a8f62 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 17:26:39 +0200
Subject: [PATCH 55/80] add path to qudas spack package to not expose
 ${ROCM_PATH}/include/hipfft explicitly

---
 .ci/spack_packages/quda/package.py                  |  3 +++
 .../quda/quda-hipfft-public-to-private.patch        | 13 +++++++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 .ci/spack_packages/quda/quda-hipfft-public-to-private.patch

diff --git a/.ci/spack_packages/quda/package.py b/.ci/spack_packages/quda/package.py
index c554fabbb..ad34b3fdb 100644
--- a/.ci/spack_packages/quda/package.py
+++ b/.ci/spack_packages/quda/package.py
@@ -31,6 +31,9 @@ class Quda(CMakePackage, CudaPackage, ROCmPackage):
 
     version("1.1.0", sha256="b4f635c993275010780ea09d8e593e0713a6ca1af1db6cc86c64518714fcc745")
 
+    # don't expose ${ROCM_PATH}/include/hipfft explicitly, spack finds it alone
+    patch("quda-hipfft-public-to-private.patch", when="@:")
+
     # build dependencies
     generator("ninja")
     depends_on("cmake@3.18:", type="build")
diff --git a/.ci/spack_packages/quda/quda-hipfft-public-to-private.patch b/.ci/spack_packages/quda/quda-hipfft-public-to-private.patch
new file mode 100644
index 000000000..127944195
--- /dev/null
+++ b/.ci/spack_packages/quda/quda-hipfft-public-to-private.patch
@@ -0,0 +1,13 @@
+diff --git a/lib/targets/hip/target_hip.cmake b/lib/targets/hip/target_hip.cmake
+index 6bdca1127..b22a469a1 100644
+--- a/lib/targets/hip/target_hip.cmake
++++ b/lib/targets/hip/target_hip.cmake
+@@ -132,7 +132,7 @@ set_source_files_properties( ${QUDA_CU_OBJS} PROPERTIES LANGUAGE HIP)
+ target_link_libraries(quda PUBLIC hip::hiprand roc::rocrand hip::hipcub roc::rocprim_hip)
+ target_link_libraries(quda PUBLIC roc::hipblas roc::rocblas)
+ 
+-target_include_directories(quda PUBLIC ${ROCM_PATH}/hipfft/include)
++target_include_directories(quda PRIVATE ${ROCM_PATH}/hipfft/include)
+ target_link_libraries(quda PUBLIC hip::hipfft)
+ 
+ add_subdirectory(targets/hip)

From 3420f79e4fe30f8c9486a84015aa8242aed34dd3 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 17:27:38 +0200
Subject: [PATCH 56/80] use hipcc in tmlqcd spack apckage if +rocm, use
 llvm-openmp if +rocm+openmp

---
 .ci/spack_packages/tmlqcd/package.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/.ci/spack_packages/tmlqcd/package.py b/.ci/spack_packages/tmlqcd/package.py
index 78a94340b..54a564bdc 100755
--- a/.ci/spack_packages/tmlqcd/package.py
+++ b/.ci/spack_packages/tmlqcd/package.py
@@ -15,12 +15,16 @@ class Tmlqcd(CMakePackage, CudaPackage, ROCmPackage):
 
     homepage = "https://www.itkp.uni-bonn.de/~urbach/software.html"
     url = "https://github.com/etmc/tmLQCD/archive/refs/tags/rel-5-1-6.tar.gz"
-    git = "https://github.com/etmc/tmLQCD.git"
+    git = "https://github.com/mtaillefumier/tmLQCD.git"
     license("GPL-3.0-or-later")
 
     maintainers("mtaillefumier")
     version("master", branch="master")
 
+    # todo: remove this version as soon as
+    # https://github.com/etmc/tmLQCD/pull/664 is merged
+    version("cmake_support", branch="cmake_support")
+
     variant("lemon", default=False, description="Enable the lemon backend")
     variant("mpi", default=True, description="Enable mpi support")
     variant("DDalphaAMG", default=False, description="Enable DAlphaAMG support")
@@ -84,6 +88,8 @@ class Tmlqcd(CMakePackage, CudaPackage, ROCmPackage):
     depends_on("mpi", when="+mpi")
     depends_on("lemonio", when="+lemon")
 
+    depends_on("llvm-openmp", when="+rocm+openmp")
+
     with when("+quda"):
         depends_on(
             "quda+shared+twisted_mass+twisted_clover+clover+ndeg_twisted_clover+ndeg_twisted_mass+wilson+qdp+multigrid"
@@ -98,7 +104,6 @@ class Tmlqcd(CMakePackage, CudaPackage, ROCmPackage):
 
 class CMakeBuilder(cmake.CMakeBuilder):
     def cmake_args(self):
-        spec = self.spec
         args = [
             self.define_from_variant("BUILD_SHARED_LIBS", "shared"),
             self.define_from_variant("TM_USE_LEMON", "lemon"),
@@ -113,7 +118,20 @@ def cmake_args(self):
             self.define_from_variant("TM_USE_HALFSPINOR", "half_spinor"),
         ]
 
-        args.append("-DCMAKE_C_COMPILER={0}".format(self.spec["hip"].hipcc))
-        args.append("-DCMAKE_CXX_COMPILER={0}".format(self.spec["hip"].hipcc))
+        # Use hipcc is case of a ROCm build
+        if "+rocm" in self.spec:
+            hip = self.spec["hip"]
+            args.append(self.define("CMAKE_C_COMPILER", hip.hipcc))
+            args.append(self.define("CMAKE_CXX_COMPILER", hip.hipcc))
+
+            # help hipcc find openmp
+            if "+openmp" in self.spec:
+                omp = self.spec["llvm-openmp"]
+                args.append(self.define("OpenMP_C_FLAGS", "-fopenmp"))
+                args.append(self.define("OpenMP_CXX_FLAGS", "-fopenmp"))
+                args.append(self.define("OpenMP_C_LIB_NAMES", "omp"))
+                args.append(self.define("OpenMP_CXX_LIB_NAMES", "omp"))
+                args.append(self.define("OpenMP_omp_LIBRARY", "{0}/libomp.so".format(omp.prefix.lib)))
+                args.append(self.define("OpenMP_CXX_INCLUDE_DIR", omp.prefix.include))
 
         return args

From 86c19c4a5bebbc42310dc77328f337a25124673e Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 17:28:27 +0200
Subject: [PATCH 57/80] add back openmp to ci pipeline

---
 .ci/cscs_beverin_pipeline.yml                           | 2 +-
 .ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 4206a1c1f..a6681a843 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -27,7 +27,7 @@ build-tmlqcd/uenv/beverin-mi300:
     paths:
       - builddir.tar
   variables:
-    SPEC: "tmlqcd@cicd ~openmp +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
+    SPEC: "tmlqcd@cicd +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
     ROCM_ARCH: gfx942
     REPO: "./.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/"
   script:
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml b/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml
index d0799a646..1b861a3cb 100755
--- a/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/environments.yaml
@@ -11,8 +11,8 @@ gcc-env:
   - hip@6.3.3 ^mesa@23.3.6
   - llvm-amdgpu
   # tmlqcd and quda are not required, since we build their newest commits in the
-  # next step. Although, we want all dependencies
-  - tmlqcd +lemon +quda
+  # build stage. Although, we want all their dependencies in the base uenv.
+  - tmlqcd@cmake_support +lemon +quda ^quda@develop +qdp +multigrid +twisted_clover +twisted_mass
   - numdiff
   variants:
   - +mpi

From 7238b329aa9e6ba61e312f18f276e7f8ebc13a61 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 17:29:23 +0200
Subject: [PATCH 58/80] corrected path in .test/hmc

---
 .ci/include/cscs/01-test-templates.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index b43577c94..fb317d3fe 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -29,7 +29,7 @@ include:
       ls -la
       which numdiff
       which hmc_tm
-      ./install_dir/bin/hmc_tm -f "${INPUT_FILE}"
+      hmc_tm -f "${INPUT_FILE}"
       if test "${SLURM_PROCID}" -eq "0"; then
         echo "Check the results on SLURM_PROCID=${SLURM_PROCID} ..."
         numdiff -r 1.2e-6 -X 1:22 -X 1:5-21 -X 2:22 -X 2:5-21 output.data ${REFPATH}/output.data

From 9ef2d9f2e1fdee83e28978630818a35d9518cbb2 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 19:44:12 +0200
Subject: [PATCH 59/80] added todo note

---
 .ci/spack_packages/tmlqcd/package.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.ci/spack_packages/tmlqcd/package.py b/.ci/spack_packages/tmlqcd/package.py
index 54a564bdc..5ed85c341 100755
--- a/.ci/spack_packages/tmlqcd/package.py
+++ b/.ci/spack_packages/tmlqcd/package.py
@@ -15,6 +15,8 @@ class Tmlqcd(CMakePackage, CudaPackage, ROCmPackage):
 
     homepage = "https://www.itkp.uni-bonn.de/~urbach/software.html"
     url = "https://github.com/etmc/tmLQCD/archive/refs/tags/rel-5-1-6.tar.gz"
+
+    # todo: change this back to etmc as soon as cmake PR is merged
     git = "https://github.com/mtaillefumier/tmLQCD.git"
     license("GPL-3.0-or-later")
 

From a5452aeb5e9051fe3f700bac869366d424e5953d Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 19:44:31 +0200
Subject: [PATCH 60/80] correct uenv image passing

---
 .ci/cscs_beverin_pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index a6681a843..3b4873e6d 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -22,7 +22,7 @@ build-tmlqcd/uenv/beverin-mi300:
   stage: build
   extends: [.uenv-runner-beverin-mi300, .beverin-mi300-secrets]
   needs: [build-base/uenv/beverin-mi300]
-  image: $UENV_NAME/$UENV_VERSION:$CI_PIPELINE_ID
+  image: ${UENV_NAME}/${UENV_VERSION}:${UENV_TAG}
   artifacts:
     paths:
       - builddir.tar

From e171c666e790d94ac72defdef65b4bb7aad53ea8 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 19:49:41 +0200
Subject: [PATCH 61/80] added debug prints in ci

---
 .ci/cscs_beverin_pipeline.yml          | 16 +++++++++-------
 .ci/include/cscs/01-test-templates.yml | 11 ++++++-----
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 3b4873e6d..9ecf2bb82 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -31,13 +31,15 @@ build-tmlqcd/uenv/beverin-mi300:
     ROCM_ARCH: gfx942
     REPO: "./.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/"
   script:
-    - |
-      #
-      echo "YAML VARIABLE = $VARIABLE"
-      git clone --filter=tree:0 $(jq -r .spack.repo /user-environment/meta/configure.json) /dev/shm/spack-clone
-      git -C /dev/shm/spack-clone checkout $(jq -r .spack.commit /user-environment/meta/configure.json)
-      source /dev/shm/spack-clone/share/spack/setup-env.sh
-      bwrap --dev-bind / / --tmpfs ~ -- ./ci/build-rocm.sh
+    - echo "YAML VARIABLE = $VARIABLE"
+    - git clone --filter=tree:0 $(jq -r .spack.repo /user-environment/meta/configure.json) /dev/shm/spack-clone
+    - git -C /dev/shm/spack-clone checkout $(jq -r .spack.commit /user-environment/meta/configure.json)
+    - source /dev/shm/spack-clone/share/spack/setup-env.sh
+    - pwd
+    - ls -la
+    - find
+    - ls -la ./.ci/build-rocm.sh
+    - bwrap --dev-bind / / --tmpfs ~ -- ./.ci/build-rocm.sh
 
 
 test/beverin-mi300:
diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index fb317d3fe..c932ad18a 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -17,18 +17,18 @@ include:
       fi
     - |
       while test ! -f preparation-done-${CI_JOB_ID}; do sleep 5; done
-      stagedir=$(find /dev/shm/spack-stage -type d -name spack-stage-sirius-*)
+      stagedir=$(find /dev/shm/spack-stage -type d -name spack-stage-tmlqcd-*)
       echo "stagedir: $stagedir"
 
 
 .test/hmc:
   extends: .test/base
   script:
+    - echo "script VARIABLE = $VARIABLE"
+    - ls -la
+    - which numdiff
+    - which hmc_tm
     - |
-      echo "script VARIABLE = $VARIABLE"
-      ls -la
-      which numdiff
-      which hmc_tm
       hmc_tm -f "${INPUT_FILE}"
       if test "${SLURM_PROCID}" -eq "0"; then
         echo "Check the results on SLURM_PROCID=${SLURM_PROCID} ..."
@@ -38,3 +38,4 @@ include:
           numdiff -r 5e-4 ${f} ${REFPATH}/${f};
         done
       fi
+    - echo "done"

From 3af655cf03c356564d510a080b8889da27278369 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 19:52:23 +0200
Subject: [PATCH 62/80] make executable

---
 .ci/build-rocm.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 .ci/build-rocm.sh

diff --git a/.ci/build-rocm.sh b/.ci/build-rocm.sh
old mode 100644
new mode 100755

From 75f25b4a76fe18d501888d2f81a2165a89f26580 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 20:11:53 +0200
Subject: [PATCH 63/80] scope=spack

---
 .ci/build-rocm.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/build-rocm.sh b/.ci/build-rocm.sh
index 1788347fe..b7ab14a6b 100755
--- a/.ci/build-rocm.sh
+++ b/.ci/build-rocm.sh
@@ -19,9 +19,9 @@ git clone -b ${QUDA_GIT_BRANCH} ${QUDA_GIT_REPO} ${QUDA_SRC_DIR}
 git -C ${QUDA_SRC_DIR} checkout ${QUDA_GIT_COMMIT}
 
 # make sure we keep the stage direcorty
-spack config --scope=user add config:build_stage:/dev/shm/spack-stage
+spack config --scope=spack add config:build_stage:/dev/shm/spack-stage
 # we might need to install dependencies too, e.g. nlcglib in case of API changes
-spack config --scope=user add config:install_tree:root:/dev/shm/spack-stage
+spack config --scope=spack add config:install_tree:root:/dev/shm/spack-stage
 
 spack env create -d ./spack-env
 

From 0ca0f5e41fea6f73e140e4006982e67df7f381ea Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Tue, 31 Mar 2026 20:12:12 +0200
Subject: [PATCH 64/80] increase slurm timelimit ro 1h

---
 .ci/cscs_beverin_pipeline.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 9ecf2bb82..f049301ae 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -30,15 +30,12 @@ build-tmlqcd/uenv/beverin-mi300:
     SPEC: "tmlqcd@cicd +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
     ROCM_ARCH: gfx942
     REPO: "./.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/"
+    SLURM_TIMELIMIT: "01:00:00"
   script:
     - echo "YAML VARIABLE = $VARIABLE"
     - git clone --filter=tree:0 $(jq -r .spack.repo /user-environment/meta/configure.json) /dev/shm/spack-clone
     - git -C /dev/shm/spack-clone checkout $(jq -r .spack.commit /user-environment/meta/configure.json)
     - source /dev/shm/spack-clone/share/spack/setup-env.sh
-    - pwd
-    - ls -la
-    - find
-    - ls -la ./.ci/build-rocm.sh
     - bwrap --dev-bind / / --tmpfs ~ -- ./.ci/build-rocm.sh
 
 

From c65f4a854ccb1d5e1646d2cd6912ac7388a6d6c6 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Wed, 1 Apr 2026 16:14:11 +0200
Subject: [PATCH 65/80] fixed path in test job

---
 .ci/cscs_beverin_pipeline.yml          |  4 ++--
 .ci/include/cscs/01-test-templates.yml | 23 +++++++++++++++--------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index f049301ae..5cf233a08 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -43,10 +43,10 @@ test/beverin-mi300:
   extends: [.uenv-runner-beverin-mi300, .test/hmc, .beverin-mi300-vars, .beverin-mi300-secrets]
   needs: [build-tmlqcd/uenv/beverin-mi300]
   variables:
-    INPUT_FILE: "doc/sample-input/sample-hmc-quda-cscs-beverin.input"
+    COMMAND: "hmc_tm -f doc/sample-input/sample-hmc-quda-cscs-beverin.input"
     REFPATH: "doc/sample-output/hmc-quda-cscs"
     QUDA_ENABLE_TUNING: 0 # disable tuning
-    QUDA_ENABLE_GDR: 1 # enable GPU-Direct RDMA
+    QUDA_ENABLE_P2P: 0 # disable P2P
     SLURM_JOB_NUM_NODES: 2
     SLURM_NTASKS: 8
     SLURM_TIMELIMIT: "00:30:00"
diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index c932ad18a..31d03be62 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -8,28 +8,35 @@ include:
   variables:
     WITH_UENV_VIEW: "default"
   before_script:
+    - echo "before VARIABLE = $VARIABLE"
     - |
-      echo "before VARIABLE = $VARIABLE"
       if test "${SLURM_PROCID}" -eq "0"; then
         ls -la
         tar xf ./builddir.tar -C /
         touch preparation-done-${CI_JOB_ID}
       fi
-    - |
-      while test ! -f preparation-done-${CI_JOB_ID}; do sleep 5; done
-      stagedir=$(find /dev/shm/spack-stage -type d -name spack-stage-tmlqcd-*)
-      echo "stagedir: $stagedir"
+    - while test ! -f preparation-done-${CI_JOB_ID}; do sleep 5; done
 
 
 .test/hmc:
   extends: .test/base
   script:
     - echo "script VARIABLE = $VARIABLE"
-    - ls -la
     - which numdiff
-    - which hmc_tm
     - |
-      hmc_tm -f "${INPUT_FILE}"
+      stagedir="$(find /dev/shm/spack-stage -type d -name "spack-stage-tmlqcd-cicd-*")"
+      bindir="$(find $stagedir/spack-build-*/src/bin -type d -name bin)"
+      export PATH=:${bindir}:$PATH
+      if [ $SLURM_LOCALID -eq 0 ]; then
+        echo "stagedir = $stagedir"
+        echo "bindir = $bindir"
+        which hmc_tm
+        ldd $(which hmc_tm)
+        echo "executing ${COMMAND}"
+      fi
+
+      ${COMMAND}
+
       if test "${SLURM_PROCID}" -eq "0"; then
         echo "Check the results on SLURM_PROCID=${SLURM_PROCID} ..."
         numdiff -r 1.2e-6 -X 1:22 -X 1:5-21 -X 2:22 -X 2:5-21 output.data ${REFPATH}/output.data

From bbe7ae0233491224aa29ca64e6fc0c0402158c90 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Wed, 1 Apr 2026 17:28:08 +0200
Subject: [PATCH 66/80] add quda to artifacts, add libquda dir to
 LD_LIBRARY_PATH in CI

---
 .ci/build-rocm.sh                      |  5 +++--
 .ci/include/cscs/01-test-templates.yml | 10 +++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.ci/build-rocm.sh b/.ci/build-rocm.sh
index b7ab14a6b..15a23e307 100755
--- a/.ci/build-rocm.sh
+++ b/.ci/build-rocm.sh
@@ -47,7 +47,8 @@ spack -e ./spack-env concretize
 spack -e ./spack-env install
 
 # the tar pipe below expects a relative path
-builddir=$(spack -e ./spack-env location -b tmlqcd)
+builddir_tmlqcd=$(spack -e ./spack-env location -b tmlqcd)
+builddir_quda=$(spack -e ./spack-env location -b quda)
 
 # create a symlink to spack build directory (keep in artifacts)
-tar -cf builddir.tar $builddir
+tar -cf builddir.tar $builddir_tmlqcd $builddir_quda
diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index 31d03be62..84cac8654 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -11,7 +11,6 @@ include:
     - echo "before VARIABLE = $VARIABLE"
     - |
       if test "${SLURM_PROCID}" -eq "0"; then
-        ls -la
         tar xf ./builddir.tar -C /
         touch preparation-done-${CI_JOB_ID}
       fi
@@ -24,15 +23,16 @@ include:
     - echo "script VARIABLE = $VARIABLE"
     - which numdiff
     - |
-      stagedir="$(find /dev/shm/spack-stage -type d -name "spack-stage-tmlqcd-cicd-*")"
-      bindir="$(find $stagedir/spack-build-*/src/bin -type d -name bin)"
+      bindir=$(echo /dev/shm/spack-stage/groman/spack-stage-tmlqcd-cicd-*/spack-build-*/src/bin)
+      libdir=$(dirname $(echo /dev/shm/spack-stage/groman/spack-stage-quda-cicd-*/spack-build-*/lib/libquda.so))
       export PATH=:${bindir}:$PATH
+      export LD_LIBRARY_PATH=:${libdir}:$LD_LIBRARY_PATH
       if [ $SLURM_LOCALID -eq 0 ]; then
-        echo "stagedir = $stagedir"
         echo "bindir = $bindir"
+        echo "libdir = $libdir"
         which hmc_tm
         ldd $(which hmc_tm)
-        echo "executing ${COMMAND}"
+        echo "executing command: ${COMMAND}"
       fi
 
       ${COMMAND}

From 1cd1c8a96977b86c39620f18888bd8ef2e81773b Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Wed, 1 Apr 2026 18:03:29 +0200
Subject: [PATCH 67/80] adjust bindir path, and SLURM_LOCALID -> SLURM_PROCID

---
 .ci/include/cscs/01-test-templates.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index 84cac8654..c73347d42 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -23,13 +23,15 @@ include:
     - echo "script VARIABLE = $VARIABLE"
     - which numdiff
     - |
-      bindir=$(echo /dev/shm/spack-stage/groman/spack-stage-tmlqcd-cicd-*/spack-build-*/src/bin)
-      libdir=$(dirname $(echo /dev/shm/spack-stage/groman/spack-stage-quda-cicd-*/spack-build-*/lib/libquda.so))
+      bindir=$(echo /dev/shm/spack-stage/*/spack-stage-tmlqcd-cicd-*/spack-build-*/src/bin)
+      libdir=$(dirname $(echo /dev/shm/spack-stage/*/spack-stage-quda-cicd-*/spack-build-*/lib/libquda.so))
       export PATH=:${bindir}:$PATH
       export LD_LIBRARY_PATH=:${libdir}:$LD_LIBRARY_PATH
-      if [ $SLURM_LOCALID -eq 0 ]; then
+      if test "${SLURM_PROCID}" -eq "0"; then
         echo "bindir = $bindir"
         echo "libdir = $libdir"
+        ls -la $bindir
+        ls -la $libdir
         which hmc_tm
         ldd $(which hmc_tm)
         echo "executing command: ${COMMAND}"

From e82eeab2049065741b308e8f2b948a0c10adf176 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Wed, 1 Apr 2026 18:04:04 +0200
Subject: [PATCH 68/80] test job timelimit 30min -> 1h

---
 .ci/cscs_beverin_pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 5cf233a08..4ec489a7d 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -49,4 +49,4 @@ test/beverin-mi300:
     QUDA_ENABLE_P2P: 0 # disable P2P
     SLURM_JOB_NUM_NODES: 2
     SLURM_NTASKS: 8
-    SLURM_TIMELIMIT: "00:30:00"
+    SLURM_TIMELIMIT: "01:00:00"

From 4c777bf24800eaa9a7b70c991724a122a9a18f82 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Wed, 1 Apr 2026 18:46:47 +0200
Subject: [PATCH 69/80] local procid = 0 has to unpack

---
 .ci/include/cscs/01-test-templates.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index c73347d42..9708cd495 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -10,7 +10,7 @@ include:
   before_script:
     - echo "before VARIABLE = $VARIABLE"
     - |
-      if test "${SLURM_PROCID}" -eq "0"; then
+      if test "${SLURM_LOCALID}" -eq "0"; then
         tar xf ./builddir.tar -C /
         touch preparation-done-${CI_JOB_ID}
       fi

From 3f7d789a6e9dcba3f4537c8de6d0d32676b0f74a Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 09:46:52 +0200
Subject: [PATCH 70/80] more verbosity in CI

---
 .ci/include/cscs/01-test-templates.yml | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index 9708cd495..0d326f8b7 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -27,15 +27,11 @@ include:
       libdir=$(dirname $(echo /dev/shm/spack-stage/*/spack-stage-quda-cicd-*/spack-build-*/lib/libquda.so))
       export PATH=:${bindir}:$PATH
       export LD_LIBRARY_PATH=:${libdir}:$LD_LIBRARY_PATH
-      if test "${SLURM_PROCID}" -eq "0"; then
-        echo "bindir = $bindir"
-        echo "libdir = $libdir"
-        ls -la $bindir
-        ls -la $libdir
-        which hmc_tm
-        ldd $(which hmc_tm)
-        echo "executing command: ${COMMAND}"
-      fi
+      echo "bindir = $bindir"
+      echo "libdir = $libdir"
+      which hmc_tm
+      ldd $(which hmc_tm)
+      echo "executing command: ${COMMAND}"
 
       ${COMMAND}
 

From 9d1ac3663d343bc0a70f15a66bacd2ad7f8d9943 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 09:47:09 +0200
Subject: [PATCH 71/80] adjust beverin input file to match daint input file

---
 doc/sample-input/sample-hmc-quda-cscs-beverin.input | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/sample-input/sample-hmc-quda-cscs-beverin.input b/doc/sample-input/sample-hmc-quda-cscs-beverin.input
index e98f2a924..f332083f5 100644
--- a/doc/sample-input/sample-hmc-quda-cscs-beverin.input
+++ b/doc/sample-input/sample-hmc-quda-cscs-beverin.input
@@ -1,8 +1,8 @@
 L=24
 T=48
 
-NrXProcs = 1
-NrYProcs = 1
+NrXProcs = 2
+NrYProcs = 2
 NrZprocs = 1
 
 ompnumthreads=32

From 4d2dd2e8e66cda7b39620e5476dc533ad73486ab Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 11:03:35 +0200
Subject: [PATCH 72/80] removed noise in CI output, added after_script for
 output check

---
 .ci/build-rocm.sh                      |  2 --
 .ci/cscs_beverin_pipeline.yml          |  8 +++----
 .ci/include/cscs/01-test-templates.yml | 32 +++++++++-----------------
 3 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/.ci/build-rocm.sh b/.ci/build-rocm.sh
index 15a23e307..ca2019ca5 100755
--- a/.ci/build-rocm.sh
+++ b/.ci/build-rocm.sh
@@ -2,8 +2,6 @@
 
 set -xeuo pipefail
 
-echo "VARIABLE = $VARIABLE"
-
 export SPACK_SYSTEM_CONFIG_PATH=/user-environment/config
 export SPACK_PYTHON=$(which python3.6) # must be <=3.12, system python is 3.6
 export CICD_SRC_DIR=$PWD
diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 4ec489a7d..1022a8482 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -28,11 +28,10 @@ build-tmlqcd/uenv/beverin-mi300:
       - builddir.tar
   variables:
     SPEC: "tmlqcd@cicd +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
-    ROCM_ARCH: gfx942
     REPO: "./.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/"
+    ROCM_ARCH: gfx942
     SLURM_TIMELIMIT: "01:00:00"
   script:
-    - echo "YAML VARIABLE = $VARIABLE"
     - git clone --filter=tree:0 $(jq -r .spack.repo /user-environment/meta/configure.json) /dev/shm/spack-clone
     - git -C /dev/shm/spack-clone checkout $(jq -r .spack.commit /user-environment/meta/configure.json)
     - source /dev/shm/spack-clone/share/spack/setup-env.sh
@@ -40,13 +39,14 @@ build-tmlqcd/uenv/beverin-mi300:
 
 
 test/beverin-mi300:
-  extends: [.uenv-runner-beverin-mi300, .test/hmc, .beverin-mi300-vars, .beverin-mi300-secrets]
+  extends: [.uenv-runner-beverin-mi300, .test/base, .beverin-mi300-vars, .beverin-mi300-secrets]
   needs: [build-tmlqcd/uenv/beverin-mi300]
   variables:
-    COMMAND: "hmc_tm -f doc/sample-input/sample-hmc-quda-cscs-beverin.input"
     REFPATH: "doc/sample-output/hmc-quda-cscs"
     QUDA_ENABLE_TUNING: 0 # disable tuning
     QUDA_ENABLE_P2P: 0 # disable P2P
     SLURM_JOB_NUM_NODES: 2
     SLURM_NTASKS: 8
     SLURM_TIMELIMIT: "01:00:00"
+  script:
+    - hmc_tm -f doc/sample-input/sample-hmc-quda-cscs-beverin.input
diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index 0d326f8b7..d32921c09 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -8,33 +8,18 @@ include:
   variables:
     WITH_UENV_VIEW: "default"
   before_script:
-    - echo "before VARIABLE = $VARIABLE"
     - |
       if test "${SLURM_LOCALID}" -eq "0"; then
         tar xf ./builddir.tar -C /
         touch preparation-done-${CI_JOB_ID}
       fi
     - while test ! -f preparation-done-${CI_JOB_ID}; do sleep 5; done
-
-
-.test/hmc:
-  extends: .test/base
-  script:
-    - echo "script VARIABLE = $VARIABLE"
-    - which numdiff
+    - bindir=$(echo /dev/shm/spack-stage/*/spack-stage-tmlqcd-cicd-*/spack-build-*/src/bin)
+    - libdir=$(dirname $(echo /dev/shm/spack-stage/*/spack-stage-quda-cicd-*/spack-build-*/lib/libquda.so))
+    - export PATH=:${bindir}:$PATH
+    - export LD_LIBRARY_PATH=:${libdir}:$LD_LIBRARY_PATH
+  after_script:
     - |
-      bindir=$(echo /dev/shm/spack-stage/*/spack-stage-tmlqcd-cicd-*/spack-build-*/src/bin)
-      libdir=$(dirname $(echo /dev/shm/spack-stage/*/spack-stage-quda-cicd-*/spack-build-*/lib/libquda.so))
-      export PATH=:${bindir}:$PATH
-      export LD_LIBRARY_PATH=:${libdir}:$LD_LIBRARY_PATH
-      echo "bindir = $bindir"
-      echo "libdir = $libdir"
-      which hmc_tm
-      ldd $(which hmc_tm)
-      echo "executing command: ${COMMAND}"
-
-      ${COMMAND}
-
       if test "${SLURM_PROCID}" -eq "0"; then
         echo "Check the results on SLURM_PROCID=${SLURM_PROCID} ..."
         numdiff -r 1.2e-6 -X 1:22 -X 1:5-21 -X 2:22 -X 2:5-21 output.data ${REFPATH}/output.data
@@ -43,4 +28,9 @@ include:
           numdiff -r 5e-4 ${f} ${REFPATH}/${f};
         done
       fi
-    - echo "done"
+
+
+.test/hmc:
+  extends: .test/base
+  script:
+    - ${COMMAND}

From db0446b112256b16d424250d303c4a9c06d90f22 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 11:14:53 +0200
Subject: [PATCH 73/80] moved hmc check into after_script

---
 .ci/cscs_beverin_pipeline.yml                 | 19 +++++--------------
 .ci/include/cscs/01-build-templates.yml       | 17 +++++++++++++++++
 ...st-templates.yml => 02-test-templates.yml} | 10 ++++------
 3 files changed, 26 insertions(+), 20 deletions(-)
 create mode 100644 .ci/include/cscs/01-build-templates.yml
 rename .ci/include/cscs/{01-test-templates.yml => 02-test-templates.yml} (97%)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 1022a8482..17ae64a7f 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -1,7 +1,8 @@
 include:
   - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
   - local: '/.ci/include/cscs/00-variables.yml'
-  - local: '/.ci/include/cscs/01-test-templates.yml'
+  - local: '/.ci/include/cscs/01-build-templates.yml'
+  - local: '/.ci/include/cscs/02-test-templates.yml'
 
 
 stages:
@@ -19,27 +20,17 @@ build-base/uenv/beverin-mi300:
 
 
 build-tmlqcd/uenv/beverin-mi300:
-  stage: build
-  extends: [.uenv-runner-beverin-mi300, .beverin-mi300-secrets]
+  extends: [.uenv-runner-beverin-mi300, .build/base, .beverin-mi300-secrets]
   needs: [build-base/uenv/beverin-mi300]
-  image: ${UENV_NAME}/${UENV_VERSION}:${UENV_TAG}
-  artifacts:
-    paths:
-      - builddir.tar
   variables:
     SPEC: "tmlqcd@cicd +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
     REPO: "./.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/"
-    ROCM_ARCH: gfx942
+    VARIANTS: "amdgpu_target=gfx942,amdgpu_target_sram_ecc=gfx942,+rocm,+mpi"
     SLURM_TIMELIMIT: "01:00:00"
-  script:
-    - git clone --filter=tree:0 $(jq -r .spack.repo /user-environment/meta/configure.json) /dev/shm/spack-clone
-    - git -C /dev/shm/spack-clone checkout $(jq -r .spack.commit /user-environment/meta/configure.json)
-    - source /dev/shm/spack-clone/share/spack/setup-env.sh
-    - bwrap --dev-bind / / --tmpfs ~ -- ./.ci/build-rocm.sh
 
 
 test/beverin-mi300:
-  extends: [.uenv-runner-beverin-mi300, .test/base, .beverin-mi300-vars, .beverin-mi300-secrets]
+  extends: [.uenv-runner-beverin-mi300, .test/hmc, .beverin-mi300-secrets]
   needs: [build-tmlqcd/uenv/beverin-mi300]
   variables:
     REFPATH: "doc/sample-output/hmc-quda-cscs"
diff --git a/.ci/include/cscs/01-build-templates.yml b/.ci/include/cscs/01-build-templates.yml
new file mode 100644
index 000000000..6c4a87174
--- /dev/null
+++ b/.ci/include/cscs/01-build-templates.yml
@@ -0,0 +1,17 @@
+include:
+  - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
+
+
+.build/base:
+  stage: build
+  image: ${UENV_NAME}/${UENV_VERSION}:${UENV_TAG}
+  artifacts:
+    paths:
+      - builddir.tar
+  variables:
+    SLURM_TIMELIMIT: "01:00:00"
+  script:
+    - git clone --filter=tree:0 $(jq -r .spack.repo /user-environment/meta/configure.json) /dev/shm/spack-clone
+    - git -C /dev/shm/spack-clone checkout $(jq -r .spack.commit /user-environment/meta/configure.json)
+    - source /dev/shm/spack-clone/share/spack/setup-env.sh
+    - bwrap --dev-bind / / --tmpfs ~ -- ./.ci/build.sh
diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/02-test-templates.yml
similarity index 97%
rename from .ci/include/cscs/01-test-templates.yml
rename to .ci/include/cscs/02-test-templates.yml
index d32921c09..2d12cec43 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/02-test-templates.yml
@@ -18,6 +18,10 @@ include:
     - libdir=$(dirname $(echo /dev/shm/spack-stage/*/spack-stage-quda-cicd-*/spack-build-*/lib/libquda.so))
     - export PATH=:${bindir}:$PATH
     - export LD_LIBRARY_PATH=:${libdir}:$LD_LIBRARY_PATH
+
+
+.test/hmc:
+  extends: .test/base
   after_script:
     - |
       if test "${SLURM_PROCID}" -eq "0"; then
@@ -28,9 +32,3 @@ include:
           numdiff -r 5e-4 ${f} ${REFPATH}/${f};
         done
       fi
-
-
-.test/hmc:
-  extends: .test/base
-  script:
-    - ${COMMAND}

From 9c7a5b6c55fcbb5da30a816bbf7fca70d0617dd8 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 11:15:20 +0200
Subject: [PATCH 74/80] generic build script

---
 .ci/build.sh | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 .ci/build.sh

diff --git a/.ci/build.sh b/.ci/build.sh
new file mode 100644
index 000000000..d9dc14ba2
--- /dev/null
+++ b/.ci/build.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+set -xeuo pipefail
+
+export SPACK_SYSTEM_CONFIG_PATH="/user-environment/config"
+export SPACK_PYTHON="$(which python3.6)" # must be <=3.12, system python is 3.6
+export CICD_SRC_DIR="$PWD"
+export QUDA_SRC_DIR="$PWD/deps/src/quda"
+
+# QUDA git, branch and commit
+export QUDA_GIT_REPO="${QUDA_GIT_REPO:=https://github.com/lattice/quda.git}"
+export QUDA_GIT_BRANCH="${QUDA_GIT_BRANCH:=develop}"
+export QUDA_GIT_COMMIT="${QUDA_GIT_COMMIT:=$(git ls-remote ${QUDA_GIT_REPO} refs/heads/${QUDA_GIT_BRANCH} | awk '{print $1}')}"
+
+# obtain QUDA
+git clone -b "${QUDA_GIT_BRANCH}" "${QUDA_GIT_REPO}" "${QUDA_SRC_DIR}"
+git -C "${QUDA_SRC_DIR}" checkout "${QUDA_GIT_COMMIT}"
+
+# make sure we keep the stage direcorty
+spack config --scope=spack add config:build_stage:/dev/shm/spack-stage
+# we might need to install dependencies too, e.g. nlcglib in case of API changes
+spack config --scope=spack add config:install_tree:root:/dev/shm/spack-stage
+
+spack env create -d ./spack-env
+
+# add local repository with current tmlqcd recipe
+spack -e ./spack-env repo add "${REPO}"
+
+spack -e ./spack-env config add "packages:all:variants:[${VARIANTS}]"
+
+spack -e ./spack-env add "${SPEC}"
+
+# for tmlqcd use local src instead of fetch git
+spack -e ./spack-env develop -p "${CICD_SRC_DIR}" tmlqcd@cicd
+
+# for quda use local src instead of fetch git, to be able to tests against
+# differnt repo, branch, commit and also to support that quda branch develop is
+# a moving target
+spack -e ./spack-env develop -p "${QUDA_SRC_DIR}" quda@cicd
+
+# display spack.yaml
+cat ./spack-env/spack.yaml
+
+spack -e ./spack-env concretize
+spack -e ./spack-env install
+
+# the tar pipe below expects a relative path
+builddir_tmlqcd=$(spack -e ./spack-env location -b tmlqcd)
+builddir_quda=$(spack -e ./spack-env location -b quda)
+
+# create a symlink to spack build directory (keep in artifacts)
+tar -cf builddir.tar $builddir_tmlqcd $builddir_quda

From 3894d48b2538a0f656dcf5d0cf0e10262b27966c Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 11:21:07 +0200
Subject: [PATCH 75/80] adjusted permissions

---
 .ci/build.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 .ci/build.sh

diff --git a/.ci/build.sh b/.ci/build.sh
old mode 100644
new mode 100755

From 97d3110756912db1fe2fea0d8a952783ec5783a7 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 11:30:46 +0200
Subject: [PATCH 76/80] adjusted CSCS gh200 pipeline to match mi300 pipeline

---
 .ci/build-rocm.sh                             | 52 -------------------
 .ci/cscs_default_pipeline.yml                 | 32 +++++++++---
 .ci/include/cscs/00-variables.yml             | 14 -----
 .../tmlqcd/beverin-mi300/config.yaml          |  2 +-
 .../tmlqcd/daint-gh200/config.yaml            |  2 +-
 .../tmlqcd/daint-gh200/environments.yaml      |  8 +--
 6 files changed, 28 insertions(+), 82 deletions(-)
 delete mode 100755 .ci/build-rocm.sh

diff --git a/.ci/build-rocm.sh b/.ci/build-rocm.sh
deleted file mode 100755
index ca2019ca5..000000000
--- a/.ci/build-rocm.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-
-set -xeuo pipefail
-
-export SPACK_SYSTEM_CONFIG_PATH=/user-environment/config
-export SPACK_PYTHON=$(which python3.6) # must be <=3.12, system python is 3.6
-export CICD_SRC_DIR=$PWD
-export QUDA_SRC_DIR=$PWD/deps/src/quda
-
-# QUDA git, branch and commit
-export QUDA_GIT_REPO="${QUDA_GIT_REPO:=https://github.com/lattice/quda.git}"
-export QUDA_GIT_BRANCH="${QUDA_GIT_BRANCH:=develop}"
-export QUDA_GIT_COMMIT="${QUDA_GIT_COMMIT:=$(git ls-remote ${QUDA_GIT_REPO} refs/heads/${QUDA_GIT_BRANCH} | awk '{print $1}')}"
-
-# obtain QUDA
-git clone -b ${QUDA_GIT_BRANCH} ${QUDA_GIT_REPO} ${QUDA_SRC_DIR}
-git -C ${QUDA_SRC_DIR} checkout ${QUDA_GIT_COMMIT}
-
-# make sure we keep the stage direcorty
-spack config --scope=spack add config:build_stage:/dev/shm/spack-stage
-# we might need to install dependencies too, e.g. nlcglib in case of API changes
-spack config --scope=spack add config:install_tree:root:/dev/shm/spack-stage
-
-spack env create -d ./spack-env
-
-# add local repository with current tmlqcd recipe
-spack -e ./spack-env repo add $REPO
-
-spack -e ./spack-env config add "packages:all:variants:[amdgpu_target=${ROCM_ARCH},amdgpu_target_sram_ecc=${ROCM_ARCH},+rocm,+mpi]"
-
-spack -e ./spack-env add $SPEC
-
-# for tmlqcd use local src instead of fetch git
-spack -e ./spack-env develop -p ${CICD_SRC_DIR} tmlqcd@cicd
-
-# for quda use local src instead of fetch git, to be able to tests against
-# differnt repo, branch, commit and also to support that quda branch develop is
-# a moving target
-spack -e ./spack-env develop -p ${QUDA_SRC_DIR} quda@cicd
-
-# display spack.yaml
-cat ./spack-env/spack.yaml
-
-spack -e ./spack-env concretize
-spack -e ./spack-env install
-
-# the tar pipe below expects a relative path
-builddir_tmlqcd=$(spack -e ./spack-env location -b tmlqcd)
-builddir_quda=$(spack -e ./spack-env location -b quda)
-
-# create a symlink to spack build directory (keep in artifacts)
-tar -cf builddir.tar $builddir_tmlqcd $builddir_quda
diff --git a/.ci/cscs_default_pipeline.yml b/.ci/cscs_default_pipeline.yml
index 4bb78aca0..048117779 100644
--- a/.ci/cscs_default_pipeline.yml
+++ b/.ci/cscs_default_pipeline.yml
@@ -1,25 +1,43 @@
 include:
   - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
   - local: '/.ci/include/cscs/00-variables.yml'
-  - local: '/.ci/include/cscs/01-test-templates.yml'
+  - local: '/.ci/include/cscs/01-build-templates.yml'
+  - local: '/.ci/include/cscs/02-test-templates.yml'
+
 
 stages:
+  - prepare
   - build
   - test
 
-build-quda/uenv/daint-gh200:
-  stage: build
+
+build-base/uenv/daint-gh200:
+  stage: prepare
   extends: .uenv-builder-daint-gh200
   variables:
     UENV_RECIPE: .ci/uenv-recipes/tmlqcd/daint-gh200
+    SLURM_TIMELIMIT: "04:00:00"
+
+
+build-tmlqcd/uenv/daint-gh200:
+  extends: [.uenv-runner-daint-gh200, .build/base]
+  needs: [build-base/uenv/daint-gh200]
+  variables:
+    SPEC: "tmlqcd@cicd +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
+    REPO: "./.ci/uenv-recipes/tmlqcd/daint-gh200/repo/"
+    VARIANTS: "cuda_arch=90,+cuda,+mpi"
+    SLURM_TIMELIMIT: "01:00:00"
+
 
 test/daint-gh200:
-  extends: [.uenv-runner-daint-gh200, .test/hmc, .daint-gh200-vars]
+  extends: [.uenv-runner-daint-gh200, .test/hmc]
+  needs: [build-tmlqcd/uenv/daint-gh200]
   variables:
-    INPUT_FILE: "doc/sample-input/sample-hmc-quda-cscs.input"
     REFPATH: "doc/sample-output/hmc-quda-cscs"
     QUDA_ENABLE_TUNING: 0 # disable tuning
-    QUDA_ENABLE_GDR: 1 # enable GPU-Direct RDMA
+    QUDA_ENABLE_GDR: 0 # enable GPU-Direct RDMA
     SLURM_JOB_NUM_NODES: 2
     SLURM_NTASKS: 8
-    SLURM_TIMELIMIT: "00:30:00"
+    SLURM_TIMELIMIT: "01:00:00"
+  script:
+    - hmc_tm -f doc/sample-input/sample-hmc-quda-cscs.input
diff --git a/.ci/include/cscs/00-variables.yml b/.ci/include/cscs/00-variables.yml
index 033560f69..bdafc1ae0 100644
--- a/.ci/include/cscs/00-variables.yml
+++ b/.ci/include/cscs/00-variables.yml
@@ -16,17 +16,3 @@ variables:
   variables:
     F7T_CLIENT_ID: $F7T_TDS_CONSUMER_KEY
     F7T_CLIENT_SECRET: $F7T_TDS_CONSUMER_SECRET
-
-# Compiler flags for the GH200 nodes
-.daint-gh200-vars:
-  variables:
-    CFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
-    CXXFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
-    LDFLAGS: "-fopenmp"
-
-# Compiler flags for the Mi300A nodes
-.beverin-mi300-vars:
-  variables:
-    CFLAGS: "-O3 -fopenmp -mtune=znver4 -mcpu=znver4"
-    CXXFLAGS: "-O3 -fopenmp -mtune=znver4 -mcpu=znver4"
-    LDFLAGS: "-fopenmp"
diff --git a/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml b/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml
index 7fbeacb99..b4b3c6495 100644
--- a/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml
+++ b/.ci/uenv-recipes/tmlqcd/beverin-mi300/config.yaml
@@ -6,5 +6,5 @@ spack:
   packages:
     repo: https://github.com/spack/spack-packages.git
     commit: 5f20b9190596e0b875141e8cee03f0d3847ad65c
-description: "tmLQCD dependencies for CSCS CI."
+description: "tmLQCD dependencies for CSCS CIon GH200."
 version: 2
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/config.yaml b/.ci/uenv-recipes/tmlqcd/daint-gh200/config.yaml
index b15e4e7ad..f6b76048e 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/config.yaml
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/config.yaml
@@ -7,5 +7,5 @@ spack:
     repo: https://github.com/spack/spack-packages.git
     #commit: 
 modules: true
-description: "tmLQCD is a freely available software suite providing a set of tools to be used in lattice QCD simulations."
+description: "tmLQCD dependencies for CSCS CI on MI300A."
 version: 2
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml b/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
index fd4c1568f..cafba6582 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/environments.yaml
@@ -5,14 +5,8 @@ gcc-env:
       specs: ['libfabric@2.4.0+cuda']
   unify: true
   specs:
-  - python@3.12
+  - tmlqcd@cmake_support +lemon +quda ^quda@develop +qdp +multigrid +twisted_clover +twisted_mass
   - numdiff
-  - quda@develop +qdp +multigrid +twisted_clover +twisted_mass
-  - lemonio
-  - c-lime
-  - openblas
-  - cmake@3.31
-  - cuda
   variants:
   - +mpi
   - +cuda

From 0929eaf386d9f17983233c14060e79cc00a4df76 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 12:21:21 +0200
Subject: [PATCH 77/80] remove v2.2 from daint repo.yaml

---
 .ci/uenv-recipes/tmlqcd/daint-gh200/repo/repo.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/repo.yaml b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/repo.yaml
index 7070c57de..f08fa46a4 100644
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/repo.yaml
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/repo.yaml
@@ -1,3 +1,2 @@
 repo:
   namespace: apps
-  api: v2.2
\ No newline at end of file

From 3427c5078d87be3a49438411873e462a024eec08 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 13:36:29 +0200
Subject: [PATCH 78/80] tidy up and move numdiff check back to script (from
 after_script)

---
 .ci/cscs_beverin_pipeline.yml          | 19 ++++++++++++++-----
 .ci/cscs_default_pipeline.yml          | 19 ++++++++++++++-----
 .ci/include/cscs/02-test-templates.yml | 14 --------------
 3 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 17ae64a7f..72e6e1211 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -21,19 +21,19 @@ build-base/uenv/beverin-mi300:
 
 build-tmlqcd/uenv/beverin-mi300:
   extends: [.uenv-runner-beverin-mi300, .build/base, .beverin-mi300-secrets]
-  needs: [build-base/uenv/beverin-mi300]
+  needs: build-base/uenv/beverin-mi300
   variables:
     SPEC: "tmlqcd@cicd +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
-    REPO: "./.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/"
+    REPO: .ci/uenv-recipes/tmlqcd/beverin-mi300/repo
     VARIANTS: "amdgpu_target=gfx942,amdgpu_target_sram_ecc=gfx942,+rocm,+mpi"
     SLURM_TIMELIMIT: "01:00:00"
 
 
 test/beverin-mi300:
-  extends: [.uenv-runner-beverin-mi300, .test/hmc, .beverin-mi300-secrets]
-  needs: [build-tmlqcd/uenv/beverin-mi300]
+  extends: [.uenv-runner-beverin-mi300, .test/base, .beverin-mi300-secrets]
+  needs: build-tmlqcd/uenv/beverin-mi300
   variables:
-    REFPATH: "doc/sample-output/hmc-quda-cscs"
+    REFPATH: doc/sample-output/hmc-quda-cscs
     QUDA_ENABLE_TUNING: 0 # disable tuning
     QUDA_ENABLE_P2P: 0 # disable P2P
     SLURM_JOB_NUM_NODES: 2
@@ -41,3 +41,12 @@ test/beverin-mi300:
     SLURM_TIMELIMIT: "01:00:00"
   script:
     - hmc_tm -f doc/sample-input/sample-hmc-quda-cscs-beverin.input
+    - |
+      if test "${SLURM_PROCID}" -eq "0"; then
+        echo "Check the results on SLURM_PROCID=${SLURM_PROCID} ..."
+        numdiff -r 1.2e-6 -X 1:22 -X 1:5-21 -X 2:22 -X 2:5-21 output.data ${REFPATH}/output.data
+        for i in $(seq 0 2 18); do
+          f=onlinemeas.$(printf %06d $i);
+          numdiff -r 5e-4 ${f} ${REFPATH}/${f};
+        done
+      fi
diff --git a/.ci/cscs_default_pipeline.yml b/.ci/cscs_default_pipeline.yml
index 048117779..91ae03045 100644
--- a/.ci/cscs_default_pipeline.yml
+++ b/.ci/cscs_default_pipeline.yml
@@ -21,19 +21,19 @@ build-base/uenv/daint-gh200:
 
 build-tmlqcd/uenv/daint-gh200:
   extends: [.uenv-runner-daint-gh200, .build/base]
-  needs: [build-base/uenv/daint-gh200]
+  needs: build-base/uenv/daint-gh200
   variables:
     SPEC: "tmlqcd@cicd +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
-    REPO: "./.ci/uenv-recipes/tmlqcd/daint-gh200/repo/"
+    REPO: .ci/uenv-recipes/tmlqcd/daint-gh200/repo
     VARIANTS: "cuda_arch=90,+cuda,+mpi"
     SLURM_TIMELIMIT: "01:00:00"
 
 
 test/daint-gh200:
-  extends: [.uenv-runner-daint-gh200, .test/hmc]
-  needs: [build-tmlqcd/uenv/daint-gh200]
+  extends: [.uenv-runner-daint-gh200, .test/base]
+  needs: build-tmlqcd/uenv/daint-gh200
   variables:
-    REFPATH: "doc/sample-output/hmc-quda-cscs"
+    REFPATH: doc/sample-output/hmc-quda-cscs
     QUDA_ENABLE_TUNING: 0 # disable tuning
     QUDA_ENABLE_GDR: 0 # enable GPU-Direct RDMA
     SLURM_JOB_NUM_NODES: 2
@@ -41,3 +41,12 @@ test/daint-gh200:
     SLURM_TIMELIMIT: "01:00:00"
   script:
     - hmc_tm -f doc/sample-input/sample-hmc-quda-cscs.input
+    - |
+      if test "${SLURM_PROCID}" -eq "0"; then
+        echo "Check the results on SLURM_PROCID=${SLURM_PROCID} ..."
+        numdiff -r 1.2e-6 -X 1:22 -X 1:5-21 -X 2:22 -X 2:5-21 output.data ${REFPATH}/output.data
+        for i in $(seq 0 2 18); do
+          f=onlinemeas.$(printf %06d $i);
+          numdiff -r 5e-4 ${f} ${REFPATH}/${f};
+        done
+      fi
diff --git a/.ci/include/cscs/02-test-templates.yml b/.ci/include/cscs/02-test-templates.yml
index 2d12cec43..d70a8a92c 100644
--- a/.ci/include/cscs/02-test-templates.yml
+++ b/.ci/include/cscs/02-test-templates.yml
@@ -18,17 +18,3 @@ include:
     - libdir=$(dirname $(echo /dev/shm/spack-stage/*/spack-stage-quda-cicd-*/spack-build-*/lib/libquda.so))
     - export PATH=:${bindir}:$PATH
     - export LD_LIBRARY_PATH=:${libdir}:$LD_LIBRARY_PATH
-
-
-.test/hmc:
-  extends: .test/base
-  after_script:
-    - |
-      if test "${SLURM_PROCID}" -eq "0"; then
-        echo "Check the results on SLURM_PROCID=${SLURM_PROCID} ..."
-        numdiff -r 1.2e-6 -X 1:22 -X 1:5-21 -X 2:22 -X 2:5-21 output.data ${REFPATH}/output.data
-        for i in $(seq 0 2 18); do
-          f=onlinemeas.$(printf %06d $i);
-          numdiff -r 5e-4 ${f} ${REFPATH}/${f};
-        done
-      fi

From 8a7042d672402bde40478fb047b6f0c7ea40b098 Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 13:38:51 +0200
Subject: [PATCH 79/80] add back brackets and quotes

---
 .ci/cscs_beverin_pipeline.yml | 8 ++++----
 .ci/cscs_default_pipeline.yml | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.ci/cscs_beverin_pipeline.yml b/.ci/cscs_beverin_pipeline.yml
index 72e6e1211..aa980c2c8 100644
--- a/.ci/cscs_beverin_pipeline.yml
+++ b/.ci/cscs_beverin_pipeline.yml
@@ -21,19 +21,19 @@ build-base/uenv/beverin-mi300:
 
 build-tmlqcd/uenv/beverin-mi300:
   extends: [.uenv-runner-beverin-mi300, .build/base, .beverin-mi300-secrets]
-  needs: build-base/uenv/beverin-mi300
+  needs: [build-base/uenv/beverin-mi300]
   variables:
     SPEC: "tmlqcd@cicd +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
-    REPO: .ci/uenv-recipes/tmlqcd/beverin-mi300/repo
+    REPO: "./.ci/uenv-recipes/tmlqcd/beverin-mi300/repo/"
     VARIANTS: "amdgpu_target=gfx942,amdgpu_target_sram_ecc=gfx942,+rocm,+mpi"
     SLURM_TIMELIMIT: "01:00:00"
 
 
 test/beverin-mi300:
   extends: [.uenv-runner-beverin-mi300, .test/base, .beverin-mi300-secrets]
-  needs: build-tmlqcd/uenv/beverin-mi300
+  needs: [build-tmlqcd/uenv/beverin-mi300]
   variables:
-    REFPATH: doc/sample-output/hmc-quda-cscs
+    REFPATH: "doc/sample-output/hmc-quda-cscs"
     QUDA_ENABLE_TUNING: 0 # disable tuning
     QUDA_ENABLE_P2P: 0 # disable P2P
     SLURM_JOB_NUM_NODES: 2
diff --git a/.ci/cscs_default_pipeline.yml b/.ci/cscs_default_pipeline.yml
index 91ae03045..c5a336df6 100644
--- a/.ci/cscs_default_pipeline.yml
+++ b/.ci/cscs_default_pipeline.yml
@@ -21,19 +21,19 @@ build-base/uenv/daint-gh200:
 
 build-tmlqcd/uenv/daint-gh200:
   extends: [.uenv-runner-daint-gh200, .build/base]
-  needs: build-base/uenv/daint-gh200
+  needs: [build-base/uenv/daint-gh200]
   variables:
     SPEC: "tmlqcd@cicd +lemon +quda ^quda@cicd +qdp +multigrid +twisted_clover +twisted_mass"
-    REPO: .ci/uenv-recipes/tmlqcd/daint-gh200/repo
+    REPO: "./.ci/uenv-recipes/tmlqcd/daint-gh200/repo/"
     VARIANTS: "cuda_arch=90,+cuda,+mpi"
     SLURM_TIMELIMIT: "01:00:00"
 
 
 test/daint-gh200:
   extends: [.uenv-runner-daint-gh200, .test/base]
-  needs: build-tmlqcd/uenv/daint-gh200
+  needs: [build-tmlqcd/uenv/daint-gh200]
   variables:
-    REFPATH: doc/sample-output/hmc-quda-cscs
+    REFPATH: "doc/sample-output/hmc-quda-cscs"
     QUDA_ENABLE_TUNING: 0 # disable tuning
     QUDA_ENABLE_GDR: 0 # enable GPU-Direct RDMA
     SLURM_JOB_NUM_NODES: 2

From da1082ad2708036a3db24db26fc481571e738c7c Mon Sep 17 00:00:00 2001
From: Roman Gruber <groman@beverin-ln001.cscs.ch>
Date: Thu, 2 Apr 2026 15:47:53 +0200
Subject: [PATCH 80/80] updated README with new features

---
 .ci/README.md | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/.ci/README.md b/.ci/README.md
index 057d45127..42b6b59fd 100644
--- a/.ci/README.md
+++ b/.ci/README.md
@@ -5,24 +5,39 @@ This document describes the external pipeline executed through CSCS.
 The pipeline can be triggered by commenting on a pull request with
 
 ```
-cscs-ci run default  # runs the default pipeline
+cscs-ci run default  # runs the default pipeline (on GH200 nodes @ CSCS)
+cscs-ci run beverin  # runs the beverin pipeline (on MI300A nodes @ CSCS)
 ```
 
 An automatic trigger on all merge-requests is currently disabled.
 
-This pipeline has 2 stages: `build` and `test`.
+This pipeline has 3 stages: `prepare`, `build` and `test`.
 
-The `build` stage builds a uenv image that includes all necessary compilers, MPI libraries and other dependecies to build QUDA and tmLQCD against QUDA. In this stage, QUDA is built correctly for the GH200 machine at CSCS with all required build flags for production runs. The uenv recipe can be found [here](uenv-recipes/tmlqcd/daint-gh200).
+## `prepare` stage
 
-In the `test` stage, the aforementioned uenv image is loaded, tmLQCD is built and linked against the QUDA library that is inside the image. Finally a minimal HMC is executed and checked against some reference data.
+The `prepare` stage builds an uenv image that includes all necessary compilers, MPI libraries and other dependecies to build QUDA and tmLQCD against QUDA. The uenv recipe can be found [here for GH200](uenv-recipes/tmlqcd/daint-gh200) and [here for MI300A](uenv-recipes/tmlqcd/beverin-mi300).
 
-## Force recompilation of quda
+## `build` stage
+
+In the `build` stage, the aforementioned uenv image is loaded, tmLQCD and QUDA are built using their spack packages using the dependencies from the base image. This stage exposes an artifact with tmLQCD/QUDA binaries. For tmLQCD, the current branch is compiled. For QUDA the following environment variables are respected:
+
+  * `QUDA_GIT_REPO`: the git repository URL to use as source (defaults to `https://github.com/lattice/quda.git`)
+  * `QUDA_GIT_BRANCH`: the git branch to compile (defaults to `develop`)
+  * `QUDA_GIT_COMMIT`: the git commit to compile (defaults to the current head commit of `QUDA_GIT_BRANCH`)
+
+Then QUDA is cloned and compiled, completely bypassing the spack compile cache.
+
+## `test` stage
+
+In the `test` stage, the aforementioned uenv image is loaded, tmLQCD and QUDA are unpacked from the artifact. Finally a minimal HMC is executed and checked against some reference data.
+
+## Force recompilation of base image in `prepare` stage
 
 Remove the build cache:
 
 ```bash
-/capstor/scratch/cscs/${USER}/uenv-cache/user-environment/build_cache/linux-sles15-neoverse_v2/gcc-13.2.0/quda-*
-/capstor/scratch/cscs/${USER}/uenv-cache/user-environment/build_cache/linux-sles15-neoverse_v2-gcc-13.2.0-quda*
+/capstor/scratch/cscs/${USER}/uenv-cache/user-environment/build_cache/linux-sles15-neoverse_v2/gcc-13.2.0/tmlqcd-*
+/capstor/scratch/cscs/${USER}/uenv-cache/user-environment/build_cache/linux-sles15-neoverse_v2-gcc-13.2.0-tmlqcd*
 ```
 
 Or increment the the version counter tag in [.ci/include/cscs/00-variables.yml](include/cscs/00-variables.yml):
@@ -46,3 +61,4 @@ and commit.
 * [CSCS Uenv Writing Documentation](https://eth-cscs.github.io/alps-uenv/)
 * [CSCS Status Page](https://status.cscs.ch/)
 * [CSCS Spack Base Containers](https://github.com/orgs/eth-cscs/packages/container/package/docker-ci-ext%2Fspack-base-containers%2Fspack-build)
+* [Sirius CI/CD](https://github.com/electronic-structure/SIRIUS/tree/develop/ci) where this one is based upon