diff --git a/.github/workflows/linux_build_and_test.yaml b/.github/workflows/linux_build_and_test.yaml
index 313126c..6dd8aff 100644
--- a/.github/workflows/linux_build_and_test.yaml
+++ b/.github/workflows/linux_build_and_test.yaml
@@ -2,9 +2,9 @@ name: linux-build-and-test
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main, develop ]
   pull_request:
-    branches: [ main ]
+    branches: [ main, develop ]
 
 jobs:
   linux-build-and-test:
@@ -17,17 +17,11 @@ jobs:
     - name: Set up CMake
       uses: jwlawson/actions-setup-cmake@v2
 
-    - name: Install OpenEXR (Ubuntu)
-      run: sudo apt-get update && sudo apt-get install -y libopenexr-dev libimath-dev
-      
-    - name: Install dependencies
-      run: sudo apt-get update && sudo apt-get install -y g++ ninja-build
-
     - name: Configure
-      run: cmake -S . -B build -G Ninja
+      run: cmake --preset linux-cpu-debug
 
     - name: Build
-      run: cmake --build build --parallel --config Release
+      run: cmake --build --preset linux-cpu-debug
 
     - name: Run tests
-      run: cd build && ctest --output-on-failure --verbose
+      run: ctest --preset linux-cpu-debug
diff --git a/.gitignore b/.gitignore
index 349d393..5e3c449 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,19 +2,17 @@
 Thumbs.db
 /_deps/
 /CMakeFiles/
-/build/
-/build-debug/
-/build-release/
-/build-mac-debug/
-
+/build*
 /Testing/
 *.dSYM
 /.vscode/
 /.VSCodeCounter/
+/vcpkg/
+/vcpkg_installed/
 
 tests/test_files/
 
 CTestTestfile.cmake
 Makefile
 cmake_install.cmake
-
+cuda-keyring_1.1-1_all.deb
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea3ba71..5a879fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,146 +1,145 @@
-cmake_minimum_required(VERSION 3.23)
+cmake_minimum_required(VERSION 3.26)
 
-project(DenoiseMachineX
-    VERSION 0.1.0
-    LANGUAGES CXX
-)
+project(DenoiseMachineX VERSION 0.1.0 LANGUAGES CXX)
+
+option(BUILD_CUDA       "Build CUDA backends"             ON)
+option(BUILD_TESTING    "Build tests"                     ON)
+
+option(DMX_ENABLE_HEAVY_TESTS "Enable heavy/slow tests"   ON)
+if(DMX_ENABLE_HEAVY_TESTS)
+    add_compile_definitions(DMX_ENABLE_HEAVY_TESTS=1)
+else()
+    add_compile_definitions(DMX_ENABLE_HEAVY_TESTS=0)
+endif()
 
-# ---- Options
-option(BUILD_CUDA     "Build CUDA backends" OFF)
-option(BUILD_TESTING  "Build tests"          ON)
 
-# ---- C++ setup
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g")
+# Log level by Default 0 = TRACE
+set(DMX_MIN_LOG_LEVEL "0" CACHE STRING "Compile-time minimum log level")
+add_compile_definitions(DMX_MIN_LOG_LEVEL=${DMX_MIN_LOG_LEVEL})
 
-# Output dirs (bin/ lib/)
+# Output dirs
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 
-# ---- Dependencies: OpenEXR + Imath (prefer CONFIG, fallback to modules)
-# Try config packages first (v3+)
-find_package(Imath CONFIG QUIET)
-find_package(OpenEXR CONFIG QUIET)
-
-# Fallback to classic find modules (v2.x dev packages)
-if(NOT Imath_FOUND)
-    find_package(Imath CONFIG REQUIRED)       # provides Imath::Imath and/or IlmBase::Imath
-endif()
-if(NOT OpenEXR_FOUND)
-    find_package(OpenEXR CONFIG REQUIRED)     # provides OpenEXR::OpenEXR or legacy targets
+if(POLICY CMP0169)
+  cmake_policy(SET CMP0169 OLD)
 endif()
 
+include(FetchContent)
 
-if(NOT OpenEXR_FOUND OR NOT Imath_FOUND)
-    message(FATAL_ERROR
-        "\nOpenEXR and/or Imath not found!\n"
-        "Please install them before configuring this project.\n"
-        "On macOS, run:\n"
-        "    brew install openexr imath\n"
-        "On Ubuntu/Debian, run:\n"
-        "    sudo apt install libopenexr-dev libimath-dev\n"
-        "On Windows (vcpkg):\n"
-        "    vcpkg install openexr\n"
-    )
-endif()
-
-# Include headers
-include_directories(${CMAKE_SOURCE_DIR}/include)
-
-# ---- Sources (CPP always; CU optional when BUILD_CUDA=ON)
-file(GLOB_RECURSE SRC_CPP CONFIGURE_DEPENDS
-  ${CMAKE_SOURCE_DIR}/src/*.cpp
+FetchContent_Declare(Imath
+  GIT_REPOSITORY https://github.com/AcademySoftwareFoundation/Imath.git
+  GIT_TAG v3.1.10
+)
+FetchContent_MakeAvailable(Imath)
+FetchContent_Declare(OpenEXR
+  GIT_REPOSITORY https://github.com/AcademySoftwareFoundation/openexr.git
+  GIT_TAG v3.2.4
 )
+FetchContent_MakeAvailable(OpenEXR)
 
-# keep CLI separate so we can add it explicitly to the exe
+# ---- Sources
+file(GLOB_RECURSE SRC_CPP CONFIGURE_DEPENDS ${CMAKE_SOURCE_DIR}/src/*.cpp)
+file(GLOB_RECURSE SRC_CU  CONFIGURE_DEPENDS ${CMAKE_SOURCE_DIR}/src/*.cu)
 set(CLI_MAIN ${CMAKE_SOURCE_DIR}/cli/main.cpp)
 
-
 if(BUILD_CUDA)
   enable_language(CUDA)
   find_package(CUDAToolkit REQUIRED)
   add_compile_definitions(DMX_ENABLE_CUDA=1)
-  file(GLOB_RECURSE SRC_CU CONFIGURE_DEPENDS
-    ${CMAKE_SOURCE_DIR}/src/*.cu
-  )
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES "75;86;87;89;90;120" CACHE STRING "CUDA architectures")
+  endif()
 else()
   add_compile_definitions(DMX_ENABLE_CUDA=0)
   set(SRC_CU)
 endif()
 
+# ---- Executable
+add_executable(dmxdenoiser
+  ${CLI_MAIN}
+  ${SRC_CPP}
+  ${SRC_CU}
+)
 
-# ---- Executable (DIRECTLY links all sources → static registrars run)
-add_executable(dmxdenoiser ${SRC_CPP} ${SRC_CU} ${CLI_MAIN})
-
-
-# CLI executable
-#add_executable(dmxdenoiser ${SRC_FILES} cli/main.cpp)
-
-target_include_directories(dmxdenoiser
-    PUBLIC
-        ${CMAKE_SOURCE_DIR}/include
+target_include_directories(dmxdenoiser PRIVATE
+  ${CMAKE_SOURCE_DIR}/include
+  ${imath_SOURCE_DIR}/src
+  ${imath_BINARY_DIR}/config
+  ${openexr_SOURCE_DIR}/src/lib
+  ${openexr_BINARY_DIR}/config
 )
 
-# Link OpenEXR/Imath via whatever targets exist
 target_link_libraries(dmxdenoiser
-    PUBLIC
-        $<$<TARGET_EXISTS:OpenEXR::OpenEXR>:OpenEXR::OpenEXR>
-        $<$<TARGET_EXISTS:Imath::Imath>:Imath::Imath>
-        $<$<TARGET_EXISTS:Imath::Half>:Imath::Half>
-        $<$<TARGET_EXISTS:OpenEXR::IlmImf>:OpenEXR::IlmImf>
-        $<$<TARGET_EXISTS:IlmBase::Imath>:IlmBase::Imath>
-        $<$<TARGET_EXISTS:IlmBase::Half>:IlmBase::Half>
-        $<$<TARGET_EXISTS:IlmBase::IlmThread>:IlmBase::IlmThread>
-        $<$<TARGET_EXISTS:Iex::Iex>:Iex::Iex>
+  PRIVATE
+    Imath::Imath
+    OpenEXR::OpenEXR
 )
 
 if(BUILD_CUDA)
   target_link_libraries(dmxdenoiser PRIVATE CUDA::cudart CUDA::cuda_driver)
-  set_target_properties(dmxdenoiser PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+  set_target_properties(dmxdenoiser PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+  )
 endif()
 
-
+# Silence MSVC deprecation spam from OpenEXR threadpool atomics
+if(MSVC)
+  add_compile_definitions(_SILENCE_CXX20_OLD_SHARED_PTR_ATOMIC_SUPPORT_DEPRECATION_WARNING)
+endif()
 
 # ---- Tests
 if(BUILD_TESTING)
-    include(FetchContent)
-    include(CTest)
-    enable_testing()
-    
-    FetchContent_Declare(
-      googletest
-      GIT_REPOSITORY https://github.com/google/googletest.git
-      GIT_TAG        v1.14.0
+  include(CTest)
+  enable_testing()
+  
+  FetchContent_Declare(googletest
+    GIT_REPOSITORY https://github.com/google/googletest.git
+    GIT_TAG v1.14.0
+  )
+  FetchContent_MakeAvailable(googletest)
+
+  file(GLOB TEST_SOURCES CONFIGURE_DEPENDS ${CMAKE_SOURCE_DIR}/tests/*.cpp)
+  foreach(test_src ${TEST_SOURCES})
+    get_filename_component(test_name ${test_src} NAME_WE)
+    add_executable(${test_name}
+      ${SRC_CPP}
+      ${SRC_CU}
+      ${test_src}
     )
-    FetchContent_MakeAvailable(googletest)
-    
-    file(GLOB TEST_SOURCES CONFIGURE_DEPENDS ${CMAKE_SOURCE_DIR}/tests/*.cpp)
-    
-    foreach(test_src ${TEST_SOURCES})
-      get_filename_component(test_name ${test_src} NAME_WE)
-      # Build each test with the same src set (direct linkage)
-      add_executable(${test_name} ${SRC_CPP} ${SRC_CU} ${test_src})
-      target_include_directories(${test_name} PRIVATE ${CMAKE_SOURCE_DIR}/include)
-      target_link_libraries(${test_name} PRIVATE
-        gtest_main
-        $<$<TARGET_EXISTS:OpenEXR::OpenEXR>:OpenEXR::OpenEXR>
-        $<$<TARGET_EXISTS:Imath::Imath>:Imath::Imath>
-        $<$<TARGET_EXISTS:Imath::Half>:Imath::Half>
-        $<$<TARGET_EXISTS:OpenEXR::IlmImf>:OpenEXR::IlmImf>
-        $<$<TARGET_EXISTS:IlmBase::Imath>:IlmBase::Imath>
-        $<$<TARGET_EXISTS:IlmBase::Half>:IlmBase::Half>
-        $<$<TARGET_EXISTS:IlmBase::IlmThread>:IlmBase::IlmThread>
-        $<$<TARGET_EXISTS:Iex::Iex>:Iex::Iex>
+
+    target_include_directories(${test_name} PRIVATE
+      ${CMAKE_SOURCE_DIR}/include
+      ${imath_SOURCE_DIR}/src
+      ${imath_BINARY_DIR}/config
+      ${openexr_SOURCE_DIR}/src/lib
+      ${openexr_BINARY_DIR}/config
+    )
+
+    target_link_libraries(${test_name} PRIVATE
+      gtest_main
+      Imath::Imath
+      OpenEXR::OpenEXR
+    )
+
+    if(BUILD_CUDA)
+      target_link_libraries(${test_name} PRIVATE CUDA::cudart CUDA::cuda_driver)
+      set_target_properties(${test_name} PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
       )
-      if(BUILD_CUDA)
-        target_link_libraries(${test_name} PRIVATE CUDA::cudart CUDA::cuda_driver)
-        set_target_properties(${test_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-      endif()
-      add_test(NAME ${test_name} COMMAND ${test_name})
-    endforeach()
-endif()
+    endif()
 
+    if(MSVC)
+      target_compile_definitions(${test_name}
+        PUBLIC _SILENCE_CXX20_OLD_SHARED_PTR_ATOMIC_SUPPORT_DEPRECATION_WARNING)
+    endif()
 
+    add_test(NAME ${test_name} COMMAND ${test_name})
+  endforeach()
+endif()
diff --git a/CMakePresets.json b/CMakePresets.json
index 9c29a88..9cdb161 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -5,50 +5,212 @@
         "minor": 23
     },
     "configurePresets": [
+    {
+        "name": "mac-cpu-dev",
+        "displayName": "macOS CPU Dev",
+        "generator": "",
+        "binaryDir": "build-mac-cpu-dev",
+        "cacheVariables": {
+            "CMAKE_BUILD_TYPE": "Debug",
+            "BUILD_CUDA": "OFF",
+            "BUILD_TESTING": "ON",
+            "DMX_ENABLE_HEAVY_TESTS": "ON",
+            "DMX_MIN_LOG_LEVEL": "0"
+        }
+    },
     {
         "name": "mac-cpu-debug",
         "displayName": "macOS CPU Debug",
-        "generator": "Ninja",
-        "binaryDir": "build-mac-debug",
+        "generator": "",
+        "binaryDir": "build-mac-cpu-debug",
+        "cacheVariables": {
+            "CMAKE_BUILD_TYPE": "Debug",
+            "BUILD_CUDA": "OFF",
+            "BUILD_TESTING": "ON",
+            "DMX_ENABLE_HEAVY_TESTS": "OFF",
+            "DMX_MIN_LOG_LEVEL": "1"
+        }
+    },
+    {
+        "name": "linux-cpu-dev",
+        "displayName": "Linux CPU Dev",
+        "generator": "Unix Makefiles",
+        "binaryDir": "build-linux-cpu-dev",
+        "cacheVariables": {
+            "CMAKE_BUILD_TYPE": "Debug",
+            "BUILD_CUDA": "OFF",
+            "BUILD_TESTING": "ON",
+            "DMX_ENABLE_HEAVY_TESTS": "ON",
+            "DMX_MIN_LOG_LEVEL": "0"
+        }
+    },
+    {
+        "name": "linux-cpu-debug",
+        "displayName": "Linux CPU Debug",
+        "generator": "Unix Makefiles",
+        "binaryDir": "build-linux-cpu-debug",
         "cacheVariables": {
             "CMAKE_BUILD_TYPE": "Debug",
             "BUILD_CUDA": "OFF",
-            "BUILD_TESTING": "ON"
+            "BUILD_TESTING": "ON",
+            "DMX_ENABLE_HEAVY_TESTS": "OFF",
+            "DMX_MIN_LOG_LEVEL": "1"
+        }
+    },
+    {
+        "name": "linux-cpu-release",
+        "displayName": "Linux CPU Release",
+        "generator": "Unix Makefiles",
+        "binaryDir": "build-linux-cpu-release",
+        "cacheVariables": {
+            "CMAKE_BUILD_TYPE": "Release",
+            "BUILD_CUDA": "OFF",
+            "BUILD_TESTING": "OFF"
         }
     },
     {
-        "name": "linux-cuda-release",
+        "name": "linux-gpu-dev",
+        "displayName": "Linux CUDA Dev",
+        "generator": "Unix Makefiles",
+        "binaryDir": "build-linux-gpu-dev",
+        "cacheVariables": {
+            "CMAKE_BUILD_TYPE": "Debug",
+            "BUILD_CUDA": "ON",
+            "BUILD_TESTING": "ON",
+            "DMX_ENABLE_HEAVY_TESTS": "ON",
+            "DMX_MIN_LOG_LEVEL": "0",
+            "CMAKE_CUDA_COMPILER": "nvcc",
+            "CMAKE_CUDA_ARCHITECTURES": "75;86"
+        }
+    },
+    {
+        "name": "linux-gpu-debug",
+        "displayName": "Linux CUDA Debug",
+        "generator": "Unix Makefiles",
+        "binaryDir": "build-linux-gpu-debug",
+        "cacheVariables": {
+            "CMAKE_BUILD_TYPE": "Debug",
+            "BUILD_CUDA": "ON",
+            "BUILD_TESTING": "ON",
+            "DMX_ENABLE_HEAVY_TESTS": "OFF",
+            "DMX_MIN_LOG_LEVEL": "1",
+            "CMAKE_CUDA_COMPILER": "nvcc",
+            "CMAKE_CUDA_ARCHITECTURES": "75;86"
+        }
+    },
+    {
+        "name": "linux-gpu-release",
         "displayName": "Linux CUDA Release",
-        "generator": "Ninja",
-        "binaryDir": "build-linux-cuda",
+        "generator": "Unix Makefiles",
+        "binaryDir": "build-linux-gpu-release",
         "cacheVariables": {
             "CMAKE_BUILD_TYPE": "Release",
             "BUILD_CUDA": "ON",
-            "BUILD_TESTING": "ON"
+            "BUILD_TESTING": "OFF",
+            "CMAKE_CUDA_COMPILER": "nvcc",
+            "CMAKE_CUDA_ARCHITECTURES": "75;86"
         }
     },
     {
-        "name": "windows-cuda",
+        "name": "windows-gpu-dev",
+        "displayName": "Windows CUDA Dev",
+        "generator": "Ninja Multi-Config",
+        "binaryDir": "${sourceDir}/build-win-gpu-dev",
+        "cacheVariables": {
+            "CMAKE_BUILD_TYPE": "Debug",
+            "BUILD_TESTING": "ON",
+            "BUILD_CUDA": "ON",
+            "DMX_MIN_LOG_LEVEL": "0",
+            "CMAKE_C_COMPILER": "C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe",
+            "CMAKE_CXX_COMPILER": "C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe",
+            "CMAKE_CUDA_COMPILER": "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin/nvcc.exe",
+            "CMAKE_CUDA_ARCHITECTURES": "native"
+        }
+    },
+    {
+        "name": "windows-gpu-debug",
         "displayName": "Windows CUDA Debug",
         "generator": "Ninja Multi-Config",
-        "binaryDir": "${sourceDir}/build-win-cuda",
+        "binaryDir": "${sourceDir}/build-win-gpu-debug",
+        "cacheVariables": {
+            "CMAKE_BUILD_TYPE": "Debug",
+            "BUILD_TESTING": "ON",
+            "BUILD_CUDA": "ON",
+            "DMX_MIN_LOG_LEVEL": "1",
+            "CMAKE_C_COMPILER": "C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe",
+            "CMAKE_CXX_COMPILER": "C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe",
+            "CMAKE_CUDA_COMPILER": "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin/nvcc.exe",
+            "CMAKE_CUDA_ARCHITECTURES": "native"
+        }
+    },
+    {
+        "name": "windows-gpu-release",
+        "displayName": "Windows CUDA Release",
+        "generator": "Ninja Multi-Config",
+        "binaryDir": "${sourceDir}/build-win-gpu-release",
         "cacheVariables": {
             "CMAKE_BUILD_TYPE": "Release",
             "BUILD_TESTING": "ON",
             "BUILD_CUDA": "ON",
+            "CMAKE_C_COMPILER": "C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe",
+            "CMAKE_CXX_COMPILER": "C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe",
             "CMAKE_CUDA_COMPILER": "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin/nvcc.exe",
             "CMAKE_CUDA_ARCHITECTURES": "native"
         }
     }
     ],
     "buildPresets": [
+        { "name": "mac-cpu-dev", "configurePreset": "mac-cpu-dev" },
         { "name": "mac-cpu-debug", "configurePreset": "mac-cpu-debug" },
-        { "name": "linux-cuda-release", "configurePreset": "linux-cuda-release" },
-        { "name": "windows-cuda-debug", "configurePreset": "windows-cuda", "configuration": "Debug" }
+        { "name": "linux-cpu-debug", "configurePreset": "linux-cpu-debug" },
+        { "name": "linux-cpu-release", "configurePreset": "linux-cpu-release" },
+        { "name": "linux-gpu-debug", "configurePreset": "linux-gpu-debug" },
+        { "name": "linux-gpu-release", "configurePreset": "linux-gpu-release" },
+        { "name": "windows-gpu-debug", "configurePreset": "windows-gpu-debug" },
+        { "name": "windows-gpu-release", "configurePreset": "windows-gpu-release" }
     ],
     "testPresets": [
-        { "name": "mac-cpu-debug", "configurePreset": "mac-cpu-debug" },
-        { "name": "linux-cuda-release", "configurePreset": "linux-cuda-release" },
-        { "name": "windows-cuda_debug-ctest", "configurePreset": "windows-cuda",  "configuration": "Debug" }
+        { 
+            "name": "mac-cpu-dev", 
+            "displayName": "Mac CPU Dev Tests",
+            "configurePreset": "mac-cpu-dev",
+            "output": 
+            {
+                "outputOnFailure": true,
+                "verbosity": "default"
+            },
+            "execution": {
+                "jobs": 16
+            }
+        },
+        { 
+            "name": "mac-cpu-debug", 
+            "displayName": "Mac CPU Debug Tests",
+            "configurePreset": "mac-cpu-debug",
+            "output": 
+            {
+                "outputOnFailure": true,
+                "verbosity": "default"
+            },
+            "execution": {
+                "jobs": 16
+            }
+        },
+        { 
+            "name": "linux-cpu-debug", 
+            "displayName": "Linux CPU Debug Tests",
+            "configurePreset": "linux-cpu-debug",
+            "output": 
+            {
+                "outputOnFailure": true,
+                "verbosity": "verbose"
+            },
+            "execution": {
+                "jobs": 16
+            }
+        },
+        { "name": "linux-gpu-release", "configurePreset": "linux-gpu-release" },
+        { "name": "windows-gpu-debug", "configurePreset": "windows-gpu-debug" },
+        { "name": "windows-gpu-release", "configurePreset": "windows-gpu-release" }
     ]
 }
diff --git a/cli/main.cpp b/cli/main.cpp
index 1c56143..392f744 100644
--- a/cli/main.cpp
+++ b/cli/main.cpp
@@ -1,11 +1,19 @@
 #include <dmxdenoiser/DMXImage.hpp>
+#include <dmxdenoiser/Logger.hpp>
 #include <iostream>
 
+using namespace dmxdenoiser;
+
 int main(int argc, char** argv)
 {
-    std::cout << "DMX Denoiser run.\nArgs: ";
+    std::cout << "DMX Denoiser v0.1.0.\n";
+
+    // Init log
+    DMX_LOG_INIT(DMX_MIN_LOG_LEVEL, &std::clog, "./dmxdenoiser.log");
+
+    // parse params
     for(int i = 0; i < argc; ++i)
-        std::cout << argv[i] << ' ';
+            std::cout << argv[i] << ' ';
 
     return 0;
 }
diff --git a/command b/command
index 5a0cb99..48da18c 100644
--- a/command
+++ b/command
@@ -11,6 +11,12 @@
     --output filtered.####.exr
 
 
+# Install openexr on Windows
+git clone https://github.com/microsoft/vcpkg.git
+cd vcpkg
+.\bootstrap-vcpkg.bat
+.\vcpkg\vcpkg.exe install --triplet x64-windows
+
 grep CMAKE_BUILD_TYPE build-release/CMakeCache.txt
 # should print: CMAKE_BUILD_TYPE:STRING=Release
 
@@ -25,3 +31,43 @@ cmake -S . -B build/debug   -DCMAKE_BUILD_TYPE=Debug
 Release build & run tests:
 cmake -S . -B build-release -DCMAKE_BUILD_TYPE=Release
 (cmake --build build-release -j 8 && cd build-release && ctest --output-on-failure --verbose -j 8 && cd .. && cd ..)
+
+# Windows
+# 1. cleanup:
+if (Test-Path build) { Remove-Item build -Recurse -Force }
+if (Test-Path CMakeCache.txt) { Remove-Item CMakeCache.txt }
+if (Test-Path CMakeFiles) { Remove-Item CMakeFiles -Recurse -Force }
+
+cmake -S . -B build -G "Visual Studio 17 2022" -A x64 `
+  -DCMAKE_TOOLCHAIN_FILE="$PWD\vcpkg\scripts\buildsystems\vcpkg.cmake"
+cmake --build build -j 8 --config Release
+
+# Build and run:
+call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\VsDevCmd.bat" -arch=x64 -host_arch=x64
+cmake -S . -B build -G Ninja `
+  -DCMAKE_TOOLCHAIN_FILE="$PWD/vcpkg/scripts/buildsystems/vcpkg.cmake" `
+  -DCMAKE_CXX_COMPILER="C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe" `
+  -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin/nvcc.exe" `
+  -DCMAKE_CUDA_ARCHITECTURES=native
+cmake --build build -j8
+cd build && ctest --output-on-failure --verbose -j 8 && cd ..
+
+call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\VsDevCmd.bat" -arch=x64 -host_arch=x64 && ^
+cmake -S . -B build -G Ninja ^
+  -DCMAKE_BUILD_TYPE=Release ^
+  -DCMAKE_TOOLCHAIN_FILE="%CD%\vcpkg\scripts\buildsystems\vcpkg.cmake" ^
+  -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin/nvcc.exe" ^
+  -DCMAKE_CUDA_ARCHITECTURES=native && ^
+cmake --build build -j16  && ^
+ctest --test-dir build --output-on-failure --verbose -j16
+
+
+cmake --preset mac-cpu-debug
+cmake --build --preset mac-cpu-debug -j16
+ctest --preset mac-cpu-debug --parallel 16 --verbose
+ctest -R ConvolutionFilter_test --preset mac-cpu-debug --parallel 16 --verbose
+
+cmake --preset mac-cpu-dev
+cmake --build --preset mac-cpu-dev -j16
+ctest --preset mac-cpu-dev --parallel 16 --verbose
+ctest -R ConvolutionFilter_test --preset mac-cpu-dev --parallel 16 --verbose
diff --git a/config.json b/config.json
index 274a0d8..70c471f 100644
--- a/config.json
+++ b/config.json
@@ -1,5 +1,4 @@
 {
-
     "gpu": 0,
     "input_mapping": {
         "beauty": "default",
@@ -20,5 +19,4 @@
         "level": "debug",
         "outputFolder": "log"
     }
-        
 }
diff --git a/include/dmxdenoiser/Config.hpp b/include/dmxdenoiser/Config.hpp
index 94a1aa4..fc93e29 100644
--- a/include/dmxdenoiser/Config.hpp
+++ b/include/dmxdenoiser/Config.hpp
@@ -11,12 +11,11 @@ namespace dmxdenoiser {
     #define DMX_DEBUG_BUILD 0
 #endif
 
-
 #if !defined(DMX_ENABLE_CUDA)
     #define DMX_ENABLE_CUDA 0
 #endif
 
-#if defined(__CUDAACC___)
+#if defined(__CUDAARC__) || defined(__CUDA_ARCH__)
     #define DMX_CPU_GPU __host__ __device__
     #if defined(_MSC_VER)
         #define DMX_INLINE __forceinline
@@ -31,3 +30,4 @@ namespace dmxdenoiser {
         #define DMX_INLINE inline __attribute__((always_inline))
     #endif
 #endif
+
diff --git a/include/dmxdenoiser/DMXError.hpp b/include/dmxdenoiser/DMXError.hpp
index 489848c..380f872 100644
--- a/include/dmxdenoiser/DMXError.hpp
+++ b/include/dmxdenoiser/DMXError.hpp
@@ -17,3 +17,4 @@ enum class DMXError
     #include <stdexcept>
     #define DMX_TRAP(MSG) std::runtime_error(MSG)
 #endif
+
diff --git a/include/dmxdenoiser/DMXImageView.hpp b/include/dmxdenoiser/DMXImageView.hpp
index cca23b0..9dcc046 100644
--- a/include/dmxdenoiser/DMXImageView.hpp
+++ b/include/dmxdenoiser/DMXImageView.hpp
@@ -19,7 +19,7 @@ namespace dmxdenoiser
         int numChannels = DEFAULT_NUM_CHANNELS;
 
         DMX_CPU_GPU
-        bool inBounds(int x, int y, int frame, int layer) {
+        bool inBounds(int x, int y, int frame, int layer) const {
             return  (x >= 0 && x < width ) &&
                     (y >= 0 && y < height ) &&
                     (frame >= 0 && frame < numFrames ) &&
@@ -27,19 +27,18 @@ namespace dmxdenoiser
         }
 
         DMX_CPU_GPU 
-        PixelRGBAView at(int x, int y, int frame, int layer) {
+        float* at(int x, int y, int frame, int layer) {
             int index = getIndex(x, y, frame, layer);
-            return PixelRGBAView{ data[index], data[index + 1], data[index + 2], data[index + 3] };
+            return &data[index];
         }
 
         DMX_CPU_GPU 
-        PixelRGBA get(int x, int y, int frame, int layer) {
-            int index = getIndex(x, y, frame, layer);
-            return PixelRGBA{ data[index], data[index + 1], data[index + 2], data[index + 3] };
+        const float* at(int x, int y, int frame, int layer) const {
+            return const_cast<DMXImageView*>(this)->at(x, y, frame, layer);
         }
-    
+
     private:
-        DMX_CPU_GPU int getIndex(int x, int y, int frame, int layer) {
+        DMX_CPU_GPU int getIndex(int x, int y, int frame, int layer) const {
             return (((frame * numLayers + layer) * height + y) * width + x) * numChannels;
         }
 
diff --git a/include/dmxdenoiser/Filter.hpp b/include/dmxdenoiser/Filter.hpp
index 2fac232..c4f92ad 100644
--- a/include/dmxdenoiser/Filter.hpp
+++ b/include/dmxdenoiser/Filter.hpp
@@ -45,7 +45,7 @@ namespace dmxdenoiser
 
     private:
         // Apply implementation of the filter
-        virtual void applyFilter(const DMXImage& in, DMXImage& out) const = 0; 
+        virtual void applyFilter(const DMXImage& in, DMXImage& out) const = 0;
     };
 
     inline void Filter::resetParams() {
diff --git a/include/dmxdenoiser/Logger.hpp b/include/dmxdenoiser/Logger.hpp
index ac9a910..66bd2e3 100644
--- a/include/dmxdenoiser/Logger.hpp
+++ b/include/dmxdenoiser/Logger.hpp
@@ -15,10 +15,21 @@
 #include <string_view>
 #include <utility>
 
+#ifndef DMX_MIN_LOG_LEVEL
+#define DMX_MIN_LOG_LEVEL 0
+#endif
+
 namespace dmxdenoiser
 {
 
-    enum class LogLevel { Trace, Debug, Info, Warning, Error, Critical};
+    enum class LogLevel { 
+        Trace, 
+        Debug, 
+        Info, 
+        Warning, 
+        Error, 
+        Critical
+    };
 
     inline constexpr std::string_view ToString(LogLevel level) {
         switch (level) {
@@ -55,10 +66,10 @@ namespace dmxdenoiser
             return std::string(buf);
         }
         
-        void init(LogLevel minLevel, std::ostream* os = nullptr, const std::string& filePath = "") {
+        void init(int minLevel, std::ostream* os = nullptr, const std::string& filePath = "") {
             std::lock_guard<std::mutex> lock{m_mutex};
             m_initialized = true;
-            m_minLevel = static_cast<int>(minLevel);
+            m_minLevel = minLevel;
             m_filePath = filePath;
             m_out = os;
             if (m_fileStream.is_open())       // close old file if any
@@ -93,6 +104,10 @@ namespace dmxdenoiser
             }
         }
 
+        void init(LogLevel minLevel, std::ostream* os = nullptr, const std::string& filePath = "") {
+            init(static_cast<int>(minLevel), os, filePath);
+        }
+
         void shutdown() {
             std::lock_guard<std::mutex> lock{m_mutex};
             if (m_fileStream.is_open())
@@ -109,18 +124,7 @@ namespace dmxdenoiser
         void Log(LogLevel level, std::string_view tag, Args&&... args) {
             if (static_cast<int>(level) < m_minLevel)
                 return;
-            /*
-            auto now = std::chrono::system_clock::now();
-            auto t = std::chrono::system_clock::to_time_t(now);
-            std::tm tm{};
-            #if defined(_WIN32)
-                localtime_s(&tm, &t);
-            #else
-                localtime_r(&t, &tm);
-            #endif
-            char buf[24];
-            std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &tm);
-            */
+
             std::string localTime_{this->localTime()};
 
             // Build message text once (handles empty varargs too), with neat spacing
@@ -165,20 +169,18 @@ namespace dmxdenoiser
 } // namespace dmxdenoiser
 
 // Initialize logger
-#define DMX_LOG_INIT(LOGLEVEL, OSTREAMPTR, FILENAME) \
-    dmxdenoiser::Logger::instance().init((LOGLEVEL), (OSTREAMPTR), (FILENAME)); \
-    dmxdenoiser::Logger::instance().Log(dmxdenoiser::LogLevel::Debug, "Logger", "Logging system initialized")
 
 #if DMX_DEBUG_BUILD
+    #define DMX_LOG_INIT(LOGLEVEL, OSTREAMPTR, FILENAME) \
+        dmxdenoiser::Logger::instance().init((LOGLEVEL), (OSTREAMPTR), (FILENAME)); \
+        dmxdenoiser::Logger::instance().Log(dmxdenoiser::LogLevel::Debug, "Logger", "Logging system initialized")
     #define DMX_LOG_TRACE(tag, ...) dmxdenoiser::Logger::instance().Log(dmxdenoiser::LogLevel::Trace, (tag), ##__VA_ARGS__)
-#else
-    #define DMX_LOG_TRACE(tag, ...) ((void)0)
-#endif
-
-#if DMX_DEBUG_BUILD
     #define DMX_LOG_DEBUG(tag, ...) dmxdenoiser::Logger::instance().Log(dmxdenoiser::LogLevel::Debug, (tag), ##__VA_ARGS__)
 #else
+    #define DMX_LOG_INIT(LOGLEVEL, OSTREAMPTR, FILENAME) \
+        dmxdenoiser::Logger::instance().init((LOGLEVEL), (OSTREAMPTR), (FILENAME))
     #define DMX_LOG_DEBUG(tag, ...) ((void)0)
+    #define DMX_LOG_TRACE(tag, ...) ((void)0)
 #endif
 
 #define DMX_LOG_INFO(tag, ...) dmxdenoiser::Logger::instance().Log(dmxdenoiser::LogLevel::Info, (tag), ##__VA_ARGS__)
diff --git a/include/dmxdenoiser/ThreadPool.hpp b/include/dmxdenoiser/ThreadPool.hpp
index 12dfda1..07e967c 100644
--- a/include/dmxdenoiser/ThreadPool.hpp
+++ b/include/dmxdenoiser/ThreadPool.hpp
@@ -84,7 +84,7 @@ namespace dmxdenoiser
                 }
             );
         }
-        DMX_LOG_INFO("ThreadPool", "ThreadPool ", this, " created with ", threads, " threads.");
+        DMX_LOG_DEBUG("ThreadPool", "ThreadPool ", this, " created with ", threads, " threads.");
     }
 
     template<class F, class... Args>
@@ -118,7 +118,7 @@ namespace dmxdenoiser
         for(std::thread& worker : m_workers)
             worker.join();
         
-        DMX_LOG_INFO("ThreadPool", "ThreadPool ", this, " stopped.");
+        DMX_LOG_TRACE("ThreadPool", "ThreadPool ", this, " stopped.");
     }
 
 } // namespace dmxdenoiser
diff --git a/include/dmxdenoiser/filters/BoxFilter.hpp b/include/dmxdenoiser/filters/BoxFilter.hpp
deleted file mode 100644
index 85dec12..0000000
--- a/include/dmxdenoiser/filters/BoxFilter.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// BoxFilter.hpp
-#pragma once
-
-#include <dmxdenoiser/DMXImage.hpp>
-#include <dmxdenoiser/Filter.hpp>
-
-namespace dmxdenoiser
-{
-    /*
-    /// BoxFilter parameters:
-    /// int kernel
-    struct BoxFilter : public Filter
-    {
-        // Parameters
-        int kernel = 1;
-        
-        // Required: unique filter name
-        static constexpr const char* StaticClassName() { return "BoxFilter"; }
-        const char* Name() const override { return StaticClassName(); };
-
-        BoxFilter() = default;
-        BoxFilter(const ParamDictionary& params) { setParams(params); };
-        ~BoxFilter() override = default;
-
-        void setParams(const ParamDictionary& params) override;
-        void apply(DMXImage& img) const override;
-
-        std::string ToString() const override;
-    };
-
-    void applySimpleBoxFilter(DMXImage& img);
-    */
-} // namespace dmxdenoiser
diff --git a/include/dmxdenoiser/filters/ConvolutionCUDA.hpp b/include/dmxdenoiser/filters/ConvolutionCUDA.hpp
new file mode 100644
index 0000000..e0836c1
--- /dev/null
+++ b/include/dmxdenoiser/filters/ConvolutionCUDA.hpp
@@ -0,0 +1,12 @@
+// ConvolutionCUDA.hpp
+#pragma once
+
+#include <dmxdenoiser/DMXImageView.hpp>
+
+namespace dmxdenoiser
+{
+
+    void convolve2D_CUDA(const DMXImage& in_, DMXImage& out_, const std::vector<int>& frames_, const std::vector<int>& layers_,
+                                    const Kernel2D& kernel_, float strength, bool filterAlpha);
+
+} // namespace dmxdenoiser
diff --git a/include/dmxdenoiser/utils/NumericUtils.hpp b/include/dmxdenoiser/utils/NumericUtils.hpp
index bd6a49d..66a2cf9 100644
--- a/include/dmxdenoiser/utils/NumericUtils.hpp
+++ b/include/dmxdenoiser/utils/NumericUtils.hpp
@@ -37,14 +37,16 @@ namespace dmxdenoiser
         return abs_c(a - b) < epsilon;
     }
 
-    DMX_CPU_GPU
-    constexpr inline float clampf(float x, float lo, float hi) {
-        return (x > hi) ? hi : ((x < lo) ? lo : x);
+    DMX_CPU_GPU /*constexpr*/ inline int clampi(int x, int min, int max) {
+        return (x > max) ? max : ((x < min) ? min : x);
     }
 
-    // Blend two floats 
-    DMX_CPU_GPU
-    constexpr inline float floatsBlend(float a, float b, float t) noexcept { 
+    DMX_CPU_GPU /*constexpr*/ inline float clampf(float x, float min, float max) {
+        return (x > max) ? max : ((x < min) ? min : x);
+    }
+
+    // Blend two floats
+    DMX_CPU_GPU /*constexpr*/ inline float floatsBlend(float a, float b, float t) noexcept { 
         t = clampf(t, 0.0f, 1.0f);
         return a + (b - a) * t;
     } 
diff --git a/src/filters/BoxFilter.cpp b/src/filters/BoxFilter.cpp
deleted file mode 100644
index 02ad065..0000000
--- a/src/filters/BoxFilter.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include <dmxdenoiser/FilterFactory.hpp>
-#include <dmxdenoiser/FilterKernels.hpp>
-#include <dmxdenoiser/filters/BoxFilter.hpp>
-
-#include <optional>
-#include <iostream>
-#include <vector>
-
-// BoxFilter parameters:
-// int kernel
-
-namespace dmxdenoiser
-{   
-    /*
-    void BoxFilter::setParams(const ParamDictionary& params)
-    {
-        auto kernel_param = params.getSingleParam<int>("kernel");
-        if (kernel_param)
-            kernel = *kernel_param;
-        else
-            throw std::runtime_error("Missing required parameter 'kernel'");
-    };
-
-    void BoxFilter::apply(
-            DMXImage& img, 
-            const std::vector<int>& frames, 
-            const std::vector<std::string>& layers) const
-    {
-        std::cout << "Filtering DMXImage... \n";
-    };
-
-    std::string BoxFilter::ToString() const
-    {
-        return "BoxFilter: kernel=" + std::to_string(kernel);
-    };
-
-    REGISTER_FILTER(BoxFilter)
-    */
-} // namespace dmxdenoiser
-
-
diff --git a/src/filters/ConvolutionCUDA.cu b/src/filters/ConvolutionCUDA.cu
new file mode 100644
index 0000000..4fcf33f
--- /dev/null
+++ b/src/filters/ConvolutionCUDA.cu
@@ -0,0 +1,123 @@
+#include <dmxdenoiser/filters/ConvolutionCUDA.hpp>
+#include <dmxdenoiser/DMXImage.hpp>
+#include <dmxdenoiser/DMXImageView.hpp>
+#include <dmxdenoiser/Kernel2D.hpp>
+#include <dmxdenoiser/utils/NumericUtils.hpp>
+#include <dmxdenoiser/Pixel.hpp>
+
+#include <cstddef>
+
+#include <cuda_runtime.h>
+
+#define CUDA_CHECK(x) do { cudaError_t _e = (x); if (_e != cudaSuccess) \
+    throw std::runtime_error(std::string("CUDA error: ")+cudaGetErrorString(_e)); } while(0)
+
+namespace dmxdenoiser
+{
+
+    __global__ void convolve2D_CUDA_kernel(DMXImageView in, DMXImageView out, int* frames, int framesSize, int* layers, int layersSize,
+                                    float* kernel, int kernelSize, float strength, bool filterAlpha) 
+    {
+        int x = blockIdx.x * blockDim.x + threadIdx.x;
+        int y = blockIdx.y * blockDim.y + threadIdx.y;
+        int s = blockIdx.z;
+
+        if (x >= in.width || y >= in.height) return;
+
+        const int total = framesSize * layersSize;
+        if (s >= total) return;
+
+        int frameIdx = s / layersSize;
+        int layerIdx = s % layersSize;
+        int frame = frames[frameIdx];
+        int layer = layers[layerIdx];
+        int offset = kernelSize >> 1;
+
+        float* orig = in.at(x, y, frame, layer);
+        float sum_r = 0.0f; float sum_g = 0.0f; float sum_b = 0.0f; float sum_a = 0.0f; 
+        for(int ky = -offset; ky <= offset; ++ky) 
+        {
+            int py = clampi(y + ky, 0, in.height - 1);
+            int krow = (ky + offset)*kernelSize;
+            for(int kx = -offset; kx <= offset; ++kx)
+            {
+                int px = clampi(x + kx, 0, in.width - 1);
+                float w = kernel[krow + (kx + offset)];
+                float* p = in.at(px, py, frame, layer);
+                sum_r += w * p[0];
+                sum_g += w * p[1];
+                sum_b += w * p[2];
+                sum_a += w * p[3];
+            }
+        }
+        float out_r = floatsBlend(orig[0], sum_r, strength);
+        float out_g = floatsBlend(orig[1], sum_g, strength);
+        float out_b = floatsBlend(orig[2], sum_b, strength);
+        float out_a = filterAlpha ? floatsBlend(orig[3], sum_a, strength) : orig[3];
+
+        float* dist = out.at(x, y, frame, layer);
+        dist[0] = out_r; dist[1] = out_g; dist[2] = out_b; dist[3] = out_a;
+    }
+
+    void convolve2D_CUDA(const DMXImage& in_, DMXImage& out_, const std::vector<int>& frames_, const std::vector<int>& layers_,
+                                    const Kernel2D& kernel_, float strength, bool filterAlpha)
+    {
+        DMXImage out_tmp = in_;
+
+        int framesSize = frames_.size();
+        int layersSize = layers_.size();
+        int kernelSize = kernel_.size();
+        std::size_t N = in_.data().size();
+
+        int* d_frames = nullptr;
+        int* d_layers = nullptr;
+        float* d_kernel = nullptr;
+        float* d_in = nullptr;
+        float* d_out = nullptr;
+        
+        DMXImageView in;
+        in.width = in_.width();
+        in.height = in_.height();
+        in.numLayers = in_.numLayers();
+        in.numFrames = in_.numFrames();
+        in.numChannels = in_.numChannels();
+        DMXImageView out = in;
+
+        CUDA_CHECK(cudaMalloc(&d_frames, framesSize*sizeof(int)));
+        CUDA_CHECK(cudaMalloc(&d_layers, layersSize*sizeof(int)));
+        CUDA_CHECK(cudaMalloc(&d_kernel, kernelSize*kernelSize*sizeof(float)));
+        CUDA_CHECK(cudaMalloc(&d_in, N*sizeof(float)));
+        CUDA_CHECK(cudaMalloc(&d_out, N*sizeof(float)));
+
+        CUDA_CHECK(cudaMemcpy(d_frames, frames_.data(), framesSize*sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(d_layers, layers_.data(), layersSize*sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(d_kernel, kernel_.m_data.data(), kernelSize*kernelSize*sizeof(float), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(d_in, in_.data().data(), N*sizeof(float), cudaMemcpyHostToDevice));
+
+        in.data = d_in;
+        out.data = d_out;
+
+        dim3 threads(16, 16, 1);
+        dim3 blocks( 
+            (in_.width() + threads.x - 1)/threads.x,
+            (in_.height() + threads.y - 1)/threads.y,
+            std::max(1, framesSize * layersSize)
+        );
+        
+        convolve2D_CUDA_kernel<<<blocks, threads>>>(in, out, d_frames, framesSize, d_layers, 
+            layersSize, d_kernel, kernelSize, strength, filterAlpha);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+        
+        CUDA_CHECK(cudaMemcpy(out_tmp.data().data(), out.data, N*sizeof(float), cudaMemcpyDeviceToHost));
+
+        out_ = std::move(out_tmp);
+
+        cudaFree(d_in);
+        cudaFree(d_out);
+        cudaFree(d_kernel);
+        cudaFree(d_frames);
+        cudaFree(d_layers);
+    }
+
+} // namespace dmxdenoiser
diff --git a/src/filters/ConvolutionFilter.cpp b/src/filters/ConvolutionFilter.cpp
index c899645..fa8418d 100644
--- a/src/filters/ConvolutionFilter.cpp
+++ b/src/filters/ConvolutionFilter.cpp
@@ -3,15 +3,12 @@
 #include <dmxdenoiser/StringConversions.hpp>
 #include <dmxdenoiser/FilterFactory.hpp>
 #include <dmxdenoiser/FilterKernels.hpp>
+#include <dmxdenoiser/filters/ConvolutionCUDA.hpp>
 #include <dmxdenoiser/filters/ConvolutionFilter.hpp>
 #include <dmxdenoiser/Logger.hpp>
 #include <dmxdenoiser/Parallel.hpp>
 #include <dmxdenoiser/utils/NumericUtils.hpp>
 
-#if DMX_ENABLE_CUDA
-    #include <dmxdenoiser/ConvolutionCUDA.cu>
-#endif
-
 #include <optional>
 #include <cstdint>
 #include <iostream>
@@ -107,7 +104,7 @@ namespace dmxdenoiser
             DMX_LOG_ERROR("ConvolutionFilter", "setParams(): Missing required parameter 'kernel'");
             throw std::runtime_error("ConvolutionFilter::setParams(): Missing required parameter 'kernel'");
         }
-        DMX_LOG_INFO("ConvolutionFilter", "Setup filter settings:\nParameters:\n", paramsInfo);
+        DMX_LOG_DEBUG("ConvolutionFilter", "Setup filter settings:\nParameters:\n", paramsInfo);
     };
 
     void ConvolutionFilter::convolveCPU(const DMXImage& input, DMXImage& output) const
@@ -116,10 +113,10 @@ namespace dmxdenoiser
         if(!pool)
             DMX_LOG_WARNING("ConvolutionFilter", "convolveCPU(): no ThreadPool available; running single-threaded");
 
-        int width = input.width();
-        int height = input.height();
-        int ksize = m_kernel.size();
-        int offset = ksize/2;
+        const int width = input.width();
+        const int height = input.height();
+        const int ksize = m_kernel.size();
+        const int offset = ksize/2;
 
         std::vector<int> framesIndices;
         // If no specific frames were set, process all frames by default.
@@ -134,8 +131,9 @@ namespace dmxdenoiser
                 if(requestedFrame < input.numFrames())
                     framesIndices.push_back(requestedFrame);
                 else
-                    DMX_LOG_WARNING("ConvolutionFilter", "setParams(): requested frame ", 
-                        requestedFrame, " not found; skipping");
+                    DMX_LOG_WARNING("ConvolutionFilter",
+                        "convolveCPU(): requested frame ", requestedFrame,
+                        " out of range for input; skipping");
             }
         }
 
@@ -153,6 +151,12 @@ namespace dmxdenoiser
             }
         }
 
+        if (framesIndices.empty() || layerIndices.empty()) {
+            DMX_LOG_WARNING("ConvolutionFilter",
+                "convolveCPU(): no valid frames or layers to process; skipping");
+            return;
+}
+
         for(int frameIdx = 0; frameIdx < framesIndices.size(); ++frameIdx)
         {
             int frame = framesIndices[frameIdx];
@@ -165,12 +169,14 @@ namespace dmxdenoiser
                         PixelRGBA orig = input.get(to_int(x), to_int(y), frame, layer);
                         PixelRGBA sum = {0.0f, 0.0f, 0.0f, 0.0f};
                         for(int ky = -offset; ky <= offset; ++ky)
+                        {
+                            int py = std::clamp(to_int(y) + ky, 0, height - 1);
                             for(int kx = -offset; kx <= offset; ++kx)
                             {
                                 int px = std::clamp(to_int(x) + kx, 0, width - 1);
-                                int py = std::clamp(to_int(y) + ky, 0, height - 1);
                                 sum += m_kernel(ky + offset, kx + offset) * input.get(px, py, frame, layer);
                             }
+                        }
                         sum = blendPixels(orig, sum, m_strength, m_filterAlpha);
                         output.at(to_int(x), to_int(y), frame, layer) = sum;
                     }
@@ -182,7 +188,50 @@ namespace dmxdenoiser
     void ConvolutionFilter::convolveGPU(const DMXImage& input, DMXImage& output) const
     {
         #if DMX_ENABLE_CUDA
-            // GPU logic
+            int width = input.width();
+            int height = input.height();
+            int ksize = m_kernel.size();
+
+            std::vector<int> framesIndices;
+            // If no specific frames were set, process all frames by default.
+            if (m_frames.empty())
+            {
+                for (int i = 0; i < input.numFrames(); ++i) // Add all frames
+                        framesIndices.push_back(i);
+            } else {
+                for (int i = 0; i < m_frames.size(); ++i)
+                {
+                    int requestedFrame = m_frames[i];
+                    if(requestedFrame < input.numFrames())
+                        framesIndices.push_back(requestedFrame);
+                    else
+                        DMX_LOG_WARNING("ConvolutionFilter",
+                            "convolveGPU(): requested frame ", requestedFrame,
+                            " out of range for input; skipping");
+                }
+            }
+
+            std::vector<int> layerIndices;
+            // If no specific layers were set, process by default.   
+            if (m_layers.empty()) {
+                layerIndices = input.getFilteringLayersIndices();
+            } else {
+                for (const auto& layer : m_layers)
+                {
+                    if (input.hasLayer(layer))
+                        layerIndices.push_back(input.getLayerIndex(layer));
+                    else
+                        DMX_LOG_WARNING("ConvolutionFilter", "setParams(): requested layer '", layer, "' not found; skipping");
+                }
+            }
+
+            if (framesIndices.empty() || layerIndices.empty()) {
+                DMX_LOG_WARNING("ConvolutionFilter",
+                    "convolveCPU(): no valid frames or layers to process; skipping");
+                return;
+
+            convolve2D_CUDA(input, output, framesIndices, layerIndices, m_kernel, m_strength, m_filterAlpha);
+
         #else
             DMX_LOG_ERROR("ConvolutionFilter", "convolveGPU(): no CUDA build");
             throw std::runtime_error("convolveGPU(): no CUDA build");
@@ -212,6 +261,10 @@ namespace dmxdenoiser
             this->convolveGPU(in, out);
         }  else if (m_backend == Backend::METAL) {
             this->convolveMETAL(in, out);
+        } else {
+            DMX_LOG_ERROR("ConvolutionFilter",
+                "applyFilter(): Unsupported backend: ", dmxdenoiser::ToString(m_backend));
+            throw std::runtime_error("ConvolutionFilter::applyFilter(): Unsupported backend");
         }
     };
 
diff --git a/src/filters/NLMFilter.cpp b/src/filters/NLMFilter.cpp
index 610d0ed..2f12281 100644
--- a/src/filters/NLMFilter.cpp
+++ b/src/filters/NLMFilter.cpp
@@ -8,10 +8,6 @@
 #include <dmxdenoiser/Parallel.hpp>
 #include <dmxdenoiser/utils/NumericUtils.hpp>
 
-#if DMX_ENABLE_CUDA
-    #include <dmxdenoiser/NLMFilterCUDA.cu>
-#endif
-
 #include <optional>
 #include <cstdint>
 #include <cmath>
diff --git a/tests/AssertLogContains.hpp b/tests/AssertLogContains.hpp
index e8adb4b..f8d826e 100644
--- a/tests/AssertLogContains.hpp
+++ b/tests/AssertLogContains.hpp
@@ -20,35 +20,39 @@ namespace dmxdenoiser
     template<typename... Args>
     void assertLogContains(std::string_view logFilePath, Args&&... args)
     {
-        static_assert((is_string_type<Args> && ...), "All arguments must be std::string or const char*");
-
-        ASSERT_TRUE(std::filesystem::exists(logFilePath));
-        ASSERT_GT(std::filesystem::file_size(logFilePath), 0u);
-        std::ifstream ifile{std::string(logFilePath)};
-        ASSERT_TRUE(ifile.good());
-        std::string logText{};
-        std::string line{};
-        while(std::getline(ifile, line))
-            logText += line;
-        (([&](){ EXPECT_NE(logText.find(std::forward<Args>(args)), std::string::npos); }() ), ...);
+        #if DMX_DEBUG_BUILD
+            static_assert((is_string_type<Args> && ...), "All arguments must be std::string or const char*");
+
+            ASSERT_TRUE(std::filesystem::exists(logFilePath));
+            ASSERT_GT(std::filesystem::file_size(logFilePath), 0u);
+            std::ifstream ifile{std::string(logFilePath)};
+            ASSERT_TRUE(ifile.good());
+            std::string logText{};
+            std::string line{};
+            while(std::getline(ifile, line))
+                logText += line;
+            (([&](){ EXPECT_NE(logText.find(std::forward<Args>(args)), std::string::npos); }() ), ...);
+        #endif
     }
 
     template<typename... Args>
     void assertLogDoesNotContain(std::string_view logFilePath, Args&&... args)
     {
-        static_assert((is_string_type<Args> && ...), "All arguments must be std::string or const char*");
-
-        ASSERT_TRUE(std::filesystem::exists(logFilePath));
-        ASSERT_GT(std::filesystem::file_size(logFilePath), 0u);
-        std::ifstream ifile{std::string(logFilePath)};
-        ASSERT_TRUE(ifile.good());
-        std::string logText{};
-        std::string line{};
-        while(std::getline(ifile, line))
-            logText += line;
-        (([&](){ EXPECT_EQ(logText.find(std::forward<Args>(args)), 
-            std::string::npos) << "Log contains substring '"
-            << std::forward<Args>(args) << "' (should be absent)"; }() ), ...);
+        #if DMX_DEBUG_BUILD
+            static_assert((is_string_type<Args> && ...), "All arguments must be std::string or const char*");
+        
+            ASSERT_TRUE(std::filesystem::exists(logFilePath));
+            ASSERT_GT(std::filesystem::file_size(logFilePath), 0u);
+            std::ifstream ifile{std::string(logFilePath)};
+            ASSERT_TRUE(ifile.good());
+            std::string logText{};
+            std::string line{};
+            while(std::getline(ifile, line))
+                logText += line;
+            (([&](){ EXPECT_EQ(logText.find(std::forward<Args>(args)), 
+                std::string::npos) << "Log contains substring '"
+                << std::forward<Args>(args) << "' (should be absent)"; }() ), ...);
+        #endif
     }
 
 } // namespace dmxdenoiser
diff --git a/tests/BuildInfo_test.cpp b/tests/BuildInfo_test.cpp
index 8e97994..708cb74 100644
--- a/tests/BuildInfo_test.cpp
+++ b/tests/BuildInfo_test.cpp
@@ -19,7 +19,7 @@ class BuildInfoTest : public ::testing::Test {
 
     void SetUp() override {
         removeLogFile();
-        DMX_LOG_INIT(LogLevel::Trace, &std::clog, this->getLogPath());
+        DMX_LOG_INIT(DMX_MIN_LOG_LEVEL, &std::clog, this->getLogPath());
     }
 
     void TearDown() override {
diff --git a/tests/ConvolutionFilter_test.cpp b/tests/ConvolutionFilter_test.cpp
index c253941..e41ab67 100644
--- a/tests/ConvolutionFilter_test.cpp
+++ b/tests/ConvolutionFilter_test.cpp
@@ -30,7 +30,7 @@ class ConvolutionFilterTest : public ::testing::Test {
 
     void SetUp() override {
         removeLogFile();
-        DMX_LOG_INIT(LogLevel::Trace, &std::clog, this->getLogPath());
+        DMX_LOG_INIT(DMX_MIN_LOG_LEVEL, &std::clog, this->getLogPath());
     }
 
     void TearDown() override {
@@ -256,3 +256,27 @@ TEST_F(ConvolutionFilterTest, ApplyGaussianFilterKernelToTheImageParallelSingleT
     auto gaussianKernel = FilterKernels::getGaussianKernel(3, sigma);
     applyFilterToImageFile(filename, outputFileName, gaussianKernel);
 }
+
+#if DMX_ENABLE_HEAVY_TESTS
+TEST_F(ConvolutionFilterTest, ApplyGaussianFilterKernelToPalmImageKernel7x7CPU)
+{
+    ThreadPool threadPool(0);
+    std::string filename = "../examples/palm_pixel_art.exr";
+    std::string outputFileName = "../tests/test_files/palm_pixel_art_cpu_convo_gaussan_sigma2_17x17.exr";
+    float sigma = 100.0f;
+    auto gaussianKernel = FilterKernels::getGaussianKernel(17, sigma);
+    applyFilterToImageFile(filename, outputFileName, gaussianKernel, &threadPool, Backend::CPU);
+}
+
+#if DMX_ENABLE_CUDA
+TEST_F(ConvolutionFilterTest, ApplyGaussianFilterKernelToPalmImageKernel17x17GPU)
+{
+    std::string filename = "../examples/palm_pixel_art.exr";
+    std::string outputFileName = "../tests/test_files/palm_pixel_art_gpu_convo_gaussan_sigma2_17x17.exr";
+    float sigma = 100.0f;
+    auto gaussianKernel = FilterKernels::getGaussianKernel(17, sigma);
+    applyFilterToImageFile(filename, outputFileName, gaussianKernel, nullptr, Backend::GPU);
+}
+#endif // DMX_ENABLE_CUDA
+
+#endif // DMX_ENABLE_HEAVY_TESTS
diff --git a/tests/NLMFilter_test.cpp b/tests/NLMFilter_test.cpp
index 3504f14..86812a9 100644
--- a/tests/NLMFilter_test.cpp
+++ b/tests/NLMFilter_test.cpp
@@ -112,6 +112,7 @@ TEST_F(NLMFilterTest, ApplyNLMFilterKernelToTheImageRabbit)
                             sigmaBeauty, sigmaAlbedo, sigmaNormal, sigmaDepth, &threadPool, { {"beauty", "default"} });
 }
 
+#if DMX_ENABLE_HEAVY_TESTS
 TEST_F(NLMFilterTest, ApplyNLMFilterKernelToTheImageForest)
 {
     ThreadPool threadPool(0);
@@ -130,3 +131,4 @@ TEST_F(NLMFilterTest, ApplyNLMFilterKernelToTheImageForest)
                             outputFileName, radius, patchRadius, sigmaBeauty, sigmaAlbedo, 
                             sigmaNormal, sigmaDepth, &threadPool);
 }
+#endif
diff --git a/todo b/todo
index 5e5015c..bc85235 100644
--- a/todo
+++ b/todo
@@ -1,44 +1,48 @@
-NML filter: CPU CUDA
-ParseArgs: parsing inputs
+Filter: avoid storing frameIndex / layerIndex as members - pass the relevant frames/layers (or indices) to applyFilter().
+Filters: refactor hardcoding including Logs - use Name() method, Backend - repeating in every filter. backend methods -> universal
+Filters: Checking Logs can pick from ConvoFilter
+UMImage: unified memory allocation for CUDA
+Convo and NLM CPU GPU: try to simplify pixel operations and make
+ConvolutionFilter and all filters: ToString() ??
 Move function definitions from headers to cpp files where possible
+
 DMXImage: Duplicate or copy layers from one image to another (for Filter::convolve2D())
 DMXImage: Frame count should be odd.
 DMXImage: frame map (to compute mv guidience)
-DenoiserParams: params
-Cli: full pipeline
-Temporal Filter: CPU CUDA
-ImageIOExr: put header into ImageInfo, then ImageInfo -> DMXImage as a map -> Write correct header metadata
-Pixel, ConvoCPU: simplify, pixel operations
-UMImage: unified memory allocation for CUDA
-DMXError: error handling on cuda (boundCheck etc.) CUDA_CHECK
 DMXImage: rename Data to ptr_to_data — calling x.data().data() just to retrieve a pointer feels awkward.
-ML Samples: video about collecting process
-RgbaSample_test: read channels, compression, check sample -> move it to lib
-ImageIOExr: separate dataWindow and displayWindow in params, fix read, write and ImageInfo to read actual display resolution
-ConvolutionFilter: ToString()
-BuildInfo: data from cmake
+ImageIOExr: separate dataWindow and displayWindow in params, fix read, write and ImageInfo to read actual display resolution ??
+Store EXR header in ImageInfo, pass it to DMXImage as metadata map, and write it back to preserve correct header.
+
+DenoiserParams: params
+ImageSequence: Add multi-frame logic (frame logic)
+CLI: full pipeline
+
+NML filter: CUDA
+Temporal Filter: CPU CUDA, MV estimator/generator based on MSE and mv guidence
+ConvolutionFilter: add METAL backend implementations
+
+makegile: create filter, build, quality-check, format, etc.
+tools/python wrapper
 Git actions: debug / release
-All code: place Trace Debag in the code
-Cmake: level of verbosity
-Cmake: Release?
-Cmake: Config: debug level - verbosity
+BuildInfo: data from cmake, changelog, version
+Release pipeline
+
+GetParameter: add defaulting
+ParseArgs: parsing inputs
+
+
+ML spatial
+ML Samples: video about collecting process
+
 Parallel: estimate time of parallel - Debug
 ThreadPool: ToString, Backend res -> threadPool - >> to string
-Kernel2D: split into separate cpp and hpp files
-ImageIO: Add unit tests after refactor
-ImageIOExr: Add unit tests for generated layers and frames
-ImageInfo: Update ToString() to print parameters
-ImageSequence: Add multi-frame logic (frame logic)
-ImageIO unit tests: Add frames tests
-Kernel2D unit tests: Add tests for name method
-ImageIOExr - header ACES
+ImageInfo: Update ToString()
+
+
 GUI / async display
-ML spatial
-simple temporalFilter
 noise analyzer
-MV estimator/generator based on MSE and mv guidence
 PSNR/SSIM basic metrics
-ConvolutionFilter: add METAL backend implementations
+
 
         O-------------------O
         |                   |
@@ -52,6 +56,3 @@ ConvolutionFilter: add METAL backend implementations
         |        CLI        |
         O-------------------O
 
-
-Addressed:
-ParamDictionary: default value instead throwing error - addressed. No needed
diff --git a/tools/build.py b/tools/build.py
new file mode 100644
index 0000000..8d09656
--- /dev/null
+++ b/tools/build.py
@@ -0,0 +1,16 @@
+import argparse
+
+def build_parser():
+    parser = argparse.ArgumentParser(
+        description="Universal project builder CLI"
+    )
+
+    parser.add_argument("--preset")
+    return parser
+
+def main():
+    parser = build_parser()
+
+if __name__ == "__main__":
+    main()
+