From baae7454a30b4034cd446e9188e78a6d0bef893f Mon Sep 17 00:00:00 2001 From: RecRanger <168371178+RecRanger@users.noreply.github.com> Date: Sun, 4 Jan 2026 18:09:48 -0700 Subject: [PATCH 1/4] Run CI on all matrix OSes --- .github/workflows/main.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 68576ec..5196373 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,6 +7,7 @@ on: branches: ["main", "master"] pull_request: branches: ["main", "master"] + workflow_dispatch: # Enable manual trigger. permissions: contents: read @@ -15,12 +16,9 @@ jobs: build: strategy: matrix: - # Update this as needed: - # Common platforms: ["ubuntu-latest", "macos-latest", "windows-latest"] - os: ["ubuntu-latest"] + os: ["ubuntu-latest", "macos-latest", "windows-latest"] python-version: ["3.10", "3.11", "3.12", "3.13"] - # Linux only by default. Use ${{ matrix.os }} for other OSes. runs-on: ${{ matrix.os }} steps: From 69b4dc5fafce2a84a737f2b6600d967153ad69e2 Mon Sep 17 00:00:00 2001 From: RecRanger <168371178+RecRanger@users.noreply.github.com> Date: Sun, 4 Jan 2026 18:12:00 -0700 Subject: [PATCH 2/4] Install libmagic on macOS --- .github/workflows/main.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5196373..d6d1c8a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,6 +31,12 @@ jobs: # Important for versioning plugins: fetch-depth: 0 + - name: Install libmagic (macOS only) + if: matrix.os == 'macos-latest' + run: | + brew update + brew install libmagic + - name: Install uv (official Astral action) uses: astral-sh/setup-uv@v5 with: From 387ee42e783d290e1f37a37d24e2d7db06d3342e Mon Sep 17 00:00:00 2001 From: RecRanger <168371178+RecRanger@users.noreply.github.com> Date: Sun, 4 Jan 2026 18:30:29 -0700 Subject: [PATCH 3/4] Switch to `Path.read_bytes()` --- .github/workflows/main.yml | 1 + README.md | 2 +- src/folder_indexer/indexer.py | 19 +++++++++---------- tests/test_main_cli.py | 3 ++- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d6d1c8a..21876d6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,6 +20,7 @@ jobs: python-version: ["3.10", "3.11", "3.12", "3.13"] runs-on: ${{ matrix.os }} + timeout-minutes: 5 steps: # Generally following uv docs: diff --git a/README.md b/README.md index 375c699..a60838d 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ The output parquet file (`file_index.parquet`) has the following columns: * folder_path * file_name * file_size_bytes - * md5_hash_hex + * md5_hex * sha256_base64 * date_created * date_modified diff --git a/src/folder_indexer/indexer.py b/src/folder_indexer/indexer.py index 286f413..da4d0bb 100644 --- a/src/folder_indexer/indexer.py +++ b/src/folder_indexer/indexer.py @@ -282,19 +282,17 @@ def get_file_info( magic_file_type_1 = magic_worker.from_file(file_path) # SHA256 hash, if file size < 100 KiB - with add_time_taken(time_taken_log, "sha256_hash_1"): + with add_time_taken(time_taken_log, "sha256_base64"): sha256_base64 = None if file_size < BIG_FILE_SIZE_THRESHOLD_BYTES: - with file_path.open("rb") as f: - sha256_hash = hashlib.sha256(f.read()).digest() - sha256_base64 = base64.b64encode(sha256_hash).decode("utf-8") + sha256_hash = hashlib.sha256(file_path.read_bytes()).digest() + sha256_base64 = base64.b64encode(sha256_hash).decode("utf-8") - with add_time_taken(time_taken_log, "md5_hash_hex_1"): - md5_hash_hex = None + with add_time_taken(time_taken_log, "md5_hex"): + md5_hex = None if file_size < BIG_FILE_SIZE_THRESHOLD_BYTES: - with file_path.open("rb") as f: - md5_hash = hashlib.md5(f.read()).digest() - md5_hash_hex = binascii.hexlify(md5_hash).decode("utf-8") + md5_hash = hashlib.md5(file_path.read_bytes()).digest() + md5_hex = binascii.hexlify(md5_hash).decode("utf-8") # Append file information to the list with add_time_taken(time_taken_log, "append_to_list"): @@ -306,7 +304,7 @@ def get_file_info( "folder_path": file_path_stripped.parent.as_posix(), "file_name": file_path_stripped.name, "file_size_bytes": file_size, - "md5_hex": md5_hash_hex, + "md5_hex": md5_hex, "sha256_base64": sha256_base64, "date_created": date_created, "date_modified": date_modified, @@ -328,6 +326,7 @@ def save_to_parquet(file_data: list[dict[str, Any]], output_folder: Path) -> Non "folder_path": pl.String, "file_name": pl.String, "file_size_bytes": pl.UInt64, + "md5_hex": pl.String, "sha256_base64": pl.String, "date_created": pl.Datetime, "date_modified": pl.Datetime, diff --git a/tests/test_main_cli.py b/tests/test_main_cli.py index 77405d4..5ec53fb 100644 --- a/tests/test_main_cli.py +++ b/tests/test_main_cli.py @@ -19,13 +19,14 @@ def test_with_single_file(tmp_path: Path) -> None: assert (tmp_path / "file_index.parquet").is_file() df = pl.read_parquet(tmp_path / "file_index.parquet") - assert df.shape == (1, 12) + assert df.shape == (1, 13) assert df.columns == [ "file_path", "folder_path", "file_name", "file_size_bytes", + "md5_hex", "sha256_base64", "date_created", "date_modified", From 9f7d41e83c36f5f0fdedfbe178ab18fe876896fe Mon Sep 17 00:00:00 2001 From: RecRanger <168371178+RecRanger@users.noreply.github.com> Date: Sun, 4 Jan 2026 18:38:01 -0700 Subject: [PATCH 4/4] Disable CI on Windows --- .github/workflows/main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 21876d6..0ec8690 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -16,7 +16,8 @@ jobs: build: strategy: matrix: - os: ["ubuntu-latest", "macos-latest", "windows-latest"] + # TODO: Fix failing CI on Windows + os: ["ubuntu-latest", "macos-latest"] python-version: ["3.10", "3.11", "3.12", "3.13"] runs-on: ${{ matrix.os }}