diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 68576ec..0ec8690 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,6 +7,7 @@ on: branches: ["main", "master"] pull_request: branches: ["main", "master"] + workflow_dispatch: # Enable manual trigger. permissions: contents: read @@ -15,13 +16,12 @@ jobs: build: strategy: matrix: - # Update this as needed: - # Common platforms: ["ubuntu-latest", "macos-latest", "windows-latest"] - os: ["ubuntu-latest"] + # TODO: Fix failing CI on Windows + os: ["ubuntu-latest", "macos-latest"] python-version: ["3.10", "3.11", "3.12", "3.13"] - # Linux only by default. Use ${{ matrix.os }} for other OSes. runs-on: ${{ matrix.os }} + timeout-minutes: 5 steps: # Generally following uv docs: @@ -33,6 +33,12 @@ jobs: # Important for versioning plugins: fetch-depth: 0 + - name: Install libmagic (macOS only) + if: matrix.os == 'macos-latest' + run: | + brew update + brew install libmagic + - name: Install uv (official Astral action) uses: astral-sh/setup-uv@v5 with: diff --git a/README.md b/README.md index 375c699..a60838d 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ The output parquet file (`file_index.parquet`) has the following columns: * folder_path * file_name * file_size_bytes - * md5_hash_hex + * md5_hex * sha256_base64 * date_created * date_modified diff --git a/src/folder_indexer/indexer.py b/src/folder_indexer/indexer.py index 286f413..da4d0bb 100644 --- a/src/folder_indexer/indexer.py +++ b/src/folder_indexer/indexer.py @@ -282,19 +282,17 @@ def get_file_info( magic_file_type_1 = magic_worker.from_file(file_path) # SHA256 hash, if file size < 100 KiB - with add_time_taken(time_taken_log, "sha256_hash_1"): + with add_time_taken(time_taken_log, "sha256_base64"): sha256_base64 = None if file_size < BIG_FILE_SIZE_THRESHOLD_BYTES: - with file_path.open("rb") as f: - sha256_hash = hashlib.sha256(f.read()).digest() - sha256_base64 = base64.b64encode(sha256_hash).decode("utf-8") + sha256_hash = hashlib.sha256(file_path.read_bytes()).digest() + sha256_base64 = base64.b64encode(sha256_hash).decode("utf-8") - with add_time_taken(time_taken_log, "md5_hash_hex_1"): - md5_hash_hex = None + with add_time_taken(time_taken_log, "md5_hex"): + md5_hex = None if file_size < BIG_FILE_SIZE_THRESHOLD_BYTES: - with file_path.open("rb") as f: - md5_hash = hashlib.md5(f.read()).digest() - md5_hash_hex = binascii.hexlify(md5_hash).decode("utf-8") + md5_hash = hashlib.md5(file_path.read_bytes()).digest() + md5_hex = binascii.hexlify(md5_hash).decode("utf-8") # Append file information to the list with add_time_taken(time_taken_log, "append_to_list"): @@ -306,7 +304,7 @@ def get_file_info( "folder_path": file_path_stripped.parent.as_posix(), "file_name": file_path_stripped.name, "file_size_bytes": file_size, - "md5_hex": md5_hash_hex, + "md5_hex": md5_hex, "sha256_base64": sha256_base64, "date_created": date_created, "date_modified": date_modified, @@ -328,6 +326,7 @@ def save_to_parquet(file_data: list[dict[str, Any]], output_folder: Path) -> Non "folder_path": pl.String, "file_name": pl.String, "file_size_bytes": pl.UInt64, + "md5_hex": pl.String, "sha256_base64": pl.String, "date_created": pl.Datetime, "date_modified": pl.Datetime, diff --git a/tests/test_main_cli.py b/tests/test_main_cli.py index 77405d4..5ec53fb 100644 --- a/tests/test_main_cli.py +++ b/tests/test_main_cli.py @@ -19,13 +19,14 @@ def test_with_single_file(tmp_path: Path) -> None: assert (tmp_path / "file_index.parquet").is_file() df = pl.read_parquet(tmp_path / "file_index.parquet") - assert df.shape == (1, 12) + assert df.shape == (1, 13) assert df.columns == [ "file_path", "folder_path", "file_name", "file_size_bytes", + "md5_hex", "sha256_base64", "date_created", "date_modified",