diff --git a/README.md b/README.md index d5d4e91..fd96945 100644 --- a/README.md +++ b/README.md @@ -488,6 +488,9 @@ By default, the plugin reads and parses file content when loading as follows: - `.jsonl` — Each line is parsed as a JSON object - All other file types — Loads as raw text or binary content +Compressed files with a `.gz`, `.bz2`, or `.xz` extension are decompressed transparently. The inner file type +determines which reader and read mode are used (e.g. `data.json.gz` uses `json.load`, `data.txt.bz2` loads as plain text). + ### Customizing defaults You can customize this behavior by specifying a file reader that accepts a file-like object returned by `open()`. diff --git a/src/pytest_data_loader/loaders/impl.py b/src/pytest_data_loader/loaders/impl.py index 7c6662c..d90f218 100644 --- a/src/pytest_data_loader/loaders/impl.py +++ b/src/pytest_data_loader/loaders/impl.py @@ -19,7 +19,10 @@ from pytest_data_loader.paths import ( check_and_track_dir, check_circular_symlink, + compression_aware_open, + get_effective_suffix, get_matching_paths, + is_compressed_path, resolve_relative_path, split_glob_path, ) @@ -191,20 +194,24 @@ def __init__(self, *args: Any, gidx: int | None = None, **kwargs: Any): self.file_reader = self.load_attrs.reader self.read_options = self.load_attrs.read_options if not self.file_reader: - if registered_reader := FileReader.get_registered_reader(self.load_attrs.search_from, self.path.suffix): + if registered_reader := FileReader.get_registered_reader( + self.load_attrs.search_from, get_effective_suffix(self.path) + ): self.file_reader = registered_reader.reader if not self.read_options: self.read_options = registered_reader.read_options assert isinstance(self.read_options, HashableDict) self._effective_read_mode: str | None = None - self._is_streamable = self.file_reader is not None or all( - # non-structured text data can be read line by line - [ - self.path.suffix in FileLoader.STREAMABLE_FILE_TYPES, - self.read_mode != "rb", - self.load_attrs.onload_func is None, - self.load_attrs.parametrizer_func is None, - ] + self._is_streamable = not is_compressed_path(self.path) and ( + self.file_reader is not None + or all( + [ + get_effective_suffix(self.path) in FileLoader.STREAMABLE_FILE_TYPES, + self.read_mode != "rb", + self.load_attrs.onload_func is None, + self.load_attrs.parametrizer_func is None, + ] + ) ) # Caches used by data loaders. @@ -462,7 +469,7 @@ def _get_file_obj(self) -> IO[Any]: """Get file object from cache or open a new one and cache it""" f = self._cached_file_objects.get((self.path, self.read_options)) if not f or f.closed: - f = open(self.path, **self.read_options) + f = compression_aware_open(self.path, **self.read_options) self._cached_file_objects[(self.path, self.read_options)] = f f.seek(0) return f @@ -506,7 +513,7 @@ def inspect_part_data(pos: int, part: Any) -> None: else: commit(pos, part) - with open(self.path, **self.read_options) as f: + with compression_aware_open(self.path, **self.read_options) as f: if self.file_reader: # NOTE: Do NOT use _read_reader_and_split here to get the split data. Closing the file will invalidate # the cached part data generated by the file reader and cause issues when loading part data later. @@ -534,7 +541,7 @@ def _read_file(self) -> str | bytes: if self.read_mode == "auto": # Detect read mode based on sampled data is_binary = False - with open(self.path, "rb") as f: + with compression_aware_open(self.path, mode="rb") as f: chunk = f.read(4096) if chunk: @@ -553,7 +560,7 @@ def _read_file(self) -> str | bytes: if self.read_mode == "r" and "encoding" not in read_options: read_options["encoding"] = "utf-8" - with open(self.path, **read_options) as f: + with compression_aware_open(self.path, **read_options) as f: return f.read() @requires_loader(DataLoaderType.PARAMETRIZE) diff --git a/src/pytest_data_loader/paths.py b/src/pytest_data_loader/paths.py index de1ecd6..06c04ac 100644 --- a/src/pytest_data_loader/paths.py +++ b/src/pytest_data_loader/paths.py @@ -1,15 +1,26 @@ from __future__ import annotations +import bz2 import errno import glob +import gzip +import lzma import os import re +from collections.abc import Callable from functools import lru_cache from pathlib import Path -from typing import Literal +from typing import IO, Any, Literal from pytest_data_loader.exceptions import DataNotFound +_COMPRESSION_OPENERS: dict[str, Callable[..., IO[Any]]] = { + ".gz": gzip.open, + ".bz2": bz2.open, + ".xz": lzma.open, +} +SUPPORTED_COMPRESSION_EXTENSIONS: tuple[str, ...] = tuple(_COMPRESSION_OPENERS) + @lru_cache def resolve_relative_path( @@ -188,3 +199,46 @@ def split_glob_path(path: Path) -> tuple[Path, str]: base = Path(*parts[:split]) pattern = str(Path(*parts[split:])) return base, pattern + + +def is_compressed_path(path: Path) -> bool: + """Return whether the given path is a supported compressed file (.gz/.bz2/.xz). + + :param path: File path to inspect + """ + return path.suffix.lower() in SUPPORTED_COMPRESSION_EXTENSIONS + + +def get_effective_suffix(path: Path) -> str: + """Return the format-bearing suffix of path, skipping a trailing compression suffix when present. + + :param path: File path to inspect + + Examples: + Path("data.json.gz") -> ".json" + Path("data.csv.bz2") -> ".csv" + Path("data.json") -> ".json" + Path("data.gz") -> ".gz" (no inner suffix to expose) + """ + suffixes = path.suffixes + if len(suffixes) >= 2 and is_compressed_path(path): + return suffixes[-2] + return path.suffix + + +def compression_aware_open(path: Path, **open_kwargs: Any) -> IO[Any]: + """Open a file, routing through gzip.open()/bz2.open()/lzma.open() when the suffix matches. + + For compression openers "r" means binary (unlike builtin open() where "r" means text). This function normalizes + the mode so that "r" and "rt" both produce a text-mode stream, matching the semantics of builtin open(). + + :param path: File path to open + :param open_kwargs: Keyword arguments forwarded to the opener (mode, encoding, errors, newline) + """ + opener = _COMPRESSION_OPENERS.get(path.suffix.lower()) + if opener is None: + return open(path, **open_kwargs) + mode = open_kwargs.get("mode") or "r" + # Compression openers treat "r" as binary. Map to "rt" so callers get text mode, matching builtin open. + open_kwargs["mode"] = "rt" if mode in ("r", "rt") else mode + return opener(path, **open_kwargs) diff --git a/tests/.gitattributes b/tests/.gitattributes new file mode 100644 index 0000000..675746c --- /dev/null +++ b/tests/.gitattributes @@ -0,0 +1,2 @@ +# For tests in Windows - Force LF for test data text files so their bytes match compressed counterparts (e.g. text.txt vs text.txt.gz) +**/data/** text=auto eol=lf diff --git a/tests/data/files/compressed/comma.csv.xz b/tests/data/files/compressed/comma.csv.xz new file mode 100644 index 0000000..8f6a94a Binary files /dev/null and b/tests/data/files/compressed/comma.csv.xz differ diff --git a/tests/data/files/compressed/data.jsonl.bz2 b/tests/data/files/compressed/data.jsonl.bz2 new file mode 100644 index 0000000..971cdb2 Binary files /dev/null and b/tests/data/files/compressed/data.jsonl.bz2 differ diff --git a/tests/data/files/compressed/image.jpg.gz b/tests/data/files/compressed/image.jpg.gz new file mode 100644 index 0000000..86abcce Binary files /dev/null and b/tests/data/files/compressed/image.jpg.gz differ diff --git a/tests/data/files/compressed/object.json.gz b/tests/data/files/compressed/object.json.gz new file mode 100644 index 0000000..0e9a86e Binary files /dev/null and b/tests/data/files/compressed/object.json.gz differ diff --git a/tests/data/files/compressed/text.txt.gz b/tests/data/files/compressed/text.txt.gz new file mode 100644 index 0000000..3451cd4 Binary files /dev/null and b/tests/data/files/compressed/text.txt.gz differ diff --git a/tests/data/files/compressed/yaml.yml.gz b/tests/data/files/compressed/yaml.yml.gz new file mode 100644 index 0000000..e813768 Binary files /dev/null and b/tests/data/files/compressed/yaml.yml.gz differ diff --git a/tests/data/files/jsonl/data.jsonl b/tests/data/files/jsonl/data.jsonl index 070b519..f679fd2 100644 --- a/tests/data/files/jsonl/data.jsonl +++ b/tests/data/files/jsonl/data.jsonl @@ -1,3 +1,3 @@ {"name": "Alice", "age": 30} {"name": "Bob", "age": 25} -{"name": "Charlie", "age": 35} +{"name": "Charlie", "age": 35} \ No newline at end of file diff --git a/tests/data/files/jsonl/data2.jsonl b/tests/data/files/jsonl/data2.jsonl index 18c8a49..98e7999 100644 --- a/tests/data/files/jsonl/data2.jsonl +++ b/tests/data/files/jsonl/data2.jsonl @@ -1,2 +1,2 @@ {"city": "Tokyo", "country": "Japan"} -{"city": "Paris", "country": "France"} +{"city": "Paris", "country": "France"} \ No newline at end of file diff --git a/tests/data/files/yaml/yaml.yml b/tests/data/files/yaml/yaml.yml index cf5b1b0..320ba9c 100644 --- a/tests/data/files/yaml/yaml.yml +++ b/tests/data/files/yaml/yaml.yml @@ -27,4 +27,4 @@ services: depends_on: - db volumes: - pgdata: + pgdata: \ No newline at end of file diff --git a/tests/data/files/yaml/yaml_documents.yml b/tests/data/files/yaml/yaml_documents.yml index 8e907d2..3c5d527 100644 --- a/tests/data/files/yaml/yaml_documents.yml +++ b/tests/data/files/yaml/yaml_documents.yml @@ -30,4 +30,4 @@ spec: - configMapRef: name: app-config - secretRef: - name: app-secret + name: app-secret \ No newline at end of file diff --git a/tests/paths.py b/tests/paths.py index cfc2024..0296df1 100644 --- a/tests/paths.py +++ b/tests/paths.py @@ -35,6 +35,15 @@ PATH_JPEG_FILE = Path(IMAGE_DIR, "image.jpg") PATH_HIDDEN_FILE = Path(SOME_DIR, ".hidden_file") PATH_HIDDEN_DIR = Path(SOME_DIR, ".hidden_dir") +PATH_COMPRESSED_FILE_DIR = Path(FILES_DIR, "compressed") +PATH_JSON_FILE_GZ = PATH_COMPRESSED_FILE_DIR / f"{PATH_JSON_FILE_OBJECT.name}.gz" +PATH_JSONL_FILE_BZ2 = PATH_COMPRESSED_FILE_DIR / f"{PATH_JSONL_FILE.name}.bz2" +PATH_CSV_FILE_XZ = PATH_COMPRESSED_FILE_DIR / f"{PATH_CSV_FILE.name}.xz" +PATH_TEXT_FILE_GZ = PATH_COMPRESSED_FILE_DIR / f"{PATH_TEXT_FILE.name}.gz" +PATH_JPEG_FILE_GZ = PATH_COMPRESSED_FILE_DIR / f"{PATH_JPEG_FILE.name}.gz" +PATH_YAML_FILE_GZ = PATH_COMPRESSED_FILE_DIR / f"{PATH_YAML_FILE.name}.gz" +PATHS_COMPRESSED_TEXT_FILES = [PATH_JSON_FILE_GZ, PATH_JSONL_FILE_BZ2, PATH_CSV_FILE_XZ, PATH_TEXT_FILE_GZ] +PATHS_COMPRESSED_BINARY_FILES = [PATH_JPEG_FILE_GZ] PATHS_TEXT_FILES = [ PATH_TEXT_FILE, PATH_JSON_FILE_SCALAR, diff --git a/tests/tests_loader/test_load_file.py b/tests/tests_loader/test_load_file.py index e2fcb22..0a16151 100644 --- a/tests/tests_loader/test_load_file.py +++ b/tests/tests_loader/test_load_file.py @@ -8,19 +8,23 @@ from tests.paths import ( ABS_PATH_LOADER_DIR, PATH_JPEG_FILE, + PATH_JPEG_FILE_GZ, + PATH_JSON_FILE_GZ, PATH_JSON_FILE_NESTED_OBJECT, PATH_JSON_FILE_OBJECT, PATH_TEXT_FILE, + PATH_TEXT_FILE_GZ, ) pytestmark = pytest.mark.loaders # NOTE: # - lazy_loading option is separately tested in another test using pytester -# - This file covers 3 types of data types the plugin handles differently: +# - This file covers 4 types of data types the plugin handles differently: # - text file (no file reader) # - json file (with default file reader) # - binary file +# - compressed files (gz, .bz2, .xz) for the above # Text file @@ -132,3 +136,32 @@ def test_load_binary_file_with_id(request: FixtureRequest, data: bytes) -> None: def test_load_binary_file_with_marks(request: FixtureRequest, data: bytes) -> None: """Test @load loader with the marks option using binary file""" assert "foo" in {m.name for m in request.node.own_markers} + + +# Compressed files +@load("data", PATH_TEXT_FILE_GZ) +def test_load_compressed_text_file(data: str) -> None: + """Test that @load with a .txt.gz file returns decompressed file data""" + assert isinstance(data, str) + assert data == (ABS_PATH_LOADER_DIR / PATH_TEXT_FILE).read_text() + + +@load("data", PATH_JSON_FILE_GZ) +def test_load_compressed_json_file(data: dict[str, Any]) -> None: + """Test that @load with a .json.gz file resolves to the default json.load reader transparently""" + assert isinstance(data, dict) + assert data == json.loads((ABS_PATH_LOADER_DIR / PATH_JSON_FILE_OBJECT).read_text()) + + +@load("data", PATH_JPEG_FILE_GZ) +def test_load_compressed_autodetects_binary_mode(data: bytes) -> None: + """Test that @load with a .jpg.gz file auto-detects binary mode from decompressed content""" + assert isinstance(data, bytes) + assert data == (ABS_PATH_LOADER_DIR / PATH_JPEG_FILE).read_bytes() + + +@load("data", PATH_TEXT_FILE_GZ, read_options={"mode": "rb"}) +def test_load_compressed_text_with_force_binary(data: bytes) -> None: + """Test that @load with a .txt.gz file in binary mode returns decompressed bytes""" + assert isinstance(data, bytes) + assert data == (ABS_PATH_LOADER_DIR / PATH_TEXT_FILE).read_bytes() diff --git a/tests/tests_loader/test_parametrize_dir.py b/tests/tests_loader/test_parametrize_dir.py index e667a0f..0014662 100644 --- a/tests/tests_loader/test_parametrize_dir.py +++ b/tests/tests_loader/test_parametrize_dir.py @@ -1,11 +1,25 @@ +import json from pathlib import Path +from typing import Any import pytest from pytest import FixtureRequest from pytest_data_loader import parametrize_dir +from pytest_data_loader.paths import get_effective_suffix from pytest_data_loader.types import LoadedDataType -from tests.paths import ABS_PATH_LOADER_DIR, IMAGE_DIR, PATH_TEXT_FILE_DIR, SOME_DIR, SOME_DIR_INNER +from tests.paths import ( + ABS_PATH_LOADER_DIR, + IMAGE_DIR, + PATH_COMPRESSED_FILE_DIR, + PATH_JPEG_FILE, + PATH_JSON_FILE_OBJECT, + PATH_TEXT_FILE, + PATH_TEXT_FILE_DIR, + PATH_YAML_FILE, + SOME_DIR, + SOME_DIR_INNER, +) from .helper import get_parametrized_test_idx @@ -118,3 +132,23 @@ def test_parametrize_dir_multi_dirs_recursive(request: FixtureRequest, data: str idx = get_parametrized_test_idx(request, "data") all_expected = ["data0", "data1", "data2", "data3", "data4", "data5", "line0\nline1\nline2"] assert data == all_expected[idx] + + +@parametrize_dir( + ("file_path", "data"), + PATH_COMPRESSED_FILE_DIR, + filter=lambda p: get_effective_suffix(p) in (".txt", ".json", ".yml", ".jpg"), +) +def test_parametrize_dir_with_compressed_files(file_path: Path, data: Any) -> None: + """Test @parametrize_dir loader with compressed files in the directory""" + effective_suffix = get_effective_suffix(file_path) + if effective_suffix == ".txt": + assert data == (ABS_PATH_LOADER_DIR / PATH_TEXT_FILE).read_text() + elif effective_suffix == ".json": + assert data == json.loads((ABS_PATH_LOADER_DIR / PATH_JSON_FILE_OBJECT).read_text()) + elif effective_suffix == ".yml": + assert data == (ABS_PATH_LOADER_DIR / PATH_YAML_FILE).read_text() + elif effective_suffix == ".jpg": + assert data == (ABS_PATH_LOADER_DIR / PATH_JPEG_FILE).read_bytes() + else: + raise NotImplementedError("Add test") diff --git a/tests/tests_loader/test_parametrize_file.py b/tests/tests_loader/test_parametrize_file.py index adc3c4c..829fbcb 100644 --- a/tests/tests_loader/test_parametrize_file.py +++ b/tests/tests_loader/test_parametrize_file.py @@ -7,12 +7,16 @@ from tests.paths import ( ABS_PATH_LOADER_DIR, PATH_JPEG_FILE, + PATH_JPEG_FILE_GZ, PATH_JSON_FILE_ARRAY, + PATH_JSON_FILE_GZ, PATH_JSON_FILE_NESTED_OBJECT, PATH_JSON_FILE_OBJECT, PATH_JSON_FILE_SCALAR, PATH_TEXT_FILE, + PATH_TEXT_FILE_GZ, PATH_YAML_FILE, + PATH_YAML_FILE_GZ, ) from .helper import get_parametrized_test_idx @@ -21,11 +25,12 @@ # NOTE: # - lazy_loading option is separately tested in another test using pytester -# - This file covers 4 types of data types the plugin handles differently: +# - This file covers 5 types of data types the plugin handles differently: # - text file (non-structured file, streamable) # - json file (structured file, streamable via the default file reader) # - yaml file (structured file, non-streamable) # - binary file +# - compressed files (gz, .bz2, .xz) for the above # Text file @@ -267,6 +272,60 @@ def test_parametrize_binary_file_with_marks(request: FixtureRequest, data: bytes assert request.node.get_closest_marker("foo") +# Compressed files +@parametrize("data", PATH_TEXT_FILE_GZ) +def test_parametrize_compressed_text_file(request: FixtureRequest, data: str) -> None: + """Test that @parametrize loader with a .txt.gz file yields line""" + assert isinstance(data, str) + idx = get_parametrized_test_idx(request, "data") + assert data == f"line{idx}" + + +@parametrize("data", PATH_JSON_FILE_GZ) +def test_parametrize_compressed_json_file(request: FixtureRequest, data: tuple[str, str]) -> None: + """Test that @parametrize loader with a .jsonl.bz2 file yields one dict per non-empty line""" + assert isinstance(data, tuple) + idx = get_parametrized_test_idx(request, "data") + assert data == (f"key{idx}", f"value{idx}") + + +@parametrize( + "data", + PATH_YAML_FILE_GZ, + processor=lambda i, *_: str(i), + marks=lambda i, *_: pytest.mark.foo if i % 2 else None, + ids=lambda i, *_: str(i), +) +def test_parametrize_compressed_yaml_file(request: FixtureRequest, data: str) -> None: + """Test @parametrize loader with .yml.gz file""" + assert isinstance(data, str) + idx = get_parametrized_test_idx(request, "data") + assert data == str(idx) + mark = request.node.get_closest_marker("foo") + if idx % 2: + assert mark is not None + else: + assert mark is None + + +@parametrize("data", PATH_JPEG_FILE_GZ, parametrizer=lambda d: _split_jpeg(d)) # noqa: PLW0108 +def test_parametrize_compressed_binary_file_with_parametrizer(request: FixtureRequest, data: bytes) -> None: + """Test @parametrize loader with the parametrizer using compressed binary file""" + assert isinstance(data, bytes) + idx = get_parametrized_test_idx(request, "data") + assert idx in range(3) + if idx == 0: + # Chunk 0 should start with SOI + assert data.startswith(b"\xff\xd8") + elif idx == 1: + # Second chunk must start with SOS + assert data.startswith(b"\xff\xda") + else: + # Last chunk must be EOI + assert data == b"\xff\xd9" + + +# Multi-path @parametrize("data", [PATH_TEXT_FILE, PATH_JSON_FILE_ARRAY]) def test_parametrize_multi_files(request: FixtureRequest, data: str) -> None: """Test @parametrize loader with a list of file paths concatenates all parametrized data""" diff --git a/tests/tests_plugin/test_compression_aware_loading.py b/tests/tests_plugin/test_compression_aware_loading.py new file mode 100644 index 0000000..6b3dd4b --- /dev/null +++ b/tests/tests_plugin/test_compression_aware_loading.py @@ -0,0 +1,126 @@ +import gzip +from pathlib import Path + +import pytest +from pytest import ExitCode, Pytester + +from pytest_data_loader.constants import DEFAULT_LOADER_DIR_NAME +from pytest_data_loader.paths import SUPPORTED_COMPRESSION_EXTENSIONS, compression_aware_open + +pytestmark = pytest.mark.plugin + + +class TestCompressionAwareLoading: + """Tests for compression aware loading""" + + @pytest.fixture(autouse=True) + def data_dir(self, pytester: Pytester) -> Path: + return pytester.mkdir(DEFAULT_LOADER_DIR_NAME) + + @pytest.mark.parametrize("ext", SUPPORTED_COMPRESSION_EXTENSIONS) + def test_load_compressed_file(self, pytester: Pytester, data_dir: Path, ext: str) -> None: + """Test that @load with a compressed file returns the decompressed content""" + text_payload = "line1\nline2\n" + compressed_path = data_dir / f"text.txt{ext}" + with compression_aware_open(compressed_path, mode="wt", encoding="utf-8") as f: + f.write(text_payload) + + pytester.makepyfile(f""" + import pytest_data_loader + + @pytest_data_loader.load("data", {str(compressed_path)!r}) + def test_func(data): + assert isinstance(data, str) + assert data.splitlines() == {text_payload.splitlines()!r} + """) + + result = pytester.runpytest("-v") + assert result.ret == ExitCode.OK + result.assert_outcomes(passed=1) + + def test_parametrize_compressed_file(self, pytester: Pytester, data_dir: Path) -> None: + """Test that @parametrize with a compressed file returns the parametrized decompressed content""" + lines = ["alpha", "beta", "gamma"] + gz_path = data_dir / "test.txt.gz" + with gzip.open(gz_path, "wt", encoding="utf-8") as f: + f.write("\n".join(lines)) + + pytester.makepyfile(f""" + import pytest_data_loader + + @pytest_data_loader.parametrize("data", {str(gz_path)!r}) + def test_func(data): + assert data in {lines!r} + """) + + result = pytester.runpytest("-v") + assert result.ret == ExitCode.OK + result.assert_outcomes(passed=len(lines)) + + def test_parametrize_dir_compressed_files(self, pytester: Pytester, data_dir: Path) -> None: + """Test that @parametrize_dir with a compressed files returns the decompressed file content""" + sub_dir = data_dir / "dir" + sub_dir.mkdir() + for ext in SUPPORTED_COMPRESSION_EXTENSIONS: + path = sub_dir / f"test.txt{ext}" + with compression_aware_open(path, mode="wt", encoding="utf-8") as f: + f.write("test\n") + + pytester.makepyfile(f""" + import pytest_data_loader + + @pytest_data_loader.parametrize_dir("data", {sub_dir.name!r}) + def test_func(data): + assert data == "test" + """) + + result = pytester.runpytest("-v") + assert result.ret == ExitCode.OK + result.assert_outcomes(passed=len(SUPPORTED_COMPRESSION_EXTENSIONS)) + + def test_load_compressed_file_with_reader(self, pytester: Pytester, data_dir: Path) -> None: + """Test that a specified reader is effective to a compressed file""" + payload = "key1: value1\nkey2: value2\n" + gz_path = data_dir / "data.yml.gz" + with gzip.open(gz_path, "wt", encoding="utf-8") as f: + f.write(payload) + + pytester.makepyfile(f""" + import yaml + import pytest_data_loader + + @pytest_data_loader.load("data", {str(gz_path.name)!r}, reader=yaml.safe_load) + def test_func(data): + assert data == yaml.safe_load({payload!r}) + """) + + result = pytester.runpytest("-v") + assert result.ret == ExitCode.OK + result.assert_outcomes(passed=1) + + def test_compressed_file_with_registered_reader(self, pytester: Pytester, data_dir: Path) -> None: + """Test that a registered reader is effective to a compressed file""" + payload = "key1: value1\nkey2: value2\n" + gz_path = data_dir / "data.yml.gz" + with gzip.open(gz_path, "wt", encoding="utf-8") as f: + f.write(payload) + + pytester.makeconftest(""" + import yaml + import pytest_data_loader + + pytest_data_loader.register_reader(".yml", yaml.safe_load) + """) + + pytester.makepyfile(f""" + import yaml + import pytest_data_loader + + @pytest_data_loader.load("data", {str(gz_path.name)!r}) + def test_func(data): + assert data == yaml.safe_load({payload!r}) + """) + + result = pytester.runpytest("-v") + assert result.ret == ExitCode.OK + result.assert_outcomes(passed=1) diff --git a/tests/tests_unit/test_file_loader.py b/tests/tests_unit/test_file_loader.py index 549dc7e..02fbce0 100644 --- a/tests/tests_unit/test_file_loader.py +++ b/tests/tests_unit/test_file_loader.py @@ -1,4 +1,6 @@ import gc +import gzip +import json from collections.abc import Callable from functools import _CacheInfo from pathlib import Path @@ -8,14 +10,19 @@ from pytest_data_loader import load, parametrize, parametrize_dir from pytest_data_loader.loaders.impl import FileLoader +from pytest_data_loader.paths import SUPPORTED_COMPRESSION_EXTENSIONS, compression_aware_open, get_effective_suffix from pytest_data_loader.types import DataLoader, DataLoaderLoadAttrs, LazyLoadedData, LazyLoadedPartData, LoadedData from tests.paths import ( ABS_PATH_LOADER_DIR, PATH_JSON_FILE_ARRAY, + PATH_JSON_FILE_GZ, PATH_JSONL_FILE, PATH_TEXT_FILE, + PATH_TEXT_FILE_GZ, PATH_XML_FILE, PATHS_BINARY_FILES, + PATHS_COMPRESSED_BINARY_FILES, + PATHS_COMPRESSED_TEXT_FILES, PATHS_TEXT_FILES, ) @@ -26,7 +33,9 @@ class TestFileLoader: """Tests for file loader with various file types and loading modes.""" @pytest.mark.parametrize("is_abs_path", [False, True]) - @pytest.mark.parametrize("path", [*PATHS_TEXT_FILES, *PATHS_BINARY_FILES]) + @pytest.mark.parametrize( + "path", [*PATHS_TEXT_FILES, *PATHS_BINARY_FILES, *PATHS_COMPRESSED_TEXT_FILES, *PATHS_COMPRESSED_BINARY_FILES] + ) @pytest.mark.parametrize("lazy_loading", [True, False]) @pytest.mark.parametrize("loader", [load, parametrize, parametrize_dir]) def test_file_loader(self, loader: DataLoader, lazy_loading: bool, path: Path, is_abs_path: bool) -> None: @@ -39,7 +48,10 @@ def test_file_loader(self, loader: DataLoader, lazy_loading: bool, path: Path, i load_from = ABS_PATH_LOADER_DIR if loader == parametrize_dir: path = path.parent - is_binary = abs_file_path.relative_to(ABS_PATH_LOADER_DIR) in PATHS_BINARY_FILES + is_binary = abs_file_path.relative_to(ABS_PATH_LOADER_DIR) in ( + *PATHS_BINARY_FILES, + *PATHS_COMPRESSED_BINARY_FILES, + ) marks = (pytest.mark.foo, pytest.mark.bar) load_attrs = DataLoaderLoadAttrs( loader=loader, @@ -54,7 +66,7 @@ def test_file_loader(self, loader: DataLoader, lazy_loading: bool, path: Path, i ) file_loader = FileLoader(abs_file_path, load_attrs, load_from=load_from, strip_trailing_whitespace=True) - if path.suffix == ".json": + if get_effective_suffix(abs_file_path) == ".json": assert file_loader.file_reader is not None loaded_data = file_loader.load() @@ -167,22 +179,31 @@ def _make_load_attrs( parametrizer_func=parametrizer, ) - @pytest.mark.parametrize("path", [*PATHS_TEXT_FILES, *PATHS_BINARY_FILES]) + @pytest.mark.parametrize( + "path", [*PATHS_TEXT_FILES, *PATHS_BINARY_FILES, *PATHS_COMPRESSED_TEXT_FILES, *PATHS_COMPRESSED_BINARY_FILES] + ) @pytest.mark.parametrize("loader", [load, parametrize, parametrize_dir]) def test_lazy_loading_cache_state_transitions(self, loader: DataLoader, path: Path) -> None: """Test that lazy loading correctly transitions cache state before and after resolve and clear_cache.""" abs_file_path = ABS_PATH_LOADER_DIR / path - parametrizer: Callable[..., Any] | None = (lambda x: [x]) if path in PATHS_BINARY_FILES else None + binary_files = (*PATHS_BINARY_FILES, *PATHS_COMPRESSED_BINARY_FILES) + parametrizer: Callable[..., Any] | None = (lambda x: [x]) if path in binary_files else None load_attrs = self._make_load_attrs(loader, path, lazy_loading=True, parametrizer=parametrizer) file_loader = FileLoader( abs_file_path, load_attrs, load_from=ABS_PATH_LOADER_DIR, strip_trailing_whitespace=True ) - if path.suffix == ".json": + if get_effective_suffix(abs_file_path) == ".json": assert file_loader.file_reader is not None lazy_loaded_data = file_loader._load_lazily() - assert file_loader._cached_file_objects == {} + # Non-streamable @parametrize files call _load_now() at collection (to count items). When a + # file_reader is present, _load_now() uses _get_file_obj() which caches the open handle. + # All other paths leave _cached_file_objects empty after _load_lazily(). + if loader == parametrize and not file_loader.is_streamable and file_loader.file_reader is not None: + assert len(file_loader._cached_file_objects) == 1 + else: + assert file_loader._cached_file_objects == {} # For streamable @parametrize files, no lru_cache wrapper is created at load time. # For all other cases, the file_loader is registered eagerly to ensure cleanup even if tests are skipped. if loader == parametrize and file_loader.is_streamable: @@ -206,9 +227,14 @@ def test_lazy_loading_cache_state_transitions(self, loader: DataLoader, path: Pa # The result of _read_reader_and_split() should be cached per reader assert file_loader.file_reader in file_loader._cached_reader_and_split else: - # The file object and resolver should not be cached, but the file loader function used in resolver - # should be cached - assert len(file_loader._cached_file_objects) == 0 + # The file loader function (lru_cache wrapper) should be populated, but the resolver itself + # is not cached. For non-streamable files WITH a file_reader (e.g. compressed JSON/JSONL), + # _load_now uses _get_file_obj() so _cached_file_objects is populated. For those without a + # reader (e.g. XML, CSV), _read_file() is used and _cached_file_objects stays empty. + if file_loader.file_reader is not None: + assert len(file_loader._cached_file_objects) == 1 + else: + assert len(file_loader._cached_file_objects) == 0 assert not hasattr(lazy_data.resolver, "cache_info") assert len(file_loader._cached_functions) == 1 file_loader_func = next(iter(file_loader._cached_functions)) @@ -368,3 +394,127 @@ def test_weakref_finalize_clears_cache_on_gc(self) -> None: assert cached_file_objects_ref == {} assert cached_file_loaders_ref == set() assert cached_reader_split_ref == {} + + +class TestFileLoaderWithCompressedFiles: + """Tests for compression-aware file loading (.gz/.bz2/.xz).""" + + def _make_load_attrs(self, loader: DataLoader, path: Path, *, lazy_loading: bool = False) -> DataLoaderLoadAttrs: + """Create minimal DataLoaderLoadAttrs for the given loader and path. + + :param loader: The data loader to use + :param path: Relative path to the test data file + :param lazy_loading: Whether to use lazy loading + """ + return DataLoaderLoadAttrs( + loader=loader, + search_from=Path(__file__), + fixture_names=("file_path", "data"), + path=path, + lazy_loading=lazy_loading, + ) + + def test_get_effective_suffix_returns_inner_suffix_for_compressed_paths(self) -> None: + """Test that get_effective_suffix strips the compression suffix to expose the inner format suffix""" + assert get_effective_suffix(Path("data.json.gz")) == ".json" + assert get_effective_suffix(Path("data.csv.bz2")) == ".csv" + assert get_effective_suffix(Path("data.txt.xz")) == ".txt" + assert get_effective_suffix(Path("data.JSON.GZ")) == ".JSON" + + def test_get_effective_suffix_returns_suffix_for_non_compressed_paths(self) -> None: + """Test that get_effective_suffix is a no-op for non-compressed paths""" + assert get_effective_suffix(Path("data.json")) == ".json" + assert get_effective_suffix(Path("data.txt")) == ".txt" + + def test_get_effective_suffix_returns_gz_when_no_inner_suffix(self) -> None: + """Test that get_effective_suffix returns the compression suffix itself when there is no inner suffix""" + assert get_effective_suffix(Path("data.gz")) == ".gz" + assert get_effective_suffix(Path("data.bz2")) == ".bz2" + assert get_effective_suffix(Path("data.xz")) == ".xz" + + def test_compression_aware_open_routes_gz_through_gzip(self, tmp_path: Path) -> None: + """Test that compression_aware_open opens .gz files via gzip and returns decompressed text""" + payload = "hello compressed world\n" + gz_path = tmp_path / "test.txt.gz" + with gzip.open(gz_path, "wt") as f: + f.write(payload) + + with compression_aware_open(gz_path) as f: + assert f.read() == payload + + def test_compressed_json_resolves_to_default_json_reader(self) -> None: + """Test that FileLoader for a .json.gz file resolves to the default json.load reader""" + abs_path = ABS_PATH_LOADER_DIR / PATH_JSON_FILE_GZ + load_attrs = self._make_load_attrs(load, PATH_JSON_FILE_GZ) + file_loader = FileLoader(abs_path, load_attrs, load_from=ABS_PATH_LOADER_DIR) + + assert file_loader.file_reader is json.load + + def test_compressed_file_disables_streaming(self) -> None: + """Test that FileLoader marks compressed files as non-streamable to avoid O(n) seeks""" + abs_path = ABS_PATH_LOADER_DIR / PATH_TEXT_FILE_GZ + load_attrs = self._make_load_attrs(parametrize, PATH_TEXT_FILE_GZ) + file_loader = FileLoader(abs_path, load_attrs, load_from=ABS_PATH_LOADER_DIR) + + assert not file_loader.is_streamable + + def test_compressed_binary_autodetect_via_decompressed_chunk(self, tmp_path: Path) -> None: + """Test that binary auto-detection probes decompressed bytes, not the gzip magic bytes""" + binary_payload = bytes(range(256)) + gz_path = tmp_path / "binary.dat.gz" + with gzip.open(gz_path, "wb") as f: + f.write(binary_payload) + + load_attrs = DataLoaderLoadAttrs( + loader=load, + search_from=Path(__file__), + fixture_names=("data",), + path=gz_path, + lazy_loading=False, + ) + file_loader = FileLoader(gz_path, load_attrs) + loaded = file_loader.load() + + assert isinstance(loaded, LoadedData) + assert loaded.data == binary_payload + assert file_loader.read_mode == "rb" + + def test_compressed_text_autodetect_via_decompressed_chunk(self, tmp_path: Path) -> None: + """Test that text auto-detection probes decompressed bytes and resolves to text mode""" + text_payload = "hello from compressed text\n" + gz_path = tmp_path / "text.dat.gz" + with gzip.open(gz_path, "wt", encoding="utf-8") as f: + f.write(text_payload) + + load_attrs = DataLoaderLoadAttrs( + loader=load, + search_from=Path(__file__), + fixture_names=("data",), + path=gz_path, + lazy_loading=False, + ) + file_loader = FileLoader(gz_path, load_attrs) + loaded = file_loader.load() + + assert isinstance(loaded, LoadedData) + assert loaded.data == text_payload + assert file_loader.read_mode == "r" + + @pytest.mark.parametrize("ext", [x.upper() for x in SUPPORTED_COMPRESSION_EXTENSIONS]) + def test_compressed_uppercase_suffix_is_routed(self, tmp_path: Path, ext: str) -> None: + """Test that uppercase compression suffixes are routed the same as lowercase""" + payload = "case insensitive\n" + path = tmp_path / f"data.txt{ext}" + with compression_aware_open(path, mode="wt") as f: + f.write(payload) + load_attrs = DataLoaderLoadAttrs( + loader=load, + search_from=Path(__file__), + fixture_names=("data",), + path=path, + lazy_loading=False, + ) + file_loader = FileLoader(path, load_attrs) + loaded = file_loader.load() + assert isinstance(loaded, LoadedData) + assert loaded.data == payload