From 65d648080b3884b3ced0ecb6428ad8521013f9b0 Mon Sep 17 00:00:00 2001 From: Yugo Kato Date: Thu, 14 May 2026 09:58:51 -0700 Subject: [PATCH] Add support of compression aware loading for .gz, .bz2, .xz files --- README.md | 3 + src/pytest_data_loader/loaders/impl.py | 33 ++-- src/pytest_data_loader/paths.py | 56 +++++- tests/.gitattributes | 2 + tests/data/files/compressed/comma.csv.xz | Bin 0 -> 108 bytes tests/data/files/compressed/data.jsonl.bz2 | Bin 0 -> 85 bytes tests/data/files/compressed/image.jpg.gz | Bin 0 -> 1610 bytes tests/data/files/compressed/object.json.gz | Bin 0 -> 55 bytes tests/data/files/compressed/text.txt.gz | Bin 0 -> 32 bytes tests/data/files/compressed/yaml.yml.gz | Bin 0 -> 327 bytes tests/data/files/jsonl/data.jsonl | 2 +- tests/data/files/jsonl/data2.jsonl | 2 +- tests/data/files/yaml/yaml.yml | 2 +- tests/data/files/yaml/yaml_documents.yml | 2 +- tests/paths.py | 9 + tests/tests_loader/test_load_file.py | 35 +++- tests/tests_loader/test_parametrize_dir.py | 36 +++- tests/tests_loader/test_parametrize_file.py | 61 ++++++- .../test_compression_aware_loading.py | 126 +++++++++++++ tests/tests_unit/test_file_loader.py | 170 ++++++++++++++++-- 20 files changed, 508 insertions(+), 31 deletions(-) create mode 100644 tests/.gitattributes create mode 100644 tests/data/files/compressed/comma.csv.xz create mode 100644 tests/data/files/compressed/data.jsonl.bz2 create mode 100644 tests/data/files/compressed/image.jpg.gz create mode 100644 tests/data/files/compressed/object.json.gz create mode 100644 tests/data/files/compressed/text.txt.gz create mode 100644 tests/data/files/compressed/yaml.yml.gz create mode 100644 tests/tests_plugin/test_compression_aware_loading.py diff --git a/README.md b/README.md index d5d4e91..fd96945 100644 --- a/README.md +++ b/README.md @@ -488,6 +488,9 @@ By default, the plugin reads and parses file content when loading as follows: - `.jsonl` — Each line is parsed as a JSON object - All other file types — Loads as raw text or binary content +Compressed files with a `.gz`, `.bz2`, or `.xz` extension are decompressed transparently. The inner file type +determines which reader and read mode are used (e.g. `data.json.gz` uses `json.load`, `data.txt.bz2` loads as plain text). + ### Customizing defaults You can customize this behavior by specifying a file reader that accepts a file-like object returned by `open()`. diff --git a/src/pytest_data_loader/loaders/impl.py b/src/pytest_data_loader/loaders/impl.py index 7c6662c..d90f218 100644 --- a/src/pytest_data_loader/loaders/impl.py +++ b/src/pytest_data_loader/loaders/impl.py @@ -19,7 +19,10 @@ from pytest_data_loader.paths import ( check_and_track_dir, check_circular_symlink, + compression_aware_open, + get_effective_suffix, get_matching_paths, + is_compressed_path, resolve_relative_path, split_glob_path, ) @@ -191,20 +194,24 @@ def __init__(self, *args: Any, gidx: int | None = None, **kwargs: Any): self.file_reader = self.load_attrs.reader self.read_options = self.load_attrs.read_options if not self.file_reader: - if registered_reader := FileReader.get_registered_reader(self.load_attrs.search_from, self.path.suffix): + if registered_reader := FileReader.get_registered_reader( + self.load_attrs.search_from, get_effective_suffix(self.path) + ): self.file_reader = registered_reader.reader if not self.read_options: self.read_options = registered_reader.read_options assert isinstance(self.read_options, HashableDict) self._effective_read_mode: str | None = None - self._is_streamable = self.file_reader is not None or all( - # non-structured text data can be read line by line - [ - self.path.suffix in FileLoader.STREAMABLE_FILE_TYPES, - self.read_mode != "rb", - self.load_attrs.onload_func is None, - self.load_attrs.parametrizer_func is None, - ] + self._is_streamable = not is_compressed_path(self.path) and ( + self.file_reader is not None + or all( + [ + get_effective_suffix(self.path) in FileLoader.STREAMABLE_FILE_TYPES, + self.read_mode != "rb", + self.load_attrs.onload_func is None, + self.load_attrs.parametrizer_func is None, + ] + ) ) # Caches used by data loaders. @@ -462,7 +469,7 @@ def _get_file_obj(self) -> IO[Any]: """Get file object from cache or open a new one and cache it""" f = self._cached_file_objects.get((self.path, self.read_options)) if not f or f.closed: - f = open(self.path, **self.read_options) + f = compression_aware_open(self.path, **self.read_options) self._cached_file_objects[(self.path, self.read_options)] = f f.seek(0) return f @@ -506,7 +513,7 @@ def inspect_part_data(pos: int, part: Any) -> None: else: commit(pos, part) - with open(self.path, **self.read_options) as f: + with compression_aware_open(self.path, **self.read_options) as f: if self.file_reader: # NOTE: Do NOT use _read_reader_and_split here to get the split data. Closing the file will invalidate # the cached part data generated by the file reader and cause issues when loading part data later. @@ -534,7 +541,7 @@ def _read_file(self) -> str | bytes: if self.read_mode == "auto": # Detect read mode based on sampled data is_binary = False - with open(self.path, "rb") as f: + with compression_aware_open(self.path, mode="rb") as f: chunk = f.read(4096) if chunk: @@ -553,7 +560,7 @@ def _read_file(self) -> str | bytes: if self.read_mode == "r" and "encoding" not in read_options: read_options["encoding"] = "utf-8" - with open(self.path, **read_options) as f: + with compression_aware_open(self.path, **read_options) as f: return f.read() @requires_loader(DataLoaderType.PARAMETRIZE) diff --git a/src/pytest_data_loader/paths.py b/src/pytest_data_loader/paths.py index de1ecd6..06c04ac 100644 --- a/src/pytest_data_loader/paths.py +++ b/src/pytest_data_loader/paths.py @@ -1,15 +1,26 @@ from __future__ import annotations +import bz2 import errno import glob +import gzip +import lzma import os import re +from collections.abc import Callable from functools import lru_cache from pathlib import Path -from typing import Literal +from typing import IO, Any, Literal from pytest_data_loader.exceptions import DataNotFound +_COMPRESSION_OPENERS: dict[str, Callable[..., IO[Any]]] = { + ".gz": gzip.open, + ".bz2": bz2.open, + ".xz": lzma.open, +} +SUPPORTED_COMPRESSION_EXTENSIONS: tuple[str, ...] = tuple(_COMPRESSION_OPENERS) + @lru_cache def resolve_relative_path( @@ -188,3 +199,46 @@ def split_glob_path(path: Path) -> tuple[Path, str]: base = Path(*parts[:split]) pattern = str(Path(*parts[split:])) return base, pattern + + +def is_compressed_path(path: Path) -> bool: + """Return whether the given path is a supported compressed file (.gz/.bz2/.xz). + + :param path: File path to inspect + """ + return path.suffix.lower() in SUPPORTED_COMPRESSION_EXTENSIONS + + +def get_effective_suffix(path: Path) -> str: + """Return the format-bearing suffix of path, skipping a trailing compression suffix when present. + + :param path: File path to inspect + + Examples: + Path("data.json.gz") -> ".json" + Path("data.csv.bz2") -> ".csv" + Path("data.json") -> ".json" + Path("data.gz") -> ".gz" (no inner suffix to expose) + """ + suffixes = path.suffixes + if len(suffixes) >= 2 and is_compressed_path(path): + return suffixes[-2] + return path.suffix + + +def compression_aware_open(path: Path, **open_kwargs: Any) -> IO[Any]: + """Open a file, routing through gzip.open()/bz2.open()/lzma.open() when the suffix matches. + + For compression openers "r" means binary (unlike builtin open() where "r" means text). This function normalizes + the mode so that "r" and "rt" both produce a text-mode stream, matching the semantics of builtin open(). + + :param path: File path to open + :param open_kwargs: Keyword arguments forwarded to the opener (mode, encoding, errors, newline) + """ + opener = _COMPRESSION_OPENERS.get(path.suffix.lower()) + if opener is None: + return open(path, **open_kwargs) + mode = open_kwargs.get("mode") or "r" + # Compression openers treat "r" as binary. Map to "rt" so callers get text mode, matching builtin open. + open_kwargs["mode"] = "rt" if mode in ("r", "rt") else mode + return opener(path, **open_kwargs) diff --git a/tests/.gitattributes b/tests/.gitattributes new file mode 100644 index 0000000..675746c --- /dev/null +++ b/tests/.gitattributes @@ -0,0 +1,2 @@ +# For tests in Windows - Force LF for test data text files so their bytes match compressed counterparts (e.g. text.txt vs text.txt.gz) +**/data/** text=auto eol=lf diff --git a/tests/data/files/compressed/comma.csv.xz b/tests/data/files/compressed/comma.csv.xz new file mode 100644 index 0000000000000000000000000000000000000000..8f6a94a95d61a6d71006fb10fb49b092547ed364 GIT binary patch literal 108 zcmexsUKJ6=z`*kC+7>q^21Q0O1_p)_{ill`FnBWP#xj(zyZp>wfNsrmKaFNB#OzmH~A#FnIJ!o)+@9=w@K_^m_TyWV-ye M&$Wyo36{tx0BD{n-2eap literal 0 HcmV?d00001 diff --git a/tests/data/files/compressed/data.jsonl.bz2 b/tests/data/files/compressed/data.jsonl.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..971cdb22ce32e534cd3deaf6ff12a0ccf7a12647 GIT binary patch literal 85 zcmV-b0IL5&T4*^jL0KkKS>I*Yxc~qt-GBfPPy|{KH~>25kP09GP@n(+$&fasGy^~b r#U#kMV{Q^xqD)HAYSAIjBkx)V9C%MQniFG_E%>{VDZ+$?`!2=E=(Hf# literal 0 HcmV?d00001 diff --git a/tests/data/files/compressed/image.jpg.gz b/tests/data/files/compressed/image.jpg.gz new file mode 100644 index 0000000000000000000000000000000000000000..86abcce8c3b1981db0561e3bde1c4c129a792519 GIT binary patch literal 1610 zcmV-Q2DSMgiwFP!00002|CLaCRMK}8|Ni(P4>J&7S(ssf_?V{nnnsdnC~O@?ohGvu zMn1zJiPX%QspZT?Dk}3aHGHkKElXFXHM164o_o$c z_j5j<^SM_$tQ`mXTsDUdAP4|xrGWMs2-%Ss6A8f1oxlbFfB{Ho1JD6~^8k24#D8Ws zWCdWbszVt5JFf!zZ)+a|Z-7OiF=!MPgTd(PV%Ol-;c;u%;>?VU3F|hSlgSq5Bofue z#g0m&Taifi%q?^`cLsw&v175lJlHN<8Sa0ZK)SlRI6WK@k0-iYk}Tc-H?%ha0RtR> z9}H~<2m%BXAZ;fwTgj*cLI0))C;%aK&@ckC!n)`K2%&?3*1!n#>Tw7dAazg#0}HgF ziK7=UL?Ag{&}U@4Ij@pTp|c{+R;|cPAprl^as@CFLaZuHtrHQ&1^FAz=I6d88Sab~YNf>^dK+H!+C4tyZUpm@J**pCZwaMjCHWminZnG_LH849$)WgsC%kO;tP-yU~J^stq!&kZcRM+kf(lcw< zOyN1?3!*fyt-zzA{E*UuVPrWuH8$Xo#O}DX%}3}g%ReC5rT%D(PrdlESICG-S_SqM zW#h72cFbInp?AjgbI)66ETYa%{A4OlE>><0Q%|E`E39k3KIU^!eo2n6(0F7!L{6T^ zecVKE4yaY?uDzf#CfTn4VOE1mIalVo;~Qy9|F;ffRugHmxUewzrm<;JsSmyGRV6Fk zP(63Lffxk+1yS*OBT+`uM} zp$aTAEkTY$`KG;l!kc-P4=wPCaAwnUH~$4BO8n*w;{kr! zytegjFzt)d{X#gG`2~9drr36e>}@FzX-SHH`p3nB#K`%}#Ui^a5*|@RhOEn?p-CrV z`2FbqKSs^=9gKz!E?UOsTFra5T01#9cPKIysY#SFTg|e0`#n9;^`je#9FOdFI^1<) zeLgjLiD_)jvBsb;g6UguGoBXbYPNh_-BsN}A2-!a7G%m9YDc@AOT9%gc244+5fK^7 zqHO@Uo6t$m9bUgQ7QHqRG(U(E=;U|eqVC(|BvuJO>}Brcc_;Te_ssBM@%6p>(E70h zr~8?S1Bqw%YXRE2+$p5M^?ix6M$1NZ3}XSQ$8rs{uX#7oc&{cJ}Zb6Ltn(zK8M0#^>Xuqh}&%lnL`uW7)g#xBW2M|mD5xfK{HtFDV;D0jFd IX$Mpe02Pc9^8f$< literal 0 HcmV?d00001 diff --git a/tests/data/files/compressed/text.txt.gz b/tests/data/files/compressed/text.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..3451cd46eb708b60fcb1cbd7c9f5c6d8a4ed1e2c GIT binary patch literal 32 kcmb2|=3oE;rvIl;p7Az)aEftKTf!v{hPE|Z?g|3M0kP-{WdHyG literal 0 HcmV?d00001 diff --git a/tests/data/files/compressed/yaml.yml.gz b/tests/data/files/compressed/yaml.yml.gz new file mode 100644 index 0000000000000000000000000000000000000000..e813768af0596a2dbe3ac66b59e235ca1b7b7a47 GIT binary patch literal 327 zcmV-N0l5AjiwFo_8U<Y%Y0iYyf>x(Qbk;6n*zA8ee=GWkV9vCu?BFL?;2s z9!aqcX+oiE0q5VZbm*8Fduf|<&b{Z{+m0#C>k5H|=d2bp%~Z$FOrse9M9TgGAa8j| z5pbYEEFf#u=x+{yCouP1*G2s?w-~EBuIfs#s=0{*e8L-?gjpKzk*U{Bp^OWJt#L$> zXnm;_UDF)-OYa4WE6r^sWs+4?XC~p@&QeBc0WL_vMjfAh3!7&{RD%F1FXXDHVnxT~ zw2i|gOOr50pbzuNL^w&_k8xl;hLn$rV1vLta95YDV58;8f|7 None: def test_load_binary_file_with_marks(request: FixtureRequest, data: bytes) -> None: """Test @load loader with the marks option using binary file""" assert "foo" in {m.name for m in request.node.own_markers} + + +# Compressed files +@load("data", PATH_TEXT_FILE_GZ) +def test_load_compressed_text_file(data: str) -> None: + """Test that @load with a .txt.gz file returns decompressed file data""" + assert isinstance(data, str) + assert data == (ABS_PATH_LOADER_DIR / PATH_TEXT_FILE).read_text() + + +@load("data", PATH_JSON_FILE_GZ) +def test_load_compressed_json_file(data: dict[str, Any]) -> None: + """Test that @load with a .json.gz file resolves to the default json.load reader transparently""" + assert isinstance(data, dict) + assert data == json.loads((ABS_PATH_LOADER_DIR / PATH_JSON_FILE_OBJECT).read_text()) + + +@load("data", PATH_JPEG_FILE_GZ) +def test_load_compressed_autodetects_binary_mode(data: bytes) -> None: + """Test that @load with a .jpg.gz file auto-detects binary mode from decompressed content""" + assert isinstance(data, bytes) + assert data == (ABS_PATH_LOADER_DIR / PATH_JPEG_FILE).read_bytes() + + +@load("data", PATH_TEXT_FILE_GZ, read_options={"mode": "rb"}) +def test_load_compressed_text_with_force_binary(data: bytes) -> None: + """Test that @load with a .txt.gz file in binary mode returns decompressed bytes""" + assert isinstance(data, bytes) + assert data == (ABS_PATH_LOADER_DIR / PATH_TEXT_FILE).read_bytes() diff --git a/tests/tests_loader/test_parametrize_dir.py b/tests/tests_loader/test_parametrize_dir.py index e667a0f..0014662 100644 --- a/tests/tests_loader/test_parametrize_dir.py +++ b/tests/tests_loader/test_parametrize_dir.py @@ -1,11 +1,25 @@ +import json from pathlib import Path +from typing import Any import pytest from pytest import FixtureRequest from pytest_data_loader import parametrize_dir +from pytest_data_loader.paths import get_effective_suffix from pytest_data_loader.types import LoadedDataType -from tests.paths import ABS_PATH_LOADER_DIR, IMAGE_DIR, PATH_TEXT_FILE_DIR, SOME_DIR, SOME_DIR_INNER +from tests.paths import ( + ABS_PATH_LOADER_DIR, + IMAGE_DIR, + PATH_COMPRESSED_FILE_DIR, + PATH_JPEG_FILE, + PATH_JSON_FILE_OBJECT, + PATH_TEXT_FILE, + PATH_TEXT_FILE_DIR, + PATH_YAML_FILE, + SOME_DIR, + SOME_DIR_INNER, +) from .helper import get_parametrized_test_idx @@ -118,3 +132,23 @@ def test_parametrize_dir_multi_dirs_recursive(request: FixtureRequest, data: str idx = get_parametrized_test_idx(request, "data") all_expected = ["data0", "data1", "data2", "data3", "data4", "data5", "line0\nline1\nline2"] assert data == all_expected[idx] + + +@parametrize_dir( + ("file_path", "data"), + PATH_COMPRESSED_FILE_DIR, + filter=lambda p: get_effective_suffix(p) in (".txt", ".json", ".yml", ".jpg"), +) +def test_parametrize_dir_with_compressed_files(file_path: Path, data: Any) -> None: + """Test @parametrize_dir loader with compressed files in the directory""" + effective_suffix = get_effective_suffix(file_path) + if effective_suffix == ".txt": + assert data == (ABS_PATH_LOADER_DIR / PATH_TEXT_FILE).read_text() + elif effective_suffix == ".json": + assert data == json.loads((ABS_PATH_LOADER_DIR / PATH_JSON_FILE_OBJECT).read_text()) + elif effective_suffix == ".yml": + assert data == (ABS_PATH_LOADER_DIR / PATH_YAML_FILE).read_text() + elif effective_suffix == ".jpg": + assert data == (ABS_PATH_LOADER_DIR / PATH_JPEG_FILE).read_bytes() + else: + raise NotImplementedError("Add test") diff --git a/tests/tests_loader/test_parametrize_file.py b/tests/tests_loader/test_parametrize_file.py index adc3c4c..829fbcb 100644 --- a/tests/tests_loader/test_parametrize_file.py +++ b/tests/tests_loader/test_parametrize_file.py @@ -7,12 +7,16 @@ from tests.paths import ( ABS_PATH_LOADER_DIR, PATH_JPEG_FILE, + PATH_JPEG_FILE_GZ, PATH_JSON_FILE_ARRAY, + PATH_JSON_FILE_GZ, PATH_JSON_FILE_NESTED_OBJECT, PATH_JSON_FILE_OBJECT, PATH_JSON_FILE_SCALAR, PATH_TEXT_FILE, + PATH_TEXT_FILE_GZ, PATH_YAML_FILE, + PATH_YAML_FILE_GZ, ) from .helper import get_parametrized_test_idx @@ -21,11 +25,12 @@ # NOTE: # - lazy_loading option is separately tested in another test using pytester -# - This file covers 4 types of data types the plugin handles differently: +# - This file covers 5 types of data types the plugin handles differently: # - text file (non-structured file, streamable) # - json file (structured file, streamable via the default file reader) # - yaml file (structured file, non-streamable) # - binary file +# - compressed files (gz, .bz2, .xz) for the above # Text file @@ -267,6 +272,60 @@ def test_parametrize_binary_file_with_marks(request: FixtureRequest, data: bytes assert request.node.get_closest_marker("foo") +# Compressed files +@parametrize("data", PATH_TEXT_FILE_GZ) +def test_parametrize_compressed_text_file(request: FixtureRequest, data: str) -> None: + """Test that @parametrize loader with a .txt.gz file yields line""" + assert isinstance(data, str) + idx = get_parametrized_test_idx(request, "data") + assert data == f"line{idx}" + + +@parametrize("data", PATH_JSON_FILE_GZ) +def test_parametrize_compressed_json_file(request: FixtureRequest, data: tuple[str, str]) -> None: + """Test that @parametrize loader with a .jsonl.bz2 file yields one dict per non-empty line""" + assert isinstance(data, tuple) + idx = get_parametrized_test_idx(request, "data") + assert data == (f"key{idx}", f"value{idx}") + + +@parametrize( + "data", + PATH_YAML_FILE_GZ, + processor=lambda i, *_: str(i), + marks=lambda i, *_: pytest.mark.foo if i % 2 else None, + ids=lambda i, *_: str(i), +) +def test_parametrize_compressed_yaml_file(request: FixtureRequest, data: str) -> None: + """Test @parametrize loader with .yml.gz file""" + assert isinstance(data, str) + idx = get_parametrized_test_idx(request, "data") + assert data == str(idx) + mark = request.node.get_closest_marker("foo") + if idx % 2: + assert mark is not None + else: + assert mark is None + + +@parametrize("data", PATH_JPEG_FILE_GZ, parametrizer=lambda d: _split_jpeg(d)) # noqa: PLW0108 +def test_parametrize_compressed_binary_file_with_parametrizer(request: FixtureRequest, data: bytes) -> None: + """Test @parametrize loader with the parametrizer using compressed binary file""" + assert isinstance(data, bytes) + idx = get_parametrized_test_idx(request, "data") + assert idx in range(3) + if idx == 0: + # Chunk 0 should start with SOI + assert data.startswith(b"\xff\xd8") + elif idx == 1: + # Second chunk must start with SOS + assert data.startswith(b"\xff\xda") + else: + # Last chunk must be EOI + assert data == b"\xff\xd9" + + +# Multi-path @parametrize("data", [PATH_TEXT_FILE, PATH_JSON_FILE_ARRAY]) def test_parametrize_multi_files(request: FixtureRequest, data: str) -> None: """Test @parametrize loader with a list of file paths concatenates all parametrized data""" diff --git a/tests/tests_plugin/test_compression_aware_loading.py b/tests/tests_plugin/test_compression_aware_loading.py new file mode 100644 index 0000000..6b3dd4b --- /dev/null +++ b/tests/tests_plugin/test_compression_aware_loading.py @@ -0,0 +1,126 @@ +import gzip +from pathlib import Path + +import pytest +from pytest import ExitCode, Pytester + +from pytest_data_loader.constants import DEFAULT_LOADER_DIR_NAME +from pytest_data_loader.paths import SUPPORTED_COMPRESSION_EXTENSIONS, compression_aware_open + +pytestmark = pytest.mark.plugin + + +class TestCompressionAwareLoading: + """Tests for compression aware loading""" + + @pytest.fixture(autouse=True) + def data_dir(self, pytester: Pytester) -> Path: + return pytester.mkdir(DEFAULT_LOADER_DIR_NAME) + + @pytest.mark.parametrize("ext", SUPPORTED_COMPRESSION_EXTENSIONS) + def test_load_compressed_file(self, pytester: Pytester, data_dir: Path, ext: str) -> None: + """Test that @load with a compressed file returns the decompressed content""" + text_payload = "line1\nline2\n" + compressed_path = data_dir / f"text.txt{ext}" + with compression_aware_open(compressed_path, mode="wt", encoding="utf-8") as f: + f.write(text_payload) + + pytester.makepyfile(f""" + import pytest_data_loader + + @pytest_data_loader.load("data", {str(compressed_path)!r}) + def test_func(data): + assert isinstance(data, str) + assert data.splitlines() == {text_payload.splitlines()!r} + """) + + result = pytester.runpytest("-v") + assert result.ret == ExitCode.OK + result.assert_outcomes(passed=1) + + def test_parametrize_compressed_file(self, pytester: Pytester, data_dir: Path) -> None: + """Test that @parametrize with a compressed file returns the parametrized decompressed content""" + lines = ["alpha", "beta", "gamma"] + gz_path = data_dir / "test.txt.gz" + with gzip.open(gz_path, "wt", encoding="utf-8") as f: + f.write("\n".join(lines)) + + pytester.makepyfile(f""" + import pytest_data_loader + + @pytest_data_loader.parametrize("data", {str(gz_path)!r}) + def test_func(data): + assert data in {lines!r} + """) + + result = pytester.runpytest("-v") + assert result.ret == ExitCode.OK + result.assert_outcomes(passed=len(lines)) + + def test_parametrize_dir_compressed_files(self, pytester: Pytester, data_dir: Path) -> None: + """Test that @parametrize_dir with a compressed files returns the decompressed file content""" + sub_dir = data_dir / "dir" + sub_dir.mkdir() + for ext in SUPPORTED_COMPRESSION_EXTENSIONS: + path = sub_dir / f"test.txt{ext}" + with compression_aware_open(path, mode="wt", encoding="utf-8") as f: + f.write("test\n") + + pytester.makepyfile(f""" + import pytest_data_loader + + @pytest_data_loader.parametrize_dir("data", {sub_dir.name!r}) + def test_func(data): + assert data == "test" + """) + + result = pytester.runpytest("-v") + assert result.ret == ExitCode.OK + result.assert_outcomes(passed=len(SUPPORTED_COMPRESSION_EXTENSIONS)) + + def test_load_compressed_file_with_reader(self, pytester: Pytester, data_dir: Path) -> None: + """Test that a specified reader is effective to a compressed file""" + payload = "key1: value1\nkey2: value2\n" + gz_path = data_dir / "data.yml.gz" + with gzip.open(gz_path, "wt", encoding="utf-8") as f: + f.write(payload) + + pytester.makepyfile(f""" + import yaml + import pytest_data_loader + + @pytest_data_loader.load("data", {str(gz_path.name)!r}, reader=yaml.safe_load) + def test_func(data): + assert data == yaml.safe_load({payload!r}) + """) + + result = pytester.runpytest("-v") + assert result.ret == ExitCode.OK + result.assert_outcomes(passed=1) + + def test_compressed_file_with_registered_reader(self, pytester: Pytester, data_dir: Path) -> None: + """Test that a registered reader is effective to a compressed file""" + payload = "key1: value1\nkey2: value2\n" + gz_path = data_dir / "data.yml.gz" + with gzip.open(gz_path, "wt", encoding="utf-8") as f: + f.write(payload) + + pytester.makeconftest(""" + import yaml + import pytest_data_loader + + pytest_data_loader.register_reader(".yml", yaml.safe_load) + """) + + pytester.makepyfile(f""" + import yaml + import pytest_data_loader + + @pytest_data_loader.load("data", {str(gz_path.name)!r}) + def test_func(data): + assert data == yaml.safe_load({payload!r}) + """) + + result = pytester.runpytest("-v") + assert result.ret == ExitCode.OK + result.assert_outcomes(passed=1) diff --git a/tests/tests_unit/test_file_loader.py b/tests/tests_unit/test_file_loader.py index 549dc7e..02fbce0 100644 --- a/tests/tests_unit/test_file_loader.py +++ b/tests/tests_unit/test_file_loader.py @@ -1,4 +1,6 @@ import gc +import gzip +import json from collections.abc import Callable from functools import _CacheInfo from pathlib import Path @@ -8,14 +10,19 @@ from pytest_data_loader import load, parametrize, parametrize_dir from pytest_data_loader.loaders.impl import FileLoader +from pytest_data_loader.paths import SUPPORTED_COMPRESSION_EXTENSIONS, compression_aware_open, get_effective_suffix from pytest_data_loader.types import DataLoader, DataLoaderLoadAttrs, LazyLoadedData, LazyLoadedPartData, LoadedData from tests.paths import ( ABS_PATH_LOADER_DIR, PATH_JSON_FILE_ARRAY, + PATH_JSON_FILE_GZ, PATH_JSONL_FILE, PATH_TEXT_FILE, + PATH_TEXT_FILE_GZ, PATH_XML_FILE, PATHS_BINARY_FILES, + PATHS_COMPRESSED_BINARY_FILES, + PATHS_COMPRESSED_TEXT_FILES, PATHS_TEXT_FILES, ) @@ -26,7 +33,9 @@ class TestFileLoader: """Tests for file loader with various file types and loading modes.""" @pytest.mark.parametrize("is_abs_path", [False, True]) - @pytest.mark.parametrize("path", [*PATHS_TEXT_FILES, *PATHS_BINARY_FILES]) + @pytest.mark.parametrize( + "path", [*PATHS_TEXT_FILES, *PATHS_BINARY_FILES, *PATHS_COMPRESSED_TEXT_FILES, *PATHS_COMPRESSED_BINARY_FILES] + ) @pytest.mark.parametrize("lazy_loading", [True, False]) @pytest.mark.parametrize("loader", [load, parametrize, parametrize_dir]) def test_file_loader(self, loader: DataLoader, lazy_loading: bool, path: Path, is_abs_path: bool) -> None: @@ -39,7 +48,10 @@ def test_file_loader(self, loader: DataLoader, lazy_loading: bool, path: Path, i load_from = ABS_PATH_LOADER_DIR if loader == parametrize_dir: path = path.parent - is_binary = abs_file_path.relative_to(ABS_PATH_LOADER_DIR) in PATHS_BINARY_FILES + is_binary = abs_file_path.relative_to(ABS_PATH_LOADER_DIR) in ( + *PATHS_BINARY_FILES, + *PATHS_COMPRESSED_BINARY_FILES, + ) marks = (pytest.mark.foo, pytest.mark.bar) load_attrs = DataLoaderLoadAttrs( loader=loader, @@ -54,7 +66,7 @@ def test_file_loader(self, loader: DataLoader, lazy_loading: bool, path: Path, i ) file_loader = FileLoader(abs_file_path, load_attrs, load_from=load_from, strip_trailing_whitespace=True) - if path.suffix == ".json": + if get_effective_suffix(abs_file_path) == ".json": assert file_loader.file_reader is not None loaded_data = file_loader.load() @@ -167,22 +179,31 @@ def _make_load_attrs( parametrizer_func=parametrizer, ) - @pytest.mark.parametrize("path", [*PATHS_TEXT_FILES, *PATHS_BINARY_FILES]) + @pytest.mark.parametrize( + "path", [*PATHS_TEXT_FILES, *PATHS_BINARY_FILES, *PATHS_COMPRESSED_TEXT_FILES, *PATHS_COMPRESSED_BINARY_FILES] + ) @pytest.mark.parametrize("loader", [load, parametrize, parametrize_dir]) def test_lazy_loading_cache_state_transitions(self, loader: DataLoader, path: Path) -> None: """Test that lazy loading correctly transitions cache state before and after resolve and clear_cache.""" abs_file_path = ABS_PATH_LOADER_DIR / path - parametrizer: Callable[..., Any] | None = (lambda x: [x]) if path in PATHS_BINARY_FILES else None + binary_files = (*PATHS_BINARY_FILES, *PATHS_COMPRESSED_BINARY_FILES) + parametrizer: Callable[..., Any] | None = (lambda x: [x]) if path in binary_files else None load_attrs = self._make_load_attrs(loader, path, lazy_loading=True, parametrizer=parametrizer) file_loader = FileLoader( abs_file_path, load_attrs, load_from=ABS_PATH_LOADER_DIR, strip_trailing_whitespace=True ) - if path.suffix == ".json": + if get_effective_suffix(abs_file_path) == ".json": assert file_loader.file_reader is not None lazy_loaded_data = file_loader._load_lazily() - assert file_loader._cached_file_objects == {} + # Non-streamable @parametrize files call _load_now() at collection (to count items). When a + # file_reader is present, _load_now() uses _get_file_obj() which caches the open handle. + # All other paths leave _cached_file_objects empty after _load_lazily(). + if loader == parametrize and not file_loader.is_streamable and file_loader.file_reader is not None: + assert len(file_loader._cached_file_objects) == 1 + else: + assert file_loader._cached_file_objects == {} # For streamable @parametrize files, no lru_cache wrapper is created at load time. # For all other cases, the file_loader is registered eagerly to ensure cleanup even if tests are skipped. if loader == parametrize and file_loader.is_streamable: @@ -206,9 +227,14 @@ def test_lazy_loading_cache_state_transitions(self, loader: DataLoader, path: Pa # The result of _read_reader_and_split() should be cached per reader assert file_loader.file_reader in file_loader._cached_reader_and_split else: - # The file object and resolver should not be cached, but the file loader function used in resolver - # should be cached - assert len(file_loader._cached_file_objects) == 0 + # The file loader function (lru_cache wrapper) should be populated, but the resolver itself + # is not cached. For non-streamable files WITH a file_reader (e.g. compressed JSON/JSONL), + # _load_now uses _get_file_obj() so _cached_file_objects is populated. For those without a + # reader (e.g. XML, CSV), _read_file() is used and _cached_file_objects stays empty. + if file_loader.file_reader is not None: + assert len(file_loader._cached_file_objects) == 1 + else: + assert len(file_loader._cached_file_objects) == 0 assert not hasattr(lazy_data.resolver, "cache_info") assert len(file_loader._cached_functions) == 1 file_loader_func = next(iter(file_loader._cached_functions)) @@ -368,3 +394,127 @@ def test_weakref_finalize_clears_cache_on_gc(self) -> None: assert cached_file_objects_ref == {} assert cached_file_loaders_ref == set() assert cached_reader_split_ref == {} + + +class TestFileLoaderWithCompressedFiles: + """Tests for compression-aware file loading (.gz/.bz2/.xz).""" + + def _make_load_attrs(self, loader: DataLoader, path: Path, *, lazy_loading: bool = False) -> DataLoaderLoadAttrs: + """Create minimal DataLoaderLoadAttrs for the given loader and path. + + :param loader: The data loader to use + :param path: Relative path to the test data file + :param lazy_loading: Whether to use lazy loading + """ + return DataLoaderLoadAttrs( + loader=loader, + search_from=Path(__file__), + fixture_names=("file_path", "data"), + path=path, + lazy_loading=lazy_loading, + ) + + def test_get_effective_suffix_returns_inner_suffix_for_compressed_paths(self) -> None: + """Test that get_effective_suffix strips the compression suffix to expose the inner format suffix""" + assert get_effective_suffix(Path("data.json.gz")) == ".json" + assert get_effective_suffix(Path("data.csv.bz2")) == ".csv" + assert get_effective_suffix(Path("data.txt.xz")) == ".txt" + assert get_effective_suffix(Path("data.JSON.GZ")) == ".JSON" + + def test_get_effective_suffix_returns_suffix_for_non_compressed_paths(self) -> None: + """Test that get_effective_suffix is a no-op for non-compressed paths""" + assert get_effective_suffix(Path("data.json")) == ".json" + assert get_effective_suffix(Path("data.txt")) == ".txt" + + def test_get_effective_suffix_returns_gz_when_no_inner_suffix(self) -> None: + """Test that get_effective_suffix returns the compression suffix itself when there is no inner suffix""" + assert get_effective_suffix(Path("data.gz")) == ".gz" + assert get_effective_suffix(Path("data.bz2")) == ".bz2" + assert get_effective_suffix(Path("data.xz")) == ".xz" + + def test_compression_aware_open_routes_gz_through_gzip(self, tmp_path: Path) -> None: + """Test that compression_aware_open opens .gz files via gzip and returns decompressed text""" + payload = "hello compressed world\n" + gz_path = tmp_path / "test.txt.gz" + with gzip.open(gz_path, "wt") as f: + f.write(payload) + + with compression_aware_open(gz_path) as f: + assert f.read() == payload + + def test_compressed_json_resolves_to_default_json_reader(self) -> None: + """Test that FileLoader for a .json.gz file resolves to the default json.load reader""" + abs_path = ABS_PATH_LOADER_DIR / PATH_JSON_FILE_GZ + load_attrs = self._make_load_attrs(load, PATH_JSON_FILE_GZ) + file_loader = FileLoader(abs_path, load_attrs, load_from=ABS_PATH_LOADER_DIR) + + assert file_loader.file_reader is json.load + + def test_compressed_file_disables_streaming(self) -> None: + """Test that FileLoader marks compressed files as non-streamable to avoid O(n) seeks""" + abs_path = ABS_PATH_LOADER_DIR / PATH_TEXT_FILE_GZ + load_attrs = self._make_load_attrs(parametrize, PATH_TEXT_FILE_GZ) + file_loader = FileLoader(abs_path, load_attrs, load_from=ABS_PATH_LOADER_DIR) + + assert not file_loader.is_streamable + + def test_compressed_binary_autodetect_via_decompressed_chunk(self, tmp_path: Path) -> None: + """Test that binary auto-detection probes decompressed bytes, not the gzip magic bytes""" + binary_payload = bytes(range(256)) + gz_path = tmp_path / "binary.dat.gz" + with gzip.open(gz_path, "wb") as f: + f.write(binary_payload) + + load_attrs = DataLoaderLoadAttrs( + loader=load, + search_from=Path(__file__), + fixture_names=("data",), + path=gz_path, + lazy_loading=False, + ) + file_loader = FileLoader(gz_path, load_attrs) + loaded = file_loader.load() + + assert isinstance(loaded, LoadedData) + assert loaded.data == binary_payload + assert file_loader.read_mode == "rb" + + def test_compressed_text_autodetect_via_decompressed_chunk(self, tmp_path: Path) -> None: + """Test that text auto-detection probes decompressed bytes and resolves to text mode""" + text_payload = "hello from compressed text\n" + gz_path = tmp_path / "text.dat.gz" + with gzip.open(gz_path, "wt", encoding="utf-8") as f: + f.write(text_payload) + + load_attrs = DataLoaderLoadAttrs( + loader=load, + search_from=Path(__file__), + fixture_names=("data",), + path=gz_path, + lazy_loading=False, + ) + file_loader = FileLoader(gz_path, load_attrs) + loaded = file_loader.load() + + assert isinstance(loaded, LoadedData) + assert loaded.data == text_payload + assert file_loader.read_mode == "r" + + @pytest.mark.parametrize("ext", [x.upper() for x in SUPPORTED_COMPRESSION_EXTENSIONS]) + def test_compressed_uppercase_suffix_is_routed(self, tmp_path: Path, ext: str) -> None: + """Test that uppercase compression suffixes are routed the same as lowercase""" + payload = "case insensitive\n" + path = tmp_path / f"data.txt{ext}" + with compression_aware_open(path, mode="wt") as f: + f.write(payload) + load_attrs = DataLoaderLoadAttrs( + loader=load, + search_from=Path(__file__), + fixture_names=("data",), + path=path, + lazy_loading=False, + ) + file_loader = FileLoader(path, load_attrs) + loaded = file_loader.load() + assert isinstance(loaded, LoadedData) + assert loaded.data == payload