Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,9 @@ By default, the plugin reads and parses file content when loading as follows:
- `.jsonl` — Each line is parsed as a JSON object
- All other file types — Loads as raw text or binary content

Compressed files with a `.gz`, `.bz2`, or `.xz` extension are decompressed transparently. The inner file type
determines which reader and read mode are used (e.g. `data.json.gz` uses `json.load`, `data.txt.bz2` loads as plain text).

### Customizing defaults

You can customize this behavior by specifying a file reader that accepts a file-like object returned by `open()`.
Expand Down
33 changes: 20 additions & 13 deletions src/pytest_data_loader/loaders/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
from pytest_data_loader.paths import (
check_and_track_dir,
check_circular_symlink,
compression_aware_open,
get_effective_suffix,
get_matching_paths,
is_compressed_path,
resolve_relative_path,
split_glob_path,
)
Expand Down Expand Up @@ -191,20 +194,24 @@ def __init__(self, *args: Any, gidx: int | None = None, **kwargs: Any):
self.file_reader = self.load_attrs.reader
self.read_options = self.load_attrs.read_options
if not self.file_reader:
if registered_reader := FileReader.get_registered_reader(self.load_attrs.search_from, self.path.suffix):
if registered_reader := FileReader.get_registered_reader(
self.load_attrs.search_from, get_effective_suffix(self.path)
):
self.file_reader = registered_reader.reader
if not self.read_options:
self.read_options = registered_reader.read_options
assert isinstance(self.read_options, HashableDict)
self._effective_read_mode: str | None = None
self._is_streamable = self.file_reader is not None or all(
# non-structured text data can be read line by line
[
self.path.suffix in FileLoader.STREAMABLE_FILE_TYPES,
self.read_mode != "rb",
self.load_attrs.onload_func is None,
self.load_attrs.parametrizer_func is None,
]
self._is_streamable = not is_compressed_path(self.path) and (
self.file_reader is not None
or all(
[
get_effective_suffix(self.path) in FileLoader.STREAMABLE_FILE_TYPES,
self.read_mode != "rb",
self.load_attrs.onload_func is None,
self.load_attrs.parametrizer_func is None,
]
)
)

# Caches used by data loaders.
Expand Down Expand Up @@ -462,7 +469,7 @@ def _get_file_obj(self) -> IO[Any]:
"""Get file object from cache or open a new one and cache it"""
f = self._cached_file_objects.get((self.path, self.read_options))
if not f or f.closed:
f = open(self.path, **self.read_options)
f = compression_aware_open(self.path, **self.read_options)
self._cached_file_objects[(self.path, self.read_options)] = f
f.seek(0)
return f
Expand Down Expand Up @@ -506,7 +513,7 @@ def inspect_part_data(pos: int, part: Any) -> None:
else:
commit(pos, part)

with open(self.path, **self.read_options) as f:
with compression_aware_open(self.path, **self.read_options) as f:
if self.file_reader:
# NOTE: Do NOT use _read_reader_and_split here to get the split data. Closing the file will invalidate
# the cached part data generated by the file reader and cause issues when loading part data later.
Expand Down Expand Up @@ -534,7 +541,7 @@ def _read_file(self) -> str | bytes:
if self.read_mode == "auto":
# Detect read mode based on sampled data
is_binary = False
with open(self.path, "rb") as f:
with compression_aware_open(self.path, mode="rb") as f:
chunk = f.read(4096)

if chunk:
Expand All @@ -553,7 +560,7 @@ def _read_file(self) -> str | bytes:
if self.read_mode == "r" and "encoding" not in read_options:
read_options["encoding"] = "utf-8"

with open(self.path, **read_options) as f:
with compression_aware_open(self.path, **read_options) as f:
return f.read()

@requires_loader(DataLoaderType.PARAMETRIZE)
Expand Down
56 changes: 55 additions & 1 deletion src/pytest_data_loader/paths.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
from __future__ import annotations

import bz2
import errno
import glob
import gzip
import lzma
import os
import re
from collections.abc import Callable
from functools import lru_cache
from pathlib import Path
from typing import Literal
from typing import IO, Any, Literal

from pytest_data_loader.exceptions import DataNotFound

_COMPRESSION_OPENERS: dict[str, Callable[..., IO[Any]]] = {
".gz": gzip.open,
".bz2": bz2.open,
".xz": lzma.open,
}
SUPPORTED_COMPRESSION_EXTENSIONS: tuple[str, ...] = tuple(_COMPRESSION_OPENERS)


@lru_cache
def resolve_relative_path(
Expand Down Expand Up @@ -188,3 +199,46 @@ def split_glob_path(path: Path) -> tuple[Path, str]:
base = Path(*parts[:split])
pattern = str(Path(*parts[split:]))
return base, pattern


def is_compressed_path(path: Path) -> bool:
"""Return whether the given path is a supported compressed file (.gz/.bz2/.xz).

:param path: File path to inspect
"""
return path.suffix.lower() in SUPPORTED_COMPRESSION_EXTENSIONS


def get_effective_suffix(path: Path) -> str:
"""Return the format-bearing suffix of path, skipping a trailing compression suffix when present.

:param path: File path to inspect

Examples:
Path("data.json.gz") -> ".json"
Path("data.csv.bz2") -> ".csv"
Path("data.json") -> ".json"
Path("data.gz") -> ".gz" (no inner suffix to expose)
"""
suffixes = path.suffixes
if len(suffixes) >= 2 and is_compressed_path(path):
return suffixes[-2]
return path.suffix


def compression_aware_open(path: Path, **open_kwargs: Any) -> IO[Any]:
"""Open a file, routing through gzip.open()/bz2.open()/lzma.open() when the suffix matches.

For compression openers "r" means binary (unlike builtin open() where "r" means text). This function normalizes
the mode so that "r" and "rt" both produce a text-mode stream, matching the semantics of builtin open().

:param path: File path to open
:param open_kwargs: Keyword arguments forwarded to the opener (mode, encoding, errors, newline)
"""
opener = _COMPRESSION_OPENERS.get(path.suffix.lower())
if opener is None:
return open(path, **open_kwargs)
mode = open_kwargs.get("mode") or "r"
# Compression openers treat "r" as binary. Map to "rt" so callers get text mode, matching builtin open.
open_kwargs["mode"] = "rt" if mode in ("r", "rt") else mode
return opener(path, **open_kwargs)
2 changes: 2 additions & 0 deletions tests/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# For tests in Windows - Force LF for test data text files so their bytes match compressed counterparts (e.g. text.txt vs text.txt.gz)
**/data/** text=auto eol=lf
Binary file added tests/data/files/compressed/comma.csv.xz
Binary file not shown.
Binary file added tests/data/files/compressed/data.jsonl.bz2
Binary file not shown.
Binary file added tests/data/files/compressed/image.jpg.gz
Binary file not shown.
Binary file added tests/data/files/compressed/object.json.gz
Binary file not shown.
Binary file added tests/data/files/compressed/text.txt.gz
Binary file not shown.
Binary file added tests/data/files/compressed/yaml.yml.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/data/files/jsonl/data.jsonl
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"name": "Alice", "age": 30}
{"name": "Bob", "age": 25}
{"name": "Charlie", "age": 35}
{"name": "Charlie", "age": 35}
2 changes: 1 addition & 1 deletion tests/data/files/jsonl/data2.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"city": "Tokyo", "country": "Japan"}
{"city": "Paris", "country": "France"}
{"city": "Paris", "country": "France"}
2 changes: 1 addition & 1 deletion tests/data/files/yaml/yaml.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ services:
depends_on:
- db
volumes:
pgdata:
pgdata:
2 changes: 1 addition & 1 deletion tests/data/files/yaml/yaml_documents.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ spec:
- configMapRef:
name: app-config
- secretRef:
name: app-secret
name: app-secret
9 changes: 9 additions & 0 deletions tests/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@
PATH_JPEG_FILE = Path(IMAGE_DIR, "image.jpg")
PATH_HIDDEN_FILE = Path(SOME_DIR, ".hidden_file")
PATH_HIDDEN_DIR = Path(SOME_DIR, ".hidden_dir")
PATH_COMPRESSED_FILE_DIR = Path(FILES_DIR, "compressed")
PATH_JSON_FILE_GZ = PATH_COMPRESSED_FILE_DIR / f"{PATH_JSON_FILE_OBJECT.name}.gz"
PATH_JSONL_FILE_BZ2 = PATH_COMPRESSED_FILE_DIR / f"{PATH_JSONL_FILE.name}.bz2"
PATH_CSV_FILE_XZ = PATH_COMPRESSED_FILE_DIR / f"{PATH_CSV_FILE.name}.xz"
PATH_TEXT_FILE_GZ = PATH_COMPRESSED_FILE_DIR / f"{PATH_TEXT_FILE.name}.gz"
PATH_JPEG_FILE_GZ = PATH_COMPRESSED_FILE_DIR / f"{PATH_JPEG_FILE.name}.gz"
PATH_YAML_FILE_GZ = PATH_COMPRESSED_FILE_DIR / f"{PATH_YAML_FILE.name}.gz"
PATHS_COMPRESSED_TEXT_FILES = [PATH_JSON_FILE_GZ, PATH_JSONL_FILE_BZ2, PATH_CSV_FILE_XZ, PATH_TEXT_FILE_GZ]
PATHS_COMPRESSED_BINARY_FILES = [PATH_JPEG_FILE_GZ]
PATHS_TEXT_FILES = [
PATH_TEXT_FILE,
PATH_JSON_FILE_SCALAR,
Expand Down
35 changes: 34 additions & 1 deletion tests/tests_loader/test_load_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,23 @@
from tests.paths import (
ABS_PATH_LOADER_DIR,
PATH_JPEG_FILE,
PATH_JPEG_FILE_GZ,
PATH_JSON_FILE_GZ,
PATH_JSON_FILE_NESTED_OBJECT,
PATH_JSON_FILE_OBJECT,
PATH_TEXT_FILE,
PATH_TEXT_FILE_GZ,
)

pytestmark = pytest.mark.loaders

# NOTE:
# - lazy_loading option is separately tested in another test using pytester
# - This file covers 3 types of data types the plugin handles differently:
# - This file covers 4 types of data types the plugin handles differently:
# - text file (no file reader)
# - json file (with default file reader)
# - binary file
# - compressed files (gz, .bz2, .xz) for the above


# Text file
Expand Down Expand Up @@ -132,3 +136,32 @@ def test_load_binary_file_with_id(request: FixtureRequest, data: bytes) -> None:
def test_load_binary_file_with_marks(request: FixtureRequest, data: bytes) -> None:
"""Test @load loader with the marks option using binary file"""
assert "foo" in {m.name for m in request.node.own_markers}


# Compressed files
@load("data", PATH_TEXT_FILE_GZ)
def test_load_compressed_text_file(data: str) -> None:
"""Test that @load with a .txt.gz file returns decompressed file data"""
assert isinstance(data, str)
assert data == (ABS_PATH_LOADER_DIR / PATH_TEXT_FILE).read_text()


@load("data", PATH_JSON_FILE_GZ)
def test_load_compressed_json_file(data: dict[str, Any]) -> None:
"""Test that @load with a .json.gz file resolves to the default json.load reader transparently"""
assert isinstance(data, dict)
assert data == json.loads((ABS_PATH_LOADER_DIR / PATH_JSON_FILE_OBJECT).read_text())


@load("data", PATH_JPEG_FILE_GZ)
def test_load_compressed_autodetects_binary_mode(data: bytes) -> None:
"""Test that @load with a .jpg.gz file auto-detects binary mode from decompressed content"""
assert isinstance(data, bytes)
assert data == (ABS_PATH_LOADER_DIR / PATH_JPEG_FILE).read_bytes()


@load("data", PATH_TEXT_FILE_GZ, read_options={"mode": "rb"})
def test_load_compressed_text_with_force_binary(data: bytes) -> None:
"""Test that @load with a .txt.gz file in binary mode returns decompressed bytes"""
assert isinstance(data, bytes)
assert data == (ABS_PATH_LOADER_DIR / PATH_TEXT_FILE).read_bytes()
36 changes: 35 additions & 1 deletion tests/tests_loader/test_parametrize_dir.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
import json
from pathlib import Path
from typing import Any

import pytest
from pytest import FixtureRequest

from pytest_data_loader import parametrize_dir
from pytest_data_loader.paths import get_effective_suffix
from pytest_data_loader.types import LoadedDataType
from tests.paths import ABS_PATH_LOADER_DIR, IMAGE_DIR, PATH_TEXT_FILE_DIR, SOME_DIR, SOME_DIR_INNER
from tests.paths import (
ABS_PATH_LOADER_DIR,
IMAGE_DIR,
PATH_COMPRESSED_FILE_DIR,
PATH_JPEG_FILE,
PATH_JSON_FILE_OBJECT,
PATH_TEXT_FILE,
PATH_TEXT_FILE_DIR,
PATH_YAML_FILE,
SOME_DIR,
SOME_DIR_INNER,
)

from .helper import get_parametrized_test_idx

Expand Down Expand Up @@ -118,3 +132,23 @@ def test_parametrize_dir_multi_dirs_recursive(request: FixtureRequest, data: str
idx = get_parametrized_test_idx(request, "data")
all_expected = ["data0", "data1", "data2", "data3", "data4", "data5", "line0\nline1\nline2"]
assert data == all_expected[idx]


@parametrize_dir(
("file_path", "data"),
PATH_COMPRESSED_FILE_DIR,
filter=lambda p: get_effective_suffix(p) in (".txt", ".json", ".yml", ".jpg"),
)
def test_parametrize_dir_with_compressed_files(file_path: Path, data: Any) -> None:
"""Test @parametrize_dir loader with compressed files in the directory"""
effective_suffix = get_effective_suffix(file_path)
if effective_suffix == ".txt":
assert data == (ABS_PATH_LOADER_DIR / PATH_TEXT_FILE).read_text()
elif effective_suffix == ".json":
assert data == json.loads((ABS_PATH_LOADER_DIR / PATH_JSON_FILE_OBJECT).read_text())
elif effective_suffix == ".yml":
assert data == (ABS_PATH_LOADER_DIR / PATH_YAML_FILE).read_text()
elif effective_suffix == ".jpg":
assert data == (ABS_PATH_LOADER_DIR / PATH_JPEG_FILE).read_bytes()
else:
raise NotImplementedError("Add test")
Loading
Loading