Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,20 @@ Conda users can install from conda-forge:

conda install -c conda-forge python-blosc2

Command line tools
==================

Two CLI tools are installed along with the package:

- ``b2view``: an interactive terminal browser (TUI) for TreeStore bundles
(``.b2d`` directories or ``.b2z`` files), with paged views of NDArray and
CTable data of any size
(`walkthrough <https://www.blosc.org/python-blosc2/getting_started/b2view.html>`_).
- ``parquet-to-blosc2``: converts Parquet files to Blosc2 columnar table
stores, and back
(`walkthrough <https://www.blosc.org/python-blosc2/getting_started/parquet_to_blosc2.html>`_;
requires ``pip install "blosc2[parquet]"``).

Documentation
=============

Expand Down
85 changes: 72 additions & 13 deletions bench/tree-store.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
Benchmark for TreeStore hierarchical creation, opening, and listing.

Creates a hierarchy of N1 levels, each with N2 NDArray leaves and one
CTable (4 cols: bool, int, float, string) with N5 rows. Leaf ``N``
CTable (20 cols: bool, int, float, string plus 16 numeric columns) with
N5 rows. Leaf ``N``
receives an *N*-dimensional array (leaf0 is 0‑d, leaf1 is 1‑d, …) with
each side ``int(MAX_ELEMS ** (1/N))`` so that no array exceeds MAX_ELEMS
elements. Everything is written to ``tree-store.b2z`` and the script
Expand All @@ -33,13 +34,58 @@

# ── Row schema for the CTable ────────────────────────────────────────────

# 4 base columns plus 16 extra numeric ones (v04..v19), wide enough to
# exceed the data panel viewport of b2view.
NCOLS = 20


@dataclasses.dataclass
class _Row:
a: bool = blosc2.field(blosc2.bool(), default=False)
b: int = blosc2.field(blosc2.int64(), default=0)
c: float = blosc2.field(blosc2.float64(), default=0.0)
d: str = ""
v04: int = blosc2.field(blosc2.int64(), default=0)
v05: float = blosc2.field(blosc2.float64(), default=0.0)
v06: int = blosc2.field(blosc2.int64(), default=0)
v07: float = blosc2.field(blosc2.float64(), default=0.0)
v08: int = blosc2.field(blosc2.int64(), default=0)
v09: float = blosc2.field(blosc2.float64(), default=0.0)
v10: int = blosc2.field(blosc2.int64(), default=0)
v11: float = blosc2.field(blosc2.float64(), default=0.0)
v12: int = blosc2.field(blosc2.int64(), default=0)
v13: float = blosc2.field(blosc2.float64(), default=0.0)
v14: int = blosc2.field(blosc2.int64(), default=0)
v15: float = blosc2.field(blosc2.float64(), default=0.0)
v16: int = blosc2.field(blosc2.int64(), default=0)
v17: float = blosc2.field(blosc2.float64(), default=0.0)
v18: int = blosc2.field(blosc2.int64(), default=0)
v19: float = blosc2.field(blosc2.float64(), default=0.0)


def ctable_values(nrows: int) -> dict[str, np.ndarray]:
"""Deterministic column values for the CTable; row *i* is predictable.

Tests (e.g. tests/b2view/test_basics.py) rely on these formulas to check
that a given viewport shows the expected values:

- a: i % 2 == 0
- b: i
- c: i * 1.5
- d: "str_%06d" % i
- v{k}, even k: i * k
- v{k}, odd k: linspace(0, k, nrows)[i] == i * k / (nrows - 1)
"""
i = np.arange(nrows)
values: dict[str, np.ndarray] = {
"a": i % 2 == 0,
"b": i,
"c": i * 1.5,
"d": np.char.add("str_", np.char.zfill(i.astype("U6"), 6)),
}
for k in range(4, NCOLS):
values[f"v{k:02d}"] = i * k if k % 2 == 0 else np.linspace(0, k, num=nrows)
return values


# ── Helpers ──────────────────────────────────────────────────────────────
Expand Down Expand Up @@ -87,9 +133,16 @@ def create_store(
max_elems: int,
nrows: int,
no_vlmeta: bool = False,
output: str = OUTPUT_FILE,
verbose: bool = True,
) -> tuple[float, int]:
"""Create the TreeStore; return (wall_clock, total_elements_written)."""
_clean(OUTPUT_FILE)

def log(*args, **kwargs):
if verbose:
print(*args, **kwargs)

_clean(output)

# Pre-build one array per unique dimensionality (leaf ``i`` → *i*‑d).
leaf_arrays_np: dict[int, np.ndarray] = {}
Expand All @@ -109,25 +162,30 @@ def create_store(
total_elements = sum(leaf_arrays_np[ndim].size for ndim in range(nleaves)) * nlevels

# Pre-populate a single CTable that we will copy for every level.
# Columns are filled from vectorized, predictable sequences (arange /
# linspace flavored) so they are fast to build and compress very well.
tmpl_table = blosc2.CTable(_Row, expected_size=nrows, validate=False)
rows = [(i % 2 == 0, i, float(i) * 1.5, f"str_{i:06d}") for i in range(nrows)]
tmpl_table.extend(rows, validate=False)
cols = ctable_values(nrows)
struct = np.empty(nrows, dtype=[(name, vals.dtype) for name, vals in cols.items()])
for name, vals in cols.items():
struct[name] = vals
tmpl_table.extend(struct, validate=False)

print(
log(
f"\nCreating TreeStore with {nlevels} level(s), "
f"{nleaves} leave(s) each, {nrows} CTable row(s) per level..."
)
print(f" Max elements per leaf: {max_elems:,}")
log(f" Max elements per leaf: {max_elems:,}")
for ndim in range(min(nleaves, 10)):
shape = _leaf_shape(ndim, max_elems)
nelem = int(np.prod(shape)) if shape else 1
print(f" leaf{ndim}: shape={shape}, elements={nelem:,}, uncompressed={_fmt_bytes(nelem * 8)}")
log(f" leaf{ndim}: shape={shape}, elements={nelem:,}, uncompressed={_fmt_bytes(nelem * 8)}")
if nleaves > 10:
print(f" ... ({nleaves - 10} more)")
print(f" CTable rows: {nrows} | uncompressed table size: {_fmt_bytes(tmpl_table.nbytes)}")
log(f" ... ({nleaves - 10} more)")
log(f" CTable rows: {nrows} | uncompressed table size: {_fmt_bytes(tmpl_table.nbytes)}")

t0 = time.perf_counter()
tstore = blosc2.TreeStore(OUTPUT_FILE, mode="w")
tstore = blosc2.TreeStore(output, mode="w")

try:
if not no_vlmeta:
Expand Down Expand Up @@ -160,12 +218,12 @@ def create_store(
ct = tstore[table_key]
ct.vlmeta["description"] = f"Level {level} CTable"
ct.vlmeta["author"] = "blosc2"
ct.vlmeta["ncols"] = 4
ct.vlmeta["ncols"] = tmpl_table.ncols
ct.vlmeta["has_index"] = True
ct.vlmeta["tags_list"] = ["benchmark", "testing", f"level_{level}"]

if (level + 1) % max(1, nlevels // 10) == 0 or level == nlevels - 1:
print(f" Level {level + 1}/{nlevels} done ({time.perf_counter() - t0:.2f}s so far)")
log(f" Level {level + 1}/{nlevels} done ({time.perf_counter() - t0:.2f}s so far)")
finally:
tstore.close()

Expand Down Expand Up @@ -308,7 +366,8 @@ def main() -> None:
for d in range(args.nleaves)
)
total_data_bytes = (
total_elements * 8 + args.nlevels * args.nrows * (1 + 8 + 8 + 16) # rough for table
# rough per-row table size: bool + int64 + float64 + str + 16 numeric cols
total_elements * 8 + args.nlevels * args.nrows * (1 + 8 + 8 + 16 + 16 * 8)
)
file_size = os.path.getsize(OUTPUT_FILE)

Expand Down
106 changes: 106 additions & 0 deletions doc/getting_started/b2view.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
b2view: Browse TreeStore Bundles in the Terminal
================================================

The ``b2view`` CLI opens an interactive terminal browser (TUI) for Blosc2
TreeStore bundles, either sparse directories (``.b2d``) or compact
zip-backed files (``.b2z``). It shows the tree of groups and nodes, the
metadata and vlmeta of the selected node, and a paged view of the data
itself — NDArrays of any dimensionality as well as CTables.

``b2view`` is installed with python-blosc2; no extra dependencies are
needed.

Step 1 — Create a sample store
------------------------------

Run the snippet below once to produce ``sample.b2z`` with a couple of
arrays and some metadata:

.. code-block:: python

import blosc2

with blosc2.TreeStore("sample.b2z", mode="w") as tstore:
tstore.vlmeta["author"] = "me"
a = blosc2.linspace(0, 1, num=1_000_000, shape=(1000, 1000))
a.vlmeta["description"] = "a 2-D linspace"
tstore["/dense/a"] = a
tstore["/dense/b"] = blosc2.arange(10_000, shape=(10, 100, 10))

Any existing TreeStore bundle works too — for instance the output of the
``parquet-to-blosc2`` converter (see :doc:`parquet_to_blosc2`).

Step 2 — Open it
----------------

.. code-block:: console

b2view sample.b2z

The screen is split into four panels: the **tree** of the bundle on the
left, and **meta**, **vlmeta** and **data** panels for the node selected
in the tree. Move between panels with ``tab`` / ``shift+tab``, maximize
the focused one with ``m`` (``r`` restores it), and quit with ``q``.

By default the mouse is left to the terminal, so selecting and copying text
works as in any other command line program. Pass ``--mouse`` to let b2view
capture it instead: panels become clickable and the wheel scrolls the data
grid (paging at the boundaries), at the cost of native text selection.

You can also jump straight to a node and panel:

.. code-block:: console

b2view sample.b2z /dense/a --panel data

Step 3 — Navigate the data panel
--------------------------------

The data panel pages through objects far larger than the screen. Press
``?`` at any time for the full key reference; the essentials are:

================================ =============================================
Key Action
================================ =============================================
``up`` / ``down`` move the cursor; pages at the edges
``pageup`` / ``pagedown`` previous / next page of rows
``t`` / ``b`` first / last row
``g`` go to a row number
``left`` / ``right`` move across columns; pages at the edges
``s`` / ``e`` (``home``/``end``) first / last column window
``c`` go to a column index or name
================================ =============================================

For N-D arrays, press ``d`` to enter *dim mode*: ``left`` / ``right``
select the active dimension, ``up`` / ``down`` change its fixed index (or
scroll the viewport), ``enter`` toggles a dimension between fixed and
navigable, and ``escape`` leaves dim mode.

Step 4 — Filter CTable rows
---------------------------

On a CTable node, press ``f`` and type a filter expression to page through
only the matching rows — the same expressions ``CTable.where()`` accepts,
including dotted nested column names and ``and`` / ``or``:

.. code-block:: text

payment.tips > 100 and trip.km > 0 and trip.sec > 0

The data header shows the active filter and the matching row count; all
navigation (paging, ``g``, ``t`` / ``b``) then operates on the filtered
rows. Press ``escape`` (or submit an empty expression) to go back to the
unfiltered table; each node remembers its filter for the session.

Columns can be filtered too: press ``/`` and type a case-insensitive
substring (e.g. ``payment``) to show only the matching columns — column
paging and the ``c`` goto-column modal then operate on that subset. Row
and column filters combine freely; ``escape`` clears them one layer at a
time (row filter first, then columns).

CLI options
-----------

``--preview-rows N`` and ``--preview-cols N`` bound the size of each data
page (20 rows by 10 columns by default), and ``--panel`` chooses the panel
focused on startup (``tree``, ``meta``, ``vlmeta`` or ``data``).
1 change: 1 addition & 0 deletions doc/getting_started/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ Getting Started
tutorials
dsl_syntax
parquet_to_blosc2
b2view
2 changes: 1 addition & 1 deletion doc/getting_started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Source code

git clone https://github.com/Blosc/python-blosc2/
cd python-blosc2
pip install .[test] # install with test dependencies
pip install . --group test # install with test dependencies

That's all. You can proceed with testing section now.

Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ dependencies = [
"numexpr>=2.14.1; platform_machine != 'wasm32'",
"pydantic",
"requests",
"rich",
"textual",
"threadpoolctl; platform_machine != 'wasm32'",
]
version = "4.4.4.dev0"
Expand All @@ -50,7 +52,6 @@ documentation = "https://www.blosc.org/python-blosc2/python-blosc2.html"

[project.optional-dependencies]
parquet = ["pyarrow"]
tui = ["textual", "rich"]

[project.scripts]
parquet-to-blosc2 = "blosc2.cli.parquet_to_blosc2:main"
Expand All @@ -74,6 +75,8 @@ dev = [
]
test = [
"pytest",
# for the b2view Pilot tests
"pytest-asyncio",
"psutil; platform_machine != 'wasm32'",
# torch is optional because it is quite large (but will still be used if found)
# "torch; platform_machine != 'wasm32'",
Expand Down
1 change: 1 addition & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ testpaths =
markers =
heavy: tests that take long time to complete.
network: tests that require network access.
tui: b2view Textual UI tests; each one boots a headless app session.

filterwarnings =
error
Expand Down
Loading
Loading