Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions .github/workflows/deploy-ghpage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,18 @@ jobs:
- name: Checkout
uses: actions/checkout@v6
- uses: actions/checkout@v6
- uses: pnpm/action-setup@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
run_install: true
package_json_file: packages/package.json
version: 9.10.0
- name: Setup Node.js with pnpm cache
uses: actions/setup-node@v6
with:
cache: 'pnpm'
cache-dependency-path: 'packages/pnpm-lock.yaml'
- name: Install pnpm dependencies
working-directory: packages
run: pnpm install --frozen-lockfile
- uses: astral-sh/setup-uv@v7
- name: Run tests
working-directory: ./packages/buckaroo-js-core
Expand Down
15 changes: 12 additions & 3 deletions .github/workflows/publish-storybook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,25 @@ jobs:
runs-on: depot-ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: pnpm/action-setup@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
run_install: true
package_json_file: packages/package.json
version: 9.10.0
- name: Setup Node.js with pnpm cache
uses: actions/setup-node@v6
with:
cache: 'pnpm'
cache-dependency-path: 'packages/pnpm-lock.yaml'
- name: Install pnpm dependencies
working-directory: packages
run: pnpm install --frozen-lockfile
- uses: astral-sh/setup-uv@v7
- name: Run tests
working-directory: ./packages/buckaroo-js-core
run: |
pnpm build-storybook
- name: Deploy
if: github.ref == 'refs/heads/main'
uses: peaceiris/actions-gh-pages@v4
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ ipydatagrid/nbextension/*
buckaroo/nbextension/*
buckaroo/labextension/*
buckaroo/static/*.js
buckaroo/static/*.js.map
buckaroo/static/*.css
docs/*.js
docs/*.js.map
Expand Down
1 change: 1 addition & 0 deletions buckaroo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from ._version import __version__
from .buckaroo_widget import BuckarooWidget, BuckarooInfiniteWidget, AutocleaningBuckaroo
from .dataflow.widget_extension_utils import DFViewer
from .artifact import prepare_buckaroo_artifact, to_html, artifact_to_json
from .widget_utils import is_in_ipython, is_in_marimo, enable, disable, determine_jupter_env
from .read_utils import read
try:
Expand Down
202 changes: 202 additions & 0 deletions buckaroo/artifact.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
"""Static embedding artifact generation for buckaroo.

Generate self-contained artifacts from DataFrames that can be rendered
in static HTML without a notebook kernel or server.

Both df_data and summary_stats_data are serialized as parquet b64
for compact transport. The JS side decodes them via resolveDFDataAsync().
"""
import base64
import json
from io import BytesIO
from pathlib import Path

import pandas as pd

from buckaroo.serialization_utils import (
prepare_df_for_serialization,
_json_encode_cell,
)
from buckaroo.dataflow.widget_extension_utils import configure_buckaroo
from buckaroo.buckaroo_widget import BuckarooWidget


def _df_to_parquet_b64_tagged(df: pd.DataFrame) -> dict:
"""Serialize a DataFrame to a tagged parquet-b64 payload.

Uses pyarrow for parquet serialization. Object/category columns are
JSON-encoded per cell (same convention as sd_to_parquet_b64) so the
JS side can decode them uniformly via parseParquetRow().

Returns {'format': 'parquet_b64', 'data': '<base64 string>'}
"""
df2 = prepare_df_for_serialization(df)
if not isinstance(df.index, pd.MultiIndex):
df2['level_0'] = df2['index']

# Convert PyArrow-backed string columns to object dtype (pandas 3.0+)
for col in df2.columns:
if (pd.api.types.is_string_dtype(df2[col].dtype)
and not pd.api.types.is_object_dtype(df2[col].dtype)):
df2[col] = df2[col].astype('object')

# JSON-encode object/category columns (except index columns which
# the JS side keeps as-is without JSON.parse)
obj_cols = df2.select_dtypes(['object', 'category']).columns.tolist()
for col in obj_cols:
if col not in ('index', 'level_0'):
df2[col] = df2[col].apply(_json_encode_cell)

buf = BytesIO()
df2.to_parquet(buf, engine='pyarrow')
buf.seek(0)
b64 = base64.b64encode(buf.read()).decode('ascii')
return {'format': 'parquet_b64', 'data': b64}


def prepare_buckaroo_artifact(df, column_config_overrides=None,
extra_pinned_rows=None, pinned_rows=None,
extra_analysis_klasses=None,
analysis_klasses=None):
"""Generate a static artifact dict from a DataFrame.

The artifact contains all data needed to render a buckaroo table
without a server or kernel. Both df_data and summary_stats_data
are serialized as parquet b64 for compact transport.

Parameters
----------
df : pd.DataFrame, pl.DataFrame, str, or Path
The data source. Strings and Paths are read as files.
column_config_overrides : dict, optional
Column-specific display configuration overrides.
extra_pinned_rows, pinned_rows : list, optional
Additional or replacement pinned summary rows.
extra_analysis_klasses, analysis_klasses : list, optional
Additional or replacement analysis classes.

Returns
-------
dict
Artifact with keys 'df_data', 'df_viewer_config', 'summary_stats_data'.
df_data and summary_stats_data are ``{format: 'parquet_b64', data: '...'}``
tagged dicts.
"""
# Handle file paths
if isinstance(df, (str, Path)):
df = _read_file(Path(df))

# Handle polars DataFrames
WidgetKls = BuckarooWidget
try:
import polars as pl
if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
from buckaroo.polars_buckaroo import PolarsBuckarooWidget
WidgetKls = PolarsBuckarooWidget
if isinstance(df, pl.LazyFrame):
df = df.collect()
except ImportError:
pass

BuckarooKls = configure_buckaroo(
WidgetKls,
extra_pinned_rows=extra_pinned_rows, pinned_rows=pinned_rows,
extra_analysis_klasses=extra_analysis_klasses,
analysis_klasses=analysis_klasses)

bw = BuckarooKls(df, column_config_overrides=column_config_overrides)

df_viewer_config = bw.df_display_args['dfviewer_special']['df_viewer_config']
summary_stats_data = bw.df_data_dict['all_stats'] # already parquet b64 tagged

# Serialize the main data as parquet b64.
# The widget stores processed data on its inner dataflow object.
processed_df = bw.dataflow.processed_df
from buckaroo.serialization_utils import force_to_pandas
serializable_df = force_to_pandas(
bw.sampling_klass.serialize_sample(processed_df))
df_data = _df_to_parquet_b64_tagged(serializable_df)

return {
'df_data': df_data,
'df_viewer_config': df_viewer_config,
'summary_stats_data': summary_stats_data,
}


def _read_file(path: Path):
"""Read a file into a DataFrame, trying polars first, then pandas."""
suffix = path.suffix.lower()
try:
import polars as pl
if suffix == '.parquet':
return pl.read_parquet(path)
elif suffix == '.csv':
return pl.read_csv(path)
elif suffix in ('.json', '.jsonl', '.ndjson'):
return pl.read_ndjson(path)
Comment on lines +136 to +137

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Read .json paths with a JSON parser in the Polars branch

The Polars path routes .json files through pl.read_ndjson, which expects newline-delimited JSON; standard JSON files (array/object form) that work in the pandas fallback will fail whenever Polars is installed. This makes prepare_buckaroo_artifact(<json path>) behavior environment-dependent and breaks valid .json inputs.

Useful? React with 👍 / 👎.

else:
return pl.read_csv(path)
except ImportError:
if suffix == '.parquet':
return pd.read_parquet(path)
elif suffix == '.csv':
return pd.read_csv(path)
elif suffix in ('.json', '.jsonl', '.ndjson'):
return pd.read_json(path, lines=(suffix in ('.jsonl', '.ndjson')))
else:
return pd.read_csv(path)


def artifact_to_json(artifact: dict) -> str:
"""Serialize an artifact dict to a JSON string."""
return json.dumps(artifact, default=str)


_HTML_TEMPLATE = """\
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{title}</title>
<link rel="stylesheet" href="static-embed.css">
<style>
html, body {{ margin: 0; padding: 0; height: 100%; }}
#root {{ width: 100%; height: 100vh; }}
</style>
</head>
<body>
<div id="root"></div>
<script>window.__BUCKAROO_ARTIFACT__ = {artifact_json};</script>

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Escape embedded artifact JSON before injecting into script

The HTML template writes artifact_json directly into an inline <script> tag, but json.dumps does not escape </script> sequences; if a dataframe-derived field in df_viewer_config (for example a column/header name) contains that substring, the browser terminates the script early and treats the remainder as executable HTML/JS. This creates an XSS vector whenever untrusted data is rendered to static HTML.

Useful? React with 👍 / 👎.

<script type="module" src="static-embed.js"></script>
</body>
</html>
"""


def to_html(df, title="Buckaroo", **kwargs) -> str:
"""Generate an HTML string that renders a buckaroo table.

The HTML references ``static-embed.js`` and ``static-embed.css``
which must be served alongside it (produced by the JS build).

Parameters
----------
df : pd.DataFrame, pl.DataFrame, str, or Path
The data source.
title : str
HTML page title.
**kwargs
Passed through to prepare_buckaroo_artifact().

Returns
-------
str
Complete HTML document string.
"""
artifact = prepare_buckaroo_artifact(df, **kwargs)
return _HTML_TEMPLATE.format(
title=title,
artifact_json=artifact_to_json(artifact),
)
1 change: 1 addition & 0 deletions packages/buckaroo-js-core/jest.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export default {
"\\.(css|less|sass|scss)$": "identity-obj-proxy",
"^.+\\.svg$": "jest-transformer-svg",
"^@/(.*)$": "<rootDir>/src/$1",
"^lodash-es$": "lodash",
},

testMatch: ["!**/*.spec.ts", "**/*.test.ts", "**/*.test.tsx"],
Expand Down
13 changes: 5 additions & 8 deletions packages/buckaroo-js-core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,10 @@
"/dist"
],
"dependencies": {
"@ag-grid-community/client-side-row-model": "^32.3.3",
"@ag-grid-community/core": "^32.3.3",
"@ag-grid-community/infinite-row-model": "^32.3.3",
"@ag-grid-community/react": "^32.3.3",
"@ag-grid-community/styles": "^32.3.3",
"@ag-grid-community/theming": "^32.3.3",
"ag-grid-community": "^35.1.0",
"ag-grid-react": "^35.1.0",
"hyparquet": "^1.8.2",
"lodash": "^4.17.21",
"lodash-es": "^4.17.21",
"recharts": "^2.13.1"
},
"devDependencies": {
Expand All @@ -55,7 +51,8 @@
"@testing-library/jest-dom": "^6.6.3",
"@testing-library/react": "^16.1.0",
"@types/jest": "^29.5.14",
"@types/lodash": "^4.17.13",
"@types/lodash-es": "^4.17.12",
"lodash": "^4.17.21",
"@types/node": "^22.15.3",
"@types/react": "^18.3.12",
"@types/react-dom": "^18.3.1",
Expand Down
37 changes: 37 additions & 0 deletions packages/buckaroo-js-core/playwright.config.static-embed.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { defineConfig, devices } from '@playwright/test';
import * as path from 'path';
import { fileURLToPath } from 'url';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PORT = 8766;
const STATIC_DIR = path.resolve(__dirname, '../../buckaroo/static');

export default defineConfig({
testDir: './pw-tests',
testMatch: ['static-embed.spec.ts'],
fullyParallel: false,
forbidOnly: !!process.env.CI,
retries: 0,
workers: 1,
reporter: 'html',
use: {
baseURL: `http://localhost:${PORT}`,
trace: 'on-first-retry',
...devices['Desktop Chrome'],
},
timeout: 60_000,

projects: [
{
name: 'chromium-static-embed',
use: { ...devices['Desktop Chrome'] },
},
],

webServer: {
command: `npx --yes serve -l ${PORT} ${STATIC_DIR} --no-clipboard`,
port: PORT,
reuseExistingServer: !process.env.CI,
timeout: 15_000,
},
});
Loading
Loading