Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ jobs:
name: Server Playwright Tests
needs: [BuildWheel]
runs-on: depot-ubuntu-latest
timeout-minutes: 10
timeout-minutes: 15
steps:
- uses: actions/checkout@v6
- name: Install uv
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,3 +276,4 @@ Have you had a good experience with this project? Why not share some love and co
We welcome [issue reports](../../issues); be sure to choose the proper issue template for your issue, so that we can be sure you're providing the necessary information.



104 changes: 56 additions & 48 deletions buckaroo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@
import inspect
import platform
from ._version import __version__
from .buckaroo_widget import BuckarooWidget, BuckarooInfiniteWidget, AutocleaningBuckaroo
from .dataflow.widget_extension_utils import DFViewer
from .widget_utils import is_in_ipython, is_in_marimo, enable, disable, determine_jupter_env
try:
from .buckaroo_widget import BuckarooWidget, BuckarooInfiniteWidget, AutocleaningBuckaroo
from .widget_utils import is_in_ipython, is_in_marimo, enable, disable, determine_jupter_env
from .dataflow.widget_extension_utils import DFViewer
_HAS_PANDAS = True
except ImportError:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Restrict import fallback to missing pandas only

Catching a blanket ImportError here suppresses all widget-stack import failures, not just the intended “pandas not installed” case. If any transitive import regression (or missing core dependency) raises ImportError, buckaroo will silently set _HAS_PANDAS=False and disable notebook exports/initialization instead of surfacing the real error, which makes breakages much harder to detect and debug.

Useful? React with 👍 / 👎.

# buckaroo_widget, widget_utils, and widget_extension_utils require pandas;
# skip in server-only mode
_HAS_PANDAS = False
from .read_utils import read
try:
from .file_cache.cache_utils import (
Expand All @@ -23,30 +29,31 @@



def is_notebook_compatible():
jupyter_env = determine_jupter_env()
if jupyter_env == "jupyter-notebook":
try:
if _HAS_PANDAS:
def is_notebook_compatible():
jupyter_env = determine_jupter_env()
if jupyter_env == "jupyter-notebook":
try:
import notebook
return notebook.version_info[0] >= 6
except:
pass
return False
else:
return True

def warn_on_incompatible():
if not is_notebook_compatible():
import notebook
return notebook.version_info[0] >= 6
except:
pass
return False
else:
return True
print("Buckaroo is compatible with jupyter notebook > 6, or jupyterlab >3.6.0")
print("You seem to be executing this in jupyter notebook version %r" % str(notebook.__version__))
print("You can upgrade to notebook 7 by running 'pip install --upgrade notebook'")
print("Or you can try running jupyter lab with 'jupyter lab'")

def warn_on_incompatible():
if not is_notebook_compatible():
import notebook
print("Buckaroo is compatible with jupyter notebook > 6, or jupyterlab >3.6.0")
print("You seem to be executing this in jupyter notebook version %r" % str(notebook.__version__))
print("You can upgrade to notebook 7 by running 'pip install --upgrade notebook'")
print("Or you can try running jupyter lab with 'jupyter lab'")



def debug_packages():
print("Selected Jupyter core packages...")
from .widget_utils import determine_jupter_env
jupyter_env = determine_jupter_env()
print("executing in %s " % jupyter_env)
packages = [
Expand Down Expand Up @@ -106,33 +113,34 @@ def is_running_in_mp_timeout() -> bool:
return False

has_initted = False
try:
if is_in_marimo():
print("Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo.marimo_utils import marimo_unmonkeypatch; marimo_unmonkeypatch()`")
from buckaroo.marimo_utils import marimo_monkeypatch
marimo_monkeypatch()

elif is_in_ipython():
enable()
print("Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`")

else:
if not is_running_in_mp_timeout() and not has_initted:
print("must be running inside ipython to enable default display via enable()")
warn_on_incompatible()
if _HAS_PANDAS:
try:
import polars
if not platform.system() == "Windows":
from buckaroo.read_utils import read, read_df
if is_in_marimo():
print("Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo.marimo_utils import marimo_unmonkeypatch; marimo_unmonkeypatch()`")
from buckaroo.marimo_utils import marimo_monkeypatch
marimo_monkeypatch()

elif is_in_ipython():
enable()
print("Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`")

else:
#FIXME post some error message here explianing that these features aren't available on windows
pass
if not is_running_in_mp_timeout() and not has_initted:
print("must be running inside ipython to enable default display via enable()")
warn_on_incompatible()
try:
import polars
if not platform.system() == "Windows":
from buckaroo.read_utils import read, read_df
else:
#FIXME post some error message here explianing that these features aren't available on windows
pass

except ImportError:
#if polars is installed, make read available as a base import
pass
except:
print("error enabling buckaroo as default display formatter for dataframes (ignore message during testing/builds")
finally:
has_initted = True
except ImportError:
#if polars is installed, make read available as a base import
pass
except:
print("error enabling buckaroo as default display formatter for dataframes (ignore message during testing/builds")
finally:
has_initted = True

10 changes: 5 additions & 5 deletions buckaroo/buckaroo_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from .pluggable_analysis_framework.col_analysis import ColAnalysis
from buckaroo.extension_utils import copy_extend

from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_parquet, sd_to_parquet_b64
from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj, to_arrow_ipc, sd_to_ipc_b64
from .dataflow.dataflow import CustomizableDataflow
from .dataflow.dataflow_extras import (Sampling, exception_protect)
from .dataflow.styling_core import (ComponentConfig, DFViewerConfig, DisplayArgs, OverrideColumnConfig, PinnedRowConfig, StylingAnalysis, merge_column_config, EMPTY_DFVIEWER_CONFIG)
Expand Down Expand Up @@ -242,7 +242,7 @@ def _sd_to_jsondf(self, sd):

Exists so this can be overridden for polars/geopandas.
"""
return sd_to_parquet_b64(sd)
return sd_to_ipc_b64(sd)



Expand Down Expand Up @@ -395,11 +395,11 @@ def _handle_payload_args(self, new_payload_args):
converted_sort_column = processed_sd[sort]['orig_col_name']
sorted_df = processed_df.sort_values(by=[converted_sort_column], ascending=ascending)
slice_df = sorted_df[start:end]
self.send({ "type": "infinite_resp", 'key':new_payload_args, 'data':[], 'length':len(processed_df)}, [to_parquet(slice_df)])
self.send({ "type": "infinite_resp", 'key':new_payload_args, 'data':[], 'length':len(processed_df)}, [to_arrow_ipc(slice_df)])
else:
slice_df = processed_df[start:end]
self.send({ "type": "infinite_resp", 'key':new_payload_args,
'data': [], 'length':len(processed_df)}, [to_parquet(slice_df) ])
'data': [], 'length':len(processed_df)}, [to_arrow_ipc(slice_df) ])

second_pa = new_payload_args.get('second_request')
if not second_pa:
Expand All @@ -409,7 +409,7 @@ def _handle_payload_args(self, new_payload_args):
extra_df = processed_df[extra_start:extra_end]
self.send(
{"type": "infinite_resp", 'key':second_pa, 'data':[], 'length':len(processed_df)},
[to_parquet(extra_df)]
[to_arrow_ipc(extra_df)]
)
except Exception as e:
logger.error(e)
Expand Down
6 changes: 3 additions & 3 deletions buckaroo/customizations/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ def get_mode(ser):

# but in jupyterlite envs, we have a recent version of pandas
# without this problem
if not pd.api.types.is_numeric():
if not pd.api.types.is_numeric_dtype(ser):
return np.nan
mode_raw = ser.mode()
if len(mode_raw) == 0:
return np.nan
return mode_raw.values[0]

try:
if not pd.api.types.is_numeric():
if not pd.api.types.is_numeric_dtype(ser):
return np.nan
mode_raw = ser.mode()
if len(mode_raw) == 0:
Expand Down
4 changes: 2 additions & 2 deletions buckaroo/dataflow/column_executor_dataflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from buckaroo.file_cache.multiprocessing_executor import MultiprocessingExecutor
from buckaroo.file_cache.paf_column_executor import PAFColumnExecutor
from .abc_dataflow import ABCDataflow
from buckaroo.serialization_utils import sd_to_parquet_b64
from buckaroo.serialization_utils import sd_to_ipc_b64

logger = logging.getLogger("buckaroo.dataflow")

Expand Down Expand Up @@ -272,7 +272,7 @@ def _listener(note: ProgressNotification) -> None:
current_summary = self.summary_sd.copy() if self.summary_sd else {}
current_summary.update(aggregated_summary)
self.summary_sd = current_summary
self.df_data_dict = {'main': [], 'all_stats': sd_to_parquet_b64(current_summary), 'empty': []}
self.df_data_dict = {'main': [], 'all_stats': sd_to_ipc_b64(current_summary), 'empty': []}
# Update merged_sd as stats come in (important for async executors)
# Merge with existing to preserve any cached columns
current_merged = self.merged_sd.copy() if self.merged_sd else {}
Expand Down
4 changes: 2 additions & 2 deletions buckaroo/dataflow/dataflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from traitlets import Unicode, Any, observe, Dict

from buckaroo.pluggable_analysis_framework.col_analysis import ColAnalysis, SDType
from ..serialization_utils import pd_to_obj, sd_to_parquet_b64
from ..serialization_utils import pd_to_obj, sd_to_ipc_b64
from buckaroo.pluggable_analysis_framework.utils import (filter_analysis)
from buckaroo.pluggable_analysis_framework.df_stats_v2 import DfStatsV2
from .autocleaning import SentinelAutocleaning
Expand Down Expand Up @@ -420,7 +420,7 @@ def _sd_to_jsondf(self, sd:SDType):

Exists so this can be overridden for polars/geopandas.
"""
return sd_to_parquet_b64(sd)
return sd_to_ipc_b64(sd)

def _df_to_obj(self, df:pd.DataFrame) -> TDict[str, TAny]:
return pd_to_obj(self.sampling_klass.serialize_sample(df))
Expand Down
6 changes: 4 additions & 2 deletions buckaroo/df_util.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import pandas as pd
from typing import Iterable, Union, List, Tuple, Dict
from __future__ import annotations
from typing import Iterable, Union, List, Tuple, Dict, TYPE_CHECKING
from typing_extensions import TypeAlias

if TYPE_CHECKING:
import pandas as pd

ColIdentifier:TypeAlias = Union[Iterable[str], str]

Expand Down
4 changes: 2 additions & 2 deletions buckaroo/geopandas_buckaroo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from buckaroo.customizations.styling import DefaultMainStyling, StylingAnalysis
from buckaroo.pluggable_analysis_framework.pluggable_analysis_framework import ColAnalysis
from .dataflow.dataflow_extras import (Sampling)
from buckaroo.serialization_utils import pd_to_obj, sd_to_parquet_b64
from buckaroo.serialization_utils import pd_to_obj, sd_to_ipc_b64
from buckaroo.customizations.analysis import (TypingStats)
import geopandas

Expand Down Expand Up @@ -65,7 +65,7 @@ def _sd_to_jsondf(self, sd):
temp_sd = sd.copy()
if 'index' in temp_sd:
del temp_sd['index']
return sd_to_parquet_b64(temp_sd)
return sd_to_ipc_b64(temp_sd)

class GeopandasBuckarooWidget(GeopandasBase):
pass
Expand Down
27 changes: 15 additions & 12 deletions buckaroo/lazy_infinite_polars_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import datetime
from datetime import timedelta
from typing import Any, Dict, List, Optional, Type
from io import BytesIO
from pathlib import Path
import os
import traceback
Expand All @@ -29,7 +28,7 @@
from buckaroo.styling_helpers import obj_, pinned_histogram
from .pluggable_analysis_framework.polars_analysis_management import PolarsAnalysis
from .df_util import old_col_new_col
from .serialization_utils import sd_to_parquet_b64
from .serialization_utils import sd_to_ipc_b64
from buckaroo.file_cache.base import AbstractFileCache, Executor as _SyncExec, ExecutorLog # type: ignore
from buckaroo.file_cache.multiprocessing_executor import MultiprocessingExecutor as _ParExec
from buckaroo.file_cache.cache_utils import get_global_file_cache, get_global_executor_log
Expand Down Expand Up @@ -718,8 +717,8 @@ def _listener(note):
# Ensure summary is ready for initial display (checks if computation completed synchronously)
summary_sd = self.ensure_initial_summary_for_display(initial_summary_sd)
summary_rows = self._summary_to_rows(summary_sd)
if isinstance(summary_rows, dict) and summary_rows.get('format') == 'parquet_b64':
logger.info("Initial all_stats prepared as parquet_b64, b64_len=%s", len(summary_rows.get('data', '')))
if isinstance(summary_rows, dict) and summary_rows.get('format') in ('ipc_b64', 'parquet_b64'):
logger.info("Initial all_stats prepared as %s, b64_len=%s", summary_rows.get('format'), len(summary_rows.get('data', '')))
else:
logger.info(
"Initial all_stats prepared: len=%s sample=%s",
Expand Down Expand Up @@ -765,7 +764,7 @@ def _summary_to_rows(self, summary: Dict[str, Dict[str, Any]]):
"""Convert summary dict to parquet-b64 tagged payload (or JSON fallback)."""
if not summary:
return []
return sd_to_parquet_b64(summary)
return sd_to_ipc_b64(summary)

# selection and retry now delegated to dataflow
def _build_column_config(self, summary: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
Expand All @@ -792,11 +791,15 @@ def _prepare_df_for_serialization(self, df: pl.DataFrame) -> pl.DataFrame:
select_clauses.append(pl.col(orig).alias(rw))
return df.select(select_clauses)

def _to_parquet(self, df: pl.DataFrame) -> bytes:
out = BytesIO()
self._prepare_df_for_serialization(df).write_parquet(out, compression='uncompressed')
out.seek(0)
return out.read()
def _to_arrow_ipc(self, df: pl.DataFrame) -> bytes:
import pyarrow as pa
import pyarrow.ipc as ipc
table = self._prepare_df_for_serialization(df).to_arrow()
sink = pa.BufferOutputStream()
writer = ipc.new_stream(sink, table.schema)
writer.write_table(table)
writer.close()
return sink.getvalue().to_pybytes()

def _handle_payload_args(self, new_payload_args: Dict[str, Any]) -> None:
start, end = new_payload_args.get('start', 0), new_payload_args.get('end', 0)
Expand Down Expand Up @@ -831,7 +834,7 @@ def _handle_payload_args(self, new_payload_args: Dict[str, Any]) -> None:
start, end, len(slice_df), self.df_meta['total_rows']
)
self.send({"type": "infinite_resp", 'key': new_payload_args, 'data': [], 'length': self.df_meta['total_rows']},
[self._to_parquet(slice_df)])
[self._to_arrow_ipc(slice_df)])

second_pa = new_payload_args.get('second_request')
if second_pa:
Expand All @@ -847,7 +850,7 @@ def _handle_payload_args(self, new_payload_args: Dict[str, Any]) -> None:
s2, e2, len(slice2), self.df_meta['total_rows']
)
self.send({"type": "infinite_resp", 'key': second_pa, 'data': [], 'length': self.df_meta['total_rows']},
[self._to_parquet(slice2)])
[self._to_arrow_ipc(slice2)])
except Exception as e:
stack_trace = traceback.format_exc()
self.send({"type": "infinite_resp", 'key': new_payload_args, 'data': [], 'error_info': stack_trace, 'length': 0}, [])
Expand Down
Loading