From fcb108997fb15395106f9cf356ea6f84a25100e1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 17 Feb 2026 14:30:37 +0000 Subject: [PATCH 1/3] Initial plan From 0fd4ebdc2958e6c07159f1ad678bb8d24afd3d85 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 17 Feb 2026 14:37:59 +0000 Subject: [PATCH 2/3] Fix sampling for Array columns with List or nested Array inner types Co-authored-by: borchero <22455425+borchero@users.noreply.github.com> --- dataframely/columns/array.py | 19 ++++++++++++++++++- tests/columns/test_sample.py | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index e1004493..36285215 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -126,9 +126,26 @@ def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series: n_elements = n * math.prod(self.shape) all_elements = self.inner.sample(generator, n_elements) + # For complex inner types (List, Array), reshape doesn't work properly + # because it tries to reshape based on primitive element count, not the + # count of complex elements. We need to manually chunk and construct. + inner_dtype = self.inner.dtype + if isinstance(inner_dtype, (pl.List, pl.Array)): + # Chunk the elements into groups of shape size + chunk_size = math.prod(self.shape) + chunks = [] + for i in range(n): + start = i * chunk_size + chunk = all_elements.slice(start, chunk_size).to_list() + chunks.append(chunk) + result = pl.Series(chunks, dtype=pl.Array(inner_dtype, self.shape)) + else: + # For scalar and struct types, reshape works correctly + result = all_elements.reshape((n, *self.shape)) + # Finally, apply a null mask return generator._apply_null_mask( - all_elements.reshape((n, *self.shape)), + result, null_probability=self._null_probability, ) diff --git a/tests/columns/test_sample.py b/tests/columns/test_sample.py index 2c51d1c1..245e0f38 100644 --- a/tests/columns/test_sample.py +++ b/tests/columns/test_sample.py @@ -199,6 +199,24 @@ def test_sample_struct(generator: Generator) -> None: assert len(samples) == 10_000 +@pytest.mark.parametrize(("arr_size", "n_samples"), [(1, 1), (2, 1), (3, 2), (2, 10)]) +def test_sample_array_list(arr_size: int, n_samples: int, generator: Generator) -> None: + """Test sampling for Array(List(...)) columns.""" + column = dy.Array(dy.List(dy.Bool()), arr_size) + samples = sample_and_validate(column, generator, n=n_samples) + assert len(samples) == n_samples + + +def test_sample_nested_array(generator: Generator) -> None: + """Test sampling for Array(Array(...)) columns.""" + column = dy.Array(dy.Array(dy.Int64(), 2), 3) + samples = sample_and_validate(column, generator, n=10) + assert len(samples) == 10 + # Check that the shape is correct (accounting for nulls) + non_null_lengths = samples.arr.len().drop_nulls() + assert all(non_null_lengths == 3) + + # --------------------------------------- UTILS -------------------------------------- # From 4a8f138f66d1c4f0400f7e33a7696be0fae60f47 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 17 Feb 2026 14:46:52 +0000 Subject: [PATCH 3/3] Use self.dtype instead of reconstructing dtype in Array sampling Co-authored-by: borchero <22455425+borchero@users.noreply.github.com> --- dataframely/columns/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index 36285215..5d582a3d 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -138,7 +138,7 @@ def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series: start = i * chunk_size chunk = all_elements.slice(start, chunk_size).to_list() chunks.append(chunk) - result = pl.Series(chunks, dtype=pl.Array(inner_dtype, self.shape)) + result = pl.Series(chunks, dtype=self.dtype) else: # For scalar and struct types, reshape works correctly result = all_elements.reshape((n, *self.shape))