From a210f2ce34d8fdb654f2089ebbac3a2ee8081842 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 4 May 2026 13:19:01 -0600 Subject: [PATCH] fix: include per-column details in exportBatch row count mismatch error When columns in a batch have mismatched row counts, the error message now reports each column's index, row count, and vector class name instead of just the distinct row counts. This helps diagnose the root cause of issues like #4211. --- .../org/apache/comet/vector/NativeUtil.scala | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/common/src/main/scala/org/apache/comet/vector/NativeUtil.scala b/common/src/main/scala/org/apache/comet/vector/NativeUtil.scala index 45245121a0..72e989839c 100644 --- a/common/src/main/scala/org/apache/comet/vector/NativeUtil.scala +++ b/common/src/main/scala/org/apache/comet/vector/NativeUtil.scala @@ -112,7 +112,7 @@ class NativeUtil { arrayAddrs: Array[Long], schemaAddrs: Array[Long], batch: ColumnarBatch): Int = { - val numRows = mutable.ArrayBuffer.empty[Int] + val numRows = mutable.ArrayBuffer.empty[(Int, Int, String)] (0 until batch.numCols()).foreach { index => batch.column(index) match { @@ -122,7 +122,7 @@ class NativeUtil { val valueVector = valuesVector.getValueVector // Use the selection vector's logical row count - numRows += selectionVector.numValues() + numRows += ((selectionVector.numValues(), index, selectionVector.getClass.getSimpleName)) val provider = if (valueVector.getField.getDictionary != null) { valuesVector.getDictionaryProvider @@ -143,7 +143,7 @@ class NativeUtil { case a: CometVector => val valueVector = a.getValueVector - numRows += valueVector.getValueCount + numRows += ((valueVector.getValueCount, index, a.getClass.getSimpleName)) val provider = if (valueVector.getField.getDictionary != null) { a.getDictionaryProvider @@ -168,9 +168,13 @@ class NativeUtil { } } - if (numRows.distinct.length > 1) { + val distinctRowCounts = numRows.map(_._1).distinct + if (distinctRowCounts.length > 1) { + val details = numRows + .map { case (rows, idx, className) => s"col[$idx]=$rows ($className)" } + .mkString(", ") throw new SparkException( - s"Number of rows in each column should be the same, but got [${numRows.distinct}]") + s"Number of rows in each column should be the same, but got [$details]") } // `ColumnarBatch.numRows` might return a different number than the actual number of rows in @@ -179,7 +183,7 @@ class NativeUtil { // logical number of rows which is less than actual number of rows due to row deletion. // Similarly, CometSelectionVector represents a different number of logical rows than the // underlying vector. - numRows.headOption.getOrElse(batch.numRows()) + numRows.headOption.map(_._1).getOrElse(batch.numRows()) } /**