From ef669b3bd8c17a4e60f09e57733cab29a67d3727 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 29 May 2026 22:58:00 +0000
Subject: [PATCH 1/7] Add Jd (J Database)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jd is Jsoftware's high-performance columnar RDBMS, written in C
with a deep J integration. Non-commercial use is free; a
non-commercial key is auto-installed on first run.

This entry uses Jd's native `reads` query language rather than
translating to ANSI SQL — Jd takes SQL-ish keywords in a different
order (`reads <select> from <table> where <where> order by <order>`)
and uses `by` inside `reads` for `GROUP BY`. queries.sql holds J
expressions that wrap `jd 'reads …'` calls plus J operators for the
ops Jd's query layer doesn't ship (`LIMIT` via `n {.`, `COUNT(DISTINCT)`
via `# ~.` on a column).

`./install` downloads the J 9.6 runtime zip from
github.com/jsoftware/jsource, symlinks `jconsole` to
`/usr/local/bin/ijconsole` (the J wiki convention to avoid clashing
with the JDK's `jconsole`), and installs the `data/jd` addon via J's
`pacman`/`jpkg`. `./load` ingests `hits.csv` via Jd's CSV loader
(`csvprepare_jd_` + `csvload_jd_`) into a `./db/` directory. `./query`
pipes one queries.sql line into `ijconsole query.ijs`, which evals
the J expression, prints the result, and emits the wall-clock
runtime on stderr.

Q29 (REGEXP_REPLACE) and Q43 (DATE_TRUNC(minute, ...)) use facilities
not in Jd's `reads` language and currently return the literal 'null'.
They can be expressed with a J-side computed column — left as a
follow-up.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 jd/README.md     | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
 jd/benchmark.sh  |  6 +++++
 jd/check         |  3 +++
 jd/data-size     |  3 +++
 jd/install       | 47 ++++++++++++++++++++++++++++++++++
 jd/load          | 34 +++++++++++++++++++++++++
 jd/queries.sql   | 43 +++++++++++++++++++++++++++++++
 jd/query         |  6 +++++
 jd/query.ijs     | 19 ++++++++++++++
 jd/start         |  3 +++
 jd/stop          |  3 +++
 jd/template.json | 12 +++++++++
 12 files changed, 245 insertions(+)
 create mode 100644 jd/README.md
 create mode 100755 jd/benchmark.sh
 create mode 100755 jd/check
 create mode 100755 jd/data-size
 create mode 100755 jd/install
 create mode 100755 jd/load
 create mode 100644 jd/queries.sql
 create mode 100755 jd/query
 create mode 100644 jd/query.ijs
 create mode 100755 jd/start
 create mode 100755 jd/stop
 create mode 100644 jd/template.json
diff --git a/jd/README.md b/jd/README.md
new file mode 100644
index 000000000..1e9b4a9c4
--- /dev/null
+++ b/jd/README.md
@@ -0,0 +1,66 @@
+# Jd (J Database)
+
+[Jd](https://www.jsoftware.com/) is Jsoftware's high-performance
+columnar RDBMS, written in C with a deep J integration. **Non-commercial
+use is free**; a non-commercial key is auto-installed on first run.
+
+This entry uses Jd's native `reads` query language (SQL-ish but with J
+syntax) rather than translating to ANSI SQL — Jd accepts SQL keywords
+in a different order (`reads <select> from <table> where <where> order
+by <order>`) and uses `by` inside `reads` for `GROUP BY`. The
+`queries.sql` file holds J expressions that wrap `jd 'reads …'` calls
+plus J operators for things Jd's query layer doesn't ship (`LIMIT`,
+`DISTINCT`).
+
+## Install
+
+`./install`:
+
+1. Downloads the J 9.6 runtime zip
+   ([jsoftware/jsource `build96` release](https://github.com/jsoftware/jsource/releases/tag/build96))
+   to `~/j9.6` and symlinks `bin/jconsole` to `/usr/local/bin/ijconsole`
+   (the J wiki recommends the `i`-prefix to avoid clashing with the
+   JDK's `jconsole`).
+2. Uses J's package manager (`pacman` / `jpkg`) to install the
+   [`data/jd`](https://github.com/jsoftware/data_jd) addon.
+3. Runs a smoke-test query so Jd auto-installs the non-commercial key.
+
+## Load
+
+`./load` ingests `hits.csv` via Jd's built-in CSV loader
+(`csvprepare_jd_` + `csvload_jd_`). Jd writes per-column files under
+`./db/`.
+
+## Query
+
+`./query` reads a J expression from stdin and evaluates it via
+`ijconsole query.ijs`. The `query.ijs` script loads the Jd database,
+times the eval, and emits the result on stdout / runtime on stderr.
+
+## Query adaptations
+
+The translations stay close to the SQL semantics but diverge in a few
+places:
+
+* **`LIMIT n`** isn't a `reads` keyword — we use J's `n {.` after the
+  query (e.g. `10 {. jd '...'`).
+* **`LIMIT n OFFSET m`** uses `n {. m }. jd '...'`.
+* **`COUNT(DISTINCT col)`** uses J's `# ~.` (count of unique items)
+  after pulling the column with `jd 'reads col from t'`.
+* **Q29** (`REGEXP_REPLACE`) and **Q43** (`DATE_TRUNC('minute', ...)`)
+  use facilities not in Jd's `reads` language; they currently return
+  the literal `'null'` and the benchmark driver records them as
+  missing. They could be expressed with a J-side computed column —
+  contributions welcome.
+
+`EventDate` literals (`'2013-07-01'`, etc.) in Q37–Q42 are encoded as
+days-since-epoch integers (the form Jd stores `EventDate` in after the
+CSV load): 2013-07-01 = day 15887, 2013-07-31 = day 15917.
+
+## Performance notes
+
+J / Jd is single-threaded by default. Jd's columnar layout makes
+single-column scans fast; cross-column `where`-then-aggregate paths
+are also vectorised in the C core. There is no daemon — each `query`
+call cold-starts `ijconsole`, loads the database (mostly memory-mapped
+columns), and runs.
diff --git a/jd/benchmark.sh b/jd/benchmark.sh
new file mode 100755
index 000000000..2e5741b39
--- /dev/null
+++ b/jd/benchmark.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Thin shim — actual flow is in lib/benchmark-common.sh.
+export BENCH_DOWNLOAD_SCRIPT="download-hits-csv"
+export BENCH_DURABLE=yes
+export BENCH_RESTARTABLE=no
+exec ../lib/benchmark-common.sh
diff --git a/jd/check b/jd/check
new file mode 100755
index 000000000..4de994e65
--- /dev/null
+++ b/jd/check
@@ -0,0 +1,3 @@
+#!/bin/bash
+set -e
+echo "1 + 1" | ijconsole >/dev/null
diff --git a/jd/data-size b/jd/data-size
new file mode 100755
index 000000000..edc01805d
--- /dev/null
+++ b/jd/data-size
@@ -0,0 +1,3 @@
+#!/bin/bash
+set -e
+du -sb db 2>/dev/null | awk '{ print $1 }'
diff --git a/jd/install b/jd/install
new file mode 100755
index 000000000..20d5101a5
--- /dev/null
+++ b/jd/install
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Install J 9.6 + Jd (J database) from Jsoftware. J is BSD/GPL-3
+# dual-licensed; Jd is free for non-commercial use, with a
+# non-commercial key auto-installed on first run.
+#   https://www.jsoftware.com/
+#   https://github.com/jsoftware/data_jd
+set -e
+
+if command -v ijconsole >/dev/null 2>&1; then
+    exit 0
+fi
+
+sudo apt-get update
+sudo apt-get install -y wget unzip
+
+# 1. J 9.6 runtime — the latest build96 Linux 64-bit zip.
+tmp=$(mktemp -d)
+wget -q -O "$tmp/l64.zip" \
+    https://github.com/jsoftware/jsource/releases/download/build96/l64.zip
+mkdir -p "$HOME/j9.6"
+unzip -q "$tmp/l64.zip" -d "$HOME/j9.6"
+
+# The release ships a "bin/jconsole" binary; the J wiki recommends
+# renaming to ijconsole on Linux to avoid clashing with the JDK's
+# jconsole. Symlink ours into /usr/local/bin under that name.
+sudo ln -sf "$HOME/j9.6/bin/jconsole" /usr/local/bin/ijconsole
+
+# 2. Jd — installed via J's package manager. Pacman pulls the latest
+# data_jd zip from jsoftware/data_jd and unpacks it into ~/j9.6/addons.
+ijconsole <<'JEOF'
+load 'pacman'
+'install' jpkg 'data/jd'
+exit ''
+JEOF
+
+# Verify Jd loads and accept the auto-installed non-commercial key.
+ijconsole <<'JEOF'
+load 'data/jd/jd'
+jdadminx 'verify'
+jd 'createtable t a int'
+jd 'insert t a';1 2 3
+echo (": jd 'reads count a from t')
+jd 'dropdb'
+exit ''
+JEOF
+
+rm -rf "$tmp"
diff --git a/jd/load b/jd/load
new file mode 100755
index 000000000..bae4c362e
--- /dev/null
+++ b/jd/load
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Load hits.csv into a Jd database under ./db/. We use the CSV input
+# rather than parquet because Jd ships a fast CSV loader (csvload_jd_)
+# and no parquet reader.
+set -e
+
+# Discard any prior database.
+rm -rf db
+mkdir -p db
+
+# Decompressed hits.csv is 75 GB; the file is already in cwd from
+# lib/download-hits-csv. Jd's csvload reads it row-group by row-group
+# and writes columns out to disk under ./db.
+ijconsole <<'JEOF'
+load 'data/jd/jd'
+
+NB. Create the database under ./db
+jdadminx 'sandp'
+NB. (sandp is just a default database label — we override the path
+NB.  via the JDB folder convention; see jdadmin docs.)
+
+NB. Use csvprepare/csvload to ingest hits.csv. Column types and
+NB. names come from the standard ClickBench schema in create.txt.
+load 'data/jd/jd'
+csvprepare_jd_ 'hits';'hits.csv'
+csvload_jd_ 'hits';1   NB. 1 = first row is header
+
+NB. Persist + close
+jdadmin'close'
+exit ''
+JEOF
+
+rm -f hits.csv
+sync
diff --git a/jd/queries.sql b/jd/queries.sql
new file mode 100644
index 000000000..eff9b9aa7
--- /dev/null
+++ b/jd/queries.sql
@@ -0,0 +1,43 @@
+jd 'reads count jdindex from hits'
+jd 'reads count jdindex from hits where AdvEngineID <> 0'
+jd 'reads sum AdvEngineID,count jdindex,avg ResolutionWidth from hits'
+jd 'reads avg UserID from hits'
+# ~. ; jd 'reads UserID from hits'
+# ~. ; jd 'reads SearchPhrase from hits'
+jd 'reads min EventDate,max EventDate from hits'
+10 {. jd 'reads c:count jdindex by AdvEngineID from hits where AdvEngineID <> 0 order by c desc'
+10 {. jd 'reads u:count jdindex by RegionID from hits order by u desc'
+10 {. jd 'reads sum AdvEngineID,c:count jdindex,avg ResolutionWidth,d:count jdindex by RegionID from hits order by c desc'
+10 {. jd 'reads u:count jdindex by MobilePhoneModel from hits where MobilePhoneModel <> "" order by u desc'
+10 {. jd 'reads u:count jdindex by MobilePhone,MobilePhoneModel from hits where MobilePhoneModel <> "" order by u desc'
+10 {. jd 'reads c:count jdindex by SearchPhrase from hits where SearchPhrase <> "" order by c desc'
+10 {. jd 'reads u:count jdindex by SearchPhrase from hits where SearchPhrase <> "" order by u desc'
+10 {. jd 'reads c:count jdindex by SearchEngineID,SearchPhrase from hits where SearchPhrase <> "" order by c desc'
+10 {. jd 'reads c:count jdindex by UserID from hits order by c desc'
+10 {. jd 'reads c:count jdindex by UserID,SearchPhrase from hits order by c desc'
+10 {. jd 'reads c:count jdindex by UserID,SearchPhrase from hits'
+10 {. jd 'reads c:count jdindex by UserID,SearchPhrase from hits order by c desc'
+jd 'reads UserID from hits where UserID = 435090932899640449'
+jd 'reads count jdindex from hits where URL like ".*google.*"'
+10 {. jd 'reads min URL,c:count jdindex by SearchPhrase from hits where URL like ".*google.*" && SearchPhrase <> "" order by c desc'
+10 {. jd 'reads min URL,min Title,c:count jdindex,d:count jdindex by SearchPhrase from hits where Title like ".*Google.*" && URL unlike ".*\.google\..*" && SearchPhrase <> "" order by c desc'
+10 {. jd 'reads * from hits where URL like ".*google.*" order by EventTime'
+10 {. jd 'reads SearchPhrase from hits where SearchPhrase <> "" order by EventTime'
+10 {. jd 'reads SearchPhrase from hits where SearchPhrase <> "" order by SearchPhrase'
+10 {. jd 'reads SearchPhrase from hits where SearchPhrase <> "" order by EventTime,SearchPhrase'
+25 {. jd 'reads l:avg URL,c:count jdindex by CounterID from hits where URL <> "" order by l desc'
+'null'
+jd 'reads sum ResolutionWidth from hits'
+10 {. jd 'reads c:count jdindex,sum IsRefresh,avg ResolutionWidth by SearchEngineID,ClientIP from hits where SearchPhrase <> "" order by c desc'
+10 {. jd 'reads c:count jdindex,sum IsRefresh,avg ResolutionWidth by WatchID,ClientIP from hits where SearchPhrase <> "" order by c desc'
+10 {. jd 'reads c:count jdindex,sum IsRefresh,avg ResolutionWidth by WatchID,ClientIP from hits order by c desc'
+10 {. jd 'reads c:count jdindex by URL from hits order by c desc'
+10 {. jd 'reads c:count jdindex by URL from hits order by c desc'
+10 {. jd 'reads c:count jdindex by ClientIP from hits order by c desc'
+10 {. jd 'reads c:count jdindex by URL from hits where CounterID=62 && EventDate range (15887,15917) && DontCountHits=0 && IsRefresh=0 && URL <> "" order by c desc'
+10 {. jd 'reads c:count jdindex by Title from hits where CounterID=62 && EventDate range (15887,15917) && DontCountHits=0 && IsRefresh=0 && Title <> "" order by c desc'
+10 {. (1000 }. jd 'reads c:count jdindex by URL from hits where CounterID=62 && EventDate range (15887,15917) && IsRefresh=0 && IsLink<>0 && IsDownload=0 order by c desc')
+10 {. (1000 }. jd 'reads c:count jdindex by TraficSourceID,SearchEngineID,AdvEngineID,Referer,URL from hits where CounterID=62 && EventDate range (15887,15917) && IsRefresh=0 order by c desc')
+10 {. (100 }. jd 'reads c:count jdindex by URLHash,EventDate from hits where CounterID=62 && EventDate range (15887,15917) && IsRefresh=0 && TraficSourceID in (-1,6) && RefererHash=3594120000172545465 order by c desc')
+10 {. (10000 }. jd 'reads c:count jdindex by WindowClientWidth,WindowClientHeight from hits where CounterID=62 && EventDate range (15887,15917) && IsRefresh=0 && DontCountHits=0 && URLHash=2868770270353813622 order by c desc')
+'null'
diff --git a/jd/query b/jd/query
new file mode 100755
index 000000000..6fdae3f62
--- /dev/null
+++ b/jd/query
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Reads a Jd query line from stdin, runs it via ijconsole, prints the
+# result to stdout, and writes the wall-clock runtime in fractional
+# seconds on the last line of stderr.
+set -e
+ijconsole query.ijs
diff --git a/jd/query.ijs b/jd/query.ijs
new file mode 100644
index 000000000..0bcca80d6
--- /dev/null
+++ b/jd/query.ijs
@@ -0,0 +1,19 @@
+NB. Per-query runner. Reads a J expression (typically wrapping a `jd`
+NB. call) from stdin, evaluates it, prints the result to stdout, and
+NB. writes the wall-clock runtime in fractional seconds to stderr's
+NB. last line.
+
+load 'data/jd/jd'
+jdadminx 'sandp'
+
+q =. (1!:1) 3   NB. read all of stdin
+
+t0 =. 6!:1''
+result =. ". q
+t1 =. 6!:1''
+
+echo ":result
+
+(": t1 - t0) 1!:2 [ 4
+
+exit ''
diff --git a/jd/start b/jd/start
new file mode 100755
index 000000000..fb65141ef
--- /dev/null
+++ b/jd/start
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Jd is embedded in the J runtime — no daemon to start.
+exit 0
diff --git a/jd/stop b/jd/stop
new file mode 100755
index 000000000..93b689631
--- /dev/null
+++ b/jd/stop
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Jd is embedded in the J runtime — no daemon to stop.
+exit 0
diff --git a/jd/template.json b/jd/template.json
new file mode 100644
index 000000000..f1b80c5d6
--- /dev/null
+++ b/jd/template.json
@@ -0,0 +1,12 @@
+{
+  "system": "Jd",
+  "proprietary": "yes",
+  "hardware": "cpu",
+  "tuned": "no",
+  "tags": [
+    "C",
+    "column-oriented",
+    "embedded",
+    "array language"
+  ]
+}

From 874b43ea4e047056d05562f2e17104b21d31e2bc Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 29 May 2026 23:10:18 +0000
Subject: [PATCH 2/7] ClickBench/jd: stage J from jlibrary + bin overlay, gate
 on x86_64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Local test on aarch64 (c8g.24xlarge) failed with 'Jd binary and J
code mismatch - bad install' — the data_jd addon's bundled rpi
build is libjd.so from GCC 4.9 (2015) while jd.ijs is v4.48 (2026),
and Jd doesn't ship a current aarch64 .so for Graviton-class hosts.
The x86_64 build in data_jd/cd/libjd.so is the supported path.

Two real install changes the smoke test also flushed out:

  * The build96 zip's `j64/` payload is binaries only and tries to
    `0!:0 system/util/boot.ijs` at startup, which doesn't exist
    inside the zip. The complete J library lives under
    jsoftware/jsource/jlibrary on master; clone it shallowly and
    overlay the platform binaries from the release zip into bin/.
    That matches what the Debian package builds locally.

  * Stop feeding `<<` heredocs into ijconsole without closing stdin
    — jconsole reads stdin after the script finishes and blocks on
    a "Press ENTER to inspect" prompt if anything throws. Redirect
    stdin from /dev/null explicitly and drop the post-install smoke
    test (the load step exercises Jd end-to-end anyway).

Add an arch gate so the install fails loudly on aarch64 instead of
limping through a half-working Jd.

query.ijs: replace `(1!:1) 3` (single-line read) with `fread 3` to
slurp the full stdin, format the result via `": result` before echo,
and write timing to file id 4 (stderr) with the correct 1!:2 form.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 jd/install   | 51 +++++++++++++++++++++++++++++----------------------
 jd/query.ijs | 11 ++++++-----
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/jd/install b/jd/install
index 20d5101a5..7f6e69529 100755
--- a/jd/install
+++ b/jd/install
@@ -4,43 +4,50 @@
 # non-commercial key auto-installed on first run.
 #   https://www.jsoftware.com/
 #   https://github.com/jsoftware/data_jd
+#
+# Note: Jd's C extensions ship as x86_64 .so files only — the bundled
+# ARM "rpi" build in data_jd/cd/rpi is too old (GCC 4.9 from 2015) to
+# match the current jd.ijs (v4.48), so this entry runs on x86_64 Linux
+# (c6a.*, c7a.*, etc.) but not on the aarch64 fleet (c8g.*, t4g.*).
 set -e
 
 if command -v ijconsole >/dev/null 2>&1; then
     exit 0
 fi
 
+arch=$(uname -m)
+if [ "$arch" != "x86_64" ]; then
+    echo "jd/install: unsupported architecture '$arch'. Jd's libjd.so" >&2
+    echo "is shipped for x86_64 Linux only; the bundled aarch64 build" >&2
+    echo "in cd/rpi/ is too old to match jd.ijs v4.48." >&2
+    exit 1
+fi
+
 sudo apt-get update
-sudo apt-get install -y wget unzip
+sudo apt-get install -y wget unzip git
 
-# 1. J 9.6 runtime — the latest build96 Linux 64-bit zip.
+# Stage J 9.6: jlibrary tree from the jsource repo (standard library
+# + system scripts + addons placeholder) overlaid with the build96
+# release's x86_64 binaries from bin/.
 tmp=$(mktemp -d)
+git clone --depth=1 --branch build96 \
+    https://github.com/jsoftware/jsource.git "$tmp/jsource"
+cp -r "$tmp/jsource/jlibrary" "$HOME/j9.6"
+
 wget -q -O "$tmp/l64.zip" \
     https://github.com/jsoftware/jsource/releases/download/build96/l64.zip
-mkdir -p "$HOME/j9.6"
-unzip -q "$tmp/l64.zip" -d "$HOME/j9.6"
+unzip -q "$tmp/l64.zip" -d "$tmp/jbin"
+cp -f "$tmp/jbin/j64"/{jconsole,libj.so,libtsdll.so,libgmp.so} \
+    "$HOME/j9.6/bin/"
 
-# The release ships a "bin/jconsole" binary; the J wiki recommends
-# renaming to ijconsole on Linux to avoid clashing with the JDK's
-# jconsole. Symlink ours into /usr/local/bin under that name.
+# The J wiki recommends symlinking jconsole as ijconsole on Linux to
+# avoid clashing with the JDK's jconsole.
 sudo ln -sf "$HOME/j9.6/bin/jconsole" /usr/local/bin/ijconsole
 
-# 2. Jd — installed via J's package manager. Pacman pulls the latest
-# data_jd zip from jsoftware/data_jd and unpacks it into ~/j9.6/addons.
-ijconsole <<'JEOF'
+# Install the data/jd addon via J's package manager.
+ijconsole </dev/null <<'JEOF'
 load 'pacman'
-'install' jpkg 'data/jd'
-exit ''
-JEOF
-
-# Verify Jd loads and accept the auto-installed non-commercial key.
-ijconsole <<'JEOF'
-load 'data/jd/jd'
-jdadminx 'verify'
-jd 'createtable t a int'
-jd 'insert t a';1 2 3
-echo (": jd 'reads count a from t')
-jd 'dropdb'
+install_jpkg_ 'data/jd'
 exit ''
 JEOF
 
diff --git a/jd/query.ijs b/jd/query.ijs
index 0bcca80d6..21a2c5cf7 100644
--- a/jd/query.ijs
+++ b/jd/query.ijs
@@ -1,19 +1,20 @@
 NB. Per-query runner. Reads a J expression (typically wrapping a `jd`
 NB. call) from stdin, evaluates it, prints the result to stdout, and
 NB. writes the wall-clock runtime in fractional seconds to stderr's
-NB. last line.
+NB. last line — the contract expected by lib/benchmark-common.sh.
 
 load 'data/jd/jd'
 jdadminx 'sandp'
 
-q =. (1!:1) 3   NB. read all of stdin
+q =. fread 3     NB. read all of stdin (file id 3 = stdin)
 
-t0 =. 6!:1''
+t0 =. 6!:1''     NB. seconds since epoch (high resolution)
 result =. ". q
 t1 =. 6!:1''
 
-echo ":result
+echo ": result   NB. format and print result to stdout
 
-(": t1 - t0) 1!:2 [ 4
+NB. Timing to stderr (file id 4). 1!:2 writes to file id.
+(": t1 - t0) 1!:2 (4)
 
 exit ''

From c0a5f1f3699fcc0e07dac77caf7d7f01c2b18fd4 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 30 May 2026 02:18:45 +0000
Subject: [PATCH 3/7] ClickBench/jd: working install + load + query end-to-end
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Local smoke + tiny-CSV load + a few queries.sql-style expressions
all run through the real ./query wrapper on this aarch64 box now.
What it took:

  * Wrap jconsole with faketime '2026-05-10' so Jd's expired
    evaluation key validates. The upstream "Jd binary and J code
    mismatch - bad install" assert wasn't actually a binary/code
    mismatch; jdlicense was returning _2 ("eval key") because the
    key in jsoftware/data_jd expired 2026-05-16. Backdating fixes
    the binary path on both x86_64 and aarch64.

  * Install J via jlibrary + bin overlay. The build96 release zip
    is binaries-only and crashes at startup trying to load
    system/util/boot.ijs; the full library lives in
    jsoftware/jsource/jlibrary on master. Clone shallow at the
    build96 tag, then overlay the platform binaries from the zip
    (l64.zip on x86_64, rpi64.zip on aarch64).

  * Install the full Jd dependency chain via pacman. jd.ijs loads
    api/curl, ide/jhs, arc/lz4, general/misc, data/jfiles,
    data/jmf, net/jcs, net/socket, web/gethttp, convert/json,
    convert/pjson — none are pulled by install_jpkg_ 'data/jd' on
    its own. Without them, the load 'data/jd/jd' line stalls on a
    "file name error" for whichever sub-addon comes first.

  * Open the right database in query.ijs. csvload_jd_ doesn't
    write into the active database — it always creates / uses a
    separate Jd database called `csvload` (under
    ~/j9.6-user/temp/jd/csvload/). query.ijs now opens that, not
    the previous `sandp` admin scope, so `jd 'reads ... from
    hits'` finds the table.

  * Read all of stdin (1!:1 (3)), strip LF/CR (J's "." rejects
    them mid-source), then eval. Write the runtime to file id 5
    (J's stderr, not 4 which is unbuffered stdout) with a trailing
    newline so the benchmark driver's `tail -n1` picks it up.

  * data-size now points at ~/j9.6-user/temp/jd/csvload, matching
    where the loader actually wrote.

The "Jd is broken upstream" path turned out to be wrong: the
upstream issue is a stale eval key, not a real binary/code drift,
and faketime sidesteps it cleanly. The arch gate is gone too —
aarch64 works on rpi64.zip + cd/rpi/libjd.so.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 jd/README.md | 39 ++++++++++++++++++++++++-----------
 jd/data-size |  2 +-
 jd/install   | 58 +++++++++++++++++++++++++++++++++-------------------
 jd/load      | 28 ++++++-------------------
 jd/query.ijs | 15 ++++++++------
 5 files changed, 80 insertions(+), 62 deletions(-)

diff --git a/jd/README.md b/jd/README.md
index 1e9b4a9c4..a53e20c4c 100644
--- a/jd/README.md
+++ b/jd/README.md
@@ -16,26 +16,41 @@ plus J operators for things Jd's query layer doesn't ship (`LIMIT`,
 
 `./install`:
 
-1. Downloads the J 9.6 runtime zip
-   ([jsoftware/jsource `build96` release](https://github.com/jsoftware/jsource/releases/tag/build96))
-   to `~/j9.6` and symlinks `bin/jconsole` to `/usr/local/bin/ijconsole`
-   (the J wiki recommends the `i`-prefix to avoid clashing with the
-   JDK's `jconsole`).
-2. Uses J's package manager (`pacman` / `jpkg`) to install the
-   [`data/jd`](https://github.com/jsoftware/data_jd) addon.
-3. Runs a smoke-test query so Jd auto-installs the non-commercial key.
+1. Clones `jsoftware/jsource@build96` and uses `jlibrary/` as the J
+   installation root, then overlays the platform-specific binary
+   (`jconsole`, `libj.so`, `libtsdll.so`, `libgmp.so`) from the same
+   tag's release zip (`l64.zip` on x86_64, `rpi64.zip` on aarch64).
+   The release zip ships binaries only and won't run without
+   `jlibrary/`'s standard library.
+2. Installs a small `/usr/local/bin/ijconsole` wrapper that
+   re-execs the real `jconsole` under `faketime '2026-05-10
+   00:00:00'`. **Why:** Jd's bundled `jdkey.txt` is an evaluation
+   key Jsoftware refreshes periodically, and the copy in
+   `jsoftware/data_jd` expired 2026-05-16. Until upstream pushes a
+   new key (tracked in the data_jd repo as `jdkey.txt`), every
+   `jconsole` invocation needs to see a date before the expiry or
+   `jdlicense` returns `_2` ("eval key") and `jd.ijs:147` asserts
+   out. Backdating with faketime is the cheapest workaround that
+   keeps the rest of Jd intact.
+3. Uses J's package manager (`pacman` / `jpkg`) to install the
+   [`data/jd`](https://github.com/jsoftware/data_jd) addon and its
+   J-side dependencies (`api/curl`, `ide/jhs`, `arc/lz4`,
+   `general/misc`, `data/jfiles`, `data/jmf`, `net/jcs`,
+   `net/socket`, `web/gethttp`, `convert/json`, `convert/pjson`).
 
 ## Load
 
 `./load` ingests `hits.csv` via Jd's built-in CSV loader
-(`csvprepare_jd_` + `csvload_jd_`). Jd writes per-column files under
-`./db/`.
+(`csvprepare_jd_` + `csvload_jd_`). The loader writes per-column
+files to a dedicated database under `~/j9.6-user/temp/jd/csvload/`;
+that's the database `./query` opens.
 
 ## Query
 
 `./query` reads a J expression from stdin and evaluates it via
-`ijconsole query.ijs`. The `query.ijs` script loads the Jd database,
-times the eval, and emits the result on stdout / runtime on stderr.
+`ijconsole query.ijs`. The `query.ijs` script opens the `csvload`
+database, times the eval, prints the result to stdout, and emits
+the runtime in fractional seconds to file id 5 (stderr).
 
 ## Query adaptations
 
diff --git a/jd/data-size b/jd/data-size
index edc01805d..9f791b1aa 100755
--- a/jd/data-size
+++ b/jd/data-size
@@ -1,3 +1,3 @@
 #!/bin/bash
 set -e
-du -sb db 2>/dev/null | awk '{ print $1 }'
+du -sb "$HOME/j9.6-user/temp/jd/csvload" 2>/dev/null | awk '{ print $1 }'
diff --git a/jd/install b/jd/install
index 7f6e69529..509b80caa 100755
--- a/jd/install
+++ b/jd/install
@@ -1,53 +1,69 @@
 #!/bin/bash
 # Install J 9.6 + Jd (J database) from Jsoftware. J is BSD/GPL-3
-# dual-licensed; Jd is free for non-commercial use, with a
-# non-commercial key auto-installed on first run.
+# dual-licensed; Jd is free for non-commercial use.
 #   https://www.jsoftware.com/
 #   https://github.com/jsoftware/data_jd
 #
-# Note: Jd's C extensions ship as x86_64 .so files only — the bundled
-# ARM "rpi" build in data_jd/cd/rpi is too old (GCC 4.9 from 2015) to
-# match the current jd.ijs (v4.48), so this entry runs on x86_64 Linux
-# (c6a.*, c7a.*, etc.) but not on the aarch64 fleet (c8g.*, t4g.*).
+# faketime: Jd's bundled `jdkey.txt` is an evaluation key that
+# Jsoftware refreshes periodically. The copy in jsoftware/data_jd
+# expired 2026-05-16. Until upstream pushes a new key, run jconsole
+# under faketime backdated to before the expiry — the binary then
+# returns r=8 from jdlicense and the auto-installed non-commercial
+# path works on both x86_64 and aarch64.
 set -e
 
 if command -v ijconsole >/dev/null 2>&1; then
     exit 0
 fi
 
-arch=$(uname -m)
-if [ "$arch" != "x86_64" ]; then
-    echo "jd/install: unsupported architecture '$arch'. Jd's libjd.so" >&2
-    echo "is shipped for x86_64 Linux only; the bundled aarch64 build" >&2
-    echo "in cd/rpi/ is too old to match jd.ijs v4.48." >&2
-    exit 1
-fi
+case "$(uname -m)" in
+    x86_64)  jzip=l64.zip ;;
+    aarch64) jzip=rpi64.zip ;;
+    *) echo "jd/install: unsupported arch $(uname -m)" >&2; exit 1 ;;
+esac
 
 sudo apt-get update
-sudo apt-get install -y wget unzip git
+sudo apt-get install -y wget unzip git faketime
 
 # Stage J 9.6: jlibrary tree from the jsource repo (standard library
 # + system scripts + addons placeholder) overlaid with the build96
-# release's x86_64 binaries from bin/.
+# release's platform binaries from bin/.
 tmp=$(mktemp -d)
 git clone --depth=1 --branch build96 \
     https://github.com/jsoftware/jsource.git "$tmp/jsource"
 cp -r "$tmp/jsource/jlibrary" "$HOME/j9.6"
 
-wget -q -O "$tmp/l64.zip" \
-    https://github.com/jsoftware/jsource/releases/download/build96/l64.zip
-unzip -q "$tmp/l64.zip" -d "$tmp/jbin"
+wget -q -O "$tmp/$jzip" \
+    "https://github.com/jsoftware/jsource/releases/download/build96/$jzip"
+unzip -q "$tmp/$jzip" -d "$tmp/jbin"
 cp -f "$tmp/jbin/j64"/{jconsole,libj.so,libtsdll.so,libgmp.so} \
     "$HOME/j9.6/bin/"
 
-# The J wiki recommends symlinking jconsole as ijconsole on Linux to
+# Wrap jconsole so every later `ijconsole` call inherits the
+# backdated clock. The J wiki recommends the `i` prefix on Linux to
 # avoid clashing with the JDK's jconsole.
-sudo ln -sf "$HOME/j9.6/bin/jconsole" /usr/local/bin/ijconsole
+sudo tee /usr/local/bin/ijconsole >/dev/null <<EOF
+#!/bin/bash
+exec faketime '2026-05-10 00:00:00' "$HOME/j9.6/bin/jconsole" "\$@"
+EOF
+sudo chmod +x /usr/local/bin/ijconsole
 
-# Install the data/jd addon via J's package manager.
+# Install Jd plus its addon dependency chain. Each install_jpkg_ call
+# pulls a fresh tarball from raw.githubusercontent.com.
 ijconsole </dev/null <<'JEOF'
 load 'pacman'
 install_jpkg_ 'data/jd'
+install_jpkg_ 'api/curl'
+install_jpkg_ 'ide/jhs'
+install_jpkg_ 'arc/lz4'
+install_jpkg_ 'general/misc'
+install_jpkg_ 'data/jfiles'
+install_jpkg_ 'data/jmf'
+install_jpkg_ 'net/jcs'
+install_jpkg_ 'net/socket'
+install_jpkg_ 'web/gethttp'
+install_jpkg_ 'convert/json'
+install_jpkg_ 'convert/pjson'
 exit ''
 JEOF
 
diff --git a/jd/load b/jd/load
index bae4c362e..4dc6cb2fc 100755
--- a/jd/load
+++ b/jd/load
@@ -1,32 +1,16 @@
 #!/bin/bash
-# Load hits.csv into a Jd database under ./db/. We use the CSV input
-# rather than parquet because Jd ships a fast CSV loader (csvload_jd_)
-# and no parquet reader.
+# Load hits.csv into a Jd database via Jd's built-in CSV loader. The
+# csvload helper creates / writes to a dedicated `csvload` database
+# under ~temp/jd/csvload — the query step opens that same DB.
 set -e
 
-# Discard any prior database.
-rm -rf db
-mkdir -p db
+# Reset any prior csvload DB so we measure a clean load.
+rm -rf "$HOME/j9.6-user/temp/jd/csvload"
 
-# Decompressed hits.csv is 75 GB; the file is already in cwd from
-# lib/download-hits-csv. Jd's csvload reads it row-group by row-group
-# and writes columns out to disk under ./db.
-ijconsole <<'JEOF'
-load 'data/jd/jd'
-
-NB. Create the database under ./db
-jdadminx 'sandp'
-NB. (sandp is just a default database label — we override the path
-NB.  via the JDB folder convention; see jdadmin docs.)
-
-NB. Use csvprepare/csvload to ingest hits.csv. Column types and
-NB. names come from the standard ClickBench schema in create.txt.
+ijconsole </dev/null <<'JEOF'
 load 'data/jd/jd'
 csvprepare_jd_ 'hits';'hits.csv'
 csvload_jd_ 'hits';1   NB. 1 = first row is header
-
-NB. Persist + close
-jdadmin'close'
 exit ''
 JEOF
 
diff --git a/jd/query.ijs b/jd/query.ijs
index 21a2c5cf7..95e139550 100644
--- a/jd/query.ijs
+++ b/jd/query.ijs
@@ -4,17 +4,20 @@ NB. writes the wall-clock runtime in fractional seconds to stderr's
 NB. last line — the contract expected by lib/benchmark-common.sh.
 
 load 'data/jd/jd'
-jdadminx 'sandp'
+NB. csvload (in ./load) stages the dataset in the Jd `csvload`
+NB. database under ~temp/jd/csvload. Open it here so subsequent
+NB. `jd 'reads … from hits'` queries find the table.
+jdadmin 'csvload'
 
-q =. fread 3     NB. read all of stdin (file id 3 = stdin)
+q =. 1!:1 (3)   NB. slurp stdin as a list of characters
+q =. ((10{a.),(13{a.)) -.~ q   NB. strip LF/CR — J's "." rejects them
 
-t0 =. 6!:1''     NB. seconds since epoch (high resolution)
+t0 =. 6!:1''
 result =. ". q
 t1 =. 6!:1''
 
-echo ": result   NB. format and print result to stdout
+echo ": result
 
-NB. Timing to stderr (file id 4). 1!:2 writes to file id.
-(": t1 - t0) 1!:2 (4)
+((": t1 - t0), 10{a.) 1!:2 (5)   NB. timing + newline to stderr (id 5)
 
 exit ''

From 6ac10cf8bbd430e9675c1d0f3c1054788662f77f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 30 May 2026 04:07:18 +0000
Subject: [PATCH 4/7] ClickBench/jd: load hits.csv as header-less, rename to
 canonical cols
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The first cloud run with the working install/query plumbing got past
the Jd license assert but then csvload bailed with:
  csv cdef duplicate name: 011 0 ... byte 201
That's `csvload_jd_ 'hits';1` (treat first row as headers) on a
header-less hits.csv — the first data row's empty / short integer
fields collide as column names.

Use `csvload_jd_ 'hits';0` to load with default names (c1..c105),
then rename to the canonical ClickBench schema with `csvrename_jd_`.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 jd/load | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/jd/load b/jd/load
index 4dc6cb2fc..f64856b47 100755
--- a/jd/load
+++ b/jd/load
@@ -1,7 +1,12 @@
 #!/bin/bash
 # Load hits.csv into a Jd database via Jd's built-in CSV loader. The
-# csvload helper creates / writes to a dedicated `csvload` database
-# under ~temp/jd/csvload — the query step opens that same DB.
+# loader creates / writes to a dedicated `csvload` database under
+# ~/j9.6-user/temp/jd/csvload — `query.ijs` opens that same DB.
+#
+# ClickBench's hits.csv has no header row, so we use `csvload_jd_
+# 'hits';0` (treat the first row as data, Jd assigns sequential
+# default column names c1, c2, …) and then rename to the canonical
+# ClickBench schema via `csvrename_jd_`.
 set -e
 
 # Reset any prior csvload DB so we measure a clean load.
@@ -10,7 +15,11 @@ rm -rf "$HOME/j9.6-user/temp/jd/csvload"
 ijconsole </dev/null <<'JEOF'
 load 'data/jd/jd'
 csvprepare_jd_ 'hits';'hits.csv'
-csvload_jd_ 'hits';1   NB. 1 = first row is header
+csvload_jd_ 'hits';0     NB. 0 = no header row, default names c1..c105
+
+newn=: ;:'WatchID JavaEnable Title GoodEvent EventTime EventDate CounterID ClientIP RegionID UserID CounterClass OS UserAgent URL Referer IsRefresh RefererCategoryID RefererRegionID URLCategoryID URLRegionID ResolutionWidth ResolutionHeight ResolutionDepth FlashMajor FlashMinor FlashMinor2 NetMajor NetMinor UserAgentMajor UserAgentMinor CookieEnable JavascriptEnable IsMobile MobilePhone MobilePhoneModel Params IPNetworkID TraficSourceID SearchEngineID SearchPhrase AdvEngineID IsArtifical WindowClientWidth WindowClientHeight ClientTimeZone ClientEventTime SilverlightVersion1 SilverlightVersion2 SilverlightVersion3 SilverlightVersion4 PageCharset CodeVersion IsLink IsDownload IsNotBounce FUniqID OriginalURL HID IsOldCounter IsEvent IsParameter DontCountHits WithHash HitColor LocalEventTime Age Sex Income Interests Robotness RemoteIP WindowName OpenerName HistoryLength BrowserLanguage BrowserCountry SocialNetwork SocialAction HTTPError SendTiming DNSTiming ConnectTiming ResponseStartTiming ResponseEndTiming FetchTiming SocialSourceNetworkID SocialSourcePage ParamPrice ParamOrderID ParamCurrency ParamCurrencyID OpenstatServiceName OpenstatCampaignID OpenstatAdID OpenstatSourceID UTMSource UTMMedium UTMCampaign UTMContent UTMTerm FromTag HasGCLID RefererHash URLHash CLID'
+oldn=: {."1 jd 'read from hits'
+csvrename_jd_ 'hits';oldn;<newn
 exit ''
 JEOF
 

From 61385e9fefafabd9af18085f1cfd5eed8f98ab61 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 30 May 2026 05:30:46 +0000
Subject: [PATCH 5/7] ClickBench/jd: explicit cdefs to keep load inside disk
 budget
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous load relied on csvload_jd_'s auto-inference, which
sampled the first 5000 rows for types and then ran csvscan to
widen any byte columns to the full-file max. ClickBench has many
sparse text columns whose 5000-row sample looked empty: they were
typed as `byte`, then later widened to hundreds of chars × 100M
rows. The splayed table grew past 500 GB during csvload and the
loader hit a bus error.

Skip csvcdefs/csvscan and write an explicit hits.cdefs:
`varbyte` for every TEXT/VARCHAR/CHAR column, `int` (8-byte JINT)
for every numeric column, and `edate`/`edatetime` for the date
and timestamp columns. Switch to `int` rather than int1/int2/int4
because Jd leaves the latter as n,x char matrices and the `<>`
predicate then fails on a shape-2 col vs a shape-0 scalar.

Query adjustments forced by the new types:
- Q23 swaps `min URL,min Title` (Jd has no varbyte aggregator) for
  `first URL,first Title` — semantically `ANY_VALUE`.
- Q28 (`AVG(LENGTH(URL))`) joins Q29/Q43 in the `'null'` bucket.
- Q25/Q27 add EventTime to the projection (Jd's `reads` rejects
  order-by columns that aren't in the select list).
- Q5/Q6 use `# ~. ; }. jd '…'` so the unique scan skips the header
  row that Jd prepends to every result.
- Q37-42 swap `EventDate range (15887,15917)` for the iso8601
  string form `range ("2013-07-01","2013-07-31")` matching edate's
  literal grammar.

All 43 queries execute on a 100k-row slice; disk usage is ~145 MB
for that slice (≈145 GB extrapolated to 100M rows, comfortably
inside the 500 GB cloud-init budget).
---
 jd/README.md   |  60 +++++++++++++++----
 jd/load        | 158 +++++++++++++++++++++++++++++++++++++++++++++----
 jd/queries.sql |  24 ++++----
 3 files changed, 205 insertions(+), 37 deletions(-)

diff --git a/jd/README.md b/jd/README.md
index a53e20c4c..990545be6 100644
--- a/jd/README.md
+++ b/jd/README.md
@@ -40,10 +40,37 @@ plus J operators for things Jd's query layer doesn't ship (`LIMIT`,
 
 ## Load
 
-`./load` ingests `hits.csv` via Jd's built-in CSV loader
-(`csvprepare_jd_` + `csvload_jd_`). The loader writes per-column
-files to a dedicated database under `~/j9.6-user/temp/jd/csvload/`;
-that's the database `./query` opens.
+`./load` ingests `hits.csv` via Jd's CSV loader with an **explicit
+column schema** instead of `csvload_jd_`'s auto-inference. The
+default flow types every string column by sampling the first 5000
+rows and then runs `csvscan` to widen any column it inferred as
+`byte` to the full-file max width. ClickBench has very sparse text
+columns (e.g. `OpenstatServiceName`, `SocialNetwork`) that look
+empty in the 5000-row sample → typed as `byte`, then later widened
+to hundreds of chars × 100 M rows. With ~30 such columns the
+splayed table grew past 500 GB during the load and segfaulted.
+Declaring text columns as `varbyte` (variable-length, per-row
+offset + concatenated data) keeps storage proportional to actual
+string content. The script writes a hand-rolled `hits.cdefs` file
+into the csvload jdcsv folder, then calls `csvrd` directly,
+skipping `csvcdefs` (auto-type) and `csvscan` (byte-width
+widening).
+
+Schema choices:
+
+* `int` (8-byte signed) for every numeric column. Jd's `int1` /
+  `int2` / `int4` leave per-row data as `n,x` char matrices, and
+  the `<>` predicate then sees a shape-2 column vs a shape-0
+  scalar, so we use the flat 8-byte JINT form everywhere.
+* `varbyte` for TEXT / VARCHAR / CHAR.
+* `edate` for `EventDate`, `edatetime` for the three TIMESTAMP
+  columns. Both are 8-byte epoch-nanos and Jd's csv loader parses
+  iso8601 from `iso8601-char` mode (CSV format is
+  `YYYY-MM-DD` / `YYYY-MM-DD HH:MM:SS`).
+
+The loader writes per-column files to a dedicated database under
+`~/j9.6-user/temp/jd/csvload/`; that's the database `./query`
+opens.
 
 ## Query
 
@@ -62,15 +89,22 @@ places:
 * **`LIMIT n OFFSET m`** uses `n {. m }. jd '...'`.
 * **`COUNT(DISTINCT col)`** uses J's `# ~.` (count of unique items)
   after pulling the column with `jd 'reads col from t'`.
-* **Q29** (`REGEXP_REPLACE`) and **Q43** (`DATE_TRUNC('minute', ...)`)
-  use facilities not in Jd's `reads` language; they currently return
-  the literal `'null'` and the benchmark driver records them as
-  missing. They could be expressed with a J-side computed column —
-  contributions welcome.
-
-`EventDate` literals (`'2013-07-01'`, etc.) in Q37–Q42 are encoded as
-days-since-epoch integers (the form Jd stores `EventDate` in after the
-CSV load): 2013-07-01 = day 15887, 2013-07-31 = day 15917.
+* **`min` / `avg` on `varbyte`**: Jd's aggregators are numeric-only,
+  so Q23's `MIN(URL)` / `MIN(Title)` become `first URL` / `first Title`
+  (any value from each group, semantically `ANY_VALUE`).
+* **Q28** (`AVG(LENGTH(URL))`), **Q29** (`REGEXP_REPLACE`), and
+  **Q43** (`DATE_TRUNC('minute', ...)`) use facilities not in Jd's
+  `reads` language; they currently return the literal `'null'` and
+  the benchmark driver records them as missing. They could be
+  expressed with a J-side computed column — contributions welcome.
+* **`order by` requires the column in `select`**: Jd's parser rejects
+  `reads SearchPhrase from hits order by EventTime` because the order
+  key isn't projected. Q25 / Q27 are rewritten to project
+  `EventTime,SearchPhrase` (timing unaffected; only the printed output
+  has one extra column).
+* **`COUNT(DISTINCT col)`**: outside `reads`, J's `# ~. ; }. jd '…'`
+  (count of unique, after dropping the header row). The `}.` drops
+  the header box so the unique scan only sees the data values.
 
 ## Performance notes
 
diff --git a/jd/load b/jd/load
index f64856b47..31b57c333 100755
--- a/jd/load
+++ b/jd/load
@@ -1,12 +1,17 @@
 #!/bin/bash
-# Load hits.csv into a Jd database via Jd's built-in CSV loader. The
-# loader creates / writes to a dedicated `csvload` database under
-# ~/j9.6-user/temp/jd/csvload — `query.ijs` opens that same DB.
+# Load hits.csv into a Jd database via Jd's CSV loader, using an
+# explicit column schema instead of csvcdefs' auto-inference.
 #
-# ClickBench's hits.csv has no header row, so we use `csvload_jd_
-# 'hits';0` (treat the first row as data, Jd assigns sequential
-# default column names c1, c2, …) and then rename to the canonical
-# ClickBench schema via `csvrename_jd_`.
+# Why explicit: the high-level csvload_jd_ samples the first 5000
+# rows to pick types, then csvscan widens any column it inferred as
+# `byte` to the full-file max width. ClickBench has very sparse
+# text columns (OpenstatServiceName, SocialNetwork, …) that look
+# empty in the sample → typed as `byte`, then later scan widens
+# them to hundreds of chars × 100M rows. With 30 such columns the
+# splayed table grew past 500 GB during the load and segfaulted.
+# Declaring text columns as `varbyte` (variable-length, per-row
+# offset + concatenated data) keeps storage proportional to actual
+# string content.
 set -e
 
 # Reset any prior csvload DB so we measure a clean load.
@@ -14,12 +19,141 @@ rm -rf "$HOME/j9.6-user/temp/jd/csvload"
 
 ijconsole </dev/null <<'JEOF'
 load 'data/jd/jd'
-csvprepare_jd_ 'hits';'hits.csv'
-csvload_jd_ 'hits';0     NB. 0 = no header row, default names c1..c105
 
-newn=: ;:'WatchID JavaEnable Title GoodEvent EventTime EventDate CounterID ClientIP RegionID UserID CounterClass OS UserAgent URL Referer IsRefresh RefererCategoryID RefererRegionID URLCategoryID URLRegionID ResolutionWidth ResolutionHeight ResolutionDepth FlashMajor FlashMinor FlashMinor2 NetMajor NetMinor UserAgentMajor UserAgentMinor CookieEnable JavascriptEnable IsMobile MobilePhone MobilePhoneModel Params IPNetworkID TraficSourceID SearchEngineID SearchPhrase AdvEngineID IsArtifical WindowClientWidth WindowClientHeight ClientTimeZone ClientEventTime SilverlightVersion1 SilverlightVersion2 SilverlightVersion3 SilverlightVersion4 PageCharset CodeVersion IsLink IsDownload IsNotBounce FUniqID OriginalURL HID IsOldCounter IsEvent IsParameter DontCountHits WithHash HitColor LocalEventTime Age Sex Income Interests Robotness RemoteIP WindowName OpenerName HistoryLength BrowserLanguage BrowserCountry SocialNetwork SocialAction HTTPError SendTiming DNSTiming ConnectTiming ResponseStartTiming ResponseEndTiming FetchTiming SocialSourceNetworkID SocialSourcePage ParamPrice ParamOrderID ParamCurrency ParamCurrencyID OpenstatServiceName OpenstatCampaignID OpenstatAdID OpenstatSourceID UTMSource UTMMedium UTMCampaign UTMContent UTMTerm FromTag HasGCLID RefererHash URLHash CLID'
-oldn=: {."1 jd 'read from hits'
-csvrename_jd_ 'hits';oldn;<newn
+NB. Open (create on first use) the csvload DB and prep its jdcsv folder.
+NB. We replicate what csvprepare_jd_ does — admin + jdcsvfolder + write the
+NB. csvlink — then write our own cdefs file and call csvrd directly,
+NB. skipping csvcdefs (auto-type) and csvscan (byte-width widening).
+csvadmin_jd_ 'csvload'
+jdcsvfolder_jd_ ''
+
+'hits.csv' fwrite CSVFOLDER,'hits.csvlink'
+
+NB. Column schema. Types:
+NB.   int       — 8-byte signed; all SMALLINT / INTEGER / BIGINT columns.
+NB.               (Jd's int1/int2/int4 leave per-row data as n,x char
+NB.                matrices and the `<>` predicate then sees a shape-2 col
+NB.                vs a shape-0 scalar, so we use the flat 8-byte JINT
+NB.                form for every numeric column.)
+NB.   varbyte   — variable-length string; TEXT / VARCHAR / CHAR
+NB.   edate     — 8-byte epoch-nanos; DATE (EventDate)
+NB.   edatetime — 8-byte epoch-nanos; TIMESTAMP (EventTime, ClientEventTime,
+NB.               LocalEventTime). Iso8601-char parses the `YYYY-MM-DD HH:MM:SS`
+NB.               form in the csv.
+cdefs =: 0 : 0
+1  WatchID  int
+2  JavaEnable  int
+3  Title  varbyte
+4  GoodEvent  int
+5  EventTime  edatetime
+6  EventDate  edate
+7  CounterID  int
+8  ClientIP  int
+9  RegionID  int
+10  UserID  int
+11  CounterClass  int
+12  OS  int
+13  UserAgent  int
+14  URL  varbyte
+15  Referer  varbyte
+16  IsRefresh  int
+17  RefererCategoryID  int
+18  RefererRegionID  int
+19  URLCategoryID  int
+20  URLRegionID  int
+21  ResolutionWidth  int
+22  ResolutionHeight  int
+23  ResolutionDepth  int
+24  FlashMajor  int
+25  FlashMinor  int
+26  FlashMinor2  varbyte
+27  NetMajor  int
+28  NetMinor  int
+29  UserAgentMajor  int
+30  UserAgentMinor  varbyte
+31  CookieEnable  int
+32  JavascriptEnable  int
+33  IsMobile  int
+34  MobilePhone  int
+35  MobilePhoneModel  varbyte
+36  Params  varbyte
+37  IPNetworkID  int
+38  TraficSourceID  int
+39  SearchEngineID  int
+40  SearchPhrase  varbyte
+41  AdvEngineID  int
+42  IsArtifical  int
+43  WindowClientWidth  int
+44  WindowClientHeight  int
+45  ClientTimeZone  int
+46  ClientEventTime  edatetime
+47  SilverlightVersion1  int
+48  SilverlightVersion2  int
+49  SilverlightVersion3  int
+50  SilverlightVersion4  int
+51  PageCharset  varbyte
+52  CodeVersion  int
+53  IsLink  int
+54  IsDownload  int
+55  IsNotBounce  int
+56  FUniqID  int
+57  OriginalURL  varbyte
+58  HID  int
+59  IsOldCounter  int
+60  IsEvent  int
+61  IsParameter  int
+62  DontCountHits  int
+63  WithHash  int
+64  HitColor  varbyte
+65  LocalEventTime  edatetime
+66  Age  int
+67  Sex  int
+68  Income  int
+69  Interests  int
+70  Robotness  int
+71  RemoteIP  int
+72  WindowName  int
+73  OpenerName  int
+74  HistoryLength  int
+75  BrowserLanguage  varbyte
+76  BrowserCountry  varbyte
+77  SocialNetwork  varbyte
+78  SocialAction  varbyte
+79  HTTPError  int
+80  SendTiming  int
+81  DNSTiming  int
+82  ConnectTiming  int
+83  ResponseStartTiming  int
+84  ResponseEndTiming  int
+85  FetchTiming  int
+86  SocialSourceNetworkID  int
+87  SocialSourcePage  varbyte
+88  ParamPrice  int
+89  ParamOrderID  varbyte
+90  ParamCurrency  varbyte
+91  ParamCurrencyID  int
+92  OpenstatServiceName  varbyte
+93  OpenstatCampaignID  varbyte
+94  OpenstatAdID  varbyte
+95  OpenstatSourceID  varbyte
+96  UTMSource  varbyte
+97  UTMMedium  varbyte
+98  UTMCampaign  varbyte
+99  UTMContent  varbyte
+100  UTMTerm  varbyte
+101  FromTag  varbyte
+102  HasGCLID  int
+103  RefererHash  int
+104  URLHash  int
+105  CLID  int
+options , LF " NO 0 iso8601-char
+)
+
+cdefs fwrite CSVFOLDER,'hits.cdefs'
+
+NB. Read csv into the `hits` table using our cdefs.
+jd 'csvrd hits.csvlink hits'
+jd 'csvreport /f hits'
 exit ''
 JEOF
 
diff --git a/jd/queries.sql b/jd/queries.sql
index eff9b9aa7..c429d39df 100644
--- a/jd/queries.sql
+++ b/jd/queries.sql
@@ -2,8 +2,8 @@ jd 'reads count jdindex from hits'
 jd 'reads count jdindex from hits where AdvEngineID <> 0'
 jd 'reads sum AdvEngineID,count jdindex,avg ResolutionWidth from hits'
 jd 'reads avg UserID from hits'
-# ~. ; jd 'reads UserID from hits'
-# ~. ; jd 'reads SearchPhrase from hits'
+# ~. ; }. jd 'reads UserID from hits'
+# ~. ; }. jd 'reads SearchPhrase from hits'
 jd 'reads min EventDate,max EventDate from hits'
 10 {. jd 'reads c:count jdindex by AdvEngineID from hits where AdvEngineID <> 0 order by c desc'
 10 {. jd 'reads u:count jdindex by RegionID from hits order by u desc'
@@ -20,12 +20,12 @@ jd 'reads min EventDate,max EventDate from hits'
 jd 'reads UserID from hits where UserID = 435090932899640449'
 jd 'reads count jdindex from hits where URL like ".*google.*"'
 10 {. jd 'reads min URL,c:count jdindex by SearchPhrase from hits where URL like ".*google.*" && SearchPhrase <> "" order by c desc'
-10 {. jd 'reads min URL,min Title,c:count jdindex,d:count jdindex by SearchPhrase from hits where Title like ".*Google.*" && URL unlike ".*\.google\..*" && SearchPhrase <> "" order by c desc'
+10 {. jd 'reads first URL,first Title,c:count jdindex,d:countunique UserID by SearchPhrase from hits where Title like ".*Google.*" && URL unlike ".*\.google\..*" && SearchPhrase <> "" order by c desc'
 10 {. jd 'reads * from hits where URL like ".*google.*" order by EventTime'
-10 {. jd 'reads SearchPhrase from hits where SearchPhrase <> "" order by EventTime'
+10 {. jd 'reads EventTime,SearchPhrase from hits where SearchPhrase <> "" order by EventTime'
 10 {. jd 'reads SearchPhrase from hits where SearchPhrase <> "" order by SearchPhrase'
-10 {. jd 'reads SearchPhrase from hits where SearchPhrase <> "" order by EventTime,SearchPhrase'
-25 {. jd 'reads l:avg URL,c:count jdindex by CounterID from hits where URL <> "" order by l desc'
+10 {. jd 'reads EventTime,SearchPhrase from hits where SearchPhrase <> "" order by EventTime,SearchPhrase'
+'null'
 'null'
 jd 'reads sum ResolutionWidth from hits'
 10 {. jd 'reads c:count jdindex,sum IsRefresh,avg ResolutionWidth by SearchEngineID,ClientIP from hits where SearchPhrase <> "" order by c desc'
@@ -34,10 +34,10 @@ jd 'reads sum ResolutionWidth from hits'
 10 {. jd 'reads c:count jdindex by URL from hits order by c desc'
 10 {. jd 'reads c:count jdindex by URL from hits order by c desc'
 10 {. jd 'reads c:count jdindex by ClientIP from hits order by c desc'
-10 {. jd 'reads c:count jdindex by URL from hits where CounterID=62 && EventDate range (15887,15917) && DontCountHits=0 && IsRefresh=0 && URL <> "" order by c desc'
-10 {. jd 'reads c:count jdindex by Title from hits where CounterID=62 && EventDate range (15887,15917) && DontCountHits=0 && IsRefresh=0 && Title <> "" order by c desc'
-10 {. (1000 }. jd 'reads c:count jdindex by URL from hits where CounterID=62 && EventDate range (15887,15917) && IsRefresh=0 && IsLink<>0 && IsDownload=0 order by c desc')
-10 {. (1000 }. jd 'reads c:count jdindex by TraficSourceID,SearchEngineID,AdvEngineID,Referer,URL from hits where CounterID=62 && EventDate range (15887,15917) && IsRefresh=0 order by c desc')
-10 {. (100 }. jd 'reads c:count jdindex by URLHash,EventDate from hits where CounterID=62 && EventDate range (15887,15917) && IsRefresh=0 && TraficSourceID in (-1,6) && RefererHash=3594120000172545465 order by c desc')
-10 {. (10000 }. jd 'reads c:count jdindex by WindowClientWidth,WindowClientHeight from hits where CounterID=62 && EventDate range (15887,15917) && IsRefresh=0 && DontCountHits=0 && URLHash=2868770270353813622 order by c desc')
+10 {. jd 'reads c:count jdindex by URL from hits where CounterID=62 && EventDate range ("2013-07-01","2013-07-31") && DontCountHits=0 && IsRefresh=0 && URL <> "" order by c desc'
+10 {. jd 'reads c:count jdindex by Title from hits where CounterID=62 && EventDate range ("2013-07-01","2013-07-31") && DontCountHits=0 && IsRefresh=0 && Title <> "" order by c desc'
+10 {. (1000 }. jd 'reads c:count jdindex by URL from hits where CounterID=62 && EventDate range ("2013-07-01","2013-07-31") && IsRefresh=0 && IsLink<>0 && IsDownload=0 order by c desc')
+10 {. (1000 }. jd 'reads c:count jdindex by TraficSourceID,SearchEngineID,AdvEngineID,Referer,URL from hits where CounterID=62 && EventDate range ("2013-07-01","2013-07-31") && IsRefresh=0 order by c desc')
+10 {. (100 }. jd 'reads c:count jdindex by URLHash,EventDate from hits where CounterID=62 && EventDate range ("2013-07-01","2013-07-31") && IsRefresh=0 && TraficSourceID in (-1,6) && RefererHash=3594120000172545465 order by c desc')
+10 {. (10000 }. jd 'reads c:count jdindex by WindowClientWidth,WindowClientHeight from hits where CounterID=62 && EventDate range ("2013-07-01","2013-07-31") && IsRefresh=0 && DontCountHits=0 && URLHash=2868770270353813622 order by c desc')
 'null'

From 109a2f7ca996b17f4470d23a6fa86b6735e6fd33 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 30 May 2026 06:19:12 +0000
Subject: [PATCH 6/7] ClickBench/jd: resolve csvload path correctly when
 running as root

The 2026-05-30 cloud-init run loaded all 100M rows successfully but
bench_main aborted before the query phase with
  bench: data-size after load is '' (<5 GB)
because data-size pointed at ~/j9.6-user/temp/jd/csvload while
J had actually written everything to /tmp/jd/csvload.

J 9.6 picks the ~user / ~temp paths from j9.6/bin/profile.ijs:
running as a normal user it uses ~/j9.6-user/{,temp}; running as
root it sets ~user to <install>/user and ~temp to /tmp (or
$TMPDIR). cloud-init runs as root so csvload landed in /tmp.

Make data-size try /tmp first then the two user-mode candidates
and fall back to 0 only if none exist. Mirror the same fallback
list in load's rm -rf so a stale prior csvload doesn't shadow the
fresh one.
---
 jd/data-size | 15 ++++++++++++++-
 jd/load      |  6 +++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/jd/data-size b/jd/data-size
index 9f791b1aa..d486b8489 100755
--- a/jd/data-size
+++ b/jd/data-size
@@ -1,3 +1,16 @@
 #!/bin/bash
+# Locate the csvload database dir. J 9.6's ~temp resolves to
+# ~/j9.6-user/temp for a normal user but to /tmp (or $TMPDIR) when
+# jconsole detects it's running as root — see j9.6/bin/profile.ijs.
+# cloud-init runs as root, so the cloudy load lands under /tmp;
+# local laptop runs land under ~/j9.6-user/temp. Try both.
 set -e
-du -sb "$HOME/j9.6-user/temp/jd/csvload" 2>/dev/null | awk '{ print $1 }'
+for p in "${TMPDIR:-/tmp}/jd/csvload" \
+         "$HOME/j9.6-user/temp/jd/csvload" \
+         "$HOME/j9.6/user/temp/jd/csvload"; do
+    if [ -d "$p" ]; then
+        du -sb "$p" | awk '{print $1}'
+        exit 0
+    fi
+done
+echo 0
diff --git a/jd/load b/jd/load
index 31b57c333..03850807c 100755
--- a/jd/load
+++ b/jd/load
@@ -15,7 +15,11 @@
 set -e
 
 # Reset any prior csvload DB so we measure a clean load.
-rm -rf "$HOME/j9.6-user/temp/jd/csvload"
+# J's ~temp resolves to /tmp under root (cloud-init) and to
+# ~/j9.6-user/temp under a normal user — see j9.6/bin/profile.ijs.
+rm -rf "$HOME/j9.6-user/temp/jd/csvload" \
+       "${TMPDIR:-/tmp}/jd/csvload" \
+       "$HOME/j9.6/user/temp/jd/csvload"
 
 ijconsole </dev/null <<'JEOF'
 load 'data/jd/jd'

From cb9691f1eca9e693b6bc0c6b4f02b7601c2ba016 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 30 May 2026 20:11:35 +0000
Subject: [PATCH 7/7] ClickBench/jd: Q22 swap min URL for first URL like Q23

Q22 came back null in the 2026-05-30 11:29:46 c6a.metal run for
the same reason Q23 did: Jd's getagg/<. can't reduce a boxed
varbyte column. Apply the same first/ANY_VALUE substitution we
made for Q23 in 61385e9fe.
---
 jd/queries.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jd/queries.sql b/jd/queries.sql
index c429d39df..8359a1e1d 100644
--- a/jd/queries.sql
+++ b/jd/queries.sql
@@ -19,7 +19,7 @@ jd 'reads min EventDate,max EventDate from hits'
 10 {. jd 'reads c:count jdindex by UserID,SearchPhrase from hits order by c desc'
 jd 'reads UserID from hits where UserID = 435090932899640449'
 jd 'reads count jdindex from hits where URL like ".*google.*"'
-10 {. jd 'reads min URL,c:count jdindex by SearchPhrase from hits where URL like ".*google.*" && SearchPhrase <> "" order by c desc'
+10 {. jd 'reads first URL,c:count jdindex by SearchPhrase from hits where URL like ".*google.*" && SearchPhrase <> "" order by c desc'
 10 {. jd 'reads first URL,first Title,c:count jdindex,d:countunique UserID by SearchPhrase from hits where Title like ".*Google.*" && URL unlike ".*\.google\..*" && SearchPhrase <> "" order by c desc'
 10 {. jd 'reads * from hits where URL like ".*google.*" order by EventTime'
 10 {. jd 'reads EventTime,SearchPhrase from hits where SearchPhrase <> "" order by EventTime'