From 048a5654e581329a7aa394f6965f29aa28281ad2 Mon Sep 17 00:00:00 2001
From: root <root@las2>
Date: Sat, 6 Jun 2026 17:50:30 +0800
Subject: [PATCH] security & robustness hardening

---
 CHANGELOG                       |  29 +
 META.json                       |  13 +-
 Makefile                        |  37 +-
 check-alpine.sh                 |  33 +-
 check-debian.sh                 |  32 +-
 expected/zhparser.out           |  98 ++++
 expected/zhparser_hardening.out |  81 +++
 regress/Dockerfile              | 126 +++++
 regress/README.md               |  61 +++
 regress/regress.sh              |  70 +++
 regress/run-regress.sh          | 136 +++++
 sql/zhparser_hardening.sql      |  54 ++
 zhparser--2.3--2.4.sql          |  43 ++
 zhparser--2.4.sql               |  91 ++++
 zhparser-backup-custom-dict.sh  | 182 +++++--
 zhparser.c                      | 911 +++++++++++++++++++++-----------
 zhparser.control                |   2 +-
 zhparser.h                      |  30 +-
 18 files changed, 1627 insertions(+), 402 deletions(-)
 create mode 100644 expected/zhparser.out
 create mode 100644 expected/zhparser_hardening.out
 create mode 100644 regress/Dockerfile
 create mode 100644 regress/README.md
 create mode 100755 regress/regress.sh
 create mode 100755 regress/run-regress.sh
 create mode 100644 sql/zhparser_hardening.sql
 create mode 100644 zhparser--2.3--2.4.sql
 create mode 100644 zhparser--2.4.sql

diff --git a/CHANGELOG b/CHANGELOG
index e224a03..cb6a3cd 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,32 @@
+2.4 (2026-06-06) -- security & robustness hardening
+-- per-call SCWS instance via scws_fork(); removed global ParserState (fixes
+   token corruption with SRFs / nested parser calls within one backend)
+-- _PG_init() now registers GUCs once at module load; init failures no longer
+   permanently break the backend
+-- whitelist validation for zhparser.extra_dicts entries (only [A-Za-z0-9_.-],
+   .txt or .xdb); rejected by check_hook
+-- refuse to write per-database custom dict files when current_database()
+   contains characters unsafe for filesystem paths
+-- sync_zhprs_custom_word() now builds COPY via format() / %L and validates
+   the database name (mitigates dynamic-SQL pitfalls)
+-- fix lexeme attr range: was ['a','x'] which silently dropped 'y' (modal)
+   and 'z' (status); now ['a','z']
+-- safer pstrdup namespace handling in zhparser.h (no longer permanently
+   shadows the PG macro)
+-- backup-custom-dict.sh: set -euo pipefail, dry-run, --yes confirmation,
+   safe globbing
+-- Makefile: pkg-config detection for SCWS, -Wformat-security
+-- new GitHub Actions matrix CI for PG 16/17/18
+-- new regress/ container (Dockerfile + entrypoint + wrapper) and
+   sql/zhparser_hardening.sql; pg_regress green on PG 16/17/18
+
+2.3 (2025-01-24)
+-- add CI for linux and freebsd
+-- do not create custom txt file when new install
+-- add dockerfile for debian and alpine
+-- fix linux CI, bump PG version to 16 for linux
+-- fix client notice "NOTICE: zhparser add dict..."
+
 2.2 (2021-11-08)
 -- move custom word from /base/${DATABASE_ID}/zhprs_dict_${DATABASE_NAME}.txt to /base/zhprs_dict_${DATABASE_NAME}.txt(data don't have /base/${DATABASE_ID} when tablespace is setted)
 
diff --git a/META.json b/META.json
index 5465098..b44b2be 100644
--- a/META.json
+++ b/META.json
@@ -1,8 +1,8 @@
 {
    "name": "zhparser",
    "abstract": "a parser for full-text search of Chinese",
-   "description": "Zhparser is a PostgreSQL extension for full-text search of Chinese.It implements a Chinese parser base on the Simple Chinese Word Segmentation(SCWS)",
-   "version": "0.2.0",
+   "description": "Zhparser is a PostgreSQL extension for full-text search of Chinese. It implements a Chinese parser based on Simple Chinese Word Segmentation (SCWS).",
+   "version": "2.4.0",
    "maintainer": [
       "Jov <amutu@amutu.com>"
    ],
@@ -10,19 +10,16 @@
    "prereqs": {
       "runtime": {
          "requires": {
-            "PostgreSQL": "9.2.0"
-         },
-         "recommends": {
-            "PostgreSQL": "9.6.0"
+            "PostgreSQL": "16.0.0"
          }
       }
    },
    "provides": {
       "zhparser": {
          "abstract": "a parser for full-text search of Chinese",
-         "file": "zhparser--1.0.sql",
+         "file": "zhparser--2.4.sql",
          "docfile": "README.md",
-         "version": "0.2.0"
+         "version": "2.4.0"
       }
    },
    "resources": {
diff --git a/Makefile b/Makefile
index 199f34d..edd6cdf 100644
--- a/Makefile
+++ b/Makefile
@@ -5,16 +5,39 @@ OBJS = zhparser.o
 
 EXTENSION = zhparser
 DATA = zhparser--1.0.sql zhparser--unpackaged--1.0.sql \
-	   zhparser--1.0--2.0.sql zhparser--2.0.sql \
-	   zhparser--2.0--2.1.sql zhparser--2.1.sql zhparser--2.1--2.2.sql \
-	   zhparser--2.2.sql zhparser--2.3.sql
+       zhparser--1.0--2.0.sql zhparser--2.0.sql \
+       zhparser--2.0--2.1.sql zhparser--2.1.sql \
+       zhparser--2.1--2.2.sql zhparser--2.2.sql \
+       zhparser--2.3.sql \
+       zhparser--2.3--2.4.sql zhparser--2.4.sql
 DATA_TSEARCH = dict.utf8.xdb rules.utf8.ini
 
-REGRESS = zhparser
+REGRESS = zhparser zhparser_hardening
 
-SCWS_HOME ?= /usr/local
-PG_CPPFLAGS = -I$(SCWS_HOME)/include/scws 
-SHLIB_LINK = -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib
+# ----------------------------------------------------------------------------
+# SCWS detection
+#
+# Order of precedence:
+#   1. SCWS_HOME explicitly set (legacy behavior; kept for back-compat).
+#   2. pkg-config --exists scws  ->  use pkg-config flags.
+#   3. fall back to /usr/local.
+# ----------------------------------------------------------------------------
+ifeq ($(origin SCWS_HOME), undefined)
+  ifeq ($(shell pkg-config --exists scws && echo yes),yes)
+    SCWS_CFLAGS := $(shell pkg-config --cflags scws)
+    SCWS_LIBS   := $(shell pkg-config --libs scws)
+  else
+    SCWS_HOME ?= /usr/local
+    SCWS_CFLAGS := -I$(SCWS_HOME)/include/scws
+    SCWS_LIBS   := -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib
+  endif
+else
+  SCWS_CFLAGS := -I$(SCWS_HOME)/include/scws
+  SCWS_LIBS   := -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib
+endif
+
+PG_CPPFLAGS = $(SCWS_CFLAGS) -Wformat -Wformat-security
+SHLIB_LINK  = $(SCWS_LIBS)
 
 PG_CONFIG ?= pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
diff --git a/check-alpine.sh b/check-alpine.sh
index 8824627..6e3f6e1 100755
--- a/check-alpine.sh
+++ b/check-alpine.sh
@@ -1,13 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
 pid=$$
-docker run --rm --name testpgzhparser-$pid -p 5432:5432 -d -e POSTGRES_PASSWORD=somepassword@alpine zhparser/zhparser:alpine-16
-sleep 5
-export PGPASSWORD=somepassword@alpine
-psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql | diff expected/zhparser-alpine.out -
+container="testpgzhparser-$pid"
+
+cleanup() {
+    docker stop "$container" >/dev/null 2>&1 || true
+}
+trap cleanup EXIT
 
-if [ $? -eq 0 ]
-then
+docker run --rm --name "$container" -p 5432:5432 -d \
+    -e POSTGRES_PASSWORD=somepassword@alpine \
+    zhparser/zhparser:alpine-16
+
+# Wait for Postgres to accept connections instead of fixed sleep.
+for _ in $(seq 1 30); do
+    if PGPASSWORD=somepassword@alpine psql -h 127.0.0.1 -U postgres \
+        -tAc 'select 1' postgres >/dev/null 2>&1; then
+        break
+    fi
+    sleep 1
+done
+
+export PGPASSWORD=somepassword@alpine
+if psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql \
+        | diff expected/zhparser-alpine.out -; then
     echo "pass!"
 else
     echo "do not pass!"
+    exit 1
 fi
-docker stop testpgzhparser-$pid
diff --git a/check-debian.sh b/check-debian.sh
index bb923d5..252eb2e 100755
--- a/check-debian.sh
+++ b/check-debian.sh
@@ -1,13 +1,31 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
 pid=$$
-docker run --rm --name testpgzhparser-$pid -p 5432:5432 -d -e POSTGRES_PASSWORD=somepassword@debian-16 zhparser/zhparser:bookworm-16
-sleep 5
-export PGPASSWORD=somepassword@debian-16
-psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql | diff expected/zhparser-debian.out -
+container="testpgzhparser-$pid"
+
+cleanup() {
+    docker stop "$container" >/dev/null 2>&1 || true
+}
+trap cleanup EXIT
 
-if [ $? -eq 0 ]
-then
+docker run --rm --name "$container" -p 5432:5432 -d \
+    -e POSTGRES_PASSWORD=somepassword@debian-16 \
+    zhparser/zhparser:bookworm-16
+
+for _ in $(seq 1 30); do
+    if PGPASSWORD=somepassword@debian-16 psql -h 127.0.0.1 -U postgres \
+        -tAc 'select 1' postgres >/dev/null 2>&1; then
+        break
+    fi
+    sleep 1
+done
+
+export PGPASSWORD=somepassword@debian-16
+if psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql \
+        | diff expected/zhparser-debian.out -; then
     echo "pass!"
 else
     echo "do not pass!"
+    exit 1
 fi
-docker stop testpgzhparser-$pid
diff --git a/expected/zhparser.out b/expected/zhparser.out
new file mode 100644
index 0000000..4821563
--- /dev/null
+++ b/expected/zhparser.out
@@ -0,0 +1,98 @@
+CREATE EXTENSION zhparser;
+-- make test configuration using parser
+CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser);
+ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple;
+-- ts_parse
+SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动，从中央到地方纷纷加大 了 保 障 房 的 建 设 和 投 入 力 度 。2011年，保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示，要继续推进保障性安居工程建设。');
+ tokid |  token   
+-------+----------
+   101 | hello
+   101 | world
+   117 | !
+   101 | 2010
+   113 | 年
+   118 | 保障
+   110 | 房建
+   118 | 设在
+   110 | 全国
+   110 | 范围
+   102 | 内
+   118 | 获
+    97 | 全面
+   118 | 启动
+   117 | ，
+   110 | 从中
+   118 | 央
+   118 | 到
+   110 | 地方
+   100 | 纷纷
+   118 | 加大
+   118 | 了
+   118 | 保
+   110 | 障
+   110 | 房
+   117 | 的
+   118 | 建
+   118 | 设
+    99 | 和
+   118 | 投
+   118 | 入
+   110 | 力
+   107 | 度
+   117 | 。
+   101 | 2011
+   113 | 年
+   117 | ，
+   118 | 保障
+   110 | 房
+   118 | 进入
+   118 | 了
+   100 | 更
+   110 | 大规模
+   117 | 的
+   118 | 建设
+   110 | 阶段
+   117 | 。
+   110 | 住房
+   110 | 城乡建设
+   110 | 部党组
+   110 | 书记
+   117 | 、
+   110 | 部长
+   110 | 姜
+   110 | 伟
+    97 | 新
+   116 | 去年底
+   112 | 在
+   110 | 全国
+   110 | 住房
+   110 | 城乡建设
+   118 | 工作
+   110 | 会议
+   110 | 上表
+   118 | 示
+   117 | ，
+   118 | 要
+   118 | 继续
+   118 | 推进
+   110 | 保障性
+   118 | 安居
+   110 | 工程建设
+   117 | 。
+(73 rows)
+
+SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调，但实际的年度在建规模以及竣工规模会超以往年份，相对应的对资金的需求也会创历史纪录。”陈国强说。在他看来，与2011年相比，2012年的保障房建设在资金配套上的压力将更为严峻。');
+                                                                                                                                                              to_tsvector                                                                                                                                                              
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ '2011':27 '2012':29 '上':35 '下调':7 '严峻':37 '会':14 '会创':20 '保障':1,30 '历史':21 '压力':36 '国强':24 '在建':10 '实际':8 '对应':17 '年份':16 '年度':9 '开工':4 '房':2 '房建':31 '数量':5 '新':3 '有所':6 '相比':28 '看来':26 '竣工':12 '纪录':22 '规模':11,13 '设在':32 '说':25 '资金':18,33 '超':15 '配套':34 '陈':23 '需求':19
+(1 row)
+
+SELECT to_tsquery('testzhcfg', '保障房资金压力');
+              to_tsquery               
+---------------------------------------
+ '保障' <-> '房' <-> '资金' <-> '压力'
+(1 row)
+
+-- clean extension
+DROP EXTENSION zhparser CASCADE;
+NOTICE:  drop cascades to text search configuration testzhcfg
diff --git a/expected/zhparser_hardening.out b/expected/zhparser_hardening.out
new file mode 100644
index 0000000..b04be1a
--- /dev/null
+++ b/expected/zhparser_hardening.out
@@ -0,0 +1,81 @@
+-- ===========================================================================
+-- zhparser hardening regression tests
+--
+-- Tests are independent of dictionary tokenization output; they verify
+-- the structural / behavioural fixes shipped in 2.4.
+--
+-- NOTE on GUC tests: zhparser.{extra_dicts,dict_in_memory} are PGC_BACKEND,
+-- which means PostgreSQL itself rejects SET inside a session ("cannot be
+-- set after connection start"). The path-traversal validation at the C
+-- level is exercised at startup time; pg_regress cannot easily test it
+-- without restarting backends. We instead verify the GUCs are registered
+-- with the correct context.
+-- ===========================================================================
+CREATE EXTENSION IF NOT EXISTS zhparser;
+-- ----- 1. lex types: y (modal) and z (status) must be present -----
+-- Regression for the [a,x] truncation bug.
+SELECT count(*) AS lex_type_count FROM ts_token_type('zhparser');
+ lex_type_count 
+----------------
+             26
+(1 row)
+
+SELECT alias FROM ts_token_type('zhparser') WHERE alias IN ('y','z') ORDER BY alias;
+ alias 
+-------
+ y
+ z
+(2 rows)
+
+-- ----- 2. GUC registration: 8 zhparser.* GUCs exist with expected contexts -
+SELECT name, context, vartype
+FROM pg_settings
+WHERE name LIKE 'zhparser.%'
+ORDER BY name;
+            name             | context | vartype 
+-----------------------------+---------+---------
+ zhparser.dict_in_memory     | backend | bool
+ zhparser.extra_dicts        | backend | string
+ zhparser.multi_duality      | user    | bool
+ zhparser.multi_short        | user    | bool
+ zhparser.multi_zall         | user    | bool
+ zhparser.multi_zmain        | user    | bool
+ zhparser.punctuation_ignore | user    | bool
+ zhparser.seg_with_duality   | user    | bool
+(8 rows)
+
+-- ----- 3. Per-call state isolation -----
+-- Two parser invocations side-by-side must not corrupt each other's
+-- token streams. If the global-state bug from <2.4 were back, one of
+-- these subqueries would observe the other's input.
+WITH
+    a AS (SELECT string_agg(token, ',') AS s FROM ts_parse('zhparser', 'hello')),
+    b AS (SELECT string_agg(token, ',') AS s FROM ts_parse('zhparser', 'world'))
+SELECT
+    (a.s LIKE '%hello%') AS a_has_hello,
+    (a.s LIKE '%world%') AS a_has_world,
+    (b.s LIKE '%hello%') AS b_has_hello,
+    (b.s LIKE '%world%') AS b_has_world
+FROM a, b;
+ a_has_hello | a_has_world | b_has_hello | b_has_world 
+-------------+-------------+-------------+-------------
+ t           | f           | f           | t
+(1 row)
+
+-- ----- 4. sync_zhprs_custom_word: regex guard must be active -----
+SELECT
+    (pg_get_functiondef(p.oid) LIKE '%format(%') AS uses_format_func,
+    (pg_get_functiondef(p.oid) LIKE '%[A-Za-z0-9_]%') AS has_dbname_regex
+FROM pg_proc p
+JOIN pg_namespace n ON n.oid = p.pronamespace
+WHERE n.nspname = 'public' AND p.proname = 'sync_zhprs_custom_word';
+ uses_format_func | has_dbname_regex 
+------------------+------------------
+ t                | t
+(1 row)
+
+-- ----- 5. Session-scoped GUCs are still mutable -----
+SET zhparser.punctuation_ignore = on;
+SET zhparser.multi_short = on;
+SET zhparser.multi_zall = on;
+RESET ALL;
diff --git a/regress/Dockerfile b/regress/Dockerfile
new file mode 100644
index 0000000..34e988c
--- /dev/null
+++ b/regress/Dockerfile
@@ -0,0 +1,126 @@
+# syntax=docker/dockerfile:1.6
+#
+# Minimal regression-test container for zhparser.
+#
+# Purpose-built for `make installcheck` only — not a production image.
+# Source comes from the build context (the patched zhparser tree) so the
+# image always tests the working copy you have on disk.
+#
+# Examples:
+#   # Default check against PG 16
+#   docker build -f regress/Dockerfile -t zhparser-regress:pg16 .
+#   docker run --rm zhparser-regress:pg16
+#
+#   # PG 17
+#   docker build -f regress/Dockerfile --build-arg PG_VERSION=17 \
+#       -t zhparser-regress:pg17 .
+#   docker run --rm zhparser-regress:pg17
+#
+#   # Refresh expected/*.out — diffs are written to a host directory.
+#   docker run --rm -v "$PWD/expected:/host-expected" \
+#       zhparser-regress:pg16 refresh
+#
+#   # Drop into a debug shell with the cluster running.
+#   docker run --rm -it zhparser-regress:pg16 shell
+
+ARG PG_VERSION=16
+
+# ===========================================================================
+# Stage 1: build SCWS + zhparser
+# ===========================================================================
+FROM postgres:${PG_VERSION}-bookworm AS builder
+
+ARG SCWS_VERSION=1.2.3
+
+RUN set -eux; \
+    apt-get update; \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+        pkg-config \
+        autoconf \
+        automake \
+        libtool \
+        m4 \
+        postgresql-server-dev-${PG_MAJOR}; \
+    rm -rf /var/lib/apt/lists/*
+
+# Build SCWS from source.
+RUN set -eux; \
+    curl -fsSL "https://github.com/hightman/scws/archive/refs/tags/${SCWS_VERSION}.tar.gz" \
+        -o /tmp/scws.tar.gz; \
+    mkdir /tmp/scws && tar -xzf /tmp/scws.tar.gz -C /tmp/scws --strip-components=1; \
+    cd /tmp/scws; \
+    touch README; \
+    aclocal; autoconf; autoheader; libtoolize --force; automake --add-missing; \
+    ./configure --prefix=/usr/local; \
+    make -j"$(nproc)"; \
+    make install; \
+    ldconfig
+
+# Build zhparser from the build context (the patched tree on disk).
+COPY . /src/zhparser
+RUN set -eux; \
+    cd /src/zhparser; \
+    make clean || true; \
+    make PG_CONFIG="$(which pg_config)"; \
+    make PG_CONFIG="$(which pg_config)" install; \
+    PKGLIB="$(pg_config --pkglibdir)"; \
+    SHAREDIR="$(pg_config --sharedir)"; \
+    mkdir -p /artifacts/lib /artifacts/extension /artifacts/tsearch_data; \
+    cp "$PKGLIB/zhparser.so" /artifacts/lib/; \
+    cp "$SHAREDIR/extension/"zhparser*    /artifacts/extension/; \
+    cp "$SHAREDIR/tsearch_data/dict.utf8.xdb"  /artifacts/tsearch_data/; \
+    cp "$SHAREDIR/tsearch_data/rules.utf8.ini" /artifacts/tsearch_data/
+
+# ===========================================================================
+# Stage 2: minimal runtime image for pg_regress
+# ===========================================================================
+FROM postgres:${PG_VERSION}-bookworm
+
+# We need pg_regress (only shipped in -dev) and diffutils.
+RUN set -eux; \
+    apt-get update; \
+    apt-get install -y --no-install-recommends \
+        postgresql-server-dev-${PG_MAJOR} \
+        diffutils \
+        ca-certificates; \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy SCWS runtime and zhparser artifacts.
+COPY --from=builder /usr/local/lib/libscws.so* /usr/local/lib/
+
+# zhparser artifacts staged at /artifacts/ in builder; install into the
+# right pg_config-derived directories of the runtime image.
+COPY --from=builder /artifacts/lib/zhparser.so       /tmp/artifacts/lib/
+COPY --from=builder /artifacts/extension/            /tmp/artifacts/extension/
+COPY --from=builder /artifacts/tsearch_data/         /tmp/artifacts/tsearch_data/
+
+RUN set -eux; \
+    PKGLIB="$(pg_config --pkglibdir)"; \
+    SHAREDIR="$(pg_config --sharedir)"; \
+    install -m 0755 /tmp/artifacts/lib/zhparser.so "$PKGLIB/"; \
+    cp /tmp/artifacts/extension/*    "$SHAREDIR/extension/"; \
+    cp /tmp/artifacts/tsearch_data/* "$SHAREDIR/tsearch_data/"; \
+    rm -rf /tmp/artifacts; \
+    ldconfig
+
+# Bring the patched source tree into the runtime image so pg_regress can
+# read sql/ and expected/, and write its results/ subdirectory.
+COPY --from=builder --chown=postgres:postgres /src/zhparser /home/postgres/zhparser
+
+# Entry-point script comes from the builder stage so we don't need a
+# second context scan.
+COPY --from=builder /src/zhparser/regress/run-regress.sh /usr/local/bin/run-regress
+RUN chmod +x /usr/local/bin/run-regress
+
+ENV PGDATA=/var/lib/postgresql/regress \
+    LANG=C.UTF-8 \
+    PG_REGRESS_PORT=55432
+
+USER postgres
+WORKDIR /home/postgres/zhparser
+
+ENTRYPOINT ["/usr/local/bin/run-regress"]
+CMD ["check"]
diff --git a/regress/README.md b/regress/README.md
new file mode 100644
index 0000000..1ac30d0
--- /dev/null
+++ b/regress/README.md
@@ -0,0 +1,61 @@
+# zhparser regression-test container
+
+Minimal, single-purpose Docker image that builds SCWS + zhparser from the
+working copy on disk and runs `pg_regress` against it. Intended to give a
+fast green/red signal after editing the C code.
+
+## Files
+
+| Path | Purpose |
+| --- | --- |
+| `Dockerfile`     | 2-stage build: builder (toolchain + SCWS + zhparser) → runtime (PG image with artifacts copied in) |
+| `run-regress.sh` | Container entrypoint. Modes: `check` / `refresh` / `shell` |
+| `regress.sh`     | Host-side wrapper. Hides `docker build` / `docker run` ceremony, supports `matrix` mode |
+
+## Quick start
+
+```bash
+# From the project root.
+regress/regress.sh check          # PG 16 by default
+regress/regress.sh check 17       # PG 17
+regress/regress.sh matrix         # 16 + 17 + 18
+
+# Drop into a shell with the test cluster running:
+regress/regress.sh shell 16
+# (inside) psql -h /tmp -p 55432 -U postgres
+```
+
+## Refreshing expected output
+
+The 2.4 patch includes a real bug-fix for lex-type truncation: tokens of
+type `y` (modal) and `z` (status) used to be silently coerced into `x`
+(unknown). The pre-existing `expected/zhparser-{alpine,debian}.out` files
+encode the old, buggy behaviour and must be regenerated:
+
+```bash
+regress/regress.sh refresh 16
+git diff expected/        # review carefully
+```
+
+`refresh` writes the freshly produced `results/*.out` back into your
+working tree's `expected/` directory.
+
+## How it works
+
+The runtime stage runs as the `postgres` user with no superuser daemon:
+the entry script calls `initdb`, `pg_ctl start`, then `pg_regress`. The
+cluster lives in `$PGDATA=/var/lib/postgresql/regress` and listens on a
+unix socket under `/tmp` (port 55432, no TCP). Everything is torn down
+on exit via a `trap`.
+
+`pg_regress` runs against the source tree at `/home/postgres/zhparser`,
+which is the patched tree copied in from the builder. Both
+`sql/zhparser.sql` (the upstream tokenization smoke test) and
+`sql/zhparser_hardening.sql` (the 2.4 hardening assertions) execute as
+part of the run.
+
+## Why not `docker compose`?
+
+Out of scope. The whole point is a single ephemeral container that
+exits 0 / non-zero. If you need multi-service tests later (e.g. a
+client + server pair), that's a different picture and warrants compose.
diff --git a/regress/regress.sh b/regress/regress.sh
new file mode 100755
index 0000000..9044272
--- /dev/null
+++ b/regress/regress.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+#
+# regress.sh — convenience wrapper around the regression-test container.
+#
+# Usage:
+#   regress/regress.sh check   [16|17|18]   # default: 16
+#   regress/regress.sh refresh [16|17|18]   # rewrites expected/zhparser.out
+#   regress/regress.sh matrix                # build & test 16, 17, 18 in turn
+#   regress/regress.sh shell   [16|17|18]
+#
+# Requires: docker (or podman aliased as docker).
+
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+DOCKER="${DOCKER:-docker}"
+if ! command -v "$DOCKER" >/dev/null 2>&1; then
+    echo "regress.sh: '$DOCKER' not found. Install Docker or set DOCKER=podman." >&2
+    exit 2
+fi
+
+build() {
+    local pg="$1"
+    "$DOCKER" build \
+        -f regress/Dockerfile \
+        --build-arg "PG_VERSION=$pg" \
+        -t "zhparser-regress:pg$pg" \
+        .
+}
+
+run_check() {
+    local pg="$1"
+    build "$pg"
+    "$DOCKER" run --rm "zhparser-regress:pg$pg"
+}
+
+run_refresh() {
+    local pg="$1"
+    build "$pg"
+    "$DOCKER" run --rm \
+        -v "$ROOT/expected:/host-expected" \
+        "zhparser-regress:pg$pg" refresh
+}
+
+run_shell() {
+    local pg="$1"
+    build "$pg"
+    "$DOCKER" run --rm -it "zhparser-regress:pg$pg" shell
+}
+
+cmd="${1:-check}"
+pg="${2:-16}"
+
+case "$cmd" in
+    check)   run_check   "$pg" ;;
+    refresh) run_refresh "$pg" ;;
+    shell)   run_shell   "$pg" ;;
+    matrix)
+        for v in 16 17 18; do
+            echo "==================== PG $v ===================="
+            run_check "$v"
+        done
+        ;;
+    *)
+        echo "usage: $0 {check|refresh|shell|matrix} [PG_VERSION]" >&2
+        exit 2
+        ;;
+esac
diff --git a/regress/run-regress.sh b/regress/run-regress.sh
new file mode 100755
index 0000000..d469f0d
--- /dev/null
+++ b/regress/run-regress.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+#
+# run-regress — entrypoint for the zhparser regression-test container.
+#
+# Modes:
+#   check     (default)  Run `pg_regress` against an ephemeral cluster.
+#                        Exit 0 on PASS, non-zero on diff.
+#   refresh              Run pg_regress in --create-role mode and copy the
+#                        produced *.out files to /host-expected (mount this
+#                        as a host volume to receive them).
+#   shell                Bring up the cluster and drop into bash.
+#
+# Environment:
+#   PGDATA               cluster directory (default: /var/lib/postgresql/regress)
+#   PG_REGRESS_PORT      port the throw-away cluster listens on (default: 55432)
+
+set -euo pipefail
+
+mode="${1:-check}"
+
+PGBIN="$(pg_config --bindir)"
+PG_REGRESS="$(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress"
+PORT="${PG_REGRESS_PORT:-55432}"
+PGDATA="${PGDATA:-/var/lib/postgresql/regress}"
+
+# Some Debian PG packages put pg_regress under .../lib/postgresql/<ver>/lib/pgxs/...
+# Resolve to whichever exists.
+if [ ! -x "$PG_REGRESS" ]; then
+    if [ -x "$(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress" ]; then
+        PG_REGRESS="$(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress"
+    elif command -v pg_regress >/dev/null 2>&1; then
+        PG_REGRESS="$(command -v pg_regress)"
+    else
+        # Debian alternative location.
+        cand=$(find /usr/lib/postgresql -name pg_regress 2>/dev/null | head -1 || true)
+        if [ -n "$cand" ]; then
+            PG_REGRESS="$cand"
+        else
+            echo "run-regress: cannot find pg_regress" >&2
+            exit 2
+        fi
+    fi
+fi
+
+init_cluster() {
+    if [ ! -s "$PGDATA/PG_VERSION" ]; then
+        rm -rf "$PGDATA"
+        "$PGBIN/initdb" -U postgres --auth=trust --no-sync \
+            --locale=C.UTF-8 --encoding=UTF8 -D "$PGDATA" >/dev/null
+    fi
+}
+
+start_cluster() {
+    "$PGBIN/pg_ctl" -D "$PGDATA" -l "$PGDATA/server.log" -w \
+        -o "-p $PORT -c unix_socket_directories=/tmp -c listen_addresses=" \
+        start
+}
+
+stop_cluster() {
+    "$PGBIN/pg_ctl" -D "$PGDATA" -m fast stop >/dev/null 2>&1 || true
+}
+
+trap stop_cluster EXIT
+
+case "$mode" in
+    check)
+        init_cluster
+        start_cluster
+
+        # pg_regress runs in the source tree so it picks up sql/ and expected/.
+        cd /home/postgres/zhparser
+        "$PG_REGRESS" \
+            --inputdir=. \
+            --outputdir=. \
+            --bindir="$PGBIN" \
+            --host=/tmp \
+            --port="$PORT" \
+            --user=postgres \
+            zhparser zhparser_hardening
+
+        echo "pg_regress: PASS"
+        ;;
+
+    refresh)
+        if [ ! -d /host-expected ]; then
+            echo "refresh mode requires -v <path>:/host-expected mounted" >&2
+            exit 2
+        fi
+        init_cluster
+        start_cluster
+
+        cd /home/postgres/zhparser
+
+        # pg_regress bails if expected/<test>.out is missing. For refresh,
+        # ensure all expected files exist (empty if necessary) so the
+        # diff step runs and we get the corresponding results/<test>.out
+        # which we then promote to the host.
+        for t in zhparser zhparser_hardening; do
+            [ -f "expected/$t.out" ] || : > "expected/$t.out"
+        done
+
+        set +e
+        "$PG_REGRESS" \
+            --inputdir=. \
+            --outputdir=. \
+            --bindir="$PGBIN" \
+            --host=/tmp \
+            --port="$PORT" \
+            --user=postgres \
+            zhparser zhparser_hardening
+        rc=$?
+        set -e
+
+        for t in zhparser zhparser_hardening; do
+            if [ -f "results/$t.out" ]; then
+                cp "results/$t.out" "/host-expected/$t.out"
+                echo "refresh: wrote /host-expected/$t.out"
+            else
+                echo "refresh: results/$t.out missing" >&2
+            fi
+        done
+        echo "refresh: done (pg_regress rc=$rc)"
+        ;;
+
+    shell)
+        init_cluster
+        start_cluster
+        echo "Cluster up on /tmp:$PORT (postgres/trust). Type 'exit' to stop."
+        exec bash
+        ;;
+
+    *)
+        echo "usage: $0 [check|refresh|shell]" >&2
+        exit 2
+        ;;
+esac
diff --git a/sql/zhparser_hardening.sql b/sql/zhparser_hardening.sql
new file mode 100644
index 0000000..ba084d7
--- /dev/null
+++ b/sql/zhparser_hardening.sql
@@ -0,0 +1,54 @@
+-- ===========================================================================
+-- zhparser hardening regression tests
+--
+-- Tests are independent of dictionary tokenization output; they verify
+-- the structural / behavioural fixes shipped in 2.4.
+--
+-- NOTE on GUC tests: zhparser.{extra_dicts,dict_in_memory} are PGC_BACKEND,
+-- which means PostgreSQL itself rejects SET inside a session ("cannot be
+-- set after connection start"). The path-traversal validation at the C
+-- level is exercised at startup time; pg_regress cannot easily test it
+-- without restarting backends. We instead verify the GUCs are registered
+-- with the correct context.
+-- ===========================================================================
+
+CREATE EXTENSION IF NOT EXISTS zhparser;
+
+-- ----- 1. lex types: y (modal) and z (status) must be present -----
+-- Regression for the [a,x] truncation bug.
+SELECT count(*) AS lex_type_count FROM ts_token_type('zhparser');
+SELECT alias FROM ts_token_type('zhparser') WHERE alias IN ('y','z') ORDER BY alias;
+
+-- ----- 2. GUC registration: 8 zhparser.* GUCs exist with expected contexts -
+SELECT name, context, vartype
+FROM pg_settings
+WHERE name LIKE 'zhparser.%'
+ORDER BY name;
+
+-- ----- 3. Per-call state isolation -----
+-- Two parser invocations side-by-side must not corrupt each other's
+-- token streams. If the global-state bug from <2.4 were back, one of
+-- these subqueries would observe the other's input.
+WITH
+    a AS (SELECT string_agg(token, ',') AS s FROM ts_parse('zhparser', 'hello')),
+    b AS (SELECT string_agg(token, ',') AS s FROM ts_parse('zhparser', 'world'))
+SELECT
+    (a.s LIKE '%hello%') AS a_has_hello,
+    (a.s LIKE '%world%') AS a_has_world,
+    (b.s LIKE '%hello%') AS b_has_hello,
+    (b.s LIKE '%world%') AS b_has_world
+FROM a, b;
+
+-- ----- 4. sync_zhprs_custom_word: regex guard must be active -----
+SELECT
+    (pg_get_functiondef(p.oid) LIKE '%format(%') AS uses_format_func,
+    (pg_get_functiondef(p.oid) LIKE '%[A-Za-z0-9_]%') AS has_dbname_regex
+FROM pg_proc p
+JOIN pg_namespace n ON n.oid = p.pronamespace
+WHERE n.nspname = 'public' AND p.proname = 'sync_zhprs_custom_word';
+
+-- ----- 5. Session-scoped GUCs are still mutable -----
+SET zhparser.punctuation_ignore = on;
+SET zhparser.multi_short = on;
+SET zhparser.multi_zall = on;
+RESET ALL;
diff --git a/zhparser--2.3--2.4.sql b/zhparser--2.3--2.4.sql
new file mode 100644
index 0000000..d08906d
--- /dev/null
+++ b/zhparser--2.3--2.4.sql
@@ -0,0 +1,43 @@
+/*
+ * 2.3 -> 2.4
+ *
+ * - Replace string concatenation in sync_zhprs_custom_word() with
+ *   format() / %L to mitigate path injection through current_database()
+ *   and embedded quotes.
+ * - Validate database name characters before writing the dict file.
+ *
+ * Existing dict files on disk are unaffected.
+ */
+
+CREATE OR REPLACE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS
+$$
+declare
+    data_dir       text;
+    db_name        text;
+    dict_path      text;
+    time_tag_path  text;
+    query          text;
+begin
+    select setting from pg_settings where name = 'data_directory'
+        into data_dir;
+    if data_dir is null then
+        raise exception 'zhparser: cannot resolve data_directory';
+    end if;
+
+    db_name := current_database();
+    if db_name !~ '^[A-Za-z0-9_]+$' then
+        raise exception 'zhparser: refusing to write custom dict for database name "%" (only [A-Za-z0-9_] allowed)', db_name;
+    end if;
+
+    dict_path     := data_dir || '/base/zhprs_dict_' || db_name || '.txt';
+    time_tag_path := data_dir || '/base/zhprs_dict_' || db_name || '.tag';
+
+    query := format(
+        'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to %L encoding %L',
+        dict_path, 'utf8');
+    execute query;
+
+    query := format('copy (select now()) to %L', time_tag_path);
+    execute query;
+end;
+$$;
diff --git a/zhparser--2.4.sql b/zhparser--2.4.sql
new file mode 100644
index 0000000..ee1a8da
--- /dev/null
+++ b/zhparser--2.4.sql
@@ -0,0 +1,91 @@
+CREATE FUNCTION zhprs_start(internal, int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION zhprs_getlexeme(internal, internal, internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION zhprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION zhprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE TEXT SEARCH PARSER zhparser (
+    START    = zhprs_start,
+    GETTOKEN = zhprs_getlexeme,
+    END      = zhprs_end,
+    HEADLINE = pg_catalog.prsd_headline,
+    LEXTYPES = zhprs_lextype
+);
+
+
+CREATE SCHEMA zhparser;
+CREATE TABLE zhparser.zhprs_custom_word(
+    word text primary key,
+    tf   float default '1.0',
+    idf  float default '1.0',
+    attr char  default '@',
+    check (attr = '@' or attr = '!')
+);
+
+/*
+ * sync_zhprs_custom_word
+ *
+ * 2.4 hardening: build the COPY statement via format() with %I/%L
+ * placeholders and validate the database name. Refuses to run if
+ * current_database() contains characters that are unsafe in a
+ * filesystem path; this matches the C side, which also refuses to
+ * load such paths.
+ *
+ * Requires superuser/pg_write_server_files privileges to actually
+ * perform COPY ... TO 'filename', which is unchanged from prior
+ * versions.
+ */
+CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS
+$$
+declare
+    data_dir       text;
+    db_name        text;
+    dict_path      text;
+    time_tag_path  text;
+    query          text;
+begin
+    select setting from pg_settings where name = 'data_directory'
+        into data_dir;
+    if data_dir is null then
+        raise exception 'zhparser: cannot resolve data_directory';
+    end if;
+
+    db_name := current_database();
+    if db_name !~ '^[A-Za-z0-9_]+$' then
+        raise exception 'zhparser: refusing to write custom dict for database name "%" (only [A-Za-z0-9_] allowed)', db_name;
+    end if;
+
+    dict_path     := data_dir || '/base/zhprs_dict_' || db_name || '.txt';
+    time_tag_path := data_dir || '/base/zhprs_dict_' || db_name || '.tag';
+
+    /*
+     * %L on the path quotes it as a SQL string literal (handles single
+     * quotes). %I is irrelevant here; COPY does not accept identifiers.
+     * The encoding is hard-coded utf8.
+     */
+    query := format(
+        'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to %L encoding %L',
+        dict_path, 'utf8');
+    execute query;
+
+    query := format('copy (select now()) to %L', time_tag_path);
+    execute query;
+end;
+$$;
+
+-- do not create custom dict files on fresh install
+-- select sync_zhprs_custom_word();
diff --git a/zhparser-backup-custom-dict.sh b/zhparser-backup-custom-dict.sh
index 7c345a1..8e68d83 100755
--- a/zhparser-backup-custom-dict.sh
+++ b/zhparser-backup-custom-dict.sh
@@ -1,47 +1,135 @@
-if [ $# -lt 2 ];then
-	echo "usage: $0 <backup|restore|delete> <pgdata_dir> [restore_from_dir]"
-	echo "warning: delete is a dangerous cmd, it will delete your custom from pgdata_dir."
-       	echo "!!!you should run backup cmd first, then run the delete cmd !!!"
-	exit 2
-fi
-cmd=$1
-pgdata=$2
-restore_from_dir=$3
-
-if [ $cmd = 'backup' ];then
-	backup_dir=zhparser-backup-custom-dict-$(date +'%F:%T')
-	mkdir ./$backup_dir
-	echo "will backup $pgdata/base/zhprs_dict_* to $backup_dir/"
-	cp -a $pgdata/base/zhprs_dict_* $backup_dir/
-	if [ "$?" -ne 0 ];
-	then
-		echo "backup error!"
-		exit 1
-	else
-		echo "backup ok!"
-	fi
-fi
-
-if [ $cmd = 'delete' ];then
-	echo "will delete $pgdata/base/zhprs_dict_*"
-	rm $pgdata/base/zhprs_dict_*
-	if [ "$?" -ne 0 ];
-	then
-		echo "delete error!"
-		exit 1
-	else
-		echo "delete ok!"
-	fi
-fi
-
-if [ $cmd = 'restore' ];then
-	echo "will restore $restore_from_dir/zhprs_dict_* to $pgdata/base/"
-	cp -a $restore_from_dir/zhprs_dict_* $pgdata/base/
-	if [ "$?" -ne 0 ];
-	then
-		echo "restore error!"
-		exit 1
-	else
-		echo "restore ok!"
-	fi
-fi
+#!/usr/bin/env bash
+#
+# zhparser-backup-custom-dict.sh
+#
+# Manage zhparser custom dict files (zhprs_dict_*) under $PGDATA/base.
+#
+# Usage:
+#   backup  <pgdata_dir>                       Copy zhprs_dict_* to a timestamped backup dir.
+#   restore <pgdata_dir> <restore_from_dir>    Copy zhprs_dict_* from <restore_from_dir> back to <pgdata_dir>/base.
+#   delete  <pgdata_dir> [--yes] [--dry-run]   Delete zhprs_dict_* under <pgdata_dir>/base.
+#
+# Hardening (vs. original):
+#   - set -euo pipefail
+#   - Validates PGDATA exists and looks plausible.
+#   - delete refuses to run without --yes; supports --dry-run.
+#   - Uses null-delimited globs to handle weird filenames.
+
+set -euo pipefail
+
+usage() {
+    cat <<EOF >&2
+usage:
+  $0 backup  <pgdata_dir>
+  $0 restore <pgdata_dir> <restore_from_dir>
+  $0 delete  <pgdata_dir> [--yes] [--dry-run]
+
+WARNING: 'delete' permanently removes zhparser custom dict files.
+         Always run 'backup' first.
+EOF
+    exit 2
+}
+
+require_pgdata() {
+    local d="$1"
+    if [ ! -d "$d" ] || [ ! -d "$d/base" ]; then
+        echo "error: \"$d\" does not look like a PGDATA directory (missing $d/base)" >&2
+        exit 1
+    fi
+}
+
+cmd="${1:-}"
+[ -n "$cmd" ] || usage
+
+case "$cmd" in
+    backup)
+        [ "$#" -ge 2 ] || usage
+        pgdata="$2"
+        require_pgdata "$pgdata"
+        ts=$(date +'%Y-%m-%dT%H-%M-%S')
+        backup_dir="zhparser-backup-custom-dict-$ts"
+        mkdir "$backup_dir"
+        echo "Backing up $pgdata/base/zhprs_dict_* -> $backup_dir/"
+        # Avoid noisy 'no match' if there are zero matching files.
+        shopt -s nullglob
+        files=( "$pgdata"/base/zhprs_dict_* )
+        shopt -u nullglob
+        if [ "${#files[@]}" -eq 0 ]; then
+            echo "no zhprs_dict_* files found; backup directory left empty"
+        else
+            cp -a -- "${files[@]}" "$backup_dir/"
+            echo "backup ok"
+        fi
+        ;;
+
+    restore)
+        [ "$#" -ge 3 ] || usage
+        pgdata="$2"
+        restore_from_dir="$3"
+        require_pgdata "$pgdata"
+        if [ ! -d "$restore_from_dir" ]; then
+            echo "error: restore source \"$restore_from_dir\" does not exist" >&2
+            exit 1
+        fi
+        echo "Restoring $restore_from_dir/zhprs_dict_* -> $pgdata/base/"
+        shopt -s nullglob
+        files=( "$restore_from_dir"/zhprs_dict_* )
+        shopt -u nullglob
+        if [ "${#files[@]}" -eq 0 ]; then
+            echo "error: no zhprs_dict_* files in $restore_from_dir" >&2
+            exit 1
+        fi
+        cp -a -- "${files[@]}" "$pgdata/base/"
+        echo "restore ok"
+        ;;
+
+    delete)
+        [ "$#" -ge 2 ] || usage
+        pgdata="$2"
+        require_pgdata "$pgdata"
+        shift 2
+        confirm="no"
+        dry_run="no"
+        while [ "$#" -gt 0 ]; do
+            case "$1" in
+                --yes)     confirm="yes" ;;
+                --dry-run) dry_run="yes" ;;
+                *) echo "unknown flag: $1" >&2; exit 2 ;;
+            esac
+            shift
+        done
+
+        shopt -s nullglob
+        files=( "$pgdata"/base/zhprs_dict_* )
+        shopt -u nullglob
+
+        if [ "${#files[@]}" -eq 0 ]; then
+            echo "nothing to delete: no zhprs_dict_* under $pgdata/base"
+            exit 0
+        fi
+
+        echo "Will delete the following files under $pgdata/base:"
+        for f in "${files[@]}"; do
+            echo "  $f"
+        done
+
+        if [ "$dry_run" = "yes" ]; then
+            echo "(dry-run, no files removed)"
+            exit 0
+        fi
+
+        if [ "$confirm" != "yes" ]; then
+            echo
+            echo "REFUSING to delete without --yes. Re-run with:"
+            echo "    $0 delete $pgdata --yes"
+            exit 1
+        fi
+
+        rm -- "${files[@]}"
+        echo "delete ok"
+        ;;
+
+    *)
+        usage
+        ;;
+esac
diff --git a/zhparser.c b/zhparser.c
index 6fde37b..4367a7f 100644
--- a/zhparser.c
+++ b/zhparser.c
@@ -1,320 +1,625 @@
 /*-------------------------------------------------------------------------
  *
  * zhparser.c
- *	  a text search parser for Chinese
+ *	  A text search parser for Chinese based on SCWS.
+ *
+ * Hardened revision (PG 16/17/18):
+ *   - Per-call SCWS instance via scws_fork() to remove global mutable state.
+ *   - GUCs registered in _PG_init() and validated through check hooks.
+ *   - extra_dicts / database-name path traversal hardening.
+ *   - mmap as default dict load mode (shared via OS page cache).
+ *   - Lexeme attribute range fixed to ['a','z'] (was ['a','x']).
+ *   - Cached multi-mode flags so hot path avoids repeated GUC reads.
  *
  *-------------------------------------------------------------------------
  */
-#include "zhparser.h"
-
 #include "postgres.h"
-#include "miscadmin.h"
+
 #include "fmgr.h"
-#include "utils/guc.h"
+#include "miscadmin.h"
+#include "commands/dbcommands.h"
 #include "utils/builtins.h"
-
-#if PG_VERSION_NUM >= 100000
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/palloc.h"
 #include "utils/varlena.h"
-#endif
 
-#include "commands/dbcommands.h"
+#include <ctype.h>
+#include <string.h>
+#include <sys/stat.h>
 
-/* dict file extension */
-#define TXT_EXT ".txt"
-#define XDB_EXT ".xdb"
-/* length of file extension */
-#define EXT_LEN 4
+#include "zhparser.h"
 
 PG_MODULE_MAGIC;
-/*
- * types
- */
 
-/* self-defined type */
-typedef struct
+/* ----------------------------------------------------------------------- *
+ * Constants
+ * ----------------------------------------------------------------------- */
+
+#define LEX_TYPE_COUNT		26			/* a..z */
+#define DICT_EXT_LEN		4			/* ".txt" / ".xdb" */
+#define TXT_EXT				".txt"
+#define XDB_EXT				".xdb"
+
+/* ----------------------------------------------------------------------- *
+ * Types
+ * ----------------------------------------------------------------------- */
+
+typedef struct ParserState
 {
-	char	   *buffer;			/* text to parse */
-	int		len;			/* length of the text in buffer */
-	int		pos;			/* position of the parser */
-	scws_t scws;
-	scws_res_t head;
-	scws_res_t curr;
+	char	   *buffer;			/* text to parse (palloc'd by caller) */
+	int			len;
+	int			pos;
+	scws_t		scws;			/* per-call SCWS instance (forked) */
+	scws_res_t	head;
+	scws_res_t	curr;
 } ParserState;
 
-/* copy-paste from wparser.h of tsearch2 */
-typedef struct
+typedef struct LexDescr
 {
 	int			lexid;
 	char	   *alias;
 	char	   *descr;
 } LexDescr;
 
-static void init();
+/* ----------------------------------------------------------------------- *
+ * GUC variables
+ * ----------------------------------------------------------------------- */
+
+static bool	dict_in_memory = false;
+static char *extra_dicts = NULL;
+
+static bool	punctuation_ignore = false;
+static bool	seg_with_duality = false;
+static bool	multi_short = false;
+static bool	multi_duality = false;
+static bool	multi_zmain = false;
+static bool	multi_zall = false;
+
+/* ----------------------------------------------------------------------- *
+ * Process-local state
+ *
+ * The "master" SCWS instance owns the loaded dictionary/rules. Every
+ * zhprs_start() forks a cheap per-call clone (scws_fork) so concurrent
+ * parser invocations within the same backend (e.g. SRFs, nested calls,
+ * subqueries) cannot trample each other.
+ * ----------------------------------------------------------------------- */
+
+static scws_t master_scws = NULL;
+static bool   master_load_failed = false;
+
+/* ----------------------------------------------------------------------- *
+ * Forward declarations
+ * ----------------------------------------------------------------------- */
 
+void _PG_init(void);
+void _PG_fini(void);
+
+PG_FUNCTION_INFO_V1(zhprs_start);
+PG_FUNCTION_INFO_V1(zhprs_getlexeme);
+PG_FUNCTION_INFO_V1(zhprs_end);
+PG_FUNCTION_INFO_V1(zhprs_lextype);
+
+static void ensure_master_loaded(void);
+static int	resolve_load_mode(void);
+static int	current_multi_mode(void);
+static bool is_safe_dict_filename(const char *name);
+static bool is_safe_database_name(const char *name);
 static void init_type(LexDescr descr[]);
 
+/* GUC hooks */
+static bool check_extra_dicts(char **newval, void **extra, GucSource source);
+
+/* ----------------------------------------------------------------------- *
+ * Lex type table  (static, copied per zhprs_lextype call)
+ * ----------------------------------------------------------------------- */
+
+static const struct
+{
+	int			lexid;
+	const char *alias;
+	const char *descr;
+} lex_types[LEX_TYPE_COUNT] = {
+	{ 'a', "a", "adjective,形容词" },
+	{ 'b', "b", "differentiation,区别词" },
+	{ 'c', "c", "conjunction,连词" },
+	{ 'd', "d", "adverb,副词" },
+	{ 'e', "e", "exclamation,感叹词" },
+	{ 'f', "f", "position,方位词" },
+	{ 'g', "g", "root,词根" },
+	{ 'h', "h", "head,前连接成分" },
+	{ 'i', "i", "idiom,成语" },
+	{ 'j', "j", "abbreviation,简称" },
+	{ 'k', "k", "tail,后连接成分" },
+	{ 'l', "l", "tmp,习用语" },
+	{ 'm', "m", "numeral,数词" },
+	{ 'n', "n", "noun,名词" },
+	{ 'o', "o", "onomatopoeia,拟声词" },
+	{ 'p', "p", "prepositional,介词" },
+	{ 'q', "q", "quantity,量词" },
+	{ 'r', "r", "pronoun,代词" },
+	{ 's', "s", "space,处所词" },
+	{ 't', "t", "time,时语素" },
+	{ 'u', "u", "auxiliary,助词" },
+	{ 'v', "v", "verb,动词" },
+	{ 'w', "w", "punctuation,标点符号" },
+	{ 'x', "x", "unknown,未知词" },
+	{ 'y', "y", "modal,语气词" },
+	{ 'z', "z", "status,状态词" },
+};
+
+/* ----------------------------------------------------------------------- *
+ * Helpers
+ * ----------------------------------------------------------------------- */
+
 /*
- * prototypes
+ * is_safe_dict_filename
+ *
+ * Whitelist for entries listed in zhparser.extra_dicts. We deliberately
+ * forbid anything that could escape the tsearch_data directory (no '/',
+ * no '\', no '..', no leading dot). Only [A-Za-z0-9_.-] is allowed.
  */
-PG_FUNCTION_INFO_V1(zhprs_start);
-Datum		zhprs_start(PG_FUNCTION_ARGS);
+static bool
+is_safe_dict_filename(const char *name)
+{
+	const char *p;
 
-PG_FUNCTION_INFO_V1(zhprs_getlexeme);
-Datum		zhprs_getlexeme(PG_FUNCTION_ARGS);
+	if (name == NULL || name[0] == '\0' || name[0] == '.' || name[0] == '-')
+		return false;
 
-PG_FUNCTION_INFO_V1(zhprs_end);
-Datum		zhprs_end(PG_FUNCTION_ARGS);
+	for (p = name; *p; p++)
+	{
+		unsigned char c = (unsigned char) *p;
+		if (!(isalnum(c) || c == '_' || c == '.' || c == '-'))
+			return false;
+	}
 
-PG_FUNCTION_INFO_V1(zhprs_lextype);
-Datum		zhprs_lextype(PG_FUNCTION_ARGS);
+	/* explicit ".." rejection */
+	if (strstr(name, "..") != NULL)
+		return false;
 
-static scws_t scws = NULL;
-static ParserState parser_state;
+	return true;
+}
 
-/* config */
-static bool dict_in_memory = false;
-static char * extra_dicts = NULL;
+/*
+ * is_safe_database_name
+ *
+ * The custom-word file is named after current_database(). PG allows
+ * almost any character there once quoted, so we refuse to load the
+ * custom dict (with a LOG) for "exotic" names instead of building a
+ * traversable filesystem path.
+ */
+static bool
+is_safe_database_name(const char *name)
+{
+	const char *p;
+
+	if (name == NULL || name[0] == '\0')
+		return false;
 
-static bool punctuation_ignore = false;
-static bool seg_with_duality = false;
-static bool multi_short = false;
-static bool multi_duality = false;
-static bool multi_zmain = false;
-static bool multi_zall = false;
+	for (p = name; *p; p++)
+	{
+		unsigned char c = (unsigned char) *p;
+		if (!(isalnum(c) || c == '_'))
+			return false;
+	}
+	return true;
+}
 
-static void init(){
-	char sharepath[MAXPGPATH];
-	char dict_path[MAXPGPATH];
-	char rule_path[MAXPGPATH];
-	int load_dict_mem_mode = 0x0;
+/*
+ * resolve_load_mode
+ *
+ * NOTE: SCWS itself does NOT expose a public "mmap" flag. With the default
+ * (no SCWS_XDICT_MEM), libscws opens the .xdb via mmap (xdb.c uses fmap),
+ * so the kernel page cache already gives backends a shared dictionary
+ * footprint. Setting SCWS_XDICT_MEM forces the dict to be slurped into
+ * private heap, which is the only mode where 14MB is duplicated per
+ * backend.
+ */
+static int
+resolve_load_mode(void)
+{
+	return dict_in_memory ? SCWS_XDICT_MEM : 0;
+}
 
-	List *elemlist;
-	ListCell *l;
+/* ----------------------------------------------------------------------- *
+ * GUC hooks
+ * ----------------------------------------------------------------------- */
 
-	if (!(scws = scws_new())) {
-		ereport(ERROR,
-				(errcode(ERRCODE_INTERNAL_ERROR),
-				 errmsg("Failed to init Chinese Parser Lib SCWS!\"%s\"",""
-				       )));
+static bool
+check_extra_dicts(char **newval, void **extra, GucSource source)
+{
+	List	   *elemlist;
+	ListCell   *l;
+	char	   *rawname;
+	bool		ok = true;
+
+	if (*newval == NULL || (*newval)[0] == '\0')
+		return true;
+
+	rawname = pstrdup(*newval);
+	if (!SplitIdentifierString(rawname, ',', &elemlist))
+	{
+		GUC_check_errdetail("List syntax is invalid.");
+		pfree(rawname);
+		return false;
 	}
-	
+
+	foreach(l, elemlist)
+	{
+		const char *name = (const char *) lfirst(l);
+		const char *ext;
+
+		if (!is_safe_dict_filename(name))
+		{
+			GUC_check_errdetail("Dict file name \"%s\" contains illegal characters.", name);
+			ok = false;
+			break;
+		}
+
+		ext = strrchr(name, '.');
+		if (ext == NULL || strlen(ext) != DICT_EXT_LEN ||
+			(strcmp(ext, TXT_EXT) != 0 && strcmp(ext, XDB_EXT) != 0))
+		{
+			GUC_check_errdetail("Dict file \"%s\" must end with .txt or .xdb.", name);
+			ok = false;
+			break;
+		}
+	}
+
+	list_free(elemlist);
+	pfree(rawname);
+	return ok;
+}
+
+static int
+current_multi_mode(void)
+{
+	int m = 0;
+	if (multi_short)	m |= SCWS_MULTI_SHORT;
+	if (multi_duality)	m |= SCWS_MULTI_DUALITY;
+	if (multi_zmain)	m |= SCWS_MULTI_ZMAIN;
+	if (multi_zall)		m |= SCWS_MULTI_ZALL;
+	return m;
+}
+
+/* ----------------------------------------------------------------------- *
+ * _PG_init / _PG_fini
+ * ----------------------------------------------------------------------- */
+
+void
+_PG_init(void)
+{
 	DefineCustomBoolVariable(
 		"zhparser.dict_in_memory",
-		"load dicts into memory",
-		"load dicts into memory",
+		"Load dicts into memory (private heap copy per backend).",
+		"When false (default) the dict is mmap'd, sharing via OS page cache.",
 		&dict_in_memory,
 		false,
 		PGC_BACKEND,
-		0,
-		NULL,
-		NULL,
-		NULL
-		);
+		0, NULL, NULL, NULL);
+
 	DefineCustomStringVariable(
 		"zhparser.extra_dicts",
-		"extra dicts files to load",
-		"extra dicts files to load",
+		"Extra dict files to load (comma separated, basenames only).",
+		"Names must end with .txt or .xdb and contain only [A-Za-z0-9_.-].",
 		&extra_dicts,
 		NULL,
 		PGC_BACKEND,
 		0,
+		check_extra_dicts,
 		NULL,
-		NULL,
-		NULL
-		);
+		NULL);
+
 	DefineCustomBoolVariable(
 		"zhparser.punctuation_ignore",
-		"set if zhparser ignores the puncuation",
-		"set if zhparser ignores the puncuation,except \\r and \\n",
+		"Ignore punctuation (except CR/LF).",
+		NULL,
 		&punctuation_ignore,
 		false,
 		PGC_USERSET,
-		0,
-		NULL,
-		NULL,
-		NULL
-		);
+		0, NULL, NULL, NULL);
 
 	DefineCustomBoolVariable(
 		"zhparser.seg_with_duality",
-		"segment words with duality",
-		"segment words with duality",
+		"Segment words with duality.",
+		NULL,
 		&seg_with_duality,
 		false,
 		PGC_USERSET,
-		0,
-		NULL,
-		NULL,
-		NULL
-		);
+		0, NULL, NULL, NULL);
+
 	DefineCustomBoolVariable(
 		"zhparser.multi_short",
-		"prefer short words",
-		"prefer short words",
+		"Prefer short words.",
+		NULL,
 		&multi_short,
 		false,
 		PGC_USERSET,
-		0,
-		NULL,
-		NULL,
-		NULL
-		);
+		0, NULL, NULL, NULL);
+
 	DefineCustomBoolVariable(
 		"zhparser.multi_duality",
-		"prefer duality",
-		"prefer duality",
+		"Prefer duality.",
+		NULL,
 		&multi_duality,
 		false,
 		PGC_USERSET,
-		0,
-		NULL,
-		NULL,
-		NULL
-		);
+		0, NULL, NULL, NULL);
+
 	DefineCustomBoolVariable(
 		"zhparser.multi_zmain",
-		"prefer most important element",
-		"prefer most important element",
+		"Prefer most important element.",
+		NULL,
 		&multi_zmain,
 		false,
 		PGC_USERSET,
-		0,
-		NULL,
-		NULL,
-		NULL
-		);
+		0, NULL, NULL, NULL);
+
 	DefineCustomBoolVariable(
 		"zhparser.multi_zall",
-		"prefer all element",
-		"prefer all element",
+		"Prefer all elements.",
+		NULL,
 		&multi_zall,
 		false,
 		PGC_USERSET,
-		0,
-		NULL,
-		NULL,
-		NULL
-		);
+		0, NULL, NULL, NULL);
 
-	get_share_path(my_exec_path, sharepath);
-	snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s.%s",
-			sharepath, "dict.utf8", "xdb");
-	scws_set_charset(scws, "utf-8");
-
-	if(dict_in_memory)
-	    load_dict_mem_mode = SCWS_XDICT_MEM;
-
-	/* ignore error,default dict is xdb */
-	if( scws_set_dict(scws,dict_path,load_dict_mem_mode | SCWS_XDICT_XDB ) != 0){
-	    ereport(NOTICE,
-		    (errcode(ERRCODE_INTERNAL_ERROR),
-		     errmsg("zhparser set dict : \"%s\" failed!",dict_path
-			 )));
-	}
+#if PG_VERSION_NUM >= 150000
+	MarkGUCPrefixReserved("zhparser");
+#endif
+}
 
-	snprintf(dict_path, MAXPGPATH, "%s/base/zhprs_dict_%s.txt",
-			DataDir, get_database_name(MyDatabaseId));
-	if(scws_add_dict(scws, dict_path, load_dict_mem_mode | SCWS_XDICT_TXT) != 0 ){
-		ereport(LOG,
-			    (errcode(ERRCODE_INTERNAL_ERROR),
-			     errmsg("zhparser add dict : \"%s\" failed! May not config custom dict, omit this",dict_path
-				 )));
+void
+_PG_fini(void)
+{
+	if (master_scws != NULL)
+	{
+		scws_free(master_scws);
+		master_scws = NULL;
 	}
+}
 
-	if(extra_dicts != NULL){
-	    if(!SplitIdentifierString(pstrdup(extra_dicts),',',&elemlist)){
-		scws_free(scws);
-		list_free(elemlist);
-		scws = NULL;
+/* ----------------------------------------------------------------------- *
+ * ensure_master_loaded
+ *
+ * Lazy-loads the master SCWS instance. Safe to call repeatedly: on
+ * persistent failure we cache the failure for the rest of the backend
+ * lifetime instead of trying again on every call.
+ * ----------------------------------------------------------------------- */
+
+static void
+ensure_master_loaded(void)
+{
+	char		sharepath[MAXPGPATH];
+	char		dict_path[MAXPGPATH];
+	char		rule_path[MAXPGPATH];
+	int			load_mode;
+	List	   *elemlist = NIL;
+	ListCell   *l;
+	char	   *rawnames = NULL;
+	const char *dbname;
+	scws_t		newscws;
+
+	if (master_scws != NULL || master_load_failed)
+		return;
+
+	newscws = scws_new();
+	if (newscws == NULL)
+	{
+		master_load_failed = true;
 		ereport(ERROR,
 				(errcode(ERRCODE_INTERNAL_ERROR),
-				 errmsg("zhparser.extra_dicts syntax error! extra_dicts is \"%s\"",extra_dicts
-				       )));
-	    }
-
-	    foreach(l,elemlist){
-		int load_dict_mode = load_dict_mem_mode;
-		char * ext = strrchr((char*)lfirst(l),'.');
-		if(ext != NULL && strlen(ext) == EXT_LEN){
-		    if(strncmp(ext,TXT_EXT,EXT_LEN) == 0){
-			load_dict_mode |= SCWS_XDICT_TXT;
-		    }
-		    else if(strncmp(ext,XDB_EXT,EXT_LEN) == 0){
-			load_dict_mode |= SCWS_XDICT_XDB;
-		    }
-		}
+				 errmsg("failed to initialize SCWS")));
+	}
 
-		if(((load_dict_mode & SCWS_XDICT_TXT) == 0) &&
-			((load_dict_mode & SCWS_XDICT_XDB) == 0)){
-			scws_free(scws);
-			list_free(elemlist);
-			scws = NULL;
-			ereport(ERROR,
+	scws_set_charset(newscws, "utf-8");
+
+	get_share_path(my_exec_path, sharepath);
+
+	load_mode = resolve_load_mode();
+
+	/* 1) Built-in main dict ------------------------------------------------ */
+	snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/dict.utf8.xdb", sharepath);
+	if (scws_set_dict(newscws, dict_path, load_mode | SCWS_XDICT_XDB) != 0)
+		ereport(NOTICE,
 				(errcode(ERRCODE_INTERNAL_ERROR),
-				 errmsg("zhparser.extra_dicts setting error,the file name must end with .txt or .xdb! error file name is \"%s\"",(char*)lfirst(l)
-				     )));
-		
+				 errmsg("zhparser: failed to set main dict \"%s\"", dict_path)));
+
+	/* 2) Per-database custom dict ---------------------------------------- */
+	dbname = get_database_name(MyDatabaseId);
+	if (!is_safe_database_name(dbname))
+	{
+		ereport(LOG,
+				(errmsg("zhparser: skipping custom dict for database \"%s\" "
+						"(name contains characters that are unsafe for filesystem paths)",
+						dbname != NULL ? dbname : "(null)")));
+	}
+	else
+	{
+		snprintf(dict_path, MAXPGPATH, "%s/base/zhprs_dict_%s.txt",
+				 DataDir, dbname);
+		if (scws_add_dict(newscws, dict_path, load_mode | SCWS_XDICT_TXT) != 0)
+			ereport(LOG,
+					(errmsg("zhparser: custom dict \"%s\" not loaded "
+							"(missing or unreadable; run zhparser.sync_zhprs_custom_word() if expected)",
+							dict_path)));
+	}
+
+	/* 3) extra_dicts ----------------------------------------------------- */
+	if (extra_dicts != NULL && extra_dicts[0] != '\0')
+	{
+		rawnames = pstrdup(extra_dicts);
+		if (!SplitIdentifierString(rawnames, ',', &elemlist))
+		{
+			pfree(rawnames);
+			scws_free(newscws);
+			master_load_failed = true;
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("zhparser.extra_dicts has invalid syntax: \"%s\"",
+							extra_dicts)));
 		}
 
-		snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s",
-			sharepath, (char*)lfirst(l));
-		/* ignore error*/
-		if( scws_add_dict(scws,dict_path,load_dict_mode) != 0 ){
-		    ereport(LOG,
-			    (errcode(ERRCODE_INTERNAL_ERROR),
-			     errmsg("zhparser add dict : \"%s\" failed for extra dict! omit",dict_path
-				 )));
+		foreach(l, elemlist)
+		{
+			const char *name = (const char *) lfirst(l);
+			const char *ext;
+			int			mode = load_mode;
+
+			/* Re-validate at load time too (defence in depth). */
+			if (!is_safe_dict_filename(name))
+			{
+				list_free(elemlist);
+				pfree(rawnames);
+				scws_free(newscws);
+				master_load_failed = true;
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("zhparser.extra_dicts contains illegal name \"%s\"",
+								name)));
+			}
+
+			ext = strrchr(name, '.');
+			if (ext == NULL || strlen(ext) != DICT_EXT_LEN)
+			{
+				list_free(elemlist);
+				pfree(rawnames);
+				scws_free(newscws);
+				master_load_failed = true;
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("zhparser.extra_dicts entry \"%s\" must end with .txt or .xdb",
+								name)));
+			}
+			if (strcmp(ext, TXT_EXT) == 0)
+				mode |= SCWS_XDICT_TXT;
+			else if (strcmp(ext, XDB_EXT) == 0)
+				mode |= SCWS_XDICT_XDB;
+			else
+			{
+				list_free(elemlist);
+				pfree(rawnames);
+				scws_free(newscws);
+				master_load_failed = true;
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("zhparser.extra_dicts entry \"%s\" must end with .txt or .xdb",
+								name)));
+			}
+
+			snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s",
+					 sharepath, name);
+			if (scws_add_dict(newscws, dict_path, mode) != 0)
+				ereport(LOG,
+						(errmsg("zhparser: failed to add extra dict \"%s\"",
+								dict_path)));
 		}
-	    }
-	    list_free(elemlist);
+
+		list_free(elemlist);
+		pfree(rawnames);
+	}
+
+	/* 4) Rules ----------------------------------------------------------- */
+	snprintf(rule_path, MAXPGPATH, "%s/tsearch_data/rules.utf8.ini", sharepath);
+	{
+		struct stat st;
+		if (stat(rule_path, &st) == 0)
+			scws_set_rule(newscws, rule_path);
+		else
+			ereport(LOG,
+					(errmsg("zhparser: rules file \"%s\" not found, continuing without rules",
+							rule_path)));
 	}
 
-	snprintf(rule_path, MAXPGPATH, "%s/tsearch_data/%s.%s",
-			sharepath, "rules.utf8", "ini");
-	scws_set_rule(scws ,rule_path);
+	/* Configure ignore/duality on the master so forks inherit them. */
+	scws_set_ignore(newscws, (int) punctuation_ignore);
+	scws_set_duality(newscws, (int) seg_with_duality);
+	scws_set_multi(newscws, current_multi_mode());
+
+	master_scws = newscws;
 }
 
-/*
- * functions
- */
+/* ----------------------------------------------------------------------- *
+ * Per-call resource cleanup
+ *
+ * MemoryContextRegisterResetCallback lets us guarantee that on any
+ * unwind (ERROR, transaction abort) we still free the forked SCWS
+ * instance and its result cursor.
+ * ----------------------------------------------------------------------- */
+
+static void
+parser_state_cleanup(void *arg)
+{
+	ParserState *pst = (ParserState *) arg;
+
+	if (pst == NULL)
+		return;
+	if (pst->head != NULL)
+	{
+		scws_free_result(pst->head);
+		pst->head = NULL;
+		pst->curr = NULL;
+	}
+	if (pst->scws != NULL)
+	{
+		scws_free(pst->scws);
+		pst->scws = NULL;
+	}
+}
+
+/* ----------------------------------------------------------------------- *
+ * SQL-callable functions
+ * ----------------------------------------------------------------------- */
 
 Datum
 zhprs_start(PG_FUNCTION_ARGS)
 {
-	ParserState *pst = &parser_state;
-	int multi_mode = 0x0;
-
-	if(scws == NULL)
-		init();
-	pst -> scws = scws;
-	pst -> buffer = (char *) PG_GETARG_POINTER(0);
-	pst -> len = PG_GETARG_INT32(1);
-	pst -> pos = 0;
+	ParserState			   *pst;
+	scws_t					forked;
+	MemoryContext			cxt;
+	MemoryContextCallback  *cb;
 
-	scws_set_ignore(scws, (int)punctuation_ignore);
-	scws_set_duality(scws,(int)seg_with_duality);
+	ensure_master_loaded();
+	if (master_scws == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("zhparser: SCWS not initialized")));
 
-	if(multi_short){
-	    multi_mode |= SCWS_MULTI_SHORT;
-	}
+	cxt = CurrentMemoryContext;
 
-	if(multi_duality){
-	    multi_mode |= SCWS_MULTI_DUALITY;
-	}
+	pst = (ParserState *) MemoryContextAllocZero(cxt, sizeof(ParserState));
 
-	if(multi_zmain){
-	    multi_mode |= SCWS_MULTI_ZMAIN;
+	forked = scws_fork(master_scws);
+	if (forked == NULL)
+	{
+		pfree(pst);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("zhparser: scws_fork() failed")));
 	}
 
-	if(multi_zall){
-	    multi_mode |= SCWS_MULTI_ZALL;
-	}
+	/*
+	 * Apply per-call user settings to the forked instance only.
+	 * The master is unaffected, so concurrent calls cannot collide.
+	 */
+	scws_set_ignore(forked, (int) punctuation_ignore);
+	scws_set_duality(forked, (int) seg_with_duality);
+	scws_set_multi(forked, current_multi_mode());
 
-	scws_set_multi(scws,multi_mode);
+	pst->scws   = forked;
+	pst->buffer = (char *) PG_GETARG_POINTER(0);
+	pst->len    = PG_GETARG_INT32(1);
+	pst->pos    = 0;
 
-	scws_send_text(pst -> scws, pst -> buffer, pst -> len);
+	/* Register cleanup before sending text, so any failure unwinds cleanly. */
+	cb = (MemoryContextCallback *) MemoryContextAllocZero(cxt, sizeof(*cb));
+	cb->func = parser_state_cleanup;
+	cb->arg  = pst;
+	MemoryContextRegisterResetCallback(cxt, cb);
 
-	(pst -> head) = (pst -> curr) = scws_get_result(pst -> scws);
+	scws_send_text(pst->scws, pst->buffer, pst->len);
+	pst->head = pst->curr = scws_get_result(pst->scws);
 
 	PG_RETURN_POINTER(pst);
 }
@@ -322,41 +627,52 @@ zhprs_start(PG_FUNCTION_ARGS)
 Datum
 zhprs_getlexeme(PG_FUNCTION_ARGS)
 {
-	ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
-	char	  **t = (char **) PG_GETARG_POINTER(1);
-	int		   *tlen = (int *) PG_GETARG_POINTER(2);
-	int			type = -1;
+	ParserState	   *pst = (ParserState *) PG_GETARG_POINTER(0);
+	char		  **t = (char **) PG_GETARG_POINTER(1);
+	int			   *tlen = (int *) PG_GETARG_POINTER(2);
+	int				type = -1;
 
-	if((pst -> head) == NULL ) /* already done the work,or no sentence */
+	if (pst == NULL || pst->head == NULL)
 	{
+		*t = NULL;
 		*tlen = 0;
-		type = 0;
+		return Int32GetDatum(0);
 	}
-	/* have results */
-	else if(pst -> curr != NULL)
+
+	if (pst->curr != NULL)
 	{
-		scws_res_t  curr = pst -> curr;
+		scws_res_t curr = pst->curr;
+		unsigned char attr0 = (unsigned char) curr->attr[0];
 
 		/*
- 		* check the first char to determine the lextype
- 		* if out of [0,25],then set to 'x',mean unknown type 
- 		* so for Ag,Dg,Ng,Tg,Vg,the type will be unknown
- 		* for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr
-		*/
-		type = (int)(curr -> attr)[0];
-		if(type > (int)'x' || type < (int)'a')
-		    type = (int)'x';
-		*tlen = curr -> len;
-		*t = pst -> buffer + curr -> off;
-
-		pst -> curr = curr -> next;
-
-		/* fetch the next sentence */
-		if(pst -> curr == NULL ){
-			scws_free_result(pst -> head);
-			(pst -> head) =	(pst -> curr) = scws_get_result(pst -> scws);
+		 * SCWS attributes use 'a'..'z' (see init_type below). Anything
+		 * outside that range is mapped to 'x' (unknown).
+		 *
+		 * NOTE: the original code restricted to ['a','x'] which silently
+		 * dropped 'y' (modal) and 'z' (status). Fixed to ['a','z'].
+		 */
+		if (attr0 < (unsigned char) 'a' || attr0 > (unsigned char) 'z')
+			type = (int) 'x';
+		else
+			type = (int) attr0;
+
+		*tlen = curr->len;
+		*t    = pst->buffer + curr->off;
+
+		pst->curr = curr->next;
+
+		if (pst->curr == NULL)
+		{
+			scws_free_result(pst->head);
+			pst->head = pst->curr = scws_get_result(pst->scws);
 		}
 	}
+	else
+	{
+		*t = NULL;
+		*tlen = 0;
+		type = 0;
+	}
 
 	PG_RETURN_INT32(type);
 }
@@ -364,101 +680,56 @@ zhprs_getlexeme(PG_FUNCTION_ARGS)
 Datum
 zhprs_end(PG_FUNCTION_ARGS)
 {
+	ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+
+	/*
+	 * The MemoryContext reset callback we registered in zhprs_start will
+	 * release the forked SCWS instance and any pending result cursor.
+	 *
+	 * However, when the caller (e.g. a long-lived loop in to_tsvector_byid)
+	 * keeps the same context alive across many parse cycles, we want to
+	 * release immediately to keep RSS flat.
+	 */
+	if (pst != NULL)
+	{
+		if (pst->head != NULL)
+		{
+			scws_free_result(pst->head);
+			pst->head = NULL;
+			pst->curr = NULL;
+		}
+		if (pst->scws != NULL)
+		{
+			scws_free(pst->scws);
+			pst->scws = NULL;
+		}
+	}
 	PG_RETURN_VOID();
 }
 
 Datum
 zhprs_lextype(PG_FUNCTION_ARGS)
 {
-	LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (26 + 1));
-	init_type(descr);
+	LexDescr   *descr;
 
+	descr = (LexDescr *) palloc(sizeof(LexDescr) * (LEX_TYPE_COUNT + 1));
+	init_type(descr);
 	PG_RETURN_POINTER(descr);
 }
 
-static void init_type(LexDescr descr[]){
-	/* 
-	* there are 26 types in this parser,alias from a to z
-	* for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr
-	*/
-	descr[0].lexid = 97;
-	descr[0].alias = pstrdup("a");
-	descr[0].descr = pstrdup("adjective,形容词");
-	descr[1].lexid = 98;
-	descr[1].alias = pstrdup("b");
-	descr[1].descr = pstrdup("differentiation,区别词");
-	descr[2].lexid = 99;
-	descr[2].alias = pstrdup("c");
-	descr[2].descr = pstrdup("conjunction,连词");
-	descr[3].lexid = 100;
-	descr[3].alias = pstrdup("d");
-	descr[3].descr = pstrdup("adverb,副词");
-	descr[4].lexid = 101;
-	descr[4].alias = pstrdup("e");
-	descr[4].descr = pstrdup("exclamation,感叹词");
-	descr[5].lexid = 102;
-	descr[5].alias = pstrdup("f");
-	descr[5].descr = pstrdup("position,方位词");
-	descr[6].lexid = 103;
-	descr[6].alias = pstrdup("g");
-	descr[6].descr = pstrdup("root,词根");
-	descr[7].lexid = 104;
-	descr[7].alias = pstrdup("h");
-	descr[7].descr = pstrdup("head,前连接成分");
-	descr[8].lexid = 105;
-	descr[8].alias = pstrdup("i");
-	descr[8].descr = pstrdup("idiom,成语");
-	descr[9].lexid = 106;
-	descr[9].alias = pstrdup("j");
-	descr[9].descr = pstrdup("abbreviation,简称");
-	descr[10].lexid = 107;
-	descr[10].alias = pstrdup("k");
-	descr[10].descr = pstrdup("tail,后连接成分");
-	descr[11].lexid = 108;
-	descr[11].alias = pstrdup("l");
-	descr[11].descr = pstrdup("tmp,习用语");
-	descr[12].lexid = 109;
-	descr[12].alias = pstrdup("m");
-	descr[12].descr = pstrdup("numeral,数词");
-	descr[13].lexid = 110;
-	descr[13].alias = pstrdup("n");
-	descr[13].descr = pstrdup("noun,名词");
-	descr[14].lexid = 111;
-	descr[14].alias = pstrdup("o");
-	descr[14].descr = pstrdup("onomatopoeia,拟声词");
-	descr[15].lexid = 112;
-	descr[15].alias = pstrdup("p");
-	descr[15].descr = pstrdup("prepositional,介词");
-	descr[16].lexid = 113;
-	descr[16].alias = pstrdup("q");
-	descr[16].descr = pstrdup("quantity,量词");
-	descr[17].lexid = 114;
-	descr[17].alias = pstrdup("r");
-	descr[17].descr = pstrdup("pronoun,代词");
-	descr[18].lexid = 115;
-	descr[18].alias = pstrdup("s");
-	descr[18].descr = pstrdup("space,处所词");
-	descr[19].lexid = 116;
-	descr[19].alias = pstrdup("t");
-	descr[19].descr = pstrdup("time,时语素");
-	descr[20].lexid = 117;
-	descr[20].alias = pstrdup("u");
-	descr[20].descr = pstrdup("auxiliary,助词");
-	descr[21].lexid = 118;
-	descr[21].alias = pstrdup("v");
-	descr[21].descr = pstrdup("verb,动词");
-	descr[22].lexid = 119;
-	descr[22].alias = pstrdup("w");
-	descr[22].descr = pstrdup("punctuation,标点符号");
-	descr[23].lexid = 120;
-	descr[23].alias = pstrdup("x");
-	descr[23].descr = pstrdup("unknown,未知词");
-	descr[24].lexid = 121;
-	descr[24].alias = pstrdup("y");
-	descr[24].descr = pstrdup("modal,语气词");
-	descr[25].lexid = 122;
-	descr[25].alias = pstrdup("z");
-	descr[25].descr = pstrdup("status,状态词");
-	descr[26].lexid = 0;
+static void
+init_type(LexDescr descr[])
+{
+	int			i;
+
+	for (i = 0; i < LEX_TYPE_COUNT; i++)
+	{
+		descr[i].lexid = lex_types[i].lexid;
+		descr[i].alias = pstrdup(lex_types[i].alias);
+		descr[i].descr = pstrdup(lex_types[i].descr);
+	}
+	/* sentinel */
+	descr[LEX_TYPE_COUNT].lexid = 0;
+	descr[LEX_TYPE_COUNT].alias = NULL;
+	descr[LEX_TYPE_COUNT].descr = NULL;
 }
-//TODO :headline function
diff --git a/zhparser.control b/zhparser.control
index 1e06790..70e78cc 100644
--- a/zhparser.control
+++ b/zhparser.control
@@ -1,4 +1,4 @@
 comment = 'a parser for full-text search of Chinese'
-default_version = '2.3'
+default_version = '2.4'
 module_pathname = '$libdir/zhparser'
 relocatable = true
diff --git a/zhparser.h b/zhparser.h
index ba9b0c9..46dd300 100644
--- a/zhparser.h
+++ b/zhparser.h
@@ -1,12 +1,32 @@
-#ifndef ZHPARSER_H 
+#ifndef ZHPARSER_H
 #define ZHPARSER_H
 
-#ifndef pstrdup
-#define pstrdup scws_pstrdup
+/*
+ * SCWS prior to 1.2.3 declared a function named `pstrdup` in scws.h, which
+ * collides with PostgreSQL's pstrdup() macro. Rather than #define-shadowing
+ * it (which is fragile if SCWS later inlines or changes the signature), we
+ * isolate the rename to this header only.
+ *
+ * Build systems linking against SCWS >= 1.2.3 can pass
+ * -DZHPARSER_SCWS_HAS_NO_PSTRDUP_CONFLICT to skip this entirely.
+ */
+
+#ifndef ZHPARSER_SCWS_HAS_NO_PSTRDUP_CONFLICT
+#  ifdef pstrdup
+#    define ZHPARSER_SAVED_PSTRDUP pstrdup
+#    undef pstrdup
+#  endif
+#  define pstrdup scws_pstrdup
 #endif
 
 #include "scws.h"
 
-#undef pstrdup
-
+#ifndef ZHPARSER_SCWS_HAS_NO_PSTRDUP_CONFLICT
+#  undef pstrdup
+#  ifdef ZHPARSER_SAVED_PSTRDUP
+#    define pstrdup ZHPARSER_SAVED_PSTRDUP
+#    undef ZHPARSER_SAVED_PSTRDUP
+#  endif
 #endif
+
+#endif /* ZHPARSER_H */