Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,32 @@
2.4 (2026-06-06) -- security & robustness hardening
-- per-call SCWS instance via scws_fork(); removed global ParserState (fixes
token corruption with SRFs / nested parser calls within one backend)
-- _PG_init() now registers GUCs once at module load; init failures no longer
permanently break the backend
-- whitelist validation for zhparser.extra_dicts entries (only [A-Za-z0-9_.-],
.txt or .xdb); rejected by check_hook
-- refuse to write per-database custom dict files when current_database()
contains characters unsafe for filesystem paths
-- sync_zhprs_custom_word() now builds COPY via format() / %L and validates
the database name (mitigates dynamic-SQL pitfalls)
-- fix lexeme attr range: was ['a','x'] which silently dropped 'y' (modal)
and 'z' (status); now ['a','z']
-- safer pstrdup namespace handling in zhparser.h (no longer permanently
shadows the PG macro)
-- backup-custom-dict.sh: set -euo pipefail, dry-run, --yes confirmation,
safe globbing
-- Makefile: pkg-config detection for SCWS, -Wformat-security
-- new GitHub Actions matrix CI for PG 16/17/18
-- new regress/ container (Dockerfile + entrypoint + wrapper) and
sql/zhparser_hardening.sql; pg_regress green on PG 16/17/18

2.3 (2025-01-24)
-- add CI for linux and freebsd
-- do not create custom txt file when new install
-- add dockerfile for debian and alpine
-- fix linux CI, bump PG version to 16 for linux
-- fix client notice "NOTICE: zhparser add dict..."

2.2 (2021-11-08)
-- move custom word from /base/${DATABASE_ID}/zhprs_dict_${DATABASE_NAME}.txt to /base/zhprs_dict_${DATABASE_NAME}.txt(data don't have /base/${DATABASE_ID} when tablespace is setted)

Expand Down
13 changes: 5 additions & 8 deletions META.json
Original file line number Diff line number Diff line change
@@ -1,28 +1,25 @@
{
"name": "zhparser",
"abstract": "a parser for full-text search of Chinese",
"description": "Zhparser is a PostgreSQL extension for full-text search of Chinese.It implements a Chinese parser base on the Simple Chinese Word Segmentation(SCWS)",
"version": "0.2.0",
"description": "Zhparser is a PostgreSQL extension for full-text search of Chinese. It implements a Chinese parser based on Simple Chinese Word Segmentation (SCWS).",
"version": "2.4.0",
"maintainer": [
"Jov <amutu@amutu.com>"
],
"license": "postgresql",
"prereqs": {
"runtime": {
"requires": {
"PostgreSQL": "9.2.0"
},
"recommends": {
"PostgreSQL": "9.6.0"
"PostgreSQL": "16.0.0"
}
}
},
"provides": {
"zhparser": {
"abstract": "a parser for full-text search of Chinese",
"file": "zhparser--1.0.sql",
"file": "zhparser--2.4.sql",
"docfile": "README.md",
"version": "0.2.0"
"version": "2.4.0"
}
},
"resources": {
Expand Down
37 changes: 30 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,39 @@ OBJS = zhparser.o

EXTENSION = zhparser
DATA = zhparser--1.0.sql zhparser--unpackaged--1.0.sql \
zhparser--1.0--2.0.sql zhparser--2.0.sql \
zhparser--2.0--2.1.sql zhparser--2.1.sql zhparser--2.1--2.2.sql \
zhparser--2.2.sql zhparser--2.3.sql
zhparser--1.0--2.0.sql zhparser--2.0.sql \
zhparser--2.0--2.1.sql zhparser--2.1.sql \
zhparser--2.1--2.2.sql zhparser--2.2.sql \
zhparser--2.3.sql \
zhparser--2.3--2.4.sql zhparser--2.4.sql
DATA_TSEARCH = dict.utf8.xdb rules.utf8.ini

REGRESS = zhparser
REGRESS = zhparser zhparser_hardening

SCWS_HOME ?= /usr/local
PG_CPPFLAGS = -I$(SCWS_HOME)/include/scws
SHLIB_LINK = -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib
# ----------------------------------------------------------------------------
# SCWS detection
#
# Order of precedence:
# 1. SCWS_HOME explicitly set (legacy behavior; kept for back-compat).
# 2. pkg-config --exists scws -> use pkg-config flags.
# 3. fall back to /usr/local.
# ----------------------------------------------------------------------------
ifeq ($(origin SCWS_HOME), undefined)
ifeq ($(shell pkg-config --exists scws && echo yes),yes)
SCWS_CFLAGS := $(shell pkg-config --cflags scws)
SCWS_LIBS := $(shell pkg-config --libs scws)
else
SCWS_HOME ?= /usr/local
SCWS_CFLAGS := -I$(SCWS_HOME)/include/scws
SCWS_LIBS := -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib
endif
else
SCWS_CFLAGS := -I$(SCWS_HOME)/include/scws
SCWS_LIBS := -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib
endif

PG_CPPFLAGS = $(SCWS_CFLAGS) -Wformat -Wformat-security
SHLIB_LINK = $(SCWS_LIBS)

PG_CONFIG ?= pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
Expand Down
33 changes: 26 additions & 7 deletions check-alpine.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,32 @@
#!/usr/bin/env bash
set -euo pipefail

pid=$$
docker run --rm --name testpgzhparser-$pid -p 5432:5432 -d -e POSTGRES_PASSWORD=somepassword@alpine zhparser/zhparser:alpine-16
sleep 5
export PGPASSWORD=somepassword@alpine
psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql | diff expected/zhparser-alpine.out -
container="testpgzhparser-$pid"

cleanup() {
docker stop "$container" >/dev/null 2>&1 || true
}
trap cleanup EXIT

if [ $? -eq 0 ]
then
docker run --rm --name "$container" -p 5432:5432 -d \
-e POSTGRES_PASSWORD=somepassword@alpine \
zhparser/zhparser:alpine-16

# Wait for Postgres to accept connections instead of fixed sleep.
for _ in $(seq 1 30); do
if PGPASSWORD=somepassword@alpine psql -h 127.0.0.1 -U postgres \
-tAc 'select 1' postgres >/dev/null 2>&1; then
break
fi
sleep 1
done

export PGPASSWORD=somepassword@alpine
if psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql \
| diff expected/zhparser-alpine.out -; then
echo "pass!"
else
echo "do not pass!"
exit 1
fi
docker stop testpgzhparser-$pid
32 changes: 25 additions & 7 deletions check-debian.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,31 @@
#!/usr/bin/env bash
set -euo pipefail

pid=$$
docker run --rm --name testpgzhparser-$pid -p 5432:5432 -d -e POSTGRES_PASSWORD=somepassword@debian-16 zhparser/zhparser:bookworm-16
sleep 5
export PGPASSWORD=somepassword@debian-16
psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql | diff expected/zhparser-debian.out -
container="testpgzhparser-$pid"

cleanup() {
docker stop "$container" >/dev/null 2>&1 || true
}
trap cleanup EXIT

if [ $? -eq 0 ]
then
docker run --rm --name "$container" -p 5432:5432 -d \
-e POSTGRES_PASSWORD=somepassword@debian-16 \
zhparser/zhparser:bookworm-16

for _ in $(seq 1 30); do
if PGPASSWORD=somepassword@debian-16 psql -h 127.0.0.1 -U postgres \
-tAc 'select 1' postgres >/dev/null 2>&1; then
break
fi
sleep 1
done

export PGPASSWORD=somepassword@debian-16
if psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql \
| diff expected/zhparser-debian.out -; then
echo "pass!"
else
echo "do not pass!"
exit 1
fi
docker stop testpgzhparser-$pid
98 changes: 98 additions & 0 deletions expected/zhparser.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
CREATE EXTENSION zhparser;
-- make test configuration using parser
CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser);
ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple;
-- ts_parse
SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动,从中央到地方纷纷加大 了 保 障 房 的 建 设 和 投 入 力 度 。2011年,保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示,要继续推进保障性安居工程建设。');
tokid | token
-------+----------
101 | hello
101 | world
117 | !
101 | 2010
113 | 年
118 | 保障
110 | 房建
118 | 设在
110 | 全国
110 | 范围
102 | 内
118 | 获
97 | 全面
118 | 启动
117 | ,
110 | 从中
118 | 央
118 | 到
110 | 地方
100 | 纷纷
118 | 加大
118 | 了
118 | 保
110 | 障
110 | 房
117 | 的
118 | 建
118 | 设
99 | 和
118 | 投
118 | 入
110 | 力
107 | 度
117 | 。
101 | 2011
113 | 年
117 | ,
118 | 保障
110 | 房
118 | 进入
118 | 了
100 | 更
110 | 大规模
117 | 的
118 | 建设
110 | 阶段
117 | 。
110 | 住房
110 | 城乡建设
110 | 部党组
110 | 书记
117 | 、
110 | 部长
110 | 姜
110 | 伟
97 | 新
116 | 去年底
112 | 在
110 | 全国
110 | 住房
110 | 城乡建设
118 | 工作
110 | 会议
110 | 上表
118 | 示
117 | ,
118 | 要
118 | 继续
118 | 推进
110 | 保障性
118 | 安居
110 | 工程建设
117 | 。
(73 rows)

SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调,但实际的年度在建规模以及竣工规模会超以往年份,相对应的对资金的需求也会创历史纪录。”陈国强说。在他看来,与2011年相比,2012年的保障房建设在资金配套上的压力将更为严峻。');
to_tsvector
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
'2011':27 '2012':29 '上':35 '下调':7 '严峻':37 '会':14 '会创':20 '保障':1,30 '历史':21 '压力':36 '国强':24 '在建':10 '实际':8 '对应':17 '年份':16 '年度':9 '开工':4 '房':2 '房建':31 '数量':5 '新':3 '有所':6 '相比':28 '看来':26 '竣工':12 '纪录':22 '规模':11,13 '设在':32 '说':25 '资金':18,33 '超':15 '配套':34 '陈':23 '需求':19
(1 row)

SELECT to_tsquery('testzhcfg', '保障房资金压力');
to_tsquery
---------------------------------------
'保障' <-> '房' <-> '资金' <-> '压力'
(1 row)

-- clean extension
DROP EXTENSION zhparser CASCADE;
NOTICE: drop cascades to text search configuration testzhcfg
81 changes: 81 additions & 0 deletions expected/zhparser_hardening.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
-- ===========================================================================
-- zhparser hardening regression tests
--
-- Tests are independent of dictionary tokenization output; they verify
-- the structural / behavioural fixes shipped in 2.4.
--
-- NOTE on GUC tests: zhparser.{extra_dicts,dict_in_memory} are PGC_BACKEND,
-- which means PostgreSQL itself rejects SET inside a session ("cannot be
-- set after connection start"). The path-traversal validation at the C
-- level is exercised at startup time; pg_regress cannot easily test it
-- without restarting backends. We instead verify the GUCs are registered
-- with the correct context.
-- ===========================================================================
CREATE EXTENSION IF NOT EXISTS zhparser;
-- ----- 1. lex types: y (modal) and z (status) must be present -----
-- Regression for the [a,x] truncation bug.
SELECT count(*) AS lex_type_count FROM ts_token_type('zhparser');
lex_type_count
----------------
26
(1 row)

SELECT alias FROM ts_token_type('zhparser') WHERE alias IN ('y','z') ORDER BY alias;
alias
-------
y
z
(2 rows)

-- ----- 2. GUC registration: 8 zhparser.* GUCs exist with expected contexts -
SELECT name, context, vartype
FROM pg_settings
WHERE name LIKE 'zhparser.%'
ORDER BY name;
name | context | vartype
-----------------------------+---------+---------
zhparser.dict_in_memory | backend | bool
zhparser.extra_dicts | backend | string
zhparser.multi_duality | user | bool
zhparser.multi_short | user | bool
zhparser.multi_zall | user | bool
zhparser.multi_zmain | user | bool
zhparser.punctuation_ignore | user | bool
zhparser.seg_with_duality | user | bool
(8 rows)

-- ----- 3. Per-call state isolation -----
-- Two parser invocations side-by-side must not corrupt each other's
-- token streams. If the global-state bug from <2.4 were back, one of
-- these subqueries would observe the other's input.
WITH
a AS (SELECT string_agg(token, ',') AS s FROM ts_parse('zhparser', 'hello')),
b AS (SELECT string_agg(token, ',') AS s FROM ts_parse('zhparser', 'world'))
SELECT
(a.s LIKE '%hello%') AS a_has_hello,
(a.s LIKE '%world%') AS a_has_world,
(b.s LIKE '%hello%') AS b_has_hello,
(b.s LIKE '%world%') AS b_has_world
FROM a, b;
a_has_hello | a_has_world | b_has_hello | b_has_world
-------------+-------------+-------------+-------------
t | f | f | t
(1 row)

-- ----- 4. sync_zhprs_custom_word: regex guard must be active -----
SELECT
(pg_get_functiondef(p.oid) LIKE '%format(%') AS uses_format_func,
(pg_get_functiondef(p.oid) LIKE '%[A-Za-z0-9_]%') AS has_dbname_regex
FROM pg_proc p
JOIN pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'public' AND p.proname = 'sync_zhprs_custom_word';
uses_format_func | has_dbname_regex
------------------+------------------
t | t
(1 row)

-- ----- 5. Session-scoped GUCs are still mutable -----
SET zhparser.punctuation_ignore = on;
SET zhparser.multi_short = on;
SET zhparser.multi_zall = on;
RESET ALL;
Loading
Loading