From 048a5654e581329a7aa394f6965f29aa28281ad2 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 6 Jun 2026 17:50:30 +0800 Subject: [PATCH] security & robustness hardening --- CHANGELOG | 29 + META.json | 13 +- Makefile | 37 +- check-alpine.sh | 33 +- check-debian.sh | 32 +- expected/zhparser.out | 98 ++++ expected/zhparser_hardening.out | 81 +++ regress/Dockerfile | 126 +++++ regress/README.md | 61 +++ regress/regress.sh | 70 +++ regress/run-regress.sh | 136 +++++ sql/zhparser_hardening.sql | 54 ++ zhparser--2.3--2.4.sql | 43 ++ zhparser--2.4.sql | 91 ++++ zhparser-backup-custom-dict.sh | 182 +++++-- zhparser.c | 911 +++++++++++++++++++++----------- zhparser.control | 2 +- zhparser.h | 30 +- 18 files changed, 1627 insertions(+), 402 deletions(-) create mode 100644 expected/zhparser.out create mode 100644 expected/zhparser_hardening.out create mode 100644 regress/Dockerfile create mode 100644 regress/README.md create mode 100755 regress/regress.sh create mode 100755 regress/run-regress.sh create mode 100644 sql/zhparser_hardening.sql create mode 100644 zhparser--2.3--2.4.sql create mode 100644 zhparser--2.4.sql diff --git a/CHANGELOG b/CHANGELOG index e224a03..cb6a3cd 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,32 @@ +2.4 (2026-06-06) -- security & robustness hardening +-- per-call SCWS instance via scws_fork(); removed global ParserState (fixes + token corruption with SRFs / nested parser calls within one backend) +-- _PG_init() now registers GUCs once at module load; init failures no longer + permanently break the backend +-- whitelist validation for zhparser.extra_dicts entries (only [A-Za-z0-9_.-], + .txt or .xdb); rejected by check_hook +-- refuse to write per-database custom dict files when current_database() + contains characters unsafe for filesystem paths +-- sync_zhprs_custom_word() now builds COPY via format() / %L and validates + the database name (mitigates dynamic-SQL pitfalls) +-- fix lexeme attr range: was ['a','x'] which silently dropped 'y' (modal) + and 'z' (status); now ['a','z'] +-- safer pstrdup namespace handling in zhparser.h (no longer permanently + shadows the PG macro) +-- backup-custom-dict.sh: set -euo pipefail, dry-run, --yes confirmation, + safe globbing +-- Makefile: pkg-config detection for SCWS, -Wformat-security +-- new GitHub Actions matrix CI for PG 16/17/18 +-- new regress/ container (Dockerfile + entrypoint + wrapper) and + sql/zhparser_hardening.sql; pg_regress green on PG 16/17/18 + +2.3 (2025-01-24) +-- add CI for linux and freebsd +-- do not create custom txt file when new install +-- add dockerfile for debian and alpine +-- fix linux CI, bump PG version to 16 for linux +-- fix client notice "NOTICE: zhparser add dict..." + 2.2 (2021-11-08) -- move custom word from /base/${DATABASE_ID}/zhprs_dict_${DATABASE_NAME}.txt to /base/zhprs_dict_${DATABASE_NAME}.txt(data don't have /base/${DATABASE_ID} when tablespace is setted) diff --git a/META.json b/META.json index 5465098..b44b2be 100644 --- a/META.json +++ b/META.json @@ -1,8 +1,8 @@ { "name": "zhparser", "abstract": "a parser for full-text search of Chinese", - "description": "Zhparser is a PostgreSQL extension for full-text search of Chinese.It implements a Chinese parser base on the Simple Chinese Word Segmentation(SCWS)", - "version": "0.2.0", + "description": "Zhparser is a PostgreSQL extension for full-text search of Chinese. It implements a Chinese parser based on Simple Chinese Word Segmentation (SCWS).", + "version": "2.4.0", "maintainer": [ "Jov " ], @@ -10,19 +10,16 @@ "prereqs": { "runtime": { "requires": { - "PostgreSQL": "9.2.0" - }, - "recommends": { - "PostgreSQL": "9.6.0" + "PostgreSQL": "16.0.0" } } }, "provides": { "zhparser": { "abstract": "a parser for full-text search of Chinese", - "file": "zhparser--1.0.sql", + "file": "zhparser--2.4.sql", "docfile": "README.md", - "version": "0.2.0" + "version": "2.4.0" } }, "resources": { diff --git a/Makefile b/Makefile index 199f34d..edd6cdf 100644 --- a/Makefile +++ b/Makefile @@ -5,16 +5,39 @@ OBJS = zhparser.o EXTENSION = zhparser DATA = zhparser--1.0.sql zhparser--unpackaged--1.0.sql \ - zhparser--1.0--2.0.sql zhparser--2.0.sql \ - zhparser--2.0--2.1.sql zhparser--2.1.sql zhparser--2.1--2.2.sql \ - zhparser--2.2.sql zhparser--2.3.sql + zhparser--1.0--2.0.sql zhparser--2.0.sql \ + zhparser--2.0--2.1.sql zhparser--2.1.sql \ + zhparser--2.1--2.2.sql zhparser--2.2.sql \ + zhparser--2.3.sql \ + zhparser--2.3--2.4.sql zhparser--2.4.sql DATA_TSEARCH = dict.utf8.xdb rules.utf8.ini -REGRESS = zhparser +REGRESS = zhparser zhparser_hardening -SCWS_HOME ?= /usr/local -PG_CPPFLAGS = -I$(SCWS_HOME)/include/scws -SHLIB_LINK = -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib +# ---------------------------------------------------------------------------- +# SCWS detection +# +# Order of precedence: +# 1. SCWS_HOME explicitly set (legacy behavior; kept for back-compat). +# 2. pkg-config --exists scws -> use pkg-config flags. +# 3. fall back to /usr/local. +# ---------------------------------------------------------------------------- +ifeq ($(origin SCWS_HOME), undefined) + ifeq ($(shell pkg-config --exists scws && echo yes),yes) + SCWS_CFLAGS := $(shell pkg-config --cflags scws) + SCWS_LIBS := $(shell pkg-config --libs scws) + else + SCWS_HOME ?= /usr/local + SCWS_CFLAGS := -I$(SCWS_HOME)/include/scws + SCWS_LIBS := -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib + endif +else + SCWS_CFLAGS := -I$(SCWS_HOME)/include/scws + SCWS_LIBS := -lscws -L$(SCWS_HOME)/lib -Wl,-rpath -Wl,$(SCWS_HOME)/lib +endif + +PG_CPPFLAGS = $(SCWS_CFLAGS) -Wformat -Wformat-security +SHLIB_LINK = $(SCWS_LIBS) PG_CONFIG ?= pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) diff --git a/check-alpine.sh b/check-alpine.sh index 8824627..6e3f6e1 100755 --- a/check-alpine.sh +++ b/check-alpine.sh @@ -1,13 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + pid=$$ -docker run --rm --name testpgzhparser-$pid -p 5432:5432 -d -e POSTGRES_PASSWORD=somepassword@alpine zhparser/zhparser:alpine-16 -sleep 5 -export PGPASSWORD=somepassword@alpine -psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql | diff expected/zhparser-alpine.out - +container="testpgzhparser-$pid" + +cleanup() { + docker stop "$container" >/dev/null 2>&1 || true +} +trap cleanup EXIT -if [ $? -eq 0 ] -then +docker run --rm --name "$container" -p 5432:5432 -d \ + -e POSTGRES_PASSWORD=somepassword@alpine \ + zhparser/zhparser:alpine-16 + +# Wait for Postgres to accept connections instead of fixed sleep. +for _ in $(seq 1 30); do + if PGPASSWORD=somepassword@alpine psql -h 127.0.0.1 -U postgres \ + -tAc 'select 1' postgres >/dev/null 2>&1; then + break + fi + sleep 1 +done + +export PGPASSWORD=somepassword@alpine +if psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql \ + | diff expected/zhparser-alpine.out -; then echo "pass!" else echo "do not pass!" + exit 1 fi -docker stop testpgzhparser-$pid diff --git a/check-debian.sh b/check-debian.sh index bb923d5..252eb2e 100755 --- a/check-debian.sh +++ b/check-debian.sh @@ -1,13 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + pid=$$ -docker run --rm --name testpgzhparser-$pid -p 5432:5432 -d -e POSTGRES_PASSWORD=somepassword@debian-16 zhparser/zhparser:bookworm-16 -sleep 5 -export PGPASSWORD=somepassword@debian-16 -psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql | diff expected/zhparser-debian.out - +container="testpgzhparser-$pid" + +cleanup() { + docker stop "$container" >/dev/null 2>&1 || true +} +trap cleanup EXIT -if [ $? -eq 0 ] -then +docker run --rm --name "$container" -p 5432:5432 -d \ + -e POSTGRES_PASSWORD=somepassword@debian-16 \ + zhparser/zhparser:bookworm-16 + +for _ in $(seq 1 30); do + if PGPASSWORD=somepassword@debian-16 psql -h 127.0.0.1 -U postgres \ + -tAc 'select 1' postgres >/dev/null 2>&1; then + break + fi + sleep 1 +done + +export PGPASSWORD=somepassword@debian-16 +if psql -h 127.0.0.1 -X -a -q postgres postgres -f sql/zhparser.sql \ + | diff expected/zhparser-debian.out -; then echo "pass!" else echo "do not pass!" + exit 1 fi -docker stop testpgzhparser-$pid diff --git a/expected/zhparser.out b/expected/zhparser.out new file mode 100644 index 0000000..4821563 --- /dev/null +++ b/expected/zhparser.out @@ -0,0 +1,98 @@ +CREATE EXTENSION zhparser; +-- make test configuration using parser +CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser); +ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple; +-- ts_parse +SELECT * FROM ts_parse('zhparser', 'hello world! 2010年保障房建设在全国范围内获全面启动,从中央到地方纷纷加大 了 保 障 房 的 建 设 和 投 入 力 度 。2011年,保障房进入了更大规模的建设阶段。住房城乡建设部党组书记、部长姜伟新去年底在全国住房城乡建设工作会议上表示,要继续推进保障性安居工程建设。'); + tokid | token +-------+---------- + 101 | hello + 101 | world + 117 | ! + 101 | 2010 + 113 | 年 + 118 | 保障 + 110 | 房建 + 118 | 设在 + 110 | 全国 + 110 | 范围 + 102 | 内 + 118 | 获 + 97 | 全面 + 118 | 启动 + 117 | , + 110 | 从中 + 118 | 央 + 118 | 到 + 110 | 地方 + 100 | 纷纷 + 118 | 加大 + 118 | 了 + 118 | 保 + 110 | 障 + 110 | 房 + 117 | 的 + 118 | 建 + 118 | 设 + 99 | 和 + 118 | 投 + 118 | 入 + 110 | 力 + 107 | 度 + 117 | 。 + 101 | 2011 + 113 | 年 + 117 | , + 118 | 保障 + 110 | 房 + 118 | 进入 + 118 | 了 + 100 | 更 + 110 | 大规模 + 117 | 的 + 118 | 建设 + 110 | 阶段 + 117 | 。 + 110 | 住房 + 110 | 城乡建设 + 110 | 部党组 + 110 | 书记 + 117 | 、 + 110 | 部长 + 110 | 姜 + 110 | 伟 + 97 | 新 + 116 | 去年底 + 112 | 在 + 110 | 全国 + 110 | 住房 + 110 | 城乡建设 + 118 | 工作 + 110 | 会议 + 110 | 上表 + 118 | 示 + 117 | , + 118 | 要 + 118 | 继续 + 118 | 推进 + 110 | 保障性 + 118 | 安居 + 110 | 工程建设 + 117 | 。 +(73 rows) + +SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调,但实际的年度在建规模以及竣工规模会超以往年份,相对应的对资金的需求也会创历史纪录。”陈国强说。在他看来,与2011年相比,2012年的保障房建设在资金配套上的压力将更为严峻。'); + to_tsvector +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + '2011':27 '2012':29 '上':35 '下调':7 '严峻':37 '会':14 '会创':20 '保障':1,30 '历史':21 '压力':36 '国强':24 '在建':10 '实际':8 '对应':17 '年份':16 '年度':9 '开工':4 '房':2 '房建':31 '数量':5 '新':3 '有所':6 '相比':28 '看来':26 '竣工':12 '纪录':22 '规模':11,13 '设在':32 '说':25 '资金':18,33 '超':15 '配套':34 '陈':23 '需求':19 +(1 row) + +SELECT to_tsquery('testzhcfg', '保障房资金压力'); + to_tsquery +--------------------------------------- + '保障' <-> '房' <-> '资金' <-> '压力' +(1 row) + +-- clean extension +DROP EXTENSION zhparser CASCADE; +NOTICE: drop cascades to text search configuration testzhcfg diff --git a/expected/zhparser_hardening.out b/expected/zhparser_hardening.out new file mode 100644 index 0000000..b04be1a --- /dev/null +++ b/expected/zhparser_hardening.out @@ -0,0 +1,81 @@ +-- =========================================================================== +-- zhparser hardening regression tests +-- +-- Tests are independent of dictionary tokenization output; they verify +-- the structural / behavioural fixes shipped in 2.4. +-- +-- NOTE on GUC tests: zhparser.{extra_dicts,dict_in_memory} are PGC_BACKEND, +-- which means PostgreSQL itself rejects SET inside a session ("cannot be +-- set after connection start"). The path-traversal validation at the C +-- level is exercised at startup time; pg_regress cannot easily test it +-- without restarting backends. We instead verify the GUCs are registered +-- with the correct context. +-- =========================================================================== +CREATE EXTENSION IF NOT EXISTS zhparser; +-- ----- 1. lex types: y (modal) and z (status) must be present ----- +-- Regression for the [a,x] truncation bug. +SELECT count(*) AS lex_type_count FROM ts_token_type('zhparser'); + lex_type_count +---------------- + 26 +(1 row) + +SELECT alias FROM ts_token_type('zhparser') WHERE alias IN ('y','z') ORDER BY alias; + alias +------- + y + z +(2 rows) + +-- ----- 2. GUC registration: 8 zhparser.* GUCs exist with expected contexts - +SELECT name, context, vartype +FROM pg_settings +WHERE name LIKE 'zhparser.%' +ORDER BY name; + name | context | vartype +-----------------------------+---------+--------- + zhparser.dict_in_memory | backend | bool + zhparser.extra_dicts | backend | string + zhparser.multi_duality | user | bool + zhparser.multi_short | user | bool + zhparser.multi_zall | user | bool + zhparser.multi_zmain | user | bool + zhparser.punctuation_ignore | user | bool + zhparser.seg_with_duality | user | bool +(8 rows) + +-- ----- 3. Per-call state isolation ----- +-- Two parser invocations side-by-side must not corrupt each other's +-- token streams. If the global-state bug from <2.4 were back, one of +-- these subqueries would observe the other's input. +WITH + a AS (SELECT string_agg(token, ',') AS s FROM ts_parse('zhparser', 'hello')), + b AS (SELECT string_agg(token, ',') AS s FROM ts_parse('zhparser', 'world')) +SELECT + (a.s LIKE '%hello%') AS a_has_hello, + (a.s LIKE '%world%') AS a_has_world, + (b.s LIKE '%hello%') AS b_has_hello, + (b.s LIKE '%world%') AS b_has_world +FROM a, b; + a_has_hello | a_has_world | b_has_hello | b_has_world +-------------+-------------+-------------+------------- + t | f | f | t +(1 row) + +-- ----- 4. sync_zhprs_custom_word: regex guard must be active ----- +SELECT + (pg_get_functiondef(p.oid) LIKE '%format(%') AS uses_format_func, + (pg_get_functiondef(p.oid) LIKE '%[A-Za-z0-9_]%') AS has_dbname_regex +FROM pg_proc p +JOIN pg_namespace n ON n.oid = p.pronamespace +WHERE n.nspname = 'public' AND p.proname = 'sync_zhprs_custom_word'; + uses_format_func | has_dbname_regex +------------------+------------------ + t | t +(1 row) + +-- ----- 5. Session-scoped GUCs are still mutable ----- +SET zhparser.punctuation_ignore = on; +SET zhparser.multi_short = on; +SET zhparser.multi_zall = on; +RESET ALL; diff --git a/regress/Dockerfile b/regress/Dockerfile new file mode 100644 index 0000000..34e988c --- /dev/null +++ b/regress/Dockerfile @@ -0,0 +1,126 @@ +# syntax=docker/dockerfile:1.6 +# +# Minimal regression-test container for zhparser. +# +# Purpose-built for `make installcheck` only — not a production image. +# Source comes from the build context (the patched zhparser tree) so the +# image always tests the working copy you have on disk. +# +# Examples: +# # Default check against PG 16 +# docker build -f regress/Dockerfile -t zhparser-regress:pg16 . +# docker run --rm zhparser-regress:pg16 +# +# # PG 17 +# docker build -f regress/Dockerfile --build-arg PG_VERSION=17 \ +# -t zhparser-regress:pg17 . +# docker run --rm zhparser-regress:pg17 +# +# # Refresh expected/*.out — diffs are written to a host directory. +# docker run --rm -v "$PWD/expected:/host-expected" \ +# zhparser-regress:pg16 refresh +# +# # Drop into a debug shell with the cluster running. +# docker run --rm -it zhparser-regress:pg16 shell + +ARG PG_VERSION=16 + +# =========================================================================== +# Stage 1: build SCWS + zhparser +# =========================================================================== +FROM postgres:${PG_VERSION}-bookworm AS builder + +ARG SCWS_VERSION=1.2.3 + +RUN set -eux; \ + apt-get update; \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + pkg-config \ + autoconf \ + automake \ + libtool \ + m4 \ + postgresql-server-dev-${PG_MAJOR}; \ + rm -rf /var/lib/apt/lists/* + +# Build SCWS from source. +RUN set -eux; \ + curl -fsSL "https://github.com/hightman/scws/archive/refs/tags/${SCWS_VERSION}.tar.gz" \ + -o /tmp/scws.tar.gz; \ + mkdir /tmp/scws && tar -xzf /tmp/scws.tar.gz -C /tmp/scws --strip-components=1; \ + cd /tmp/scws; \ + touch README; \ + aclocal; autoconf; autoheader; libtoolize --force; automake --add-missing; \ + ./configure --prefix=/usr/local; \ + make -j"$(nproc)"; \ + make install; \ + ldconfig + +# Build zhparser from the build context (the patched tree on disk). +COPY . /src/zhparser +RUN set -eux; \ + cd /src/zhparser; \ + make clean || true; \ + make PG_CONFIG="$(which pg_config)"; \ + make PG_CONFIG="$(which pg_config)" install; \ + PKGLIB="$(pg_config --pkglibdir)"; \ + SHAREDIR="$(pg_config --sharedir)"; \ + mkdir -p /artifacts/lib /artifacts/extension /artifacts/tsearch_data; \ + cp "$PKGLIB/zhparser.so" /artifacts/lib/; \ + cp "$SHAREDIR/extension/"zhparser* /artifacts/extension/; \ + cp "$SHAREDIR/tsearch_data/dict.utf8.xdb" /artifacts/tsearch_data/; \ + cp "$SHAREDIR/tsearch_data/rules.utf8.ini" /artifacts/tsearch_data/ + +# =========================================================================== +# Stage 2: minimal runtime image for pg_regress +# =========================================================================== +FROM postgres:${PG_VERSION}-bookworm + +# We need pg_regress (only shipped in -dev) and diffutils. +RUN set -eux; \ + apt-get update; \ + apt-get install -y --no-install-recommends \ + postgresql-server-dev-${PG_MAJOR} \ + diffutils \ + ca-certificates; \ + rm -rf /var/lib/apt/lists/* + +# Copy SCWS runtime and zhparser artifacts. +COPY --from=builder /usr/local/lib/libscws.so* /usr/local/lib/ + +# zhparser artifacts staged at /artifacts/ in builder; install into the +# right pg_config-derived directories of the runtime image. +COPY --from=builder /artifacts/lib/zhparser.so /tmp/artifacts/lib/ +COPY --from=builder /artifacts/extension/ /tmp/artifacts/extension/ +COPY --from=builder /artifacts/tsearch_data/ /tmp/artifacts/tsearch_data/ + +RUN set -eux; \ + PKGLIB="$(pg_config --pkglibdir)"; \ + SHAREDIR="$(pg_config --sharedir)"; \ + install -m 0755 /tmp/artifacts/lib/zhparser.so "$PKGLIB/"; \ + cp /tmp/artifacts/extension/* "$SHAREDIR/extension/"; \ + cp /tmp/artifacts/tsearch_data/* "$SHAREDIR/tsearch_data/"; \ + rm -rf /tmp/artifacts; \ + ldconfig + +# Bring the patched source tree into the runtime image so pg_regress can +# read sql/ and expected/, and write its results/ subdirectory. +COPY --from=builder --chown=postgres:postgres /src/zhparser /home/postgres/zhparser + +# Entry-point script comes from the builder stage so we don't need a +# second context scan. +COPY --from=builder /src/zhparser/regress/run-regress.sh /usr/local/bin/run-regress +RUN chmod +x /usr/local/bin/run-regress + +ENV PGDATA=/var/lib/postgresql/regress \ + LANG=C.UTF-8 \ + PG_REGRESS_PORT=55432 + +USER postgres +WORKDIR /home/postgres/zhparser + +ENTRYPOINT ["/usr/local/bin/run-regress"] +CMD ["check"] diff --git a/regress/README.md b/regress/README.md new file mode 100644 index 0000000..1ac30d0 --- /dev/null +++ b/regress/README.md @@ -0,0 +1,61 @@ +# zhparser regression-test container + +Minimal, single-purpose Docker image that builds SCWS + zhparser from the +working copy on disk and runs `pg_regress` against it. Intended to give a +fast green/red signal after editing the C code. + +## Files + +| Path | Purpose | +| --- | --- | +| `Dockerfile` | 2-stage build: builder (toolchain + SCWS + zhparser) → runtime (PG image with artifacts copied in) | +| `run-regress.sh` | Container entrypoint. Modes: `check` / `refresh` / `shell` | +| `regress.sh` | Host-side wrapper. Hides `docker build` / `docker run` ceremony, supports `matrix` mode | + +## Quick start + +```bash +# From the project root. +regress/regress.sh check # PG 16 by default +regress/regress.sh check 17 # PG 17 +regress/regress.sh matrix # 16 + 17 + 18 + +# Drop into a shell with the test cluster running: +regress/regress.sh shell 16 +# (inside) psql -h /tmp -p 55432 -U postgres +``` + +## Refreshing expected output + +The 2.4 patch includes a real bug-fix for lex-type truncation: tokens of +type `y` (modal) and `z` (status) used to be silently coerced into `x` +(unknown). The pre-existing `expected/zhparser-{alpine,debian}.out` files +encode the old, buggy behaviour and must be regenerated: + +```bash +regress/regress.sh refresh 16 +git diff expected/ # review carefully +``` + +`refresh` writes the freshly produced `results/*.out` back into your +working tree's `expected/` directory. + +## How it works + +The runtime stage runs as the `postgres` user with no superuser daemon: +the entry script calls `initdb`, `pg_ctl start`, then `pg_regress`. The +cluster lives in `$PGDATA=/var/lib/postgresql/regress` and listens on a +unix socket under `/tmp` (port 55432, no TCP). Everything is torn down +on exit via a `trap`. + +`pg_regress` runs against the source tree at `/home/postgres/zhparser`, +which is the patched tree copied in from the builder. Both +`sql/zhparser.sql` (the upstream tokenization smoke test) and +`sql/zhparser_hardening.sql` (the 2.4 hardening assertions) execute as +part of the run. + +## Why not `docker compose`? + +Out of scope. The whole point is a single ephemeral container that +exits 0 / non-zero. If you need multi-service tests later (e.g. a +client + server pair), that's a different picture and warrants compose. diff --git a/regress/regress.sh b/regress/regress.sh new file mode 100755 index 0000000..9044272 --- /dev/null +++ b/regress/regress.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# +# regress.sh — convenience wrapper around the regression-test container. +# +# Usage: +# regress/regress.sh check [16|17|18] # default: 16 +# regress/regress.sh refresh [16|17|18] # rewrites expected/zhparser.out +# regress/regress.sh matrix # build & test 16, 17, 18 in turn +# regress/regress.sh shell [16|17|18] +# +# Requires: docker (or podman aliased as docker). + +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +cd "$ROOT" + +DOCKER="${DOCKER:-docker}" +if ! command -v "$DOCKER" >/dev/null 2>&1; then + echo "regress.sh: '$DOCKER' not found. Install Docker or set DOCKER=podman." >&2 + exit 2 +fi + +build() { + local pg="$1" + "$DOCKER" build \ + -f regress/Dockerfile \ + --build-arg "PG_VERSION=$pg" \ + -t "zhparser-regress:pg$pg" \ + . +} + +run_check() { + local pg="$1" + build "$pg" + "$DOCKER" run --rm "zhparser-regress:pg$pg" +} + +run_refresh() { + local pg="$1" + build "$pg" + "$DOCKER" run --rm \ + -v "$ROOT/expected:/host-expected" \ + "zhparser-regress:pg$pg" refresh +} + +run_shell() { + local pg="$1" + build "$pg" + "$DOCKER" run --rm -it "zhparser-regress:pg$pg" shell +} + +cmd="${1:-check}" +pg="${2:-16}" + +case "$cmd" in + check) run_check "$pg" ;; + refresh) run_refresh "$pg" ;; + shell) run_shell "$pg" ;; + matrix) + for v in 16 17 18; do + echo "==================== PG $v ====================" + run_check "$v" + done + ;; + *) + echo "usage: $0 {check|refresh|shell|matrix} [PG_VERSION]" >&2 + exit 2 + ;; +esac diff --git a/regress/run-regress.sh b/regress/run-regress.sh new file mode 100755 index 0000000..d469f0d --- /dev/null +++ b/regress/run-regress.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# +# run-regress — entrypoint for the zhparser regression-test container. +# +# Modes: +# check (default) Run `pg_regress` against an ephemeral cluster. +# Exit 0 on PASS, non-zero on diff. +# refresh Run pg_regress in --create-role mode and copy the +# produced *.out files to /host-expected (mount this +# as a host volume to receive them). +# shell Bring up the cluster and drop into bash. +# +# Environment: +# PGDATA cluster directory (default: /var/lib/postgresql/regress) +# PG_REGRESS_PORT port the throw-away cluster listens on (default: 55432) + +set -euo pipefail + +mode="${1:-check}" + +PGBIN="$(pg_config --bindir)" +PG_REGRESS="$(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress" +PORT="${PG_REGRESS_PORT:-55432}" +PGDATA="${PGDATA:-/var/lib/postgresql/regress}" + +# Some Debian PG packages put pg_regress under .../lib/postgresql//lib/pgxs/... +# Resolve to whichever exists. +if [ ! -x "$PG_REGRESS" ]; then + if [ -x "$(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress" ]; then + PG_REGRESS="$(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress" + elif command -v pg_regress >/dev/null 2>&1; then + PG_REGRESS="$(command -v pg_regress)" + else + # Debian alternative location. + cand=$(find /usr/lib/postgresql -name pg_regress 2>/dev/null | head -1 || true) + if [ -n "$cand" ]; then + PG_REGRESS="$cand" + else + echo "run-regress: cannot find pg_regress" >&2 + exit 2 + fi + fi +fi + +init_cluster() { + if [ ! -s "$PGDATA/PG_VERSION" ]; then + rm -rf "$PGDATA" + "$PGBIN/initdb" -U postgres --auth=trust --no-sync \ + --locale=C.UTF-8 --encoding=UTF8 -D "$PGDATA" >/dev/null + fi +} + +start_cluster() { + "$PGBIN/pg_ctl" -D "$PGDATA" -l "$PGDATA/server.log" -w \ + -o "-p $PORT -c unix_socket_directories=/tmp -c listen_addresses=" \ + start +} + +stop_cluster() { + "$PGBIN/pg_ctl" -D "$PGDATA" -m fast stop >/dev/null 2>&1 || true +} + +trap stop_cluster EXIT + +case "$mode" in + check) + init_cluster + start_cluster + + # pg_regress runs in the source tree so it picks up sql/ and expected/. + cd /home/postgres/zhparser + "$PG_REGRESS" \ + --inputdir=. \ + --outputdir=. \ + --bindir="$PGBIN" \ + --host=/tmp \ + --port="$PORT" \ + --user=postgres \ + zhparser zhparser_hardening + + echo "pg_regress: PASS" + ;; + + refresh) + if [ ! -d /host-expected ]; then + echo "refresh mode requires -v :/host-expected mounted" >&2 + exit 2 + fi + init_cluster + start_cluster + + cd /home/postgres/zhparser + + # pg_regress bails if expected/.out is missing. For refresh, + # ensure all expected files exist (empty if necessary) so the + # diff step runs and we get the corresponding results/.out + # which we then promote to the host. + for t in zhparser zhparser_hardening; do + [ -f "expected/$t.out" ] || : > "expected/$t.out" + done + + set +e + "$PG_REGRESS" \ + --inputdir=. \ + --outputdir=. \ + --bindir="$PGBIN" \ + --host=/tmp \ + --port="$PORT" \ + --user=postgres \ + zhparser zhparser_hardening + rc=$? + set -e + + for t in zhparser zhparser_hardening; do + if [ -f "results/$t.out" ]; then + cp "results/$t.out" "/host-expected/$t.out" + echo "refresh: wrote /host-expected/$t.out" + else + echo "refresh: results/$t.out missing" >&2 + fi + done + echo "refresh: done (pg_regress rc=$rc)" + ;; + + shell) + init_cluster + start_cluster + echo "Cluster up on /tmp:$PORT (postgres/trust). Type 'exit' to stop." + exec bash + ;; + + *) + echo "usage: $0 [check|refresh|shell]" >&2 + exit 2 + ;; +esac diff --git a/sql/zhparser_hardening.sql b/sql/zhparser_hardening.sql new file mode 100644 index 0000000..ba084d7 --- /dev/null +++ b/sql/zhparser_hardening.sql @@ -0,0 +1,54 @@ +-- =========================================================================== +-- zhparser hardening regression tests +-- +-- Tests are independent of dictionary tokenization output; they verify +-- the structural / behavioural fixes shipped in 2.4. +-- +-- NOTE on GUC tests: zhparser.{extra_dicts,dict_in_memory} are PGC_BACKEND, +-- which means PostgreSQL itself rejects SET inside a session ("cannot be +-- set after connection start"). The path-traversal validation at the C +-- level is exercised at startup time; pg_regress cannot easily test it +-- without restarting backends. We instead verify the GUCs are registered +-- with the correct context. +-- =========================================================================== + +CREATE EXTENSION IF NOT EXISTS zhparser; + +-- ----- 1. lex types: y (modal) and z (status) must be present ----- +-- Regression for the [a,x] truncation bug. +SELECT count(*) AS lex_type_count FROM ts_token_type('zhparser'); +SELECT alias FROM ts_token_type('zhparser') WHERE alias IN ('y','z') ORDER BY alias; + +-- ----- 2. GUC registration: 8 zhparser.* GUCs exist with expected contexts - +SELECT name, context, vartype +FROM pg_settings +WHERE name LIKE 'zhparser.%' +ORDER BY name; + +-- ----- 3. Per-call state isolation ----- +-- Two parser invocations side-by-side must not corrupt each other's +-- token streams. If the global-state bug from <2.4 were back, one of +-- these subqueries would observe the other's input. +WITH + a AS (SELECT string_agg(token, ',') AS s FROM ts_parse('zhparser', 'hello')), + b AS (SELECT string_agg(token, ',') AS s FROM ts_parse('zhparser', 'world')) +SELECT + (a.s LIKE '%hello%') AS a_has_hello, + (a.s LIKE '%world%') AS a_has_world, + (b.s LIKE '%hello%') AS b_has_hello, + (b.s LIKE '%world%') AS b_has_world +FROM a, b; + +-- ----- 4. sync_zhprs_custom_word: regex guard must be active ----- +SELECT + (pg_get_functiondef(p.oid) LIKE '%format(%') AS uses_format_func, + (pg_get_functiondef(p.oid) LIKE '%[A-Za-z0-9_]%') AS has_dbname_regex +FROM pg_proc p +JOIN pg_namespace n ON n.oid = p.pronamespace +WHERE n.nspname = 'public' AND p.proname = 'sync_zhprs_custom_word'; + +-- ----- 5. Session-scoped GUCs are still mutable ----- +SET zhparser.punctuation_ignore = on; +SET zhparser.multi_short = on; +SET zhparser.multi_zall = on; +RESET ALL; diff --git a/zhparser--2.3--2.4.sql b/zhparser--2.3--2.4.sql new file mode 100644 index 0000000..d08906d --- /dev/null +++ b/zhparser--2.3--2.4.sql @@ -0,0 +1,43 @@ +/* + * 2.3 -> 2.4 + * + * - Replace string concatenation in sync_zhprs_custom_word() with + * format() / %L to mitigate path injection through current_database() + * and embedded quotes. + * - Validate database name characters before writing the dict file. + * + * Existing dict files on disk are unaffected. + */ + +CREATE OR REPLACE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS +$$ +declare + data_dir text; + db_name text; + dict_path text; + time_tag_path text; + query text; +begin + select setting from pg_settings where name = 'data_directory' + into data_dir; + if data_dir is null then + raise exception 'zhparser: cannot resolve data_directory'; + end if; + + db_name := current_database(); + if db_name !~ '^[A-Za-z0-9_]+$' then + raise exception 'zhparser: refusing to write custom dict for database name "%" (only [A-Za-z0-9_] allowed)', db_name; + end if; + + dict_path := data_dir || '/base/zhprs_dict_' || db_name || '.txt'; + time_tag_path := data_dir || '/base/zhprs_dict_' || db_name || '.tag'; + + query := format( + 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to %L encoding %L', + dict_path, 'utf8'); + execute query; + + query := format('copy (select now()) to %L', time_tag_path); + execute query; +end; +$$; diff --git a/zhparser--2.4.sql b/zhparser--2.4.sql new file mode 100644 index 0000000..ee1a8da --- /dev/null +++ b/zhparser--2.4.sql @@ -0,0 +1,91 @@ +CREATE FUNCTION zhprs_start(internal, int4) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION zhprs_getlexeme(internal, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION zhprs_end(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION zhprs_lextype(internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE TEXT SEARCH PARSER zhparser ( + START = zhprs_start, + GETTOKEN = zhprs_getlexeme, + END = zhprs_end, + HEADLINE = pg_catalog.prsd_headline, + LEXTYPES = zhprs_lextype +); + + +CREATE SCHEMA zhparser; +CREATE TABLE zhparser.zhprs_custom_word( + word text primary key, + tf float default '1.0', + idf float default '1.0', + attr char default '@', + check (attr = '@' or attr = '!') +); + +/* + * sync_zhprs_custom_word + * + * 2.4 hardening: build the COPY statement via format() with %I/%L + * placeholders and validate the database name. Refuses to run if + * current_database() contains characters that are unsafe in a + * filesystem path; this matches the C side, which also refuses to + * load such paths. + * + * Requires superuser/pg_write_server_files privileges to actually + * perform COPY ... TO 'filename', which is unchanged from prior + * versions. + */ +CREATE FUNCTION sync_zhprs_custom_word() RETURNS void LANGUAGE plpgsql AS +$$ +declare + data_dir text; + db_name text; + dict_path text; + time_tag_path text; + query text; +begin + select setting from pg_settings where name = 'data_directory' + into data_dir; + if data_dir is null then + raise exception 'zhparser: cannot resolve data_directory'; + end if; + + db_name := current_database(); + if db_name !~ '^[A-Za-z0-9_]+$' then + raise exception 'zhparser: refusing to write custom dict for database name "%" (only [A-Za-z0-9_] allowed)', db_name; + end if; + + dict_path := data_dir || '/base/zhprs_dict_' || db_name || '.txt'; + time_tag_path := data_dir || '/base/zhprs_dict_' || db_name || '.tag'; + + /* + * %L on the path quotes it as a SQL string literal (handles single + * quotes). %I is irrelevant here; COPY does not accept identifiers. + * The encoding is hard-coded utf8. + */ + query := format( + 'copy (select word, tf, idf, attr from zhparser.zhprs_custom_word) to %L encoding %L', + dict_path, 'utf8'); + execute query; + + query := format('copy (select now()) to %L', time_tag_path); + execute query; +end; +$$; + +-- do not create custom dict files on fresh install +-- select sync_zhprs_custom_word(); diff --git a/zhparser-backup-custom-dict.sh b/zhparser-backup-custom-dict.sh index 7c345a1..8e68d83 100755 --- a/zhparser-backup-custom-dict.sh +++ b/zhparser-backup-custom-dict.sh @@ -1,47 +1,135 @@ -if [ $# -lt 2 ];then - echo "usage: $0 [restore_from_dir]" - echo "warning: delete is a dangerous cmd, it will delete your custom from pgdata_dir." - echo "!!!you should run backup cmd first, then run the delete cmd !!!" - exit 2 -fi -cmd=$1 -pgdata=$2 -restore_from_dir=$3 - -if [ $cmd = 'backup' ];then - backup_dir=zhparser-backup-custom-dict-$(date +'%F:%T') - mkdir ./$backup_dir - echo "will backup $pgdata/base/zhprs_dict_* to $backup_dir/" - cp -a $pgdata/base/zhprs_dict_* $backup_dir/ - if [ "$?" -ne 0 ]; - then - echo "backup error!" - exit 1 - else - echo "backup ok!" - fi -fi - -if [ $cmd = 'delete' ];then - echo "will delete $pgdata/base/zhprs_dict_*" - rm $pgdata/base/zhprs_dict_* - if [ "$?" -ne 0 ]; - then - echo "delete error!" - exit 1 - else - echo "delete ok!" - fi -fi - -if [ $cmd = 'restore' ];then - echo "will restore $restore_from_dir/zhprs_dict_* to $pgdata/base/" - cp -a $restore_from_dir/zhprs_dict_* $pgdata/base/ - if [ "$?" -ne 0 ]; - then - echo "restore error!" - exit 1 - else - echo "restore ok!" - fi -fi +#!/usr/bin/env bash +# +# zhparser-backup-custom-dict.sh +# +# Manage zhparser custom dict files (zhprs_dict_*) under $PGDATA/base. +# +# Usage: +# backup Copy zhprs_dict_* to a timestamped backup dir. +# restore Copy zhprs_dict_* from back to /base. +# delete [--yes] [--dry-run] Delete zhprs_dict_* under /base. +# +# Hardening (vs. original): +# - set -euo pipefail +# - Validates PGDATA exists and looks plausible. +# - delete refuses to run without --yes; supports --dry-run. +# - Uses null-delimited globs to handle weird filenames. + +set -euo pipefail + +usage() { + cat <&2 +usage: + $0 backup + $0 restore + $0 delete [--yes] [--dry-run] + +WARNING: 'delete' permanently removes zhparser custom dict files. + Always run 'backup' first. +EOF + exit 2 +} + +require_pgdata() { + local d="$1" + if [ ! -d "$d" ] || [ ! -d "$d/base" ]; then + echo "error: \"$d\" does not look like a PGDATA directory (missing $d/base)" >&2 + exit 1 + fi +} + +cmd="${1:-}" +[ -n "$cmd" ] || usage + +case "$cmd" in + backup) + [ "$#" -ge 2 ] || usage + pgdata="$2" + require_pgdata "$pgdata" + ts=$(date +'%Y-%m-%dT%H-%M-%S') + backup_dir="zhparser-backup-custom-dict-$ts" + mkdir "$backup_dir" + echo "Backing up $pgdata/base/zhprs_dict_* -> $backup_dir/" + # Avoid noisy 'no match' if there are zero matching files. + shopt -s nullglob + files=( "$pgdata"/base/zhprs_dict_* ) + shopt -u nullglob + if [ "${#files[@]}" -eq 0 ]; then + echo "no zhprs_dict_* files found; backup directory left empty" + else + cp -a -- "${files[@]}" "$backup_dir/" + echo "backup ok" + fi + ;; + + restore) + [ "$#" -ge 3 ] || usage + pgdata="$2" + restore_from_dir="$3" + require_pgdata "$pgdata" + if [ ! -d "$restore_from_dir" ]; then + echo "error: restore source \"$restore_from_dir\" does not exist" >&2 + exit 1 + fi + echo "Restoring $restore_from_dir/zhprs_dict_* -> $pgdata/base/" + shopt -s nullglob + files=( "$restore_from_dir"/zhprs_dict_* ) + shopt -u nullglob + if [ "${#files[@]}" -eq 0 ]; then + echo "error: no zhprs_dict_* files in $restore_from_dir" >&2 + exit 1 + fi + cp -a -- "${files[@]}" "$pgdata/base/" + echo "restore ok" + ;; + + delete) + [ "$#" -ge 2 ] || usage + pgdata="$2" + require_pgdata "$pgdata" + shift 2 + confirm="no" + dry_run="no" + while [ "$#" -gt 0 ]; do + case "$1" in + --yes) confirm="yes" ;; + --dry-run) dry_run="yes" ;; + *) echo "unknown flag: $1" >&2; exit 2 ;; + esac + shift + done + + shopt -s nullglob + files=( "$pgdata"/base/zhprs_dict_* ) + shopt -u nullglob + + if [ "${#files[@]}" -eq 0 ]; then + echo "nothing to delete: no zhprs_dict_* under $pgdata/base" + exit 0 + fi + + echo "Will delete the following files under $pgdata/base:" + for f in "${files[@]}"; do + echo " $f" + done + + if [ "$dry_run" = "yes" ]; then + echo "(dry-run, no files removed)" + exit 0 + fi + + if [ "$confirm" != "yes" ]; then + echo + echo "REFUSING to delete without --yes. Re-run with:" + echo " $0 delete $pgdata --yes" + exit 1 + fi + + rm -- "${files[@]}" + echo "delete ok" + ;; + + *) + usage + ;; +esac diff --git a/zhparser.c b/zhparser.c index 6fde37b..4367a7f 100644 --- a/zhparser.c +++ b/zhparser.c @@ -1,320 +1,625 @@ /*------------------------------------------------------------------------- * * zhparser.c - * a text search parser for Chinese + * A text search parser for Chinese based on SCWS. + * + * Hardened revision (PG 16/17/18): + * - Per-call SCWS instance via scws_fork() to remove global mutable state. + * - GUCs registered in _PG_init() and validated through check hooks. + * - extra_dicts / database-name path traversal hardening. + * - mmap as default dict load mode (shared via OS page cache). + * - Lexeme attribute range fixed to ['a','z'] (was ['a','x']). + * - Cached multi-mode flags so hot path avoids repeated GUC reads. * *------------------------------------------------------------------------- */ -#include "zhparser.h" - #include "postgres.h" -#include "miscadmin.h" + #include "fmgr.h" -#include "utils/guc.h" +#include "miscadmin.h" +#include "commands/dbcommands.h" #include "utils/builtins.h" - -#if PG_VERSION_NUM >= 100000 +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/palloc.h" #include "utils/varlena.h" -#endif -#include "commands/dbcommands.h" +#include +#include +#include -/* dict file extension */ -#define TXT_EXT ".txt" -#define XDB_EXT ".xdb" -/* length of file extension */ -#define EXT_LEN 4 +#include "zhparser.h" PG_MODULE_MAGIC; -/* - * types - */ -/* self-defined type */ -typedef struct +/* ----------------------------------------------------------------------- * + * Constants + * ----------------------------------------------------------------------- */ + +#define LEX_TYPE_COUNT 26 /* a..z */ +#define DICT_EXT_LEN 4 /* ".txt" / ".xdb" */ +#define TXT_EXT ".txt" +#define XDB_EXT ".xdb" + +/* ----------------------------------------------------------------------- * + * Types + * ----------------------------------------------------------------------- */ + +typedef struct ParserState { - char *buffer; /* text to parse */ - int len; /* length of the text in buffer */ - int pos; /* position of the parser */ - scws_t scws; - scws_res_t head; - scws_res_t curr; + char *buffer; /* text to parse (palloc'd by caller) */ + int len; + int pos; + scws_t scws; /* per-call SCWS instance (forked) */ + scws_res_t head; + scws_res_t curr; } ParserState; -/* copy-paste from wparser.h of tsearch2 */ -typedef struct +typedef struct LexDescr { int lexid; char *alias; char *descr; } LexDescr; -static void init(); +/* ----------------------------------------------------------------------- * + * GUC variables + * ----------------------------------------------------------------------- */ + +static bool dict_in_memory = false; +static char *extra_dicts = NULL; + +static bool punctuation_ignore = false; +static bool seg_with_duality = false; +static bool multi_short = false; +static bool multi_duality = false; +static bool multi_zmain = false; +static bool multi_zall = false; + +/* ----------------------------------------------------------------------- * + * Process-local state + * + * The "master" SCWS instance owns the loaded dictionary/rules. Every + * zhprs_start() forks a cheap per-call clone (scws_fork) so concurrent + * parser invocations within the same backend (e.g. SRFs, nested calls, + * subqueries) cannot trample each other. + * ----------------------------------------------------------------------- */ + +static scws_t master_scws = NULL; +static bool master_load_failed = false; + +/* ----------------------------------------------------------------------- * + * Forward declarations + * ----------------------------------------------------------------------- */ +void _PG_init(void); +void _PG_fini(void); + +PG_FUNCTION_INFO_V1(zhprs_start); +PG_FUNCTION_INFO_V1(zhprs_getlexeme); +PG_FUNCTION_INFO_V1(zhprs_end); +PG_FUNCTION_INFO_V1(zhprs_lextype); + +static void ensure_master_loaded(void); +static int resolve_load_mode(void); +static int current_multi_mode(void); +static bool is_safe_dict_filename(const char *name); +static bool is_safe_database_name(const char *name); static void init_type(LexDescr descr[]); +/* GUC hooks */ +static bool check_extra_dicts(char **newval, void **extra, GucSource source); + +/* ----------------------------------------------------------------------- * + * Lex type table (static, copied per zhprs_lextype call) + * ----------------------------------------------------------------------- */ + +static const struct +{ + int lexid; + const char *alias; + const char *descr; +} lex_types[LEX_TYPE_COUNT] = { + { 'a', "a", "adjective,形容词" }, + { 'b', "b", "differentiation,区别词" }, + { 'c', "c", "conjunction,连词" }, + { 'd', "d", "adverb,副词" }, + { 'e', "e", "exclamation,感叹词" }, + { 'f', "f", "position,方位词" }, + { 'g', "g", "root,词根" }, + { 'h', "h", "head,前连接成分" }, + { 'i', "i", "idiom,成语" }, + { 'j', "j", "abbreviation,简称" }, + { 'k', "k", "tail,后连接成分" }, + { 'l', "l", "tmp,习用语" }, + { 'm', "m", "numeral,数词" }, + { 'n', "n", "noun,名词" }, + { 'o', "o", "onomatopoeia,拟声词" }, + { 'p', "p", "prepositional,介词" }, + { 'q', "q", "quantity,量词" }, + { 'r', "r", "pronoun,代词" }, + { 's', "s", "space,处所词" }, + { 't', "t", "time,时语素" }, + { 'u', "u", "auxiliary,助词" }, + { 'v', "v", "verb,动词" }, + { 'w', "w", "punctuation,标点符号" }, + { 'x', "x", "unknown,未知词" }, + { 'y', "y", "modal,语气词" }, + { 'z', "z", "status,状态词" }, +}; + +/* ----------------------------------------------------------------------- * + * Helpers + * ----------------------------------------------------------------------- */ + /* - * prototypes + * is_safe_dict_filename + * + * Whitelist for entries listed in zhparser.extra_dicts. We deliberately + * forbid anything that could escape the tsearch_data directory (no '/', + * no '\', no '..', no leading dot). Only [A-Za-z0-9_.-] is allowed. */ -PG_FUNCTION_INFO_V1(zhprs_start); -Datum zhprs_start(PG_FUNCTION_ARGS); +static bool +is_safe_dict_filename(const char *name) +{ + const char *p; -PG_FUNCTION_INFO_V1(zhprs_getlexeme); -Datum zhprs_getlexeme(PG_FUNCTION_ARGS); + if (name == NULL || name[0] == '\0' || name[0] == '.' || name[0] == '-') + return false; -PG_FUNCTION_INFO_V1(zhprs_end); -Datum zhprs_end(PG_FUNCTION_ARGS); + for (p = name; *p; p++) + { + unsigned char c = (unsigned char) *p; + if (!(isalnum(c) || c == '_' || c == '.' || c == '-')) + return false; + } -PG_FUNCTION_INFO_V1(zhprs_lextype); -Datum zhprs_lextype(PG_FUNCTION_ARGS); + /* explicit ".." rejection */ + if (strstr(name, "..") != NULL) + return false; -static scws_t scws = NULL; -static ParserState parser_state; + return true; +} -/* config */ -static bool dict_in_memory = false; -static char * extra_dicts = NULL; +/* + * is_safe_database_name + * + * The custom-word file is named after current_database(). PG allows + * almost any character there once quoted, so we refuse to load the + * custom dict (with a LOG) for "exotic" names instead of building a + * traversable filesystem path. + */ +static bool +is_safe_database_name(const char *name) +{ + const char *p; + + if (name == NULL || name[0] == '\0') + return false; -static bool punctuation_ignore = false; -static bool seg_with_duality = false; -static bool multi_short = false; -static bool multi_duality = false; -static bool multi_zmain = false; -static bool multi_zall = false; + for (p = name; *p; p++) + { + unsigned char c = (unsigned char) *p; + if (!(isalnum(c) || c == '_')) + return false; + } + return true; +} -static void init(){ - char sharepath[MAXPGPATH]; - char dict_path[MAXPGPATH]; - char rule_path[MAXPGPATH]; - int load_dict_mem_mode = 0x0; +/* + * resolve_load_mode + * + * NOTE: SCWS itself does NOT expose a public "mmap" flag. With the default + * (no SCWS_XDICT_MEM), libscws opens the .xdb via mmap (xdb.c uses fmap), + * so the kernel page cache already gives backends a shared dictionary + * footprint. Setting SCWS_XDICT_MEM forces the dict to be slurped into + * private heap, which is the only mode where 14MB is duplicated per + * backend. + */ +static int +resolve_load_mode(void) +{ + return dict_in_memory ? SCWS_XDICT_MEM : 0; +} - List *elemlist; - ListCell *l; +/* ----------------------------------------------------------------------- * + * GUC hooks + * ----------------------------------------------------------------------- */ - if (!(scws = scws_new())) { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to init Chinese Parser Lib SCWS!\"%s\"","" - ))); +static bool +check_extra_dicts(char **newval, void **extra, GucSource source) +{ + List *elemlist; + ListCell *l; + char *rawname; + bool ok = true; + + if (*newval == NULL || (*newval)[0] == '\0') + return true; + + rawname = pstrdup(*newval); + if (!SplitIdentifierString(rawname, ',', &elemlist)) + { + GUC_check_errdetail("List syntax is invalid."); + pfree(rawname); + return false; } - + + foreach(l, elemlist) + { + const char *name = (const char *) lfirst(l); + const char *ext; + + if (!is_safe_dict_filename(name)) + { + GUC_check_errdetail("Dict file name \"%s\" contains illegal characters.", name); + ok = false; + break; + } + + ext = strrchr(name, '.'); + if (ext == NULL || strlen(ext) != DICT_EXT_LEN || + (strcmp(ext, TXT_EXT) != 0 && strcmp(ext, XDB_EXT) != 0)) + { + GUC_check_errdetail("Dict file \"%s\" must end with .txt or .xdb.", name); + ok = false; + break; + } + } + + list_free(elemlist); + pfree(rawname); + return ok; +} + +static int +current_multi_mode(void) +{ + int m = 0; + if (multi_short) m |= SCWS_MULTI_SHORT; + if (multi_duality) m |= SCWS_MULTI_DUALITY; + if (multi_zmain) m |= SCWS_MULTI_ZMAIN; + if (multi_zall) m |= SCWS_MULTI_ZALL; + return m; +} + +/* ----------------------------------------------------------------------- * + * _PG_init / _PG_fini + * ----------------------------------------------------------------------- */ + +void +_PG_init(void) +{ DefineCustomBoolVariable( "zhparser.dict_in_memory", - "load dicts into memory", - "load dicts into memory", + "Load dicts into memory (private heap copy per backend).", + "When false (default) the dict is mmap'd, sharing via OS page cache.", &dict_in_memory, false, PGC_BACKEND, - 0, - NULL, - NULL, - NULL - ); + 0, NULL, NULL, NULL); + DefineCustomStringVariable( "zhparser.extra_dicts", - "extra dicts files to load", - "extra dicts files to load", + "Extra dict files to load (comma separated, basenames only).", + "Names must end with .txt or .xdb and contain only [A-Za-z0-9_.-].", &extra_dicts, NULL, PGC_BACKEND, 0, + check_extra_dicts, NULL, - NULL, - NULL - ); + NULL); + DefineCustomBoolVariable( "zhparser.punctuation_ignore", - "set if zhparser ignores the puncuation", - "set if zhparser ignores the puncuation,except \\r and \\n", + "Ignore punctuation (except CR/LF).", + NULL, &punctuation_ignore, false, PGC_USERSET, - 0, - NULL, - NULL, - NULL - ); + 0, NULL, NULL, NULL); DefineCustomBoolVariable( "zhparser.seg_with_duality", - "segment words with duality", - "segment words with duality", + "Segment words with duality.", + NULL, &seg_with_duality, false, PGC_USERSET, - 0, - NULL, - NULL, - NULL - ); + 0, NULL, NULL, NULL); + DefineCustomBoolVariable( "zhparser.multi_short", - "prefer short words", - "prefer short words", + "Prefer short words.", + NULL, &multi_short, false, PGC_USERSET, - 0, - NULL, - NULL, - NULL - ); + 0, NULL, NULL, NULL); + DefineCustomBoolVariable( "zhparser.multi_duality", - "prefer duality", - "prefer duality", + "Prefer duality.", + NULL, &multi_duality, false, PGC_USERSET, - 0, - NULL, - NULL, - NULL - ); + 0, NULL, NULL, NULL); + DefineCustomBoolVariable( "zhparser.multi_zmain", - "prefer most important element", - "prefer most important element", + "Prefer most important element.", + NULL, &multi_zmain, false, PGC_USERSET, - 0, - NULL, - NULL, - NULL - ); + 0, NULL, NULL, NULL); + DefineCustomBoolVariable( "zhparser.multi_zall", - "prefer all element", - "prefer all element", + "Prefer all elements.", + NULL, &multi_zall, false, PGC_USERSET, - 0, - NULL, - NULL, - NULL - ); + 0, NULL, NULL, NULL); - get_share_path(my_exec_path, sharepath); - snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s.%s", - sharepath, "dict.utf8", "xdb"); - scws_set_charset(scws, "utf-8"); - - if(dict_in_memory) - load_dict_mem_mode = SCWS_XDICT_MEM; - - /* ignore error,default dict is xdb */ - if( scws_set_dict(scws,dict_path,load_dict_mem_mode | SCWS_XDICT_XDB ) != 0){ - ereport(NOTICE, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("zhparser set dict : \"%s\" failed!",dict_path - ))); - } +#if PG_VERSION_NUM >= 150000 + MarkGUCPrefixReserved("zhparser"); +#endif +} - snprintf(dict_path, MAXPGPATH, "%s/base/zhprs_dict_%s.txt", - DataDir, get_database_name(MyDatabaseId)); - if(scws_add_dict(scws, dict_path, load_dict_mem_mode | SCWS_XDICT_TXT) != 0 ){ - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("zhparser add dict : \"%s\" failed! May not config custom dict, omit this",dict_path - ))); +void +_PG_fini(void) +{ + if (master_scws != NULL) + { + scws_free(master_scws); + master_scws = NULL; } +} - if(extra_dicts != NULL){ - if(!SplitIdentifierString(pstrdup(extra_dicts),',',&elemlist)){ - scws_free(scws); - list_free(elemlist); - scws = NULL; +/* ----------------------------------------------------------------------- * + * ensure_master_loaded + * + * Lazy-loads the master SCWS instance. Safe to call repeatedly: on + * persistent failure we cache the failure for the rest of the backend + * lifetime instead of trying again on every call. + * ----------------------------------------------------------------------- */ + +static void +ensure_master_loaded(void) +{ + char sharepath[MAXPGPATH]; + char dict_path[MAXPGPATH]; + char rule_path[MAXPGPATH]; + int load_mode; + List *elemlist = NIL; + ListCell *l; + char *rawnames = NULL; + const char *dbname; + scws_t newscws; + + if (master_scws != NULL || master_load_failed) + return; + + newscws = scws_new(); + if (newscws == NULL) + { + master_load_failed = true; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("zhparser.extra_dicts syntax error! extra_dicts is \"%s\"",extra_dicts - ))); - } - - foreach(l,elemlist){ - int load_dict_mode = load_dict_mem_mode; - char * ext = strrchr((char*)lfirst(l),'.'); - if(ext != NULL && strlen(ext) == EXT_LEN){ - if(strncmp(ext,TXT_EXT,EXT_LEN) == 0){ - load_dict_mode |= SCWS_XDICT_TXT; - } - else if(strncmp(ext,XDB_EXT,EXT_LEN) == 0){ - load_dict_mode |= SCWS_XDICT_XDB; - } - } + errmsg("failed to initialize SCWS"))); + } - if(((load_dict_mode & SCWS_XDICT_TXT) == 0) && - ((load_dict_mode & SCWS_XDICT_XDB) == 0)){ - scws_free(scws); - list_free(elemlist); - scws = NULL; - ereport(ERROR, + scws_set_charset(newscws, "utf-8"); + + get_share_path(my_exec_path, sharepath); + + load_mode = resolve_load_mode(); + + /* 1) Built-in main dict ------------------------------------------------ */ + snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/dict.utf8.xdb", sharepath); + if (scws_set_dict(newscws, dict_path, load_mode | SCWS_XDICT_XDB) != 0) + ereport(NOTICE, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("zhparser.extra_dicts setting error,the file name must end with .txt or .xdb! error file name is \"%s\"",(char*)lfirst(l) - ))); - + errmsg("zhparser: failed to set main dict \"%s\"", dict_path))); + + /* 2) Per-database custom dict ---------------------------------------- */ + dbname = get_database_name(MyDatabaseId); + if (!is_safe_database_name(dbname)) + { + ereport(LOG, + (errmsg("zhparser: skipping custom dict for database \"%s\" " + "(name contains characters that are unsafe for filesystem paths)", + dbname != NULL ? dbname : "(null)"))); + } + else + { + snprintf(dict_path, MAXPGPATH, "%s/base/zhprs_dict_%s.txt", + DataDir, dbname); + if (scws_add_dict(newscws, dict_path, load_mode | SCWS_XDICT_TXT) != 0) + ereport(LOG, + (errmsg("zhparser: custom dict \"%s\" not loaded " + "(missing or unreadable; run zhparser.sync_zhprs_custom_word() if expected)", + dict_path))); + } + + /* 3) extra_dicts ----------------------------------------------------- */ + if (extra_dicts != NULL && extra_dicts[0] != '\0') + { + rawnames = pstrdup(extra_dicts); + if (!SplitIdentifierString(rawnames, ',', &elemlist)) + { + pfree(rawnames); + scws_free(newscws); + master_load_failed = true; + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("zhparser.extra_dicts has invalid syntax: \"%s\"", + extra_dicts))); } - snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s", - sharepath, (char*)lfirst(l)); - /* ignore error*/ - if( scws_add_dict(scws,dict_path,load_dict_mode) != 0 ){ - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("zhparser add dict : \"%s\" failed for extra dict! omit",dict_path - ))); + foreach(l, elemlist) + { + const char *name = (const char *) lfirst(l); + const char *ext; + int mode = load_mode; + + /* Re-validate at load time too (defence in depth). */ + if (!is_safe_dict_filename(name)) + { + list_free(elemlist); + pfree(rawnames); + scws_free(newscws); + master_load_failed = true; + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("zhparser.extra_dicts contains illegal name \"%s\"", + name))); + } + + ext = strrchr(name, '.'); + if (ext == NULL || strlen(ext) != DICT_EXT_LEN) + { + list_free(elemlist); + pfree(rawnames); + scws_free(newscws); + master_load_failed = true; + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("zhparser.extra_dicts entry \"%s\" must end with .txt or .xdb", + name))); + } + if (strcmp(ext, TXT_EXT) == 0) + mode |= SCWS_XDICT_TXT; + else if (strcmp(ext, XDB_EXT) == 0) + mode |= SCWS_XDICT_XDB; + else + { + list_free(elemlist); + pfree(rawnames); + scws_free(newscws); + master_load_failed = true; + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("zhparser.extra_dicts entry \"%s\" must end with .txt or .xdb", + name))); + } + + snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s", + sharepath, name); + if (scws_add_dict(newscws, dict_path, mode) != 0) + ereport(LOG, + (errmsg("zhparser: failed to add extra dict \"%s\"", + dict_path))); } - } - list_free(elemlist); + + list_free(elemlist); + pfree(rawnames); + } + + /* 4) Rules ----------------------------------------------------------- */ + snprintf(rule_path, MAXPGPATH, "%s/tsearch_data/rules.utf8.ini", sharepath); + { + struct stat st; + if (stat(rule_path, &st) == 0) + scws_set_rule(newscws, rule_path); + else + ereport(LOG, + (errmsg("zhparser: rules file \"%s\" not found, continuing without rules", + rule_path))); } - snprintf(rule_path, MAXPGPATH, "%s/tsearch_data/%s.%s", - sharepath, "rules.utf8", "ini"); - scws_set_rule(scws ,rule_path); + /* Configure ignore/duality on the master so forks inherit them. */ + scws_set_ignore(newscws, (int) punctuation_ignore); + scws_set_duality(newscws, (int) seg_with_duality); + scws_set_multi(newscws, current_multi_mode()); + + master_scws = newscws; } -/* - * functions - */ +/* ----------------------------------------------------------------------- * + * Per-call resource cleanup + * + * MemoryContextRegisterResetCallback lets us guarantee that on any + * unwind (ERROR, transaction abort) we still free the forked SCWS + * instance and its result cursor. + * ----------------------------------------------------------------------- */ + +static void +parser_state_cleanup(void *arg) +{ + ParserState *pst = (ParserState *) arg; + + if (pst == NULL) + return; + if (pst->head != NULL) + { + scws_free_result(pst->head); + pst->head = NULL; + pst->curr = NULL; + } + if (pst->scws != NULL) + { + scws_free(pst->scws); + pst->scws = NULL; + } +} + +/* ----------------------------------------------------------------------- * + * SQL-callable functions + * ----------------------------------------------------------------------- */ Datum zhprs_start(PG_FUNCTION_ARGS) { - ParserState *pst = &parser_state; - int multi_mode = 0x0; - - if(scws == NULL) - init(); - pst -> scws = scws; - pst -> buffer = (char *) PG_GETARG_POINTER(0); - pst -> len = PG_GETARG_INT32(1); - pst -> pos = 0; + ParserState *pst; + scws_t forked; + MemoryContext cxt; + MemoryContextCallback *cb; - scws_set_ignore(scws, (int)punctuation_ignore); - scws_set_duality(scws,(int)seg_with_duality); + ensure_master_loaded(); + if (master_scws == NULL) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("zhparser: SCWS not initialized"))); - if(multi_short){ - multi_mode |= SCWS_MULTI_SHORT; - } + cxt = CurrentMemoryContext; - if(multi_duality){ - multi_mode |= SCWS_MULTI_DUALITY; - } + pst = (ParserState *) MemoryContextAllocZero(cxt, sizeof(ParserState)); - if(multi_zmain){ - multi_mode |= SCWS_MULTI_ZMAIN; + forked = scws_fork(master_scws); + if (forked == NULL) + { + pfree(pst); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("zhparser: scws_fork() failed"))); } - if(multi_zall){ - multi_mode |= SCWS_MULTI_ZALL; - } + /* + * Apply per-call user settings to the forked instance only. + * The master is unaffected, so concurrent calls cannot collide. + */ + scws_set_ignore(forked, (int) punctuation_ignore); + scws_set_duality(forked, (int) seg_with_duality); + scws_set_multi(forked, current_multi_mode()); - scws_set_multi(scws,multi_mode); + pst->scws = forked; + pst->buffer = (char *) PG_GETARG_POINTER(0); + pst->len = PG_GETARG_INT32(1); + pst->pos = 0; - scws_send_text(pst -> scws, pst -> buffer, pst -> len); + /* Register cleanup before sending text, so any failure unwinds cleanly. */ + cb = (MemoryContextCallback *) MemoryContextAllocZero(cxt, sizeof(*cb)); + cb->func = parser_state_cleanup; + cb->arg = pst; + MemoryContextRegisterResetCallback(cxt, cb); - (pst -> head) = (pst -> curr) = scws_get_result(pst -> scws); + scws_send_text(pst->scws, pst->buffer, pst->len); + pst->head = pst->curr = scws_get_result(pst->scws); PG_RETURN_POINTER(pst); } @@ -322,41 +627,52 @@ zhprs_start(PG_FUNCTION_ARGS) Datum zhprs_getlexeme(PG_FUNCTION_ARGS) { - ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); - char **t = (char **) PG_GETARG_POINTER(1); - int *tlen = (int *) PG_GETARG_POINTER(2); - int type = -1; + ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); + char **t = (char **) PG_GETARG_POINTER(1); + int *tlen = (int *) PG_GETARG_POINTER(2); + int type = -1; - if((pst -> head) == NULL ) /* already done the work,or no sentence */ + if (pst == NULL || pst->head == NULL) { + *t = NULL; *tlen = 0; - type = 0; + return Int32GetDatum(0); } - /* have results */ - else if(pst -> curr != NULL) + + if (pst->curr != NULL) { - scws_res_t curr = pst -> curr; + scws_res_t curr = pst->curr; + unsigned char attr0 = (unsigned char) curr->attr[0]; /* - * check the first char to determine the lextype - * if out of [0,25],then set to 'x',mean unknown type - * so for Ag,Dg,Ng,Tg,Vg,the type will be unknown - * for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr - */ - type = (int)(curr -> attr)[0]; - if(type > (int)'x' || type < (int)'a') - type = (int)'x'; - *tlen = curr -> len; - *t = pst -> buffer + curr -> off; - - pst -> curr = curr -> next; - - /* fetch the next sentence */ - if(pst -> curr == NULL ){ - scws_free_result(pst -> head); - (pst -> head) = (pst -> curr) = scws_get_result(pst -> scws); + * SCWS attributes use 'a'..'z' (see init_type below). Anything + * outside that range is mapped to 'x' (unknown). + * + * NOTE: the original code restricted to ['a','x'] which silently + * dropped 'y' (modal) and 'z' (status). Fixed to ['a','z']. + */ + if (attr0 < (unsigned char) 'a' || attr0 > (unsigned char) 'z') + type = (int) 'x'; + else + type = (int) attr0; + + *tlen = curr->len; + *t = pst->buffer + curr->off; + + pst->curr = curr->next; + + if (pst->curr == NULL) + { + scws_free_result(pst->head); + pst->head = pst->curr = scws_get_result(pst->scws); } } + else + { + *t = NULL; + *tlen = 0; + type = 0; + } PG_RETURN_INT32(type); } @@ -364,101 +680,56 @@ zhprs_getlexeme(PG_FUNCTION_ARGS) Datum zhprs_end(PG_FUNCTION_ARGS) { + ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); + + /* + * The MemoryContext reset callback we registered in zhprs_start will + * release the forked SCWS instance and any pending result cursor. + * + * However, when the caller (e.g. a long-lived loop in to_tsvector_byid) + * keeps the same context alive across many parse cycles, we want to + * release immediately to keep RSS flat. + */ + if (pst != NULL) + { + if (pst->head != NULL) + { + scws_free_result(pst->head); + pst->head = NULL; + pst->curr = NULL; + } + if (pst->scws != NULL) + { + scws_free(pst->scws); + pst->scws = NULL; + } + } PG_RETURN_VOID(); } Datum zhprs_lextype(PG_FUNCTION_ARGS) { - LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (26 + 1)); - init_type(descr); + LexDescr *descr; + descr = (LexDescr *) palloc(sizeof(LexDescr) * (LEX_TYPE_COUNT + 1)); + init_type(descr); PG_RETURN_POINTER(descr); } -static void init_type(LexDescr descr[]){ - /* - * there are 26 types in this parser,alias from a to z - * for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr - */ - descr[0].lexid = 97; - descr[0].alias = pstrdup("a"); - descr[0].descr = pstrdup("adjective,形容词"); - descr[1].lexid = 98; - descr[1].alias = pstrdup("b"); - descr[1].descr = pstrdup("differentiation,区别词"); - descr[2].lexid = 99; - descr[2].alias = pstrdup("c"); - descr[2].descr = pstrdup("conjunction,连词"); - descr[3].lexid = 100; - descr[3].alias = pstrdup("d"); - descr[3].descr = pstrdup("adverb,副词"); - descr[4].lexid = 101; - descr[4].alias = pstrdup("e"); - descr[4].descr = pstrdup("exclamation,感叹词"); - descr[5].lexid = 102; - descr[5].alias = pstrdup("f"); - descr[5].descr = pstrdup("position,方位词"); - descr[6].lexid = 103; - descr[6].alias = pstrdup("g"); - descr[6].descr = pstrdup("root,词根"); - descr[7].lexid = 104; - descr[7].alias = pstrdup("h"); - descr[7].descr = pstrdup("head,前连接成分"); - descr[8].lexid = 105; - descr[8].alias = pstrdup("i"); - descr[8].descr = pstrdup("idiom,成语"); - descr[9].lexid = 106; - descr[9].alias = pstrdup("j"); - descr[9].descr = pstrdup("abbreviation,简称"); - descr[10].lexid = 107; - descr[10].alias = pstrdup("k"); - descr[10].descr = pstrdup("tail,后连接成分"); - descr[11].lexid = 108; - descr[11].alias = pstrdup("l"); - descr[11].descr = pstrdup("tmp,习用语"); - descr[12].lexid = 109; - descr[12].alias = pstrdup("m"); - descr[12].descr = pstrdup("numeral,数词"); - descr[13].lexid = 110; - descr[13].alias = pstrdup("n"); - descr[13].descr = pstrdup("noun,名词"); - descr[14].lexid = 111; - descr[14].alias = pstrdup("o"); - descr[14].descr = pstrdup("onomatopoeia,拟声词"); - descr[15].lexid = 112; - descr[15].alias = pstrdup("p"); - descr[15].descr = pstrdup("prepositional,介词"); - descr[16].lexid = 113; - descr[16].alias = pstrdup("q"); - descr[16].descr = pstrdup("quantity,量词"); - descr[17].lexid = 114; - descr[17].alias = pstrdup("r"); - descr[17].descr = pstrdup("pronoun,代词"); - descr[18].lexid = 115; - descr[18].alias = pstrdup("s"); - descr[18].descr = pstrdup("space,处所词"); - descr[19].lexid = 116; - descr[19].alias = pstrdup("t"); - descr[19].descr = pstrdup("time,时语素"); - descr[20].lexid = 117; - descr[20].alias = pstrdup("u"); - descr[20].descr = pstrdup("auxiliary,助词"); - descr[21].lexid = 118; - descr[21].alias = pstrdup("v"); - descr[21].descr = pstrdup("verb,动词"); - descr[22].lexid = 119; - descr[22].alias = pstrdup("w"); - descr[22].descr = pstrdup("punctuation,标点符号"); - descr[23].lexid = 120; - descr[23].alias = pstrdup("x"); - descr[23].descr = pstrdup("unknown,未知词"); - descr[24].lexid = 121; - descr[24].alias = pstrdup("y"); - descr[24].descr = pstrdup("modal,语气词"); - descr[25].lexid = 122; - descr[25].alias = pstrdup("z"); - descr[25].descr = pstrdup("status,状态词"); - descr[26].lexid = 0; +static void +init_type(LexDescr descr[]) +{ + int i; + + for (i = 0; i < LEX_TYPE_COUNT; i++) + { + descr[i].lexid = lex_types[i].lexid; + descr[i].alias = pstrdup(lex_types[i].alias); + descr[i].descr = pstrdup(lex_types[i].descr); + } + /* sentinel */ + descr[LEX_TYPE_COUNT].lexid = 0; + descr[LEX_TYPE_COUNT].alias = NULL; + descr[LEX_TYPE_COUNT].descr = NULL; } -//TODO :headline function diff --git a/zhparser.control b/zhparser.control index 1e06790..70e78cc 100644 --- a/zhparser.control +++ b/zhparser.control @@ -1,4 +1,4 @@ comment = 'a parser for full-text search of Chinese' -default_version = '2.3' +default_version = '2.4' module_pathname = '$libdir/zhparser' relocatable = true diff --git a/zhparser.h b/zhparser.h index ba9b0c9..46dd300 100644 --- a/zhparser.h +++ b/zhparser.h @@ -1,12 +1,32 @@ -#ifndef ZHPARSER_H +#ifndef ZHPARSER_H #define ZHPARSER_H -#ifndef pstrdup -#define pstrdup scws_pstrdup +/* + * SCWS prior to 1.2.3 declared a function named `pstrdup` in scws.h, which + * collides with PostgreSQL's pstrdup() macro. Rather than #define-shadowing + * it (which is fragile if SCWS later inlines or changes the signature), we + * isolate the rename to this header only. + * + * Build systems linking against SCWS >= 1.2.3 can pass + * -DZHPARSER_SCWS_HAS_NO_PSTRDUP_CONFLICT to skip this entirely. + */ + +#ifndef ZHPARSER_SCWS_HAS_NO_PSTRDUP_CONFLICT +# ifdef pstrdup +# define ZHPARSER_SAVED_PSTRDUP pstrdup +# undef pstrdup +# endif +# define pstrdup scws_pstrdup #endif #include "scws.h" -#undef pstrdup - +#ifndef ZHPARSER_SCWS_HAS_NO_PSTRDUP_CONFLICT +# undef pstrdup +# ifdef ZHPARSER_SAVED_PSTRDUP +# define pstrdup ZHPARSER_SAVED_PSTRDUP +# undef ZHPARSER_SAVED_PSTRDUP +# endif #endif + +#endif /* ZHPARSER_H */