From e0d74976e7d21fca55cde44c36f734bbb59fdeef Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sun, 28 Jun 2026 12:08:41 +0200 Subject: [PATCH 1/2] filters: decode escaped chars correctly inside *[...] classes The escape branch in strjoker probed joker[i+2] instead of the current char, so a backslash escape only worked as the first class member: '*[\[\]]' (documented as "the [ or ] character") matched only ']', and '*[a,\[]' dropped the 'a'. The loop also treated any ']' as the class terminator, so an escaped ']' could never be a member. Decode the escape first in the loop body: a backslash takes the next char as the literal member (only that char, not also the backslash the old code added), and an escaped ']' is consumed before the terminator check. So '*[\[\]]' now matches both brackets, and escape precedes the range/size checks ('\-' '\,' '\<' become literal members). The self-test previously pinned the buggy output as expected; it now asserts the documented behavior and fails against the old matcher. Closes #148 Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Roche --- html/filters.html | 2 +- src/htsfilters.c | 14 ++++++---- tests/01_engine-filter.test | 56 +++++++++++++++++++++++++------------ 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/html/filters.html b/html/filters.html index 18d6ddb8..1ec7b520 100644 --- a/html/filters.html +++ b/html/filters.html @@ -247,7 +247,7 @@

1.a. Scan rules based on URL or extension

the \ character - *[\[\]] + *[\[,\]] the [ or ] character diff --git a/src/htsfilters.c b/src/htsfilters.c index 53d7f327..927932e4 100644 --- a/src/htsfilters.c +++ b/src/htsfilters.c @@ -193,7 +193,12 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s int len = (int) strlen(joker); while((joker[i] != RIGHT) && (joker[i]) && (i < len)) { - if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10] + // '\' escapes the next char as a literal member, e.g. *[\[\]] + if (joker[i] == '\\' && joker[i + 1] != '\0') { + i++; + pass[(int) (unsigned char) joker[i]] = 1; + i++; + } else if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10] int lsize = 0; int lverdict; @@ -221,7 +226,7 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s while(isdigit((unsigned char) joker[i])) i++; } - } else if (joker[i + 1] == '-') { // 2 car, ex: *[A-Z] + } else if (joker[i + 1] == '-') { // 2 car, ex: *[A-Z] if ((int) (unsigned char) joker[i + 2] > (int) (unsigned char) joker[i]) { int j; @@ -233,10 +238,7 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s } // else err=1; i += 3; - } else { // 1 car, ex: *[ ] - if (joker[i + 2] == '\\' && joker[i + 3] != 0) { // escaped char, such as *[\[] or *[\]] - i++; - } + } else { // 1 car, ex: *[ ] pass[(int) (unsigned char) joker[i]] = 1; i++; } diff --git a/tests/01_engine-filter.test b/tests/01_engine-filter.test index 528592bb..c09df6d8 100755 --- a/tests/01_engine-filter.test +++ b/tests/01_engine-filter.test @@ -50,27 +50,47 @@ match '*foo*bar' 'foozbar' # '?' is the query-string marker, not a single-char wildcard nomatch 'a?c' 'abc' -# backslash escapes a metacharacter inside a class so it is matched literally. -# Quirk: the decoder also adds the backslash itself to the set, so '\X' matches -# both X and '\'. These assertions pin that behavior. +# Inside a class, backslash escapes the next char as a literal member (#148): +# '\X' matches X only (not '\'), and an escaped ']' is a member, not the terminator. match '*[\*]' '*' -match '*[\*]' "\\" -nomatch '*[\*]' 'a' +nomatch '*[\*]' "\\" match '*[\\]' "\\" -nomatch '*[\\]' 'a' +nomatch '*[\\]' '*' match '*[\[]' '[' -match '*[\[]' "\\" -nomatch '*[\[]' 'a' - -# A literal ']' cannot be a class member: the class parser stops at the first -# ']', escaped or not. So '*[\[\]]' does NOT mean "the [ or ] character" as the -# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed -# by a trailing literal ']'. These assertions document the current (buggy) -# behavior so any future matcher fix is a deliberate, visible change. -nomatch '*[\[\]]' '[' # not matched, despite the docs -match '*[\[\]]' ']' # only via the empty class-match + trailing ']' -match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']' -nomatch '*[\[\]]' '[]x' +nomatch '*[\[]' "\\" +match '*[\]]' ']' +nomatch '*[\]]' "\\" + +# '*[\[\]]' is "the [ or ] character", as the filter guide documents. +match '*[\[\]]' '[' +match '*[\[\]]' ']' +nomatch '*[\[\]]' 'a' +match '*[\[,\]]' '[' # comma between members is optional +match '*[\[,\]]' ']' +match '*[a,\[]' 'a' # an escaped member no longer eats the preceding one +match '*[a,\[]' '[' + +# Escape is decoded before the range/separator/size checks, so '\-' '\,' '\<' +# are literal members, not operators. +match '*[a\-z]' 'a' +match '*[a\-z]' 'z' +nomatch '*[a\-z]' 'b' # not the a..z range +match '*[\,]' ',' +match '*[\<]' '<' +match '*[\[,\],a]' '[' +match '*[\[,\],a]' ']' +match '*[\[,\],a]' 'a' + +# *(...) matches exactly one char from the class; *[...] matches a run. +match '*(a,b)' 'a' +nomatch '*(a,b)' 'aa' +nomatch '*(a,b)' 'c' + +# documented composite filters (filters.html) +match 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.zip' +nomatch 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.tar' +match '*.html*[]' 'page.html' +nomatch '*.html*[]' 'page.html?x=1' # *[] forbids the trailing query # Size-based rules (-#test=filtersize ): a negative size # means the size is still unknown (scan time). A size exclusion must stay neutral From c292454271f2ac9f9d7865d76c4fdb3e4c42e4f3 Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sun, 28 Jun 2026 12:24:09 +0200 Subject: [PATCH 2/2] filters: fix a 1-byte over-read on a truncated range *[a- The *[...] class parser's range arm does i += 3 unconditionally, so a pattern ending in a dangling '-' (e.g. *[a-) read one byte past the NUL: joker[i+2] is the NUL, i jumps to len+1, and the separator skip and loop guard then read joker[len+1]. Guard the range arm on joker[i+2] != '\0' so a truncated range falls through to the literal-member path instead of overshooting. The filter self-test now copies the pattern and string into exact-size heap buffers so a sanitizer traps such over-reads; the pattern previously came straight from argv (no redzone), which is why this stayed invisible. A *[a- test case exercises it. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Roche --- src/htsfilters.c | 4 +++- src/htsselftest.c | 14 ++++++++++---- tests/01_engine-filter.test | 15 +++++++++++---- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/htsfilters.c b/src/htsfilters.c index 927932e4..e07a5d82 100644 --- a/src/htsfilters.c +++ b/src/htsfilters.c @@ -226,7 +226,9 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s while(isdigit((unsigned char) joker[i])) i++; } - } else if (joker[i + 1] == '-') { // 2 car, ex: *[A-Z] + } else if (joker[i + 1] == '-' && joker[i + 2] != '\0') { + // range *[A-Z]; the '\0' guard rejects a truncated *[a- (else + // i+=3 overshoots the NUL) if ((int) (unsigned char) joker[i + 2] > (int) (unsigned char) joker[i]) { int j; diff --git a/src/htsselftest.c b/src/htsselftest.c index aca1c96b..1ab68a8b 100644 --- a/src/htsselftest.c +++ b/src/htsselftest.c @@ -512,15 +512,21 @@ static int string_safety_selftests(void) { /* ------------------------------------------------------------ */ static int st_filter(httrackp *opt, int argc, char **argv) { + char *str, *pat; + int matched; + (void) opt; if (argc < 2) { fprintf(stderr, "filter: needs a filter pattern and a string\n"); return 1; } - if (strjoker(argv[1], argv[0], NULL, NULL)) - printf("%s does match %s\n", argv[1], argv[0]); - else - printf("%s does NOT match %s\n", argv[1], argv[0]); + /* exact-size heap copies so a sanitizer traps any over-read of the pattern */ + str = strdupt(argv[1]); + pat = strdupt(argv[0]); + matched = strjoker(str, pat, NULL, NULL) != NULL; + printf("%s does %s %s\n", argv[1], matched ? "match" : "NOT match", argv[0]); + freet(str); + freet(pat); return 0; } diff --git a/tests/01_engine-filter.test b/tests/01_engine-filter.test index c09df6d8..ddddf5dc 100755 --- a/tests/01_engine-filter.test +++ b/tests/01_engine-filter.test @@ -65,22 +65,29 @@ nomatch '*[\]]' "\\" match '*[\[\]]' '[' match '*[\[\]]' ']' nomatch '*[\[\]]' 'a' -match '*[\[,\]]' '[' # comma between members is optional +match '*[\[,\]]' '[' # comma between members is optional match '*[\[,\]]' ']' -match '*[a,\[]' 'a' # an escaped member no longer eats the preceding one +match '*[a,\[]' 'a' # an escaped member no longer eats the preceding one match '*[a,\[]' '[' # Escape is decoded before the range/separator/size checks, so '\-' '\,' '\<' # are literal members, not operators. match '*[a\-z]' 'a' match '*[a\-z]' 'z' -nomatch '*[a\-z]' 'b' # not the a..z range +nomatch '*[a\-z]' 'b' # not the a..z range match '*[\,]' ',' +nomatch '*[\,]' "\\" # the escape must not leak '\' into the class match '*[\<]' '<' +nomatch '*[\<]' "\\" match '*[\[,\],a]' '[' match '*[\[,\],a]' ']' match '*[\[,\],a]' 'a' +# A truncated range '*[a-' is the literal members {a,-}; the parser must not +# read past the end decoding it (was a 1-byte heap over-read in the range arm). +match '*[a-' 'a' +nomatch '*[a-' 'b' + # *(...) matches exactly one char from the class; *[...] matches a run. match '*(a,b)' 'a' nomatch '*(a,b)' 'aa' @@ -90,7 +97,7 @@ nomatch '*(a,b)' 'c' match 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.zip' nomatch 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.tar' match '*.html*[]' 'page.html' -nomatch '*.html*[]' 'page.html?x=1' # *[] forbids the trailing query +nomatch '*.html*[]' 'page.html?x=1' # *[] forbids the trailing query # Size-based rules (-#test=filtersize ): a negative size # means the size is still unknown (scan time). A size exclusion must stay neutral