From e0d74976e7d21fca55cde44c36f734bbb59fdeef Mon Sep 17 00:00:00 2001
From: Xavier Roche <roche@httrack.com>
Date: Sun, 28 Jun 2026 12:08:41 +0200
Subject: [PATCH 1/2] filters: decode escaped chars correctly inside *[...]
 classes

The escape branch in strjoker probed joker[i+2] instead of the current
char, so a backslash escape only worked as the first class member:
'*[\[\]]' (documented as "the [ or ] character") matched only ']', and
'*[a,\[]' dropped the 'a'. The loop also treated any ']' as the class
terminator, so an escaped ']' could never be a member.

Decode the escape first in the loop body: a backslash takes the next char
as the literal member (only that char, not also the backslash the old code
added), and an escaped ']' is consumed before the terminator check. So
'*[\[\]]' now matches both brackets, and escape precedes the range/size
checks ('\-' '\,' '\<' become literal members). The self-test previously
pinned the buggy output as expected; it now asserts the documented
behavior and fails against the old matcher.

Closes #148

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
---
 html/filters.html           |  2 +-
 src/htsfilters.c            | 14 ++++++----
 tests/01_engine-filter.test | 56 +++++++++++++++++++++++++------------
 3 files changed, 47 insertions(+), 25 deletions(-)
diff --git a/html/filters.html b/html/filters.html
index 18d6ddb8..1ec7b520 100644
--- a/html/filters.html
+++ b/html/filters.html
@@ -247,7 +247,7 @@ <h4>1.a. Scan rules based on URL or extension</h4>
         <td>the \ character</td>
       </tr>
       <tr>
-        <td nowrap><tt>*[\[\]]</tt></td>
+        <td nowrap><tt>*[\[,\]]</tt></td>
         <td>the [ or ] character</td>
       </tr>
       <tr>
diff --git a/src/htsfilters.c b/src/htsfilters.c
index 53d7f327..927932e4 100644
--- a/src/htsfilters.c
+++ b/src/htsfilters.c
@@ -193,7 +193,12 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
           int len = (int) strlen(joker);
 
           while((joker[i] != RIGHT) && (joker[i]) && (i < len)) {
-            if ((joker[i] == '<') || (joker[i] == '>')) {       // *[<10]
+            // '\' escapes the next char as a literal member, e.g. *[\[\]]
+            if (joker[i] == '\\' && joker[i + 1] != '\0') {
+              i++;
+              pass[(int) (unsigned char) joker[i]] = 1;
+              i++;
+            } else if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10]
               int lsize = 0;
               int lverdict;
 
@@ -221,7 +226,7 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
                 while(isdigit((unsigned char) joker[i]))
                   i++;
               }
-            } else if (joker[i + 1] == '-') {   // 2 car, ex: *[A-Z]
+            } else if (joker[i + 1] == '-') { // 2 car, ex: *[A-Z]
               if ((int) (unsigned char) joker[i + 2] >
                   (int) (unsigned char) joker[i]) {
                 int j;
@@ -233,10 +238,7 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
               }
               // else err=1;
               i += 3;
-            } else {            // 1 car, ex: *[ ]
-              if (joker[i + 2] == '\\' && joker[i + 3] != 0) {  // escaped char, such as *[\[] or *[\]]
-                i++;
-              }
+            } else { // 1 car, ex: *[ ]
               pass[(int) (unsigned char) joker[i]] = 1;
               i++;
             }
diff --git a/tests/01_engine-filter.test b/tests/01_engine-filter.test
index 528592bb..c09df6d8 100755
--- a/tests/01_engine-filter.test
+++ b/tests/01_engine-filter.test
@@ -50,27 +50,47 @@ match '*foo*bar' 'foozbar'
 # '?' is the query-string marker, not a single-char wildcard
 nomatch 'a?c' 'abc'
 
-# backslash escapes a metacharacter inside a class so it is matched literally.
-# Quirk: the decoder also adds the backslash itself to the set, so '\X' matches
-# both X and '\'. These assertions pin that behavior.
+# Inside a class, backslash escapes the next char as a literal member (#148):
+# '\X' matches X only (not '\'), and an escaped ']' is a member, not the terminator.
 match '*[\*]' '*'
-match '*[\*]' "\\"
-nomatch '*[\*]' 'a'
+nomatch '*[\*]' "\\"
 match '*[\\]' "\\"
-nomatch '*[\\]' 'a'
+nomatch '*[\\]' '*'
 match '*[\[]' '['
-match '*[\[]' "\\"
-nomatch '*[\[]' 'a'
-
-# A literal ']' cannot be a class member: the class parser stops at the first
-# ']', escaped or not. So '*[\[\]]' does NOT mean "the [ or ] character" as the
-# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed
-# by a trailing literal ']'. These assertions document the current (buggy)
-# behavior so any future matcher fix is a deliberate, visible change.
-nomatch '*[\[\]]' '[' # not matched, despite the docs
-match '*[\[\]]' ']'   # only via the empty class-match + trailing ']'
-match '*[\[\]]' '[]'  # one of {'[','\'} then the trailing ']'
-nomatch '*[\[\]]' '[]x'
+nomatch '*[\[]' "\\"
+match '*[\]]' ']'
+nomatch '*[\]]' "\\"
+
+# '*[\[\]]' is "the [ or ] character", as the filter guide documents.
+match '*[\[\]]' '['
+match '*[\[\]]' ']'
+nomatch '*[\[\]]' 'a'
+match '*[\[,\]]' '['   # comma between members is optional
+match '*[\[,\]]' ']'
+match '*[a,\[]' 'a'    # an escaped member no longer eats the preceding one
+match '*[a,\[]' '['
+
+# Escape is decoded before the range/separator/size checks, so '\-' '\,' '\<'
+# are literal members, not operators.
+match '*[a\-z]' 'a'
+match '*[a\-z]' 'z'
+nomatch '*[a\-z]' 'b'  # not the a..z range
+match '*[\,]' ','
+match '*[\<]' '<'
+match '*[\[,\],a]' '['
+match '*[\[,\],a]' ']'
+match '*[\[,\],a]' 'a'
+
+# *(...) matches exactly one char from the class; *[...] matches a run.
+match '*(a,b)' 'a'
+nomatch '*(a,b)' 'aa'
+nomatch '*(a,b)' 'c'
+
+# documented composite filters (filters.html)
+match 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.zip'
+nomatch 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.tar'
+match '*.html*[]' 'page.html'
+nomatch '*.html*[]' 'page.html?x=1'  # *[] forbids the trailing query
 
 # Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
 # means the size is still unknown (scan time). A size exclusion must stay neutral

From c292454271f2ac9f9d7865d76c4fdb3e4c42e4f3 Mon Sep 17 00:00:00 2001
From: Xavier Roche <roche@httrack.com>
Date: Sun, 28 Jun 2026 12:24:09 +0200
Subject: [PATCH 2/2] filters: fix a 1-byte over-read on a truncated range *[a-

The *[...] class parser's range arm does i += 3 unconditionally, so a
pattern ending in a dangling '-' (e.g. *[a-) read one byte past the NUL:
joker[i+2] is the NUL, i jumps to len+1, and the separator skip and loop
guard then read joker[len+1]. Guard the range arm on joker[i+2] != '\0'
so a truncated range falls through to the literal-member path instead of
overshooting.

The filter self-test now copies the pattern and string into exact-size
heap buffers so a sanitizer traps such over-reads; the pattern previously
came straight from argv (no redzone), which is why this stayed invisible.
A *[a- test case exercises it.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
---
 src/htsfilters.c            |  4 +++-
 src/htsselftest.c           | 14 ++++++++++----
 tests/01_engine-filter.test | 15 +++++++++++----
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/htsfilters.c b/src/htsfilters.c
index 927932e4..e07a5d82 100644
--- a/src/htsfilters.c
+++ b/src/htsfilters.c
@@ -226,7 +226,9 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
                 while(isdigit((unsigned char) joker[i]))
                   i++;
               }
-            } else if (joker[i + 1] == '-') { // 2 car, ex: *[A-Z]
+            } else if (joker[i + 1] == '-' && joker[i + 2] != '\0') {
+              // range *[A-Z]; the '\0' guard rejects a truncated *[a- (else
+              // i+=3 overshoots the NUL)
               if ((int) (unsigned char) joker[i + 2] >
                   (int) (unsigned char) joker[i]) {
                 int j;
diff --git a/src/htsselftest.c b/src/htsselftest.c
index aca1c96b..1ab68a8b 100644
--- a/src/htsselftest.c
+++ b/src/htsselftest.c
@@ -512,15 +512,21 @@ static int string_safety_selftests(void) {
 /* ------------------------------------------------------------ */
 
 static int st_filter(httrackp *opt, int argc, char **argv) {
+  char *str, *pat;
+  int matched;
+
   (void) opt;
   if (argc < 2) {
     fprintf(stderr, "filter: needs a filter pattern and a string\n");
     return 1;
   }
-  if (strjoker(argv[1], argv[0], NULL, NULL))
-    printf("%s does match %s\n", argv[1], argv[0]);
-  else
-    printf("%s does NOT match %s\n", argv[1], argv[0]);
+  /* exact-size heap copies so a sanitizer traps any over-read of the pattern */
+  str = strdupt(argv[1]);
+  pat = strdupt(argv[0]);
+  matched = strjoker(str, pat, NULL, NULL) != NULL;
+  printf("%s does %s %s\n", argv[1], matched ? "match" : "NOT match", argv[0]);
+  freet(str);
+  freet(pat);
   return 0;
 }
 
diff --git a/tests/01_engine-filter.test b/tests/01_engine-filter.test
index c09df6d8..ddddf5dc 100755
--- a/tests/01_engine-filter.test
+++ b/tests/01_engine-filter.test
@@ -65,22 +65,29 @@ nomatch '*[\]]' "\\"
 match '*[\[\]]' '['
 match '*[\[\]]' ']'
 nomatch '*[\[\]]' 'a'
-match '*[\[,\]]' '['   # comma between members is optional
+match '*[\[,\]]' '[' # comma between members is optional
 match '*[\[,\]]' ']'
-match '*[a,\[]' 'a'    # an escaped member no longer eats the preceding one
+match '*[a,\[]' 'a' # an escaped member no longer eats the preceding one
 match '*[a,\[]' '['
 
 # Escape is decoded before the range/separator/size checks, so '\-' '\,' '\<'
 # are literal members, not operators.
 match '*[a\-z]' 'a'
 match '*[a\-z]' 'z'
-nomatch '*[a\-z]' 'b'  # not the a..z range
+nomatch '*[a\-z]' 'b' # not the a..z range
 match '*[\,]' ','
+nomatch '*[\,]' "\\" # the escape must not leak '\' into the class
 match '*[\<]' '<'
+nomatch '*[\<]' "\\"
 match '*[\[,\],a]' '['
 match '*[\[,\],a]' ']'
 match '*[\[,\],a]' 'a'
 
+# A truncated range '*[a-' is the literal members {a,-}; the parser must not
+# read past the end decoding it (was a 1-byte heap over-read in the range arm).
+match '*[a-' 'a'
+nomatch '*[a-' 'b'
+
 # *(...) matches exactly one char from the class; *[...] matches a run.
 match '*(a,b)' 'a'
 nomatch '*(a,b)' 'aa'
@@ -90,7 +97,7 @@ nomatch '*(a,b)' 'c'
 match 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.zip'
 nomatch 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.tar'
 match '*.html*[]' 'page.html'
-nomatch '*.html*[]' 'page.html?x=1'  # *[] forbids the trailing query
+nomatch '*.html*[]' 'page.html?x=1' # *[] forbids the trailing query
 
 # Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
 # means the size is still unknown (scan time). A size exclusion must stay neutral