Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion html/filters.html
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ <h4>1.a. Scan rules based on URL or extension</h4>
<td>the \ character</td>
</tr>
<tr>
<td nowrap><tt>*[\[\]]</tt></td>
<td nowrap><tt>*[\[,\]]</tt></td>
<td>the [ or ] character</td>
</tr>
<tr>
Expand Down
16 changes: 10 additions & 6 deletions src/htsfilters.c
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,12 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
int len = (int) strlen(joker);

while((joker[i] != RIGHT) && (joker[i]) && (i < len)) {
if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10]
// '\' escapes the next char as a literal member, e.g. *[\[\]]
if (joker[i] == '\\' && joker[i + 1] != '\0') {
i++;
pass[(int) (unsigned char) joker[i]] = 1;
i++;
} else if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10]
int lsize = 0;
int lverdict;

Expand Down Expand Up @@ -221,7 +226,9 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
while(isdigit((unsigned char) joker[i]))
i++;
}
} else if (joker[i + 1] == '-') { // 2 car, ex: *[A-Z]
} else if (joker[i + 1] == '-' && joker[i + 2] != '\0') {
// range *[A-Z]; the '\0' guard rejects a truncated *[a- (else
// i+=3 overshoots the NUL)
if ((int) (unsigned char) joker[i + 2] >
(int) (unsigned char) joker[i]) {
int j;
Expand All @@ -233,10 +240,7 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
}
// else err=1;
i += 3;
} else { // 1 car, ex: *[ ]
if (joker[i + 2] == '\\' && joker[i + 3] != 0) { // escaped char, such as *[\[] or *[\]]
i++;
}
} else { // 1 car, ex: *[ ]
pass[(int) (unsigned char) joker[i]] = 1;
i++;
}
Expand Down
14 changes: 10 additions & 4 deletions src/htsselftest.c
Original file line number Diff line number Diff line change
Expand Up @@ -512,15 +512,21 @@ static int string_safety_selftests(void) {
/* ------------------------------------------------------------ */

static int st_filter(httrackp *opt, int argc, char **argv) {
char *str, *pat;
int matched;

(void) opt;
if (argc < 2) {
fprintf(stderr, "filter: needs a filter pattern and a string\n");
return 1;
}
if (strjoker(argv[1], argv[0], NULL, NULL))
printf("%s does match %s\n", argv[1], argv[0]);
else
printf("%s does NOT match %s\n", argv[1], argv[0]);
/* exact-size heap copies so a sanitizer traps any over-read of the pattern */
str = strdupt(argv[1]);
pat = strdupt(argv[0]);
matched = strjoker(str, pat, NULL, NULL) != NULL;
printf("%s does %s %s\n", argv[1], matched ? "match" : "NOT match", argv[0]);
freet(str);
freet(pat);
return 0;
}

Expand Down
63 changes: 45 additions & 18 deletions tests/01_engine-filter.test
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,54 @@ match '*foo*bar' 'foozbar'
# '?' is the query-string marker, not a single-char wildcard
nomatch 'a?c' 'abc'

# backslash escapes a metacharacter inside a class so it is matched literally.
# Quirk: the decoder also adds the backslash itself to the set, so '\X' matches
# both X and '\'. These assertions pin that behavior.
# Inside a class, backslash escapes the next char as a literal member (#148):
# '\X' matches X only (not '\'), and an escaped ']' is a member, not the terminator.
match '*[\*]' '*'
match '*[\*]' "\\"
nomatch '*[\*]' 'a'
nomatch '*[\*]' "\\"
match '*[\\]' "\\"
nomatch '*[\\]' 'a'
nomatch '*[\\]' '*'
match '*[\[]' '['
match '*[\[]' "\\"
nomatch '*[\[]' 'a'

# A literal ']' cannot be a class member: the class parser stops at the first
# ']', escaped or not. So '*[\[\]]' does NOT mean "the [ or ] character" as the
# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed
# by a trailing literal ']'. These assertions document the current (buggy)
# behavior so any future matcher fix is a deliberate, visible change.
nomatch '*[\[\]]' '[' # not matched, despite the docs
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
nomatch '*[\[\]]' '[]x'
nomatch '*[\[]' "\\"
match '*[\]]' ']'
nomatch '*[\]]' "\\"

# '*[\[\]]' is "the [ or ] character", as the filter guide documents.
match '*[\[\]]' '['
match '*[\[\]]' ']'
nomatch '*[\[\]]' 'a'
match '*[\[,\]]' '[' # comma between members is optional
match '*[\[,\]]' ']'
match '*[a,\[]' 'a' # an escaped member no longer eats the preceding one
match '*[a,\[]' '['

# Escape is decoded before the range/separator/size checks, so '\-' '\,' '\<'
# are literal members, not operators.
match '*[a\-z]' 'a'
match '*[a\-z]' 'z'
nomatch '*[a\-z]' 'b' # not the a..z range
match '*[\,]' ','
nomatch '*[\,]' "\\" # the escape must not leak '\' into the class
match '*[\<]' '<'
nomatch '*[\<]' "\\"
match '*[\[,\],a]' '['
match '*[\[,\],a]' ']'
match '*[\[,\],a]' 'a'

# A truncated range '*[a-' is the literal members {a,-}; the parser must not
# read past the end decoding it (was a 1-byte heap over-read in the range arm).
match '*[a-' 'a'
nomatch '*[a-' 'b'

# *(...) matches exactly one char from the class; *[...] matches a run.
match '*(a,b)' 'a'
nomatch '*(a,b)' 'aa'
nomatch '*(a,b)' 'c'

# documented composite filters (filters.html)
match 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.zip'
nomatch 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.tar'
match '*.html*[]' 'page.html'
nomatch '*.html*[]' 'page.html?x=1' # *[] forbids the trailing query

# Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
# means the size is still unknown (scan time). A size exclusion must stay neutral
Expand Down
Loading