From 6d1b67792eaf69b3caa8e6b083dc40417743497e Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sat, 27 Jun 2026 09:52:13 +0200 Subject: [PATCH] Add --strip-query to drop query keys from dedup naming (#112) Two URLs that differ only in tracking or session query parameters (?utm_source=x versus ?utm_source=y) were saved as separate files, and a single CGI could fan out into thousands of near-duplicate pages. fil_normalized already sorted query args, so reordered parameters dedup, but there was no way to drop a named key. --strip-query "[host/pattern=]key1,key2,..." (repeatable) removes the listed keys when computing the dedup key and the saved name. The fetched URL is untouched, so a required sid= is still sent on the wire; only the local namespace collapses. Patterns match the normalized host/path with the +/- filter glob (strjoker), last match wins as in the filter list, and stripping is decoupled from urlhack (-%u) so it never silently no-ops with -%u0. It all funnels through one chokepoint, fil_normalized: an internal fil_normalized_filtered() strips then delegates, and hts_query_strip_keys resolves the per-URL key list. The strip pass walks every query field, including empty and trailing ones, so its output is a fixpoint under the read path's second normalization (otherwise dedup silently misses). Exported ABI is unchanged; the strip_query field is appended at the tail of httrackp. Covered by a -#test=stripquery self-test (degenerate queries like a=&b&c== and a 50-case idempotency fixpoint) and an end-to-end dedup crawl test. Closes #112 Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Roche --- man/httrack.1 | 5 +- src/htsalias.c | 6 ++ src/htscore.c | 3 + src/htscore.h | 13 +++ src/htscoremain.c | 15 +++ src/htshash.c | 38 +++++-- src/htshelp.c | 1 + src/htslib.c | 138 ++++++++++++++++++++++++ src/htsname.c | 12 ++- src/htsopt.h | 2 + src/htsselftest.c | 122 +++++++++++++++++++++ tests/01_engine-stripquery.test | 8 ++ tests/26_local-strip-query.test | 18 ++++ tests/Makefile.am | 5 +- tests/server-root/stripquery/a.html | 1 + tests/server-root/stripquery/index.html | 5 + 16 files changed, 379 insertions(+), 13 deletions(-) create mode 100755 tests/01_engine-stripquery.test create mode 100755 tests/26_local-strip-query.test create mode 100644 tests/server-root/stripquery/a.html create mode 100644 tests/server-root/stripquery/index.html diff --git a/man/httrack.1 b/man/httrack.1 index cd6b71cc..0e193aeb 100644 --- a/man/httrack.1 +++ b/man/httrack.1 @@ -3,7 +3,7 @@ .\" .\" This file is generated by man/makeman.sh; do not edit by hand. .\" SPDX-License-Identifier: GPL-3.0-or-later -.TH httrack 1 "26 June 2026" "httrack website copier" +.TH httrack 1 "27 June 2026" "httrack website copier" .SH NAME httrack \- offline browser : copy websites to a local directory .SH SYNOPSIS @@ -43,6 +43,7 @@ httrack \- offline browser : copy websites to a local directory [ \fB\-x, \-\-replace\-external\fR ] [ \fB\-%x, \-\-disable\-passwords\fR ] [ \fB\-%q, \-\-include\-query\-string\fR ] +[ \fB\-%g, \-\-strip\-query\fR ] [ \fB\-o, \-\-generate\-errors\fR ] [ \fB\-X, \-\-purge\-old[=N]\fR ] [ \fB\-%p, \-\-preserve\fR ] @@ -198,6 +199,8 @@ replace external html links by error pages (\-\-replace\-external) do not include any password for external password protected websites (%x0 include) (\-\-disable\-passwords) .IP \-%q *include query string for local files (useless, for information purpose only) (%q0 don't include) (\-\-include\-query\-string) +.IP \-%g +strip query keys for dedup ([host/pattern=]key1,key2,...) (\-\-strip\-query ) .IP \-o *generate output html file in case of error (404..) (o0 don't generate) (\-\-generate\-errors) .IP \-X diff --git a/src/htsalias.c b/src/htsalias.c index 04918d3b..69400089 100644 --- a/src/htsalias.c +++ b/src/htsalias.c @@ -60,6 +60,9 @@ Please visit our Website: http://www.httrack.com param1 : this option must be alone, and needs one distinct parameter (-P ) param0 : this option must be alone, but the parameter should be put together (+*.gif) */ +/* clang-format off: hand-aligned table; clang-format reflows the whole + initializer (2->4 space) on any edit, churning every untouched row. */ +/* clang-format off */ const char *hts_optalias[][4] = { /* {"","","",""}, */ {"path", "-O", "param1", "output path"}, @@ -107,6 +110,8 @@ const char *hts_optalias[][4] = { {"disable-passwords", "-%x", "single", ""}, {"disable-password", "-%x", "single", ""}, {"include-query-string", "-%q", "single", ""}, + {"strip-query", "-%g", "param1", + "strip [host/pattern=]key1,key2,... from URLs"}, {"generate-errors", "-o", "single", ""}, {"do-not-generate-errors", "-o0", "single", ""}, {"purge-old", "-X", "param", ""}, @@ -241,6 +246,7 @@ const char *hts_optalias[][4] = { {"", "", "", ""} }; +/* clang-format on */ /* Check for alias in command-line diff --git a/src/htscore.c b/src/htscore.c index 2c79d70c..4152ebda 100644 --- a/src/htscore.c +++ b/src/htscore.c @@ -3739,6 +3739,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) { if (StringNotEmpty(from->user_agent)) StringCopyS(to->user_agent, from->user_agent); + if (StringNotEmpty(from->strip_query)) + StringCopyS(to->strip_query, from->strip_query); + if (from->retry > -1) to->retry = from->retry; diff --git a/src/htscore.h b/src/htscore.h index 1f7b3c1c..31600727 100644 --- a/src/htscore.h +++ b/src/htscore.h @@ -236,6 +236,8 @@ struct hash_struct { coucal former_adrfil; /* scratch buffers reused across lookups (not reentrant) */ int normalized; + /* query-strip keys (not owned); set from opt->strip_query at hash_init */ + const char *strip_query; char normfil[HTS_URLMAXSIZE * 2]; char normfil2[HTS_URLMAXSIZE * 2]; char catbuff[CATBUFF_SIZE]; @@ -364,6 +366,17 @@ int fspc(httrackp * opt, FILE * fp, const char *type); char *next_token(char *p, int flag); +/* Like fil_normalized(), but first drops query keys in STRIP (comma-separated, + "*" = all); STRIP NULL/empty behaves exactly like fil_normalized(). */ +char *fil_normalized_filtered(const char *source, char *dest, + const char *strip); + +/* For URL ADR/FIL, return (in DEST) the comma keylist to strip from the + '\n'-separated "[pattern=]keys" RULES (patterns matched on host/path via + strjoker, last wins); NULL if none match. Feeds fil_normalized_filtered(). */ +const char *hts_query_strip_keys(const char *rules, const char *adr, + const char *fil, char *dest, size_t destsize); + /* Read a whole file into a freshly malloc'd, NUL-terminated buffer; the caller owns it and must release it with freet(). Return NULL on missing/unreadable file (readfile_or substitutes defaultdata instead). The byte content is NOT diff --git a/src/htscoremain.c b/src/htscoremain.c index 287aa560..62153238 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -1937,6 +1937,21 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) { } break; + case 'g': // strip-query: accumulate "[pattern=]keys" entries + if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) { + HTS_PANIC_PRINTF("Option strip-query needs a blank space and " + "[host/pattern=]key1,key2,..."); + printf("Example: --strip-query " + "\"www.example.com/*=utm_source,sid\"\n"); + htsmain_free(); + return -1; + } else { + na++; + if (StringNotEmpty(opt->strip_query)) + StringCat(opt->strip_query, "\n"); + StringCat(opt->strip_query, argv[na]); + } + break; case 't': /* do not change type (ending) of filenames according to the MIME type */ opt->no_type_change = 1; if (*(com+1)=='0') { opt->no_type_change = 0; com++; } diff --git a/src/htshash.c b/src/htshash.c index 4aaf9d6b..96506c91 100644 --- a/src/htshash.c +++ b/src/htshash.c @@ -117,10 +117,17 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg, // copy link assertf(fil != NULL); - if (hash->normalized) { - fil_normalized(fil, &hash->normfil[strlen(hash->normfil)]); - } else { - strcpy(&hash->normfil[strlen(hash->normfil)], fil); + { + /* resolve the per-URL strip keys; strip applies even when urlhack is off */ + char BIGSTK keybuf[HTS_URLMAXSIZE]; + const char *const keys = hts_query_strip_keys(hash->strip_query, adr, fil, + keybuf, sizeof(keybuf)); + + if (hash->normalized || keys != NULL) { + fil_normalized_filtered(fil, &hash->normfil[strlen(hash->normfil)], keys); + } else { + strcpy(&hash->normfil[strlen(hash->normfil)], fil); + } } // hash @@ -161,12 +168,20 @@ static int key_adrfil_equals_generic(void *arg, } // now compare pathes - if (normalized) { - fil_normalized(a_fil, hash->normfil); - fil_normalized(b_fil, hash->normfil2); - return strcmp(hash->normfil, hash->normfil2) == 0; - } else { - return strcmp(a_fil, b_fil) == 0; + { + char BIGSTK ka[HTS_URLMAXSIZE], kb[HTS_URLMAXSIZE]; + const char *const keysa = + hts_query_strip_keys(hash->strip_query, a_adr, a_fil, ka, sizeof(ka)); + const char *const keysb = + hts_query_strip_keys(hash->strip_query, b_adr, b_fil, kb, sizeof(kb)); + + if (normalized || keysa != NULL || keysb != NULL) { + fil_normalized_filtered(a_fil, hash->normfil, keysa); + fil_normalized_filtered(b_fil, hash->normfil2, keysb); + return strcmp(hash->normfil, hash->normfil2) == 0; + } else { + return strcmp(a_fil, b_fil) == 0; + } } } @@ -227,6 +242,9 @@ void hash_init(httrackp *opt, hash_struct * hash, int normalized) { hash->adrfil = coucal_new(0); hash->former_adrfil = coucal_new(0); hash->normalized = normalized; + /* snapshot the query-strip list (not owned; valid for the hash lifetime) */ + hash->strip_query = + StringNotEmpty(opt->strip_query) ? StringBuff(opt->strip_query) : NULL; hts_set_hash_handler(hash->sav, opt); hts_set_hash_handler(hash->adrfil, opt); diff --git a/src/htshelp.c b/src/htshelp.c index a4e5bc4c..e0532c4d 100644 --- a/src/htshelp.c +++ b/src/htshelp.c @@ -563,6 +563,7 @@ void help(const char *app, int more) { (" %x do not include any password for external password protected websites (%x0 include)"); infomsg (" %q *include query string for local files (useless, for information purpose only) (%q0 don't include)"); + infomsg(" %g strip query keys for dedup ([host/pattern=]key1,key2,...)"); infomsg (" o *generate output html file in case of error (404..) (o0 don't generate)"); infomsg(" X *purge old files after update (X0 keep delete)"); diff --git a/src/htslib.c b/src/htslib.c index 5f951555..1a88b62f 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -3681,6 +3681,142 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) { return dest; } +/* Is query key ARG[0..keylen) in the comma-separated STRIP list? "*" = all; + case-sensitive, space-trimmed tokens. */ +static int hts_query_key_stripped(const char *arg, size_t keylen, + const char *strip) { + const char *p = strip; + + while (*p != '\0') { + const char *start = p; + size_t toklen; + + while (*p != '\0' && *p != ',') + p++; + toklen = (size_t) (p - start); + while (toklen > 0 && *start == ' ') { + start++; + toklen--; + } + while (toklen > 0 && start[toklen - 1] == ' ') + toklen--; + if (toklen == 1 && start[0] == '*') + return 1; + if (toklen == keylen && strncmp(start, arg, keylen) == 0) + return 1; + if (*p == ',') + p++; + } + return 0; +} + +/* see htscore.h */ +char *fil_normalized_filtered(const char *source, char *dest, + const char *strip) { + const char *query; + char BIGSTK tmp[HTS_URLMAXSIZE * 2]; + htsbuff cb; + int wrote = 0; + + /* No strip list, or no query: plain normalization. */ + if (strip == NULL || *strip == '\0' || + (query = strchr(source, '?')) == NULL) { + return fil_normalized(source, dest); + } + + /* Copy the path, re-emit kept query args, let fil_normalized() sort. Walk + every field incl. empty/trailing ("a&","?&&") so the result is a fixpoint + (the read re-normalizes it; a dropped empty arg would miss dedup). */ + cb = htsbuff_ptr(tmp, sizeof(tmp)); + htsbuff_catn(&cb, source, (size_t) (query - source)); + for (query++;;) { + const char *const arg = query; + const char *eq = NULL; + size_t keylen, arglen; + + while (*query != '\0' && *query != '&') { + if (eq == NULL && *query == '=') + eq = query; + query++; + } + arglen = (size_t) (query - arg); + keylen = eq != NULL ? (size_t) (eq - arg) : arglen; + if (!hts_query_key_stripped(arg, keylen, strip)) { + htsbuff_catc(&cb, wrote ? '&' : '?'); + htsbuff_catn(&cb, arg, arglen); + wrote = 1; + } + if (*query == '\0') + break; + query++; + } + return fil_normalized(tmp, dest); +} + +/* see htscore.h */ +const char *hts_query_strip_keys(const char *rules, const char *adr, + const char *fil, char *dest, size_t destsize) { + const char *p, *q; + const char *result = NULL; + char BIGSTK url[HTS_URLMAXSIZE * 2]; + + if (rules == NULL || *rules == '\0' || destsize == 0) + return NULL; + + /* Match string = normalized host/path, query removed. jump_normalized_const + collapses www+scheme/auth so read and write (double-normalized) agree; + query excluded keeps the decision on host/path only. */ + url[0] = '\0'; + strcatbuff(url, jump_normalized_const(adr)); + if (fil[0] != '/') + strcatbuff(url, "/"); + q = strchr(fil, '?'); + if (q != NULL) + strncatbuff(url, fil, (int) (q - fil)); + else + strcatbuff(url, fil); + + /* Walk the '\n' entries; last match wins (like the +/- filter eval). Each is + "pattern=keys"; no '=' is the bare form, pattern "*". */ + for (p = rules; *p != '\0';) { + const char *const line = p; + const char *eol, *eq, *keys; + char BIGSTK pat[HTS_URLMAXSIZE * 2]; + + while (*p != '\0' && *p != '\n') + p++; + eol = p; + if (*p == '\n') + p++; + if (eol == line) + continue; + eq = memchr(line, '=', (size_t) (eol - line)); + if (eq != NULL) { + size_t patlen = (size_t) (eq - line); + + if (patlen >= sizeof(pat)) + patlen = sizeof(pat) - 1; + memcpy(pat, line, patlen); + pat[patlen] = '\0'; + keys = eq + 1; + } else { + pat[0] = '*'; + pat[1] = '\0'; + keys = line; + } + if (strjoker(url, pat, NULL, NULL) != NULL) { + size_t klen = (size_t) (eol - keys); + + if (klen >= destsize) + klen = destsize - 1; + memcpy(dest, keys, klen); + dest[klen] = '\0'; + result = dest; + } + } + return result; +} + #define endwith(a) ( (len >= (sizeof(a)-1)) ? ( strncmp(dest, a+len-(sizeof(a)-1), sizeof(a)-1) == 0 ) : 0 ); HTSEXT_API char *adr_normalized_sized(const char *source, char *dest, size_t destsize) { @@ -5891,6 +6027,7 @@ HTSEXT_API httrackp *hts_create_opt(void) { opt->sizehack = HTS_FALSE; opt->urlhack = HTS_TRUE; StringCopy(opt->footer, HTS_DEFAULT_FOOTER); + StringCopy(opt->strip_query, ""); opt->ftp_proxy = HTS_TRUE; opt->convert_utf8 = HTS_TRUE; StringCopy(opt->filelist, ""); @@ -6035,6 +6172,7 @@ HTSEXT_API void hts_free_opt(httrackp * opt) { StringFree(opt->urllist); StringFree(opt->footer); StringFree(opt->mod_blacklist); + StringFree(opt->strip_query); StringFree(opt->path_html); StringFree(opt->path_html_utf8); diff --git a/src/htsname.c b/src/htsname.c index 4ec2ae82..c6ee007e 100644 --- a/src/htsname.c +++ b/src/htsname.c @@ -198,6 +198,13 @@ int url_savename(lien_adrfilsave *const afs, // copy of fil, used for lookups (see urlhack) const char *normadr = adr; const char *normfil = fil_complete; + /* query keys to strip for this URL (NULL = none); decoupled from urlhack */ + char BIGSTK stripkeys[HTS_URLMAXSIZE]; + const char *const strip = + StringNotEmpty(opt->strip_query) + ? hts_query_strip_keys(StringBuff(opt->strip_query), adr, + fil_complete, stripkeys, sizeof(stripkeys)) + : NULL; const char *const print_adr = jump_protocol_const(adr); const char *start_pos = NULL, *nom_pos = NULL, *dot_pos = NULL; // Position nom et point @@ -232,7 +239,7 @@ int url_savename(lien_adrfilsave *const afs, if (opt->urlhack) { // copy of adr (without protocol), used for lookups (see urlhack) normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_)); - normfil = fil_normalized(fil_complete, normfil_); + normfil = fil_normalized_filtered(fil_complete, normfil_, strip); } else { if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder char *pos = strchr(adr_complete, ':'); @@ -245,6 +252,9 @@ int url_savename(lien_adrfilsave *const afs, normadr = normadr_; } } + // strip still applies with urlhack off (host left untouched) + if (strip != NULL) + normfil = fil_normalized_filtered(fil_complete, normfil_, strip); } // à afficher sans ftp:// diff --git a/src/htsopt.h b/src/htsopt.h index 4b646111..eabd070a 100644 --- a/src/htsopt.h +++ b/src/htsopt.h @@ -529,6 +529,8 @@ struct httrackp { htslibhandles libHandles; /**< loaded external module handles */ // htsoptstate state; /**< embedded live engine state */ + String strip_query; /**< query keys to drop when deduping URLs (-strip-query); + appended at the tail to keep field offsets stable */ }; /* Running statistics for a mirror. */ diff --git a/src/htsselftest.c b/src/htsselftest.c index 7e8c17ed..8c194c9e 100644 --- a/src/htsselftest.c +++ b/src/htsselftest.c @@ -1052,6 +1052,126 @@ static int st_cookies(httrackp *opt, int argc, char **argv) { return err; } +/* --strip-query: resolver + fil_normalized_filtered, end to end. */ +static int st_stripquery(httrackp *opt, int argc, char **argv) { + char dest[1024], keys[256], ref[1024]; + const char *k; + + (void) opt; + (void) argc; + (void) argv; + + /* empty rules == plain fil_normalized */ + assertf(hts_query_strip_keys(NULL, "h.com", "/p?a=1", keys, sizeof(keys)) == + NULL); + assertf(hts_query_strip_keys("", "h.com", "/p?a=1", keys, sizeof(keys)) == + NULL); + assertf(strcmp(fil_normalized_filtered("/p?b=2&a=1", dest, NULL), + fil_normalized("/p?b=2&a=1", ref)) == 0); + + /* bare form (*=keys): strip the key everywhere, keep+sort the rest */ + k = hts_query_strip_keys("sid", "any.com", "/p?b=2&sid=x&a=1", keys, + sizeof(keys)); + assertf(k != NULL && strcmp(k, "sid") == 0); + assertf(strcmp(fil_normalized_filtered("/p?b=2&sid=x&a=1", dest, k), + "/p?a=1&b=2") == 0); + + /* reordered variant + an extra stripped key == the clean URL */ + assertf(strcmp(fil_normalized_filtered("/p?sid=y&a=1&b=2", dest, "sid"), + fil_normalized("/p?a=1&b=2", ref)) == 0); + + /* host pattern matches only that host, incl. its www-normalized forms */ + assertf(hts_query_strip_keys("ex.com/*=utm", "other.com", "/p?utm=1", keys, + sizeof(keys)) == NULL); + assertf(hts_query_strip_keys("ex.com/*=utm", "ex.com", "/p?utm=1", keys, + sizeof(keys)) != NULL); + assertf(hts_query_strip_keys("ex.com/*=utm", "www.ex.com", "/p?utm=1", keys, + sizeof(keys)) != NULL); + assertf(hts_query_strip_keys("ex.com/*=utm", "http://www-3.ex.com", + "/p?utm=1", keys, sizeof(keys)) != NULL); + + /* last match wins, wholesale: host rule overrides global, no union */ + k = hts_query_strip_keys("*=sid\nex.com/*=utm", "ex.com", + "/p?sid=1&utm=2&a=3", keys, sizeof(keys)); + assertf(k != NULL && strcmp(k, "utm") == 0); + assertf(strcmp(fil_normalized_filtered("/p?sid=1&utm=2&a=3", dest, k), + "/p?a=3&sid=1") == 0); + k = hts_query_strip_keys("*=sid\nex.com/*=utm", "z.com", "/p?sid=1&a=3", keys, + sizeof(keys)); + assertf(k != NULL && strcmp(k, "sid") == 0); + + /* whole-key match, not prefix: "utm" must not strip utm_source */ + assertf(strcmp(fil_normalized_filtered("/p?utm_source=x&a=1", dest, "utm"), + "/p?a=1&utm_source=x") == 0); + + /* "*" drops every param; a fully-stripped single-arg query loses its '?' */ + assertf(strcmp(fil_normalized_filtered("/p?a=1&b=2", dest, "*"), "/p") == 0); + assertf(strcmp(fil_normalized_filtered("/p?utm=1", dest, "utm"), "/p") == 0); + + /* degenerate forms a=, b, c== (key 'c'); strip c keeps a= and b */ + assertf(strcmp(fil_normalized_filtered("/p?a=&b&c==", dest, "c"), + "/p?a=&b") == 0); + /* short key must not strip a longer one: 'c' must not touch 'cc' */ + assertf(strcmp(fil_normalized_filtered("/p?cc=1&c=2", dest, "c"), + "/p?cc=1") == 0); + + /* repeated key: every occurrence is stripped, not just the first */ + assertf( + strcmp(fil_normalized_filtered("/p?foo=42&bar=13&foo=43", dest, "foo"), + "/p?bar=13") == 0); + /* repeated key mixing missing/empty values */ + assertf( + strcmp(fil_normalized_filtered("/p?foo&bar=13&foo=42&foo=", dest, "foo"), + "/p?bar=13") == 0); + /* repeated key kept (no match): all occurrences retained, then sorted */ + assertf(strcmp(fil_normalized_filtered("/p?foo=42&bar=13&foo=43", dest, "z"), + "/p?bar=13&foo=42&foo=43") == 0); + + /* value containing '=': the key is only the part before the first '='. Strip + 'foo' drops "foo=42=17" whole; the '=' in the value is not a delimiter. */ + assertf(strcmp(fil_normalized_filtered("/p?foo=42=17&bar=", dest, "foo"), + "/p?bar=") == 0); + /* keeping it preserves the embedded '=' verbatim */ + assertf(strcmp(fil_normalized_filtered("/p?foo=42=17&bar=", dest, "bar"), + "/p?foo=42=17") == 0); + /* a value segment is not a key: stripping "42" must not touch foo=42=17 */ + assertf(strcmp(fil_normalized_filtered("/p?foo=42=17", dest, "42"), + "/p?foo=42=17") == 0); + + /* Idempotency: the read path re-normalizes an already-normalized fil, so the + result must be a fixpoint or dedup misses (catches a dropped empty/trailing + arg like "?&&", "a&"). */ + { + static const char *const qs[] = {"/p?a=&b&c==", + "/p?a&&b", + "/p?&a", + "/p?a&", + "/p?", + "/p?=v", + "/p?&&", + "/p?b=2&a=1", + "/p?utm=x&", + "/p?&utm=x", + "/p?foo=42&bar=13&foo=43", + "/p?foo&bar=13&foo=42&foo=", + "/p?foo=42=17&bar="}; + static const char *const strips[] = {NULL, "z", "utm", "*", "a", "foo"}; + char once[1024], twice[1024]; + size_t i, j; + + for (i = 0; i < sizeof(qs) / sizeof(qs[0]); i++) { + for (j = 0; j < sizeof(strips) / sizeof(strips[0]); j++) { + fil_normalized_filtered(qs[i], once, strips[j]); + fil_normalized_filtered(once, twice, strips[j]); + assertf(strcmp(once, twice) == 0); + } + } + } + + printf("strip-query self-test OK\n"); + return 0; +} + /* ------------------------------------------------------------ */ /* Registry: name -> handler, with a usage hint and a one-line description. */ /* ------------------------------------------------------------ */ @@ -1068,6 +1188,8 @@ static const struct selftest_entry { "size-aware filter verdict (negative size = unknown/scan time)", st_filtersize}, {"simplify", "", "collapse ./ and ../ in a path", st_simplify}, + {"stripquery", "", "--strip-query pattern/key stripping self-test", + st_stripquery}, {"mime", "", "MIME type for a filename", st_mime}, {"charset", " ", "convert a string to UTF-8 from a charset", st_charset}, diff --git a/tests/01_engine-stripquery.test b/tests/01_engine-stripquery.test new file mode 100755 index 00000000..040d239f --- /dev/null +++ b/tests/01_engine-stripquery.test @@ -0,0 +1,8 @@ +#!/bin/bash +# + +set -euo pipefail + +# --strip-query: pattern-scoped query-key stripping for dedup. All assertions +# live in the engine self-test (hts_query_strip_keys + fil_normalized_filtered). +httrack -O /dev/null -#test=stripquery | grep -q "strip-query self-test OK" diff --git a/tests/26_local-strip-query.test b/tests/26_local-strip-query.test new file mode 100755 index 00000000..02a8eb3e --- /dev/null +++ b/tests/26_local-strip-query.test @@ -0,0 +1,18 @@ +#!/bin/bash +# +# End-to-end --strip-query (#112): two links to one resource differing only by +# ?utm_source dedup to a single saved file (2 files written: index + resource); +# the control crawl without the option keeps both variants (3 files). Locks the +# CLI->opt->hash plumbing the engine self-test can't reach. + +set -e + +: "${top_srcdir:=..}" + +# stripped: the two ?utm_source variants collapse to one resource +bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \ + httrack 'BASEURL/stripquery/index.html' --strip-query 'utm_source' + +# control: no stripping -> both query-named variants are saved +bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 3 \ + httrack 'BASEURL/stripquery/index.html' diff --git a/tests/Makefile.am b/tests/Makefile.am index 2ff00010..9d7e265c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -5,6 +5,7 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \ proxy-https-server.py \ local-crawl.sh local-server.py server.crt server.key \ server-root/simple/basic.html server-root/simple/link.html \ + server-root/stripquery/index.html server-root/stripquery/a.html \ fixtures/cache-golden/hts-cache/new.zip TESTS_ENVIRONMENT = @@ -45,6 +46,7 @@ TESTS = \ 01_engine-savename.test \ 01_engine-selftest-dispatch.test \ 01_engine-simplify.test \ + 01_engine-stripquery.test \ 01_engine-strsafe.test \ 02_manpage-regen.test \ 02_update-cache.test \ @@ -68,6 +70,7 @@ TESTS = \ 22_local-broken-size.test \ 23_local-errpage.test \ 24_local-resume-overlap.test \ - 25_local-mime-exclude.test + 25_local-mime-exclude.test \ + 26_local-strip-query.test CLEANFILES = check-network_sh.cache diff --git a/tests/server-root/stripquery/a.html b/tests/server-root/stripquery/a.html new file mode 100644 index 00000000..457f673f --- /dev/null +++ b/tests/server-root/stripquery/a.html @@ -0,0 +1 @@ +resource A diff --git a/tests/server-root/stripquery/index.html b/tests/server-root/stripquery/index.html new file mode 100644 index 00000000..601da15a --- /dev/null +++ b/tests/server-root/stripquery/index.html @@ -0,0 +1,5 @@ + +Two links to one resource, differing only by a tracking parameter. +x +y +