diff --git a/man/httrack.1 b/man/httrack.1
index cd6b71cc..0e193aeb 100644
--- a/man/httrack.1
+++ b/man/httrack.1
@@ -3,7 +3,7 @@
.\"
.\" This file is generated by man/makeman.sh; do not edit by hand.
.\" SPDX-License-Identifier: GPL-3.0-or-later
-.TH httrack 1 "26 June 2026" "httrack website copier"
+.TH httrack 1 "27 June 2026" "httrack website copier"
.SH NAME
httrack \- offline browser : copy websites to a local directory
.SH SYNOPSIS
@@ -43,6 +43,7 @@ httrack \- offline browser : copy websites to a local directory
[ \fB\-x, \-\-replace\-external\fR ]
[ \fB\-%x, \-\-disable\-passwords\fR ]
[ \fB\-%q, \-\-include\-query\-string\fR ]
+[ \fB\-%g, \-\-strip\-query\fR ]
[ \fB\-o, \-\-generate\-errors\fR ]
[ \fB\-X, \-\-purge\-old[=N]\fR ]
[ \fB\-%p, \-\-preserve\fR ]
@@ -198,6 +199,8 @@ replace external html links by error pages (\-\-replace\-external)
do not include any password for external password protected websites (%x0 include) (\-\-disable\-passwords)
.IP \-%q
*include query string for local files (useless, for information purpose only) (%q0 don't include) (\-\-include\-query\-string)
+.IP \-%g
+strip query keys for dedup ([host/pattern=]key1,key2,...) (\-\-strip\-query )
.IP \-o
*generate output html file in case of error (404..) (o0 don't generate) (\-\-generate\-errors)
.IP \-X
diff --git a/src/htsalias.c b/src/htsalias.c
index 04918d3b..69400089 100644
--- a/src/htsalias.c
+++ b/src/htsalias.c
@@ -60,6 +60,9 @@ Please visit our Website: http://www.httrack.com
param1 : this option must be alone, and needs one distinct parameter (-P )
param0 : this option must be alone, but the parameter should be put together (+*.gif)
*/
+/* clang-format off: hand-aligned table; clang-format reflows the whole
+ initializer (2->4 space) on any edit, churning every untouched row. */
+/* clang-format off */
const char *hts_optalias[][4] = {
/* {"","","",""}, */
{"path", "-O", "param1", "output path"},
@@ -107,6 +110,8 @@ const char *hts_optalias[][4] = {
{"disable-passwords", "-%x", "single", ""}, {"disable-password", "-%x",
"single", ""},
{"include-query-string", "-%q", "single", ""},
+ {"strip-query", "-%g", "param1",
+ "strip [host/pattern=]key1,key2,... from URLs"},
{"generate-errors", "-o", "single", ""},
{"do-not-generate-errors", "-o0", "single", ""},
{"purge-old", "-X", "param", ""},
@@ -241,6 +246,7 @@ const char *hts_optalias[][4] = {
{"", "", "", ""}
};
+/* clang-format on */
/*
Check for alias in command-line
diff --git a/src/htscore.c b/src/htscore.c
index 2c79d70c..4152ebda 100644
--- a/src/htscore.c
+++ b/src/htscore.c
@@ -3739,6 +3739,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
if (StringNotEmpty(from->user_agent))
StringCopyS(to->user_agent, from->user_agent);
+ if (StringNotEmpty(from->strip_query))
+ StringCopyS(to->strip_query, from->strip_query);
+
if (from->retry > -1)
to->retry = from->retry;
diff --git a/src/htscore.h b/src/htscore.h
index 1f7b3c1c..31600727 100644
--- a/src/htscore.h
+++ b/src/htscore.h
@@ -236,6 +236,8 @@ struct hash_struct {
coucal former_adrfil;
/* scratch buffers reused across lookups (not reentrant) */
int normalized;
+ /* query-strip keys (not owned); set from opt->strip_query at hash_init */
+ const char *strip_query;
char normfil[HTS_URLMAXSIZE * 2];
char normfil2[HTS_URLMAXSIZE * 2];
char catbuff[CATBUFF_SIZE];
@@ -364,6 +366,17 @@ int fspc(httrackp * opt, FILE * fp, const char *type);
char *next_token(char *p, int flag);
+/* Like fil_normalized(), but first drops query keys in STRIP (comma-separated,
+ "*" = all); STRIP NULL/empty behaves exactly like fil_normalized(). */
+char *fil_normalized_filtered(const char *source, char *dest,
+ const char *strip);
+
+/* For URL ADR/FIL, return (in DEST) the comma keylist to strip from the
+ '\n'-separated "[pattern=]keys" RULES (patterns matched on host/path via
+ strjoker, last wins); NULL if none match. Feeds fil_normalized_filtered(). */
+const char *hts_query_strip_keys(const char *rules, const char *adr,
+ const char *fil, char *dest, size_t destsize);
+
/* Read a whole file into a freshly malloc'd, NUL-terminated buffer; the caller
owns it and must release it with freet(). Return NULL on missing/unreadable
file (readfile_or substitutes defaultdata instead). The byte content is NOT
diff --git a/src/htscoremain.c b/src/htscoremain.c
index 287aa560..62153238 100644
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -1937,6 +1937,21 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
}
break;
+ case 'g': // strip-query: accumulate "[pattern=]keys" entries
+ if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
+ HTS_PANIC_PRINTF("Option strip-query needs a blank space and "
+ "[host/pattern=]key1,key2,...");
+ printf("Example: --strip-query "
+ "\"www.example.com/*=utm_source,sid\"\n");
+ htsmain_free();
+ return -1;
+ } else {
+ na++;
+ if (StringNotEmpty(opt->strip_query))
+ StringCat(opt->strip_query, "\n");
+ StringCat(opt->strip_query, argv[na]);
+ }
+ break;
case 't': /* do not change type (ending) of filenames according to the MIME type */
opt->no_type_change = 1;
if (*(com+1)=='0') { opt->no_type_change = 0; com++; }
diff --git a/src/htshash.c b/src/htshash.c
index 4aaf9d6b..96506c91 100644
--- a/src/htshash.c
+++ b/src/htshash.c
@@ -117,10 +117,17 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
// copy link
assertf(fil != NULL);
- if (hash->normalized) {
- fil_normalized(fil, &hash->normfil[strlen(hash->normfil)]);
- } else {
- strcpy(&hash->normfil[strlen(hash->normfil)], fil);
+ {
+ /* resolve the per-URL strip keys; strip applies even when urlhack is off */
+ char BIGSTK keybuf[HTS_URLMAXSIZE];
+ const char *const keys = hts_query_strip_keys(hash->strip_query, adr, fil,
+ keybuf, sizeof(keybuf));
+
+ if (hash->normalized || keys != NULL) {
+ fil_normalized_filtered(fil, &hash->normfil[strlen(hash->normfil)], keys);
+ } else {
+ strcpy(&hash->normfil[strlen(hash->normfil)], fil);
+ }
}
// hash
@@ -161,12 +168,20 @@ static int key_adrfil_equals_generic(void *arg,
}
// now compare pathes
- if (normalized) {
- fil_normalized(a_fil, hash->normfil);
- fil_normalized(b_fil, hash->normfil2);
- return strcmp(hash->normfil, hash->normfil2) == 0;
- } else {
- return strcmp(a_fil, b_fil) == 0;
+ {
+ char BIGSTK ka[HTS_URLMAXSIZE], kb[HTS_URLMAXSIZE];
+ const char *const keysa =
+ hts_query_strip_keys(hash->strip_query, a_adr, a_fil, ka, sizeof(ka));
+ const char *const keysb =
+ hts_query_strip_keys(hash->strip_query, b_adr, b_fil, kb, sizeof(kb));
+
+ if (normalized || keysa != NULL || keysb != NULL) {
+ fil_normalized_filtered(a_fil, hash->normfil, keysa);
+ fil_normalized_filtered(b_fil, hash->normfil2, keysb);
+ return strcmp(hash->normfil, hash->normfil2) == 0;
+ } else {
+ return strcmp(a_fil, b_fil) == 0;
+ }
}
}
@@ -227,6 +242,9 @@ void hash_init(httrackp *opt, hash_struct * hash, int normalized) {
hash->adrfil = coucal_new(0);
hash->former_adrfil = coucal_new(0);
hash->normalized = normalized;
+ /* snapshot the query-strip list (not owned; valid for the hash lifetime) */
+ hash->strip_query =
+ StringNotEmpty(opt->strip_query) ? StringBuff(opt->strip_query) : NULL;
hts_set_hash_handler(hash->sav, opt);
hts_set_hash_handler(hash->adrfil, opt);
diff --git a/src/htshelp.c b/src/htshelp.c
index a4e5bc4c..e0532c4d 100644
--- a/src/htshelp.c
+++ b/src/htshelp.c
@@ -563,6 +563,7 @@ void help(const char *app, int more) {
(" %x do not include any password for external password protected websites (%x0 include)");
infomsg
(" %q *include query string for local files (useless, for information purpose only) (%q0 don't include)");
+ infomsg(" %g strip query keys for dedup ([host/pattern=]key1,key2,...)");
infomsg
(" o *generate output html file in case of error (404..) (o0 don't generate)");
infomsg(" X *purge old files after update (X0 keep delete)");
diff --git a/src/htslib.c b/src/htslib.c
index 5f951555..1a88b62f 100644
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -3681,6 +3681,142 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
return dest;
}
+/* Is query key ARG[0..keylen) in the comma-separated STRIP list? "*" = all;
+ case-sensitive, space-trimmed tokens. */
+static int hts_query_key_stripped(const char *arg, size_t keylen,
+ const char *strip) {
+ const char *p = strip;
+
+ while (*p != '\0') {
+ const char *start = p;
+ size_t toklen;
+
+ while (*p != '\0' && *p != ',')
+ p++;
+ toklen = (size_t) (p - start);
+ while (toklen > 0 && *start == ' ') {
+ start++;
+ toklen--;
+ }
+ while (toklen > 0 && start[toklen - 1] == ' ')
+ toklen--;
+ if (toklen == 1 && start[0] == '*')
+ return 1;
+ if (toklen == keylen && strncmp(start, arg, keylen) == 0)
+ return 1;
+ if (*p == ',')
+ p++;
+ }
+ return 0;
+}
+
+/* see htscore.h */
+char *fil_normalized_filtered(const char *source, char *dest,
+ const char *strip) {
+ const char *query;
+ char BIGSTK tmp[HTS_URLMAXSIZE * 2];
+ htsbuff cb;
+ int wrote = 0;
+
+ /* No strip list, or no query: plain normalization. */
+ if (strip == NULL || *strip == '\0' ||
+ (query = strchr(source, '?')) == NULL) {
+ return fil_normalized(source, dest);
+ }
+
+ /* Copy the path, re-emit kept query args, let fil_normalized() sort. Walk
+ every field incl. empty/trailing ("a&","?&&") so the result is a fixpoint
+ (the read re-normalizes it; a dropped empty arg would miss dedup). */
+ cb = htsbuff_ptr(tmp, sizeof(tmp));
+ htsbuff_catn(&cb, source, (size_t) (query - source));
+ for (query++;;) {
+ const char *const arg = query;
+ const char *eq = NULL;
+ size_t keylen, arglen;
+
+ while (*query != '\0' && *query != '&') {
+ if (eq == NULL && *query == '=')
+ eq = query;
+ query++;
+ }
+ arglen = (size_t) (query - arg);
+ keylen = eq != NULL ? (size_t) (eq - arg) : arglen;
+ if (!hts_query_key_stripped(arg, keylen, strip)) {
+ htsbuff_catc(&cb, wrote ? '&' : '?');
+ htsbuff_catn(&cb, arg, arglen);
+ wrote = 1;
+ }
+ if (*query == '\0')
+ break;
+ query++;
+ }
+ return fil_normalized(tmp, dest);
+}
+
+/* see htscore.h */
+const char *hts_query_strip_keys(const char *rules, const char *adr,
+ const char *fil, char *dest, size_t destsize) {
+ const char *p, *q;
+ const char *result = NULL;
+ char BIGSTK url[HTS_URLMAXSIZE * 2];
+
+ if (rules == NULL || *rules == '\0' || destsize == 0)
+ return NULL;
+
+ /* Match string = normalized host/path, query removed. jump_normalized_const
+ collapses www+scheme/auth so read and write (double-normalized) agree;
+ query excluded keeps the decision on host/path only. */
+ url[0] = '\0';
+ strcatbuff(url, jump_normalized_const(adr));
+ if (fil[0] != '/')
+ strcatbuff(url, "/");
+ q = strchr(fil, '?');
+ if (q != NULL)
+ strncatbuff(url, fil, (int) (q - fil));
+ else
+ strcatbuff(url, fil);
+
+ /* Walk the '\n' entries; last match wins (like the +/- filter eval). Each is
+ "pattern=keys"; no '=' is the bare form, pattern "*". */
+ for (p = rules; *p != '\0';) {
+ const char *const line = p;
+ const char *eol, *eq, *keys;
+ char BIGSTK pat[HTS_URLMAXSIZE * 2];
+
+ while (*p != '\0' && *p != '\n')
+ p++;
+ eol = p;
+ if (*p == '\n')
+ p++;
+ if (eol == line)
+ continue;
+ eq = memchr(line, '=', (size_t) (eol - line));
+ if (eq != NULL) {
+ size_t patlen = (size_t) (eq - line);
+
+ if (patlen >= sizeof(pat))
+ patlen = sizeof(pat) - 1;
+ memcpy(pat, line, patlen);
+ pat[patlen] = '\0';
+ keys = eq + 1;
+ } else {
+ pat[0] = '*';
+ pat[1] = '\0';
+ keys = line;
+ }
+ if (strjoker(url, pat, NULL, NULL) != NULL) {
+ size_t klen = (size_t) (eol - keys);
+
+ if (klen >= destsize)
+ klen = destsize - 1;
+ memcpy(dest, keys, klen);
+ dest[klen] = '\0';
+ result = dest;
+ }
+ }
+ return result;
+}
+
#define endwith(a) ( (len >= (sizeof(a)-1)) ? ( strncmp(dest, a+len-(sizeof(a)-1), sizeof(a)-1) == 0 ) : 0 );
HTSEXT_API char *adr_normalized_sized(const char *source, char *dest,
size_t destsize) {
@@ -5891,6 +6027,7 @@ HTSEXT_API httrackp *hts_create_opt(void) {
opt->sizehack = HTS_FALSE;
opt->urlhack = HTS_TRUE;
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
+ StringCopy(opt->strip_query, "");
opt->ftp_proxy = HTS_TRUE;
opt->convert_utf8 = HTS_TRUE;
StringCopy(opt->filelist, "");
@@ -6035,6 +6172,7 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
StringFree(opt->urllist);
StringFree(opt->footer);
StringFree(opt->mod_blacklist);
+ StringFree(opt->strip_query);
StringFree(opt->path_html);
StringFree(opt->path_html_utf8);
diff --git a/src/htsname.c b/src/htsname.c
index 4ec2ae82..c6ee007e 100644
--- a/src/htsname.c
+++ b/src/htsname.c
@@ -198,6 +198,13 @@ int url_savename(lien_adrfilsave *const afs,
// copy of fil, used for lookups (see urlhack)
const char *normadr = adr;
const char *normfil = fil_complete;
+ /* query keys to strip for this URL (NULL = none); decoupled from urlhack */
+ char BIGSTK stripkeys[HTS_URLMAXSIZE];
+ const char *const strip =
+ StringNotEmpty(opt->strip_query)
+ ? hts_query_strip_keys(StringBuff(opt->strip_query), adr,
+ fil_complete, stripkeys, sizeof(stripkeys))
+ : NULL;
const char *const print_adr = jump_protocol_const(adr);
const char *start_pos = NULL, *nom_pos = NULL, *dot_pos = NULL; // Position nom et point
@@ -232,7 +239,7 @@ int url_savename(lien_adrfilsave *const afs,
if (opt->urlhack) {
// copy of adr (without protocol), used for lookups (see urlhack)
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
- normfil = fil_normalized(fil_complete, normfil_);
+ normfil = fil_normalized_filtered(fil_complete, normfil_, strip);
} else {
if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder
char *pos = strchr(adr_complete, ':');
@@ -245,6 +252,9 @@ int url_savename(lien_adrfilsave *const afs,
normadr = normadr_;
}
}
+ // strip still applies with urlhack off (host left untouched)
+ if (strip != NULL)
+ normfil = fil_normalized_filtered(fil_complete, normfil_, strip);
}
// à afficher sans ftp://
diff --git a/src/htsopt.h b/src/htsopt.h
index 4b646111..eabd070a 100644
--- a/src/htsopt.h
+++ b/src/htsopt.h
@@ -529,6 +529,8 @@ struct httrackp {
htslibhandles libHandles; /**< loaded external module handles */
//
htsoptstate state; /**< embedded live engine state */
+ String strip_query; /**< query keys to drop when deduping URLs (-strip-query);
+ appended at the tail to keep field offsets stable */
};
/* Running statistics for a mirror. */
diff --git a/src/htsselftest.c b/src/htsselftest.c
index 7e8c17ed..8c194c9e 100644
--- a/src/htsselftest.c
+++ b/src/htsselftest.c
@@ -1052,6 +1052,126 @@ static int st_cookies(httrackp *opt, int argc, char **argv) {
return err;
}
+/* --strip-query: resolver + fil_normalized_filtered, end to end. */
+static int st_stripquery(httrackp *opt, int argc, char **argv) {
+ char dest[1024], keys[256], ref[1024];
+ const char *k;
+
+ (void) opt;
+ (void) argc;
+ (void) argv;
+
+ /* empty rules == plain fil_normalized */
+ assertf(hts_query_strip_keys(NULL, "h.com", "/p?a=1", keys, sizeof(keys)) ==
+ NULL);
+ assertf(hts_query_strip_keys("", "h.com", "/p?a=1", keys, sizeof(keys)) ==
+ NULL);
+ assertf(strcmp(fil_normalized_filtered("/p?b=2&a=1", dest, NULL),
+ fil_normalized("/p?b=2&a=1", ref)) == 0);
+
+ /* bare form (*=keys): strip the key everywhere, keep+sort the rest */
+ k = hts_query_strip_keys("sid", "any.com", "/p?b=2&sid=x&a=1", keys,
+ sizeof(keys));
+ assertf(k != NULL && strcmp(k, "sid") == 0);
+ assertf(strcmp(fil_normalized_filtered("/p?b=2&sid=x&a=1", dest, k),
+ "/p?a=1&b=2") == 0);
+
+ /* reordered variant + an extra stripped key == the clean URL */
+ assertf(strcmp(fil_normalized_filtered("/p?sid=y&a=1&b=2", dest, "sid"),
+ fil_normalized("/p?a=1&b=2", ref)) == 0);
+
+ /* host pattern matches only that host, incl. its www-normalized forms */
+ assertf(hts_query_strip_keys("ex.com/*=utm", "other.com", "/p?utm=1", keys,
+ sizeof(keys)) == NULL);
+ assertf(hts_query_strip_keys("ex.com/*=utm", "ex.com", "/p?utm=1", keys,
+ sizeof(keys)) != NULL);
+ assertf(hts_query_strip_keys("ex.com/*=utm", "www.ex.com", "/p?utm=1", keys,
+ sizeof(keys)) != NULL);
+ assertf(hts_query_strip_keys("ex.com/*=utm", "http://www-3.ex.com",
+ "/p?utm=1", keys, sizeof(keys)) != NULL);
+
+ /* last match wins, wholesale: host rule overrides global, no union */
+ k = hts_query_strip_keys("*=sid\nex.com/*=utm", "ex.com",
+ "/p?sid=1&utm=2&a=3", keys, sizeof(keys));
+ assertf(k != NULL && strcmp(k, "utm") == 0);
+ assertf(strcmp(fil_normalized_filtered("/p?sid=1&utm=2&a=3", dest, k),
+ "/p?a=3&sid=1") == 0);
+ k = hts_query_strip_keys("*=sid\nex.com/*=utm", "z.com", "/p?sid=1&a=3", keys,
+ sizeof(keys));
+ assertf(k != NULL && strcmp(k, "sid") == 0);
+
+ /* whole-key match, not prefix: "utm" must not strip utm_source */
+ assertf(strcmp(fil_normalized_filtered("/p?utm_source=x&a=1", dest, "utm"),
+ "/p?a=1&utm_source=x") == 0);
+
+ /* "*" drops every param; a fully-stripped single-arg query loses its '?' */
+ assertf(strcmp(fil_normalized_filtered("/p?a=1&b=2", dest, "*"), "/p") == 0);
+ assertf(strcmp(fil_normalized_filtered("/p?utm=1", dest, "utm"), "/p") == 0);
+
+ /* degenerate forms a=, b, c== (key 'c'); strip c keeps a= and b */
+ assertf(strcmp(fil_normalized_filtered("/p?a=&b&c==", dest, "c"),
+ "/p?a=&b") == 0);
+ /* short key must not strip a longer one: 'c' must not touch 'cc' */
+ assertf(strcmp(fil_normalized_filtered("/p?cc=1&c=2", dest, "c"),
+ "/p?cc=1") == 0);
+
+ /* repeated key: every occurrence is stripped, not just the first */
+ assertf(
+ strcmp(fil_normalized_filtered("/p?foo=42&bar=13&foo=43", dest, "foo"),
+ "/p?bar=13") == 0);
+ /* repeated key mixing missing/empty values */
+ assertf(
+ strcmp(fil_normalized_filtered("/p?foo&bar=13&foo=42&foo=", dest, "foo"),
+ "/p?bar=13") == 0);
+ /* repeated key kept (no match): all occurrences retained, then sorted */
+ assertf(strcmp(fil_normalized_filtered("/p?foo=42&bar=13&foo=43", dest, "z"),
+ "/p?bar=13&foo=42&foo=43") == 0);
+
+ /* value containing '=': the key is only the part before the first '='. Strip
+ 'foo' drops "foo=42=17" whole; the '=' in the value is not a delimiter. */
+ assertf(strcmp(fil_normalized_filtered("/p?foo=42=17&bar=", dest, "foo"),
+ "/p?bar=") == 0);
+ /* keeping it preserves the embedded '=' verbatim */
+ assertf(strcmp(fil_normalized_filtered("/p?foo=42=17&bar=", dest, "bar"),
+ "/p?foo=42=17") == 0);
+ /* a value segment is not a key: stripping "42" must not touch foo=42=17 */
+ assertf(strcmp(fil_normalized_filtered("/p?foo=42=17", dest, "42"),
+ "/p?foo=42=17") == 0);
+
+ /* Idempotency: the read path re-normalizes an already-normalized fil, so the
+ result must be a fixpoint or dedup misses (catches a dropped empty/trailing
+ arg like "?&&", "a&"). */
+ {
+ static const char *const qs[] = {"/p?a=&b&c==",
+ "/p?a&&b",
+ "/p?&a",
+ "/p?a&",
+ "/p?",
+ "/p?=v",
+ "/p?&&",
+ "/p?b=2&a=1",
+ "/p?utm=x&",
+ "/p?&utm=x",
+ "/p?foo=42&bar=13&foo=43",
+ "/p?foo&bar=13&foo=42&foo=",
+ "/p?foo=42=17&bar="};
+ static const char *const strips[] = {NULL, "z", "utm", "*", "a", "foo"};
+ char once[1024], twice[1024];
+ size_t i, j;
+
+ for (i = 0; i < sizeof(qs) / sizeof(qs[0]); i++) {
+ for (j = 0; j < sizeof(strips) / sizeof(strips[0]); j++) {
+ fil_normalized_filtered(qs[i], once, strips[j]);
+ fil_normalized_filtered(once, twice, strips[j]);
+ assertf(strcmp(once, twice) == 0);
+ }
+ }
+ }
+
+ printf("strip-query self-test OK\n");
+ return 0;
+}
+
/* ------------------------------------------------------------ */
/* Registry: name -> handler, with a usage hint and a one-line description. */
/* ------------------------------------------------------------ */
@@ -1068,6 +1188,8 @@ static const struct selftest_entry {
"size-aware filter verdict (negative size = unknown/scan time)",
st_filtersize},
{"simplify", "", "collapse ./ and ../ in a path", st_simplify},
+ {"stripquery", "", "--strip-query pattern/key stripping self-test",
+ st_stripquery},
{"mime", "", "MIME type for a filename", st_mime},
{"charset", " ",
"convert a string to UTF-8 from a charset", st_charset},
diff --git a/tests/01_engine-stripquery.test b/tests/01_engine-stripquery.test
new file mode 100755
index 00000000..040d239f
--- /dev/null
+++ b/tests/01_engine-stripquery.test
@@ -0,0 +1,8 @@
+#!/bin/bash
+#
+
+set -euo pipefail
+
+# --strip-query: pattern-scoped query-key stripping for dedup. All assertions
+# live in the engine self-test (hts_query_strip_keys + fil_normalized_filtered).
+httrack -O /dev/null -#test=stripquery | grep -q "strip-query self-test OK"
diff --git a/tests/26_local-strip-query.test b/tests/26_local-strip-query.test
new file mode 100755
index 00000000..02a8eb3e
--- /dev/null
+++ b/tests/26_local-strip-query.test
@@ -0,0 +1,18 @@
+#!/bin/bash
+#
+# End-to-end --strip-query (#112): two links to one resource differing only by
+# ?utm_source dedup to a single saved file (2 files written: index + resource);
+# the control crawl without the option keeps both variants (3 files). Locks the
+# CLI->opt->hash plumbing the engine self-test can't reach.
+
+set -e
+
+: "${top_srcdir:=..}"
+
+# stripped: the two ?utm_source variants collapse to one resource
+bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \
+ httrack 'BASEURL/stripquery/index.html' --strip-query 'utm_source'
+
+# control: no stripping -> both query-named variants are saved
+bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 3 \
+ httrack 'BASEURL/stripquery/index.html'
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 2ff00010..9d7e265c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -5,6 +5,7 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
proxy-https-server.py \
local-crawl.sh local-server.py server.crt server.key \
server-root/simple/basic.html server-root/simple/link.html \
+ server-root/stripquery/index.html server-root/stripquery/a.html \
fixtures/cache-golden/hts-cache/new.zip
TESTS_ENVIRONMENT =
@@ -45,6 +46,7 @@ TESTS = \
01_engine-savename.test \
01_engine-selftest-dispatch.test \
01_engine-simplify.test \
+ 01_engine-stripquery.test \
01_engine-strsafe.test \
02_manpage-regen.test \
02_update-cache.test \
@@ -68,6 +70,7 @@ TESTS = \
22_local-broken-size.test \
23_local-errpage.test \
24_local-resume-overlap.test \
- 25_local-mime-exclude.test
+ 25_local-mime-exclude.test \
+ 26_local-strip-query.test
CLEANFILES = check-network_sh.cache
diff --git a/tests/server-root/stripquery/a.html b/tests/server-root/stripquery/a.html
new file mode 100644
index 00000000..457f673f
--- /dev/null
+++ b/tests/server-root/stripquery/a.html
@@ -0,0 +1 @@
+resource A
diff --git a/tests/server-root/stripquery/index.html b/tests/server-root/stripquery/index.html
new file mode 100644
index 00000000..601da15a
--- /dev/null
+++ b/tests/server-root/stripquery/index.html
@@ -0,0 +1,5 @@
+
+Two links to one resource, differing only by a tracking parameter.
+x
+y
+