From 600001b2825cdff9b624da7f4c8a19f66e8e10c6 Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sat, 27 Jun 2026 20:03:19 +0200 Subject: [PATCH] Split -%u URL Hacks into independent www/slash/query toggles (#271) -%u (--urlhack) bundled three dedup normalizations under one switch: www.host == host, redundant // collapse, and query-argument reordering. A mirror that needed one but not another (e.g. keep www. distinct) had to turn the whole umbrella off. Add three opt-out sub-options, defaulting to the umbrella so existing -%u/-%u0 behavior is unchanged: --keep-www-prefix keep www.foo.com distinct from foo.com (-%j) --keep-double-slashes keep redundant // in the path (-%o) --keep-query-order keep query-argument order significant (-%y) The split is resolved once in hash_init() into norm_host/norm_slash/ norm_query and threaded through the dedup hash (htshash.c), the savename lookup key (htsname.c) and the redirect-loop diagnostic (htsparse.c) so all three stay consistent. fil_normalized() gains an internal fil_normalized_ex(do_slash, do_query) core; the public fil_normalized()/fil_normalized_filtered() keep their signatures. Normalization (slash/query) now follows urlhack and its sub-flags uniformly, while --strip-query stays orthogonal. So with urlhack off, strip-query strips keys without sorting the remainder; the url_savename urlhack-off branch is moved to the same do_slash=0/do_query=0 normalizer the hash uses, so a URL is always looked up under the key it was stored with (a self-lookup mismatch this otherwise introduced). http/https are always merged in the dedup key (the scheme is stripped regardless of -%u), so that part of the request needs no toggle. The opt-outs are spelled positively (--keep-*) because httrack's generic --no prefix only appends the disabling "0" for parametered options, not "single" booleans, so --nowww-dedup would silently no-op. opt grows three hts_boolean fields appended at the struct tail (offsets stable, no soname bump, matching the strip_query addition in #112). Tested by a -#test=urlhack engine self-test (hash_url_equals over each flag combination) plus a -%u0 + --strip-query crawl case exercising the urlhack-off savename branch. Closes #271 Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Roche --- man/httrack.1 | 2 ++ src/htsalias.c | 3 ++ src/htscore.h | 11 ++++-- src/htscoremain.c | 24 +++++++++++++ src/htshash.c | 60 +++++++++++++++++++++++---------- src/htshash.h | 6 +++- src/htshelp.c | 3 ++ src/htslib.c | 33 +++++++++++++----- src/htsname.c | 16 ++++++--- src/htsopt.h | 4 +++ src/htsparse.c | 26 ++++++++++---- src/htsselftest.c | 49 +++++++++++++++++++++++++++ tests/01_engine-urlhack.test | 8 +++++ tests/26_local-strip-query.test | 5 +++ tests/Makefile.am | 1 + 15 files changed, 211 insertions(+), 40 deletions(-) create mode 100644 tests/01_engine-urlhack.test diff --git a/man/httrack.1 b/man/httrack.1 index 0e193aeb..9615ddb0 100644 --- a/man/httrack.1 +++ b/man/httrack.1 @@ -228,6 +228,8 @@ tolerant requests (accept bogus responses on some servers, but not standard!) (\ update hacks: various hacks to limit re\-transfers when updating (identical size, bogus response..) (\-\-updatehack) .IP \-%u url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..) (\-\-urlhack) +.br +opt out of one url\-hack part: \-\-keep\-www\-prefix (www.foo.com<>foo.com), \-\-keep\-double\-slashes (//), \-\-keep\-query\-order (?b&a) .IP \-%A assume that a type (cgi,asp..) is always linked with a mime type (\-%A php3,cgi=text/html;dat,bin=application/x\-zip) (\-\-assume ) .br diff --git a/src/htsalias.c b/src/htsalias.c index 69400089..f5cdc306 100644 --- a/src/htsalias.c +++ b/src/htsalias.c @@ -128,6 +128,9 @@ const char *hts_optalias[][4] = { {"tolerant", "-%B", "single", ""}, {"updatehack", "-%s", "single", ""}, {"sizehack", "-%s", "single", ""}, {"urlhack", "-%u", "single", ""}, + {"keep-www-prefix", "-%j", "single", ""}, + {"keep-double-slashes", "-%o", "single", ""}, + {"keep-query-order", "-%y", "single", ""}, {"user-agent", "-F", "param1", "user-agent identity"}, {"referer", "-%R", "param1", "default referer URL"}, {"from", "-%E", "param1", "from email address"}, diff --git a/src/htscore.h b/src/htscore.h index 31600727..2d59f49f 100644 --- a/src/htscore.h +++ b/src/htscore.h @@ -234,8 +234,10 @@ struct hash_struct { coucal adrfil; /* former address+path -> link index (renamed/moved entries) */ coucal former_adrfil; - /* scratch buffers reused across lookups (not reentrant) */ - int normalized; + /* effective urlhack sub-flags: www.==host / // collapse / query-arg sort */ + hts_boolean norm_host; + hts_boolean norm_slash; + hts_boolean norm_query; /* query-strip keys (not owned); set from opt->strip_query at hash_init */ const char *strip_query; char normfil[HTS_URLMAXSIZE * 2]; @@ -371,6 +373,11 @@ char *next_token(char *p, int flag); char *fil_normalized_filtered(const char *source, char *dest, const char *strip); +/* As fil_normalized_filtered(), but DO_SLASH/DO_QUERY gate the // collapse and + the query-argument sort independently (the urlhack sub-flags). */ +char *fil_normalized_filtered_ex(const char *source, char *dest, + const char *strip, int do_slash, int do_query); + /* For URL ADR/FIL, return (in DEST) the comma keylist to strip from the '\n'-separated "[pattern=]keys" RULES (patterns matched on host/path via strjoker, last wins); NULL if none match. Feeds fil_normalized_filtered(). */ diff --git a/src/htscoremain.c b/src/htscoremain.c index 62153238..eb1c3fe1 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -1570,6 +1570,30 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) { com++; } break; // url hack + case 'j': + opt->no_www_dedup = + HTS_TRUE; // --keep-www-prefix: keep www.X != X + if (*(com + 1) == '0') { + opt->no_www_dedup = HTS_FALSE; + com++; + } + break; + case 'o': + opt->no_slash_dedup = + HTS_TRUE; // --keep-double-slashes: keep // + if (*(com + 1) == '0') { + opt->no_slash_dedup = HTS_FALSE; + com++; + } + break; + case 'y': + opt->no_query_dedup = + HTS_TRUE; // --keep-query-order: keep ?b&a order + if (*(com + 1) == '0') { + opt->no_query_dedup = HTS_FALSE; + com++; + } + break; case 'v': opt->verbosedisplay = HTS_VERBOSE_FULL; if (isdigit((unsigned char) *(com + 1))) { diff --git a/src/htshash.c b/src/htshash.c index 96506c91..a9917161 100644 --- a/src/htshash.c +++ b/src/htshash.c @@ -106,10 +106,10 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg, const lien_url*const lien = (const lien_url*) value; const char *const adr = !former ? lien->adr : lien->former_adr; const char *const fil = !former ? lien->fil : lien->former_fil; - const char *const adr_norm = adr != NULL ? - ( hash->normalized ? jump_normalized_const(adr) - : jump_identification_const(adr) ) - : NULL; + const char *const adr_norm = + adr != NULL ? (hash->norm_host ? jump_normalized_const(adr) + : jump_identification_const(adr)) + : NULL; // copy address assertf(adr_norm != NULL); @@ -123,8 +123,9 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg, const char *const keys = hts_query_strip_keys(hash->strip_query, adr, fil, keybuf, sizeof(keybuf)); - if (hash->normalized || keys != NULL) { - fil_normalized_filtered(fil, &hash->normfil[strlen(hash->normfil)], keys); + if (hash->norm_slash || hash->norm_query || keys != NULL) { + fil_normalized_filtered_ex(fil, &hash->normfil[strlen(hash->normfil)], + keys, hash->norm_slash, hash->norm_query); } else { strcpy(&hash->normfil[strlen(hash->normfil)], fil); } @@ -139,8 +140,7 @@ static int key_adrfil_equals_generic(void *arg, coucal_key_const a_, coucal_key_const b_, const int former) { - hash_struct *const hash = (hash_struct*) arg; - const int normalized = hash->normalized; + hash_struct *const hash = (hash_struct *) arg; const lien_url*const a = (const lien_url*) a_; const lien_url*const b = (const lien_url*) b_; const char *const a_adr = !former ? a->adr : a->former_adr; @@ -157,10 +157,10 @@ static int key_adrfil_equals_generic(void *arg, assertf(b_fil != NULL); // skip scheme and authentication to the domain (possibly without www.) - ja = normalized - ? jump_normalized_const(a_adr) : jump_identification_const(a_adr); - jb = normalized - ? jump_normalized_const(b_adr) : jump_identification_const(b_adr); + ja = hash->norm_host ? jump_normalized_const(a_adr) + : jump_identification_const(a_adr); + jb = hash->norm_host ? jump_normalized_const(b_adr) + : jump_identification_const(b_adr); assertf(ja != NULL); assertf(jb != NULL); if (strcasecmp(ja, jb) != 0) { @@ -175,9 +175,12 @@ static int key_adrfil_equals_generic(void *arg, const char *const keysb = hts_query_strip_keys(hash->strip_query, b_adr, b_fil, kb, sizeof(kb)); - if (normalized || keysa != NULL || keysb != NULL) { - fil_normalized_filtered(a_fil, hash->normfil, keysa); - fil_normalized_filtered(b_fil, hash->normfil2, keysb); + if (hash->norm_slash || hash->norm_query || keysa != NULL || + keysb != NULL) { + fil_normalized_filtered_ex(a_fil, hash->normfil, keysa, hash->norm_slash, + hash->norm_query); + fil_normalized_filtered_ex(b_fil, hash->normfil2, keysb, hash->norm_slash, + hash->norm_query); return strcmp(hash->normfil, hash->normfil2) == 0; } else { return strcmp(a_fil, b_fil) == 0; @@ -237,11 +240,14 @@ static int key_former_adrfil_equals(void *arg, return key_adrfil_equals_generic(arg, a, b, 1); } -void hash_init(httrackp *opt, hash_struct * hash, int normalized) { +void hash_init(httrackp *opt, hash_struct *hash, hts_boolean normalized) { hash->sav = coucal_new(0); hash->adrfil = coucal_new(0); hash->former_adrfil = coucal_new(0); - hash->normalized = normalized; + /* urlhack is the umbrella; per-feature negatives opt out of each part */ + hash->norm_host = normalized && !opt->no_www_dedup; + hash->norm_slash = normalized && !opt->no_slash_dedup; + hash->norm_query = normalized && !opt->no_query_dedup; /* snapshot the query-strip list (not owned; valid for the hash lifetime) */ hash->strip_query = StringNotEmpty(opt->strip_query) ? StringBuff(opt->strip_query) : NULL; @@ -300,6 +306,26 @@ void hash_free(hash_struct *hash) { } } +/* Test helper: do the two URLs dedupe to the same key under opt's urlhack + flags? Exercises the live hash compare (norm_host/slash/query resolution). */ +hts_boolean hash_url_equals(httrackp *opt, const char *adra, const char *fila, + const char *adrb, const char *filb) { + hash_struct hash; + lien_url la, lb; + hts_boolean eq; + + memset(&la, 0, sizeof(la)); + memset(&lb, 0, sizeof(lb)); + la.adr = key_duphandler(NULL, adra); + la.fil = key_duphandler(NULL, fila); + lb.adr = key_duphandler(NULL, adrb); + lb.fil = key_duphandler(NULL, filb); + hash_init(opt, &hash, opt->urlhack); + eq = key_adrfil_equals(&hash, &la, &lb); + hash_free(&hash); + return eq; +} + // retour: position ou -1 si non trouvé int hash_read(const hash_struct * hash, const char *nom1, const char *nom2, hash_struct_type type) { diff --git a/src/htshash.h b/src/htshash.h index e6cf0ac5..ffac47f8 100644 --- a/src/htshash.h +++ b/src/htshash.h @@ -51,8 +51,12 @@ typedef enum hash_struct_type { } hash_struct_type; // tables de hachage -void hash_init(httrackp *opt, hash_struct *hash, int normalized); +void hash_init(httrackp *opt, hash_struct *hash, hts_boolean normalized); void hash_free(hash_struct *hash); +/* Test helper: HTS_TRUE if the two URLs dedupe together under opt's urlhack + flags. */ +hts_boolean hash_url_equals(httrackp *opt, const char *adra, const char *fila, + const char *adrb, const char *filb); int hash_read(const hash_struct * hash, const char *nom1, const char *nom2, hash_struct_type type); void hash_write(hash_struct * hash, size_t lpos); diff --git a/src/htshelp.c b/src/htshelp.c index e0532c4d..103b53c8 100644 --- a/src/htshelp.c +++ b/src/htshelp.c @@ -588,6 +588,9 @@ void help(const char *app, int more) { (" %s update hacks: various hacks to limit re-transfers when updating (identical size, bogus response..)"); infomsg (" %u url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..)"); + infomsg(" opt out of one url-hack part: --keep-www-prefix " + "(www.foo.com<>foo.com), --keep-double-slashes (//), " + "--keep-query-order (?b&a)"); infomsg (" %A assume that a type (cgi,asp..) is always linked with a mime type (-%A php3,cgi=text/html;dat,bin=application/x-zip)"); infomsg(" shortcut: '--assume standard' is equivalent to -%A " diff --git a/src/htslib.c b/src/htslib.c index 1a88b62f..d7663a0f 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -3610,7 +3610,10 @@ static int sortNormFnc(const void *a_, const void *b_) { return strcmp(*a + 1, *b + 1); } -HTSEXT_API char *fil_normalized(const char *source, char *dest) { +/* Path normalizer core: optionally collapse redundant '//' (DO_SLASH) and/or + sort query arguments (DO_QUERY) so equivalent URLs dedupe. */ +static char *fil_normalized_ex(const char *source, char *dest, int do_slash, + int do_query) { char lastc = 0; int gotquery = 0; int ampargs = 0; @@ -3620,8 +3623,8 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) { for(i = j = 0; source[i] != '\0'; i++) { if (!gotquery && source[i] == '?') gotquery = ampargs = 1; - if ((!gotquery && lastc == '/' && source[i] == '/') // foo//bar -> foo/bar - ) { + if (do_slash && !gotquery && lastc == '/' && source[i] == '/') { + // foo//bar -> foo/bar } else { if (gotquery && source[i] == '&') { ampargs++; @@ -3633,7 +3636,7 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) { dest[j++] = '\0'; /* Sort arguments (&foo=1&bar=2 == &bar=2&foo=1) */ - if (ampargs > 1) { + if (do_query && ampargs > 1) { char **amps = malloct(ampargs * sizeof(char *)); char *copyBuff = NULL; size_t qLen = 0; @@ -3681,6 +3684,10 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) { return dest; } +HTSEXT_API char *fil_normalized(const char *source, char *dest) { + return fil_normalized_ex(source, dest, 1, 1); +} + /* Is query key ARG[0..keylen) in the comma-separated STRIP list? "*" = all; case-sensitive, space-trimmed tokens. */ static int hts_query_key_stripped(const char *arg, size_t keylen, @@ -3711,8 +3718,9 @@ static int hts_query_key_stripped(const char *arg, size_t keylen, } /* see htscore.h */ -char *fil_normalized_filtered(const char *source, char *dest, - const char *strip) { +char *fil_normalized_filtered_ex(const char *source, char *dest, + const char *strip, int do_slash, + int do_query) { const char *query; char BIGSTK tmp[HTS_URLMAXSIZE * 2]; htsbuff cb; @@ -3721,7 +3729,7 @@ char *fil_normalized_filtered(const char *source, char *dest, /* No strip list, or no query: plain normalization. */ if (strip == NULL || *strip == '\0' || (query = strchr(source, '?')) == NULL) { - return fil_normalized(source, dest); + return fil_normalized_ex(source, dest, do_slash, do_query); } /* Copy the path, re-emit kept query args, let fil_normalized() sort. Walk @@ -3750,7 +3758,13 @@ char *fil_normalized_filtered(const char *source, char *dest, break; query++; } - return fil_normalized(tmp, dest); + return fil_normalized_ex(tmp, dest, do_slash, do_query); +} + +/* see htscore.h */ +char *fil_normalized_filtered(const char *source, char *dest, + const char *strip) { + return fil_normalized_filtered_ex(source, dest, strip, 1, 1); } /* see htscore.h */ @@ -6026,6 +6040,9 @@ HTSEXT_API httrackp *hts_create_opt(void) { opt->verbosedisplay = HTS_VERBOSE_NONE; // no text animation opt->sizehack = HTS_FALSE; opt->urlhack = HTS_TRUE; + opt->no_www_dedup = HTS_FALSE; + opt->no_slash_dedup = HTS_FALSE; + opt->no_query_dedup = HTS_FALSE; StringCopy(opt->footer, HTS_DEFAULT_FOOTER); StringCopy(opt->strip_query, ""); opt->ftp_proxy = HTS_TRUE; diff --git a/src/htsname.c b/src/htsname.c index c6ee007e..c2e2bb5a 100644 --- a/src/htsname.c +++ b/src/htsname.c @@ -237,9 +237,13 @@ int url_savename(lien_adrfilsave *const afs, // www-42.foo.com -> foo.com // foo.com/bar//foobar -> foo.com/bar/foobar if (opt->urlhack) { - // copy of adr (without protocol), used for lookups (see urlhack) - normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_)); - normfil = fil_normalized_filtered(fil_complete, normfil_, strip); + // dedup-lookup key; honor the per-feature negatives like htshash.c so + // distinct URLs keep distinct savenames (else keep normadr = adr) + if (!opt->no_www_dedup) + normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_)); + normfil = + fil_normalized_filtered_ex(fil_complete, normfil_, strip, + !opt->no_slash_dedup, !opt->no_query_dedup); } else { if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder char *pos = strchr(adr_complete, ':'); @@ -252,9 +256,11 @@ int url_savename(lien_adrfilsave *const afs, normadr = normadr_; } } - // strip still applies with urlhack off (host left untouched) + // strip still applies with urlhack off (host left untouched); no // or + // query-sort here, to match the hash key (norm_slash/norm_query are 0 when + // urlhack is off) so a URL is looked up under the key it was stored with if (strip != NULL) - normfil = fil_normalized_filtered(fil_complete, normfil_, strip); + normfil = fil_normalized_filtered_ex(fil_complete, normfil_, strip, 0, 0); } // à afficher sans ftp:// diff --git a/src/htsopt.h b/src/htsopt.h index eabd070a..4dccd17f 100644 --- a/src/htsopt.h +++ b/src/htsopt.h @@ -531,6 +531,10 @@ struct httrackp { htsoptstate state; /**< embedded live engine state */ String strip_query; /**< query keys to drop when deduping URLs (-strip-query); appended at the tail to keep field offsets stable */ + hts_boolean + no_www_dedup; /**< with urlhack, keep www.host distinct from host */ + hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */ + hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */ }; /* Running statistics for a mirror. */ diff --git a/src/htsparse.c b/src/htsparse.c index 36decc13..4ec35245 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -3602,16 +3602,28 @@ int hts_mirror_check_moved(htsmoduleStruct * str, ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) { int set_prio_to = 0; // pas de priotité fixéd par wizard - // check whether URLHack is harmless or not - if (opt->urlhack) { + // check whether URLHack is harmless or not (per the effective + // sub-flags) + if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup || + !opt->no_query_dedup)) { + const int norm_host = !opt->no_www_dedup; + const int norm_slash = !opt->no_slash_dedup; + const int norm_query = !opt->no_query_dedup; char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2]; char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2]; - n_adr[0] = n_fil[0] = '\0'; - (void) adr_normalized_sized(moved->adr, n_adr, sizeof(n_adr)); - (void) fil_normalized(moved->fil, n_fil); - (void) adr_normalized_sized(urladr(), pn_adr, sizeof(pn_adr)); - (void) fil_normalized(urlfil(), pn_fil); + strlcpybuff(n_adr, + norm_host ? jump_normalized_const(moved->adr) + : jump_identification_const(moved->adr), + sizeof(n_adr)); + strlcpybuff(pn_adr, + norm_host ? jump_normalized_const(urladr()) + : jump_identification_const(urladr()), + sizeof(pn_adr)); + fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash, + norm_query); + fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash, + norm_query); if (strcasecmp(n_adr, pn_adr) == 0 && strcasecmp(n_fil, pn_fil) == 0) { hts_log_print(opt, LOG_WARNING, diff --git a/src/htsselftest.c b/src/htsselftest.c index 8c194c9e..95bc723f 100644 --- a/src/htsselftest.c +++ b/src/htsselftest.c @@ -1172,6 +1172,53 @@ static int st_stripquery(httrackp *opt, int argc, char **argv) { return 0; } +/* -%u url-hack split (#271): each sub-flag must toggle independently. */ +static int st_urlhack(httrackp *opt, int argc, char **argv) { + (void) argc; + (void) argv; +#define EQ(aa, fa, ab, fb) hash_url_equals(opt, aa, fa, ab, fb) + /* urlhack on, no opt-outs: www, // and query order all collapse */ + opt->urlhack = HTS_TRUE; + opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE; + assertf(EQ("www.foo.com", "/a", "foo.com", "/a")); + assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b")); + assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2")); + + /* keep-www-prefix: host off; // and query still collapse */ + opt->no_www_dedup = HTS_TRUE; + assertf(!EQ("www.foo.com", "/a", "foo.com", "/a")); + assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b")); + assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2")); + opt->no_www_dedup = HTS_FALSE; + + /* keep-double-slashes: // significant; www, query order still collapse */ + opt->no_slash_dedup = HTS_TRUE; + assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b")); + assertf(EQ("www.foo.com", "/a", "foo.com", "/a")); + assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2")); + opt->no_slash_dedup = HTS_FALSE; + + /* keep-query-order: query order significant; www and // still collapse */ + opt->no_query_dedup = HTS_TRUE; + assertf(!EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2")); + assertf(EQ("www.foo.com", "/a", "foo.com", "/a")); + assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b")); + opt->no_query_dedup = HTS_FALSE; + + /* all opt-outs == urlhack off entirely */ + opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE; + assertf(!EQ("www.foo.com", "/a", "foo.com", "/a")); + assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b")); + assertf(!EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2")); + opt->urlhack = HTS_FALSE; + opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE; + assertf(!EQ("www.foo.com", "/a", "foo.com", "/a")); + assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b")); +#undef EQ + printf("urlhack self-test OK\n"); + return 0; +} + /* ------------------------------------------------------------ */ /* Registry: name -> handler, with a usage hint and a one-line description. */ /* ------------------------------------------------------------ */ @@ -1190,6 +1237,8 @@ static const struct selftest_entry { {"simplify", "", "collapse ./ and ../ in a path", st_simplify}, {"stripquery", "", "--strip-query pattern/key stripping self-test", st_stripquery}, + {"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test", + st_urlhack}, {"mime", "", "MIME type for a filename", st_mime}, {"charset", " ", "convert a string to UTF-8 from a charset", st_charset}, diff --git a/tests/01_engine-urlhack.test b/tests/01_engine-urlhack.test new file mode 100644 index 00000000..3050f415 --- /dev/null +++ b/tests/01_engine-urlhack.test @@ -0,0 +1,8 @@ +#!/bin/bash +# + +set -euo pipefail + +# -%u url-hack split (#271): www / // / query-order dedup toggle independently. +# All assertions live in the engine self-test (hash compare flag resolution). +httrack -O /dev/null -#test=urlhack run | grep -q "urlhack self-test OK" diff --git a/tests/26_local-strip-query.test b/tests/26_local-strip-query.test index 02a8eb3e..7bdf4542 100755 --- a/tests/26_local-strip-query.test +++ b/tests/26_local-strip-query.test @@ -16,3 +16,8 @@ bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \ # control: no stripping -> both query-named variants are saved bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 3 \ httrack 'BASEURL/stripquery/index.html' + +# strip still applies with url-hack off (-%u0): exercises the urlhack-off +# savename branch, which must normalize the dedup key the same way the hash does +bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \ + httrack 'BASEURL/stripquery/index.html' -%u0 --strip-query 'utm_source' diff --git a/tests/Makefile.am b/tests/Makefile.am index 9d7e265c..1fb61608 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -48,6 +48,7 @@ TESTS = \ 01_engine-simplify.test \ 01_engine-stripquery.test \ 01_engine-strsafe.test \ + 01_engine-urlhack.test \ 02_manpage-regen.test \ 02_update-cache.test \ 10_crawl-simple.test \