From c40cec16270027151f0bfa3c4062e0408466c5c4 Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sun, 28 Jun 2026 13:19:53 +0200 Subject: [PATCH 1/2] Strip the #fragment from a redirect Location before fetching (#204) A 302/30x Location is dereferenced, not displayed, so its #fragment is a client-side anchor that must be dropped before the target is requested. httrack kept it: the redirect followers copied r.location verbatim, so the re-request carried `GET /page.html#frag` (strict servers answer 400) and the mirror was saved under a fragment-polluted name. HTML links were already stripped at parse time; only the two Location followers were not. Drop the fragment in a small helper called at both follow sites, covering the live and cached-redirect paths. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Roche --- src/htsparse.c | 10 ++++++++++ tests/29_local-redirect-fragment.test | 12 ++++++++++++ tests/Makefile.am | 3 ++- tests/local-server.py | 17 +++++++++++++++++ 4 files changed, 41 insertions(+), 1 deletion(-) create mode 100755 tests/29_local-redirect-fragment.test diff --git a/src/htsparse.c b/src/htsparse.c index 4ec35245..2dfed2c2 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -302,6 +302,14 @@ static HTS_INLINE char html_prevc(const char *html, const char *start) { return html > start ? html[-1] : ' '; } +/* Drop a redirect Location's #fragment: a UA anchor, never part of the fetched + * resource (#204). */ +static void url_drop_fragment(char *const url) { + char *const frag = strchr(url, '#'); + if (frag != NULL) + *frag = '\0'; +} + /* True if [s, s+len) is exactly an HTTP method token (XHR.open's first argument is a method, not a URL: #218). Case-insensitive. */ static int is_http_method(const char *s, size_t len) { @@ -3596,6 +3604,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str, // strcpybuff(mov_url, r->location); + url_drop_fragment(mov_url); // url qque -> adresse+fichier if ((reponse = @@ -4803,6 +4812,7 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs, mov_url[0] = '\0'; strcpybuff(mov_url, back[b].r.location); // copier URL + url_drop_fragment(mov_url); /* Remove (temporarily created) file if it was created */ UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), back[b].url_sav)); diff --git a/tests/29_local-redirect-fragment.test b/tests/29_local-redirect-fragment.test new file mode 100755 index 00000000..c0580573 --- /dev/null +++ b/tests/29_local-redirect-fragment.test @@ -0,0 +1,12 @@ +#!/bin/bash +# Issue #204: a 302 Location with a #fragment must drop the fragment before the +# target is fetched; else the request-target carries '#' (strict servers 400) +# and the mirror lands under a fragment-polluted name. +set -e + +: "${top_srcdir:=..}" + +bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \ + --found 'redir/target.html' \ + --not-found 'redir/target.html#section.html' \ + httrack 'BASEURL/redir/index.html' diff --git a/tests/Makefile.am b/tests/Makefile.am index 380c53e9..1c06ddbb 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -75,6 +75,7 @@ TESTS = \ 25_local-mime-exclude.test \ 26_local-strip-query.test \ 27_local-cookies-file.test \ - 28_local-pause.test + 28_local-pause.test \ + 29_local-redirect-fragment.test CLEANFILES = check-network_sh.cache diff --git a/tests/local-server.py b/tests/local-server.py index a5872140..de886c1b 100755 --- a/tests/local-server.py +++ b/tests/local-server.py @@ -354,6 +354,20 @@ def route_size_oversize(self): if self.command != "HEAD": self.wfile.write(body) + # 302 whose Location carries a #fragment (#204): the fragment is a UA anchor + # and must be dropped before the target is fetched/saved. + def route_redir_index(self): + self.send_html('\tgo') + + def route_redir_go(self): + self.send_response(302, "Found") + self.send_header("Location", "target.html#section") + self.send_header("Content-Length", "0") + self.end_headers() + + def route_redir_target(self): + self.send_raw(b"redirect target\n", "text/html") + ROUTES = { "/cookies/entrance.php": route_entrance, "/cookies/second.php": route_second, @@ -391,6 +405,9 @@ def route_size_oversize(self): "/mimex/index.html": route_mimex_index, "/mimex/blob.pdf": route_mimex_blob, "/mimex/real.html": route_mimex_real, + "/redir/index.html": route_redir_index, + "/redir/go.php": route_redir_go, + "/redir/target.html": route_redir_target, } # --- dispatch ---------------------------------------------------------- From 084d2009edafc4033f8ee14c0671150b8ceda097 Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sun, 28 Jun 2026 13:50:39 +0200 Subject: [PATCH 2/2] test(#204): strict-server guard so a leaked fragment is a wire-level failure The first cut of 29_local-redirect-fragment only checked the saved filename. Python's urlsplit() drops the fragment before routing, so a `#` leaked into the GET line still routed to the target and the crawl passed: the assertion was a proxy, not the wire behavior the fix targets. Make the server strict (400 on any `#` in the request-target, like the real servers in #204), so a leaked fragment now logs an error and the target is never saved. Neutering the fix makes the test fail with the exact "400 Bad Request" from the issue. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Roche --- tests/29_local-redirect-fragment.test | 5 ++--- tests/local-server.py | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/29_local-redirect-fragment.test b/tests/29_local-redirect-fragment.test index c0580573..6b046167 100755 --- a/tests/29_local-redirect-fragment.test +++ b/tests/29_local-redirect-fragment.test @@ -1,12 +1,11 @@ #!/bin/bash # Issue #204: a 302 Location with a #fragment must drop the fragment before the -# target is fetched; else the request-target carries '#' (strict servers 400) -# and the mirror lands under a fragment-polluted name. +# target is fetched. The server is strict (400 on a '#' in the request-target), +# so a leaked fragment logs an error and the target is never saved. set -e : "${top_srcdir:=..}" bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \ --found 'redir/target.html' \ - --not-found 'redir/target.html#section.html' \ httrack 'BASEURL/redir/index.html' diff --git a/tests/local-server.py b/tests/local-server.py index de886c1b..92caf202 100755 --- a/tests/local-server.py +++ b/tests/local-server.py @@ -355,7 +355,8 @@ def route_size_oversize(self): self.wfile.write(body) # 302 whose Location carries a #fragment (#204): the fragment is a UA anchor - # and must be dropped before the target is fetched/saved. + # that must be dropped before the target is fetched. A leaked '#' reaches the + # strict-server guard below and 400s. def route_redir_index(self): self.send_html('\tgo') @@ -412,6 +413,16 @@ def route_redir_target(self): # --- dispatch ---------------------------------------------------------- + def reject_fragment(self): + # Strict server: a '#' in the request-target is the client failing to + # drop a fragment (#204). RFC 3986 forbids it on the wire; answer 400. + if "#" in self.path: + self.send_response(400, "Bad Request") + self.send_header("Content-Length", "0") + self.end_headers() + return True + return False + def dispatch(self): self._set_cookies = [] path = urlsplit(self.path).path @@ -423,10 +434,14 @@ def dispatch(self): return False def do_GET(self): + if self.reject_fragment(): + return if not self.dispatch(): super().do_GET() def do_HEAD(self): + if self.reject_fragment(): + return if not self.dispatch(): super().do_HEAD()