diff --git a/src/htsparse.c b/src/htsparse.c index 4ec35245..2dfed2c2 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -302,6 +302,14 @@ static HTS_INLINE char html_prevc(const char *html, const char *start) { return html > start ? html[-1] : ' '; } +/* Drop a redirect Location's #fragment: a UA anchor, never part of the fetched + * resource (#204). */ +static void url_drop_fragment(char *const url) { + char *const frag = strchr(url, '#'); + if (frag != NULL) + *frag = '\0'; +} + /* True if [s, s+len) is exactly an HTTP method token (XHR.open's first argument is a method, not a URL: #218). Case-insensitive. */ static int is_http_method(const char *s, size_t len) { @@ -3596,6 +3604,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str, // strcpybuff(mov_url, r->location); + url_drop_fragment(mov_url); // url qque -> adresse+fichier if ((reponse = @@ -4803,6 +4812,7 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs, mov_url[0] = '\0'; strcpybuff(mov_url, back[b].r.location); // copier URL + url_drop_fragment(mov_url); /* Remove (temporarily created) file if it was created */ UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), back[b].url_sav)); diff --git a/tests/29_local-redirect-fragment.test b/tests/29_local-redirect-fragment.test new file mode 100755 index 00000000..6b046167 --- /dev/null +++ b/tests/29_local-redirect-fragment.test @@ -0,0 +1,11 @@ +#!/bin/bash +# Issue #204: a 302 Location with a #fragment must drop the fragment before the +# target is fetched. The server is strict (400 on a '#' in the request-target), +# so a leaked fragment logs an error and the target is never saved. +set -e + +: "${top_srcdir:=..}" + +bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \ + --found 'redir/target.html' \ + httrack 'BASEURL/redir/index.html' diff --git a/tests/Makefile.am b/tests/Makefile.am index 380c53e9..1c06ddbb 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -75,6 +75,7 @@ TESTS = \ 25_local-mime-exclude.test \ 26_local-strip-query.test \ 27_local-cookies-file.test \ - 28_local-pause.test + 28_local-pause.test \ + 29_local-redirect-fragment.test CLEANFILES = check-network_sh.cache diff --git a/tests/local-server.py b/tests/local-server.py index a5872140..92caf202 100755 --- a/tests/local-server.py +++ b/tests/local-server.py @@ -354,6 +354,21 @@ def route_size_oversize(self): if self.command != "HEAD": self.wfile.write(body) + # 302 whose Location carries a #fragment (#204): the fragment is a UA anchor + # that must be dropped before the target is fetched. A leaked '#' reaches the + # strict-server guard below and 400s. + def route_redir_index(self): + self.send_html('\tgo') + + def route_redir_go(self): + self.send_response(302, "Found") + self.send_header("Location", "target.html#section") + self.send_header("Content-Length", "0") + self.end_headers() + + def route_redir_target(self): + self.send_raw(b"redirect target\n", "text/html") + ROUTES = { "/cookies/entrance.php": route_entrance, "/cookies/second.php": route_second, @@ -391,10 +406,23 @@ def route_size_oversize(self): "/mimex/index.html": route_mimex_index, "/mimex/blob.pdf": route_mimex_blob, "/mimex/real.html": route_mimex_real, + "/redir/index.html": route_redir_index, + "/redir/go.php": route_redir_go, + "/redir/target.html": route_redir_target, } # --- dispatch ---------------------------------------------------------- + def reject_fragment(self): + # Strict server: a '#' in the request-target is the client failing to + # drop a fragment (#204). RFC 3986 forbids it on the wire; answer 400. + if "#" in self.path: + self.send_response(400, "Bad Request") + self.send_header("Content-Length", "0") + self.end_headers() + return True + return False + def dispatch(self): self._set_cookies = [] path = urlsplit(self.path).path @@ -406,10 +434,14 @@ def dispatch(self): return False def do_GET(self): + if self.reject_fragment(): + return if not self.dispatch(): super().do_GET() def do_HEAD(self): + if self.reject_fragment(): + return if not self.dispatch(): super().do_HEAD()