From cc35193335e3afe09a7446f5f48383294f91686b Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sat, 27 Jun 2026 21:45:01 +0200 Subject: [PATCH] Add --cookies-file to preload a Netscape cookies.txt (#215) Mirroring a site behind a login meant either re-implementing the auth flow or dropping a file literally named cookies.txt into the output or working directory, the only two places the engine looked. This adds a CLI option to point at an arbitrary Netscape/Mozilla cookies.txt, so a session exported from a browser (the "Get cookies.txt" extensions write exactly this format) is replayed on the crawl and authenticated pages come down. The plumbing already existed: cookie_load parses the format into the shared jar and the request path sends every matching cookie. The new opt->cookies_file is loaded last, after the mirror/CWD defaults, so a user-supplied value wins on a name/domain/path conflict. The field is appended at the tail of httrackp, so the exported ABI is unchanged. Cookies key on host[:port], so a bare-domain file matches a normal crawl of a default-port site; only an explicit-port URL needs the port in the cookie domain. Covered by 27_local-cookies-file.test: a gated page that 500s without a cookie no page ever sets, reachable only once the file preloads it (with -o0 so the absence of a 500 error page is meaningful), plus a no-cookie control. The local-crawl harness grows a --cookie helper that writes a port-scoped jar. The copyopt self-test also gains a String round-trip so the exported copy_htsopt path for the new field is covered. Closes #215 Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Roche --- man/httrack.1 | 3 +++ src/htsalias.c | 2 ++ src/htscore.c | 8 +++++++- src/htscoremain.c | 18 ++++++++++++++++++ src/htshelp.c | 1 + src/htslib.c | 2 ++ src/htsopt.h | 2 ++ src/htsselftest.c | 13 +++++++++++++ tests/27_local-cookies-file.test | 22 ++++++++++++++++++++++ tests/Makefile.am | 3 ++- tests/local-crawl.sh | 21 ++++++++++++++++++++- tests/local-server.py | 15 +++++++++++++++ 12 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 tests/27_local-cookies-file.test diff --git a/man/httrack.1 b/man/httrack.1 index 9615ddb0..6e7cd1ad 100644 --- a/man/httrack.1 +++ b/man/httrack.1 @@ -49,6 +49,7 @@ httrack \- offline browser : copy websites to a local directory [ \fB\-%p, \-\-preserve\fR ] [ \fB\-%T, \-\-utf8\-conversion\fR ] [ \fB\-bN, \-\-cookies[=N]\fR ] +[ \fB\-%K, \-\-cookies\-file\fR ] [ \fB\-u, \-\-check\-type[=N]\fR ] [ \fB\-j, \-\-parse\-java[=N]\fR ] [ \fB\-sN, \-\-robots[=N]\fR ] @@ -212,6 +213,8 @@ links conversion to UTF\-8 (\-\-utf8\-conversion) .SS Spider options: .IP \-bN accept cookies in cookies.txt (0=do not accept,* 1=accept) (\-\-cookies[=N]) +.IP \-%K +load extra cookies from a Netscape cookies.txt (\-\-cookies\-file ) .IP \-u check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always) (\-\-check\-type[=N]) .IP \-j diff --git a/src/htsalias.c b/src/htsalias.c index f5cdc306..7359f2f7 100644 --- a/src/htsalias.c +++ b/src/htsalias.c @@ -112,6 +112,8 @@ const char *hts_optalias[][4] = { {"include-query-string", "-%q", "single", ""}, {"strip-query", "-%g", "param1", "strip [host/pattern=]key1,key2,... from URLs"}, + {"cookies-file", "-%K", "param1", + "load extra cookies from a Netscape cookies.txt"}, {"generate-errors", "-o", "single", ""}, {"do-not-generate-errors", "-o0", "single", ""}, {"purge-old", "-X", "param", ""}, diff --git a/src/htscore.c b/src/htscore.c index 4152ebda..0d25d327 100644 --- a/src/htscore.c +++ b/src/htscore.c @@ -523,9 +523,12 @@ int httpmirror(char *url1, httrackp * opt) { opt->cookie = &cookie; cookie.max_len = 30000; // max len strcpybuff(cookie.data, ""); - // Charger cookies.txt par défaut ou cookies.txt du miroir + // Load the mirror's cookies.txt, then the one in the current directory cookie_load(opt->cookie, StringBuff(opt->path_log), "cookies.txt"); cookie_load(opt->cookie, "", "cookies.txt"); + // A user-supplied cookie file is merged last so it wins on conflicts + if (strnotempty(StringBuff(opt->cookies_file))) + cookie_load(opt->cookie, "", StringBuff(opt->cookies_file)); } else opt->cookie = NULL; @@ -3742,6 +3745,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) { if (StringNotEmpty(from->strip_query)) StringCopyS(to->strip_query, from->strip_query); + if (StringNotEmpty(from->cookies_file)) + StringCopyS(to->cookies_file, from->cookies_file); + if (from->retry > -1) to->retry = from->retry; diff --git a/src/htscoremain.c b/src/htscoremain.c index eb1c3fe1..5de6fde1 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -1976,6 +1976,24 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) { StringCat(opt->strip_query, argv[na]); } break; + case 'K': // cookies-file: extra Netscape cookies.txt to preload + if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) { + HTS_PANIC_PRINTF( + "Option cookies-file needs a blank space and " + "a cookies.txt path"); + printf("Example: --cookies-file \"/home/me/cookies.txt\"\n"); + htsmain_free(); + return -1; + } else { + na++; + if (strlen(argv[na]) >= 1024) { + HTS_PANIC_PRINTF("Cookie file path too long"); + htsmain_free(); + return -1; + } + StringCopy(opt->cookies_file, argv[na]); + } + break; case 't': /* do not change type (ending) of filenames according to the MIME type */ opt->no_type_change = 1; if (*(com+1)=='0') { opt->no_type_change = 0; com++; } diff --git a/src/htshelp.c b/src/htshelp.c index 103b53c8..8e17a4b2 100644 --- a/src/htshelp.c +++ b/src/htshelp.c @@ -572,6 +572,7 @@ void help(const char *app, int more) { infomsg(""); infomsg("Spider options:"); infomsg(" bN accept cookies in cookies.txt (0=do not accept,* 1=accept)"); + infomsg(" %K load extra cookies from a Netscape cookies.txt"); infomsg (" u check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always)"); infomsg diff --git a/src/htslib.c b/src/htslib.c index d7663a0f..a0a01e76 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -6045,6 +6045,7 @@ HTSEXT_API httrackp *hts_create_opt(void) { opt->no_query_dedup = HTS_FALSE; StringCopy(opt->footer, HTS_DEFAULT_FOOTER); StringCopy(opt->strip_query, ""); + StringCopy(opt->cookies_file, ""); opt->ftp_proxy = HTS_TRUE; opt->convert_utf8 = HTS_TRUE; StringCopy(opt->filelist, ""); @@ -6190,6 +6191,7 @@ HTSEXT_API void hts_free_opt(httrackp * opt) { StringFree(opt->footer); StringFree(opt->mod_blacklist); StringFree(opt->strip_query); + StringFree(opt->cookies_file); StringFree(opt->path_html); StringFree(opt->path_html_utf8); diff --git a/src/htsopt.h b/src/htsopt.h index 4dccd17f..3007b0de 100644 --- a/src/htsopt.h +++ b/src/htsopt.h @@ -535,6 +535,8 @@ struct httrackp { no_www_dedup; /**< with urlhack, keep www.host distinct from host */ hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */ hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */ + String cookies_file; /**< extra Netscape cookies.txt to preload + (--cookies-file) */ }; /* Running statistics for a mirror. */ diff --git a/src/htsselftest.c b/src/htsselftest.c index 95bc723f..cf833fa6 100644 --- a/src/htsselftest.c +++ b/src/htsselftest.c @@ -899,6 +899,19 @@ static int st_copyopt(httrackp *opt, int argc, char **argv) { if (to->parseall != HTS_TRUE) err = 1; + /* String field: a non-empty source deep-copies across, an empty source + leaves the target intact (StringNotEmpty guard). Covers the exported + copy_htsopt String path that no crawl test reaches. */ + StringCopy(from->cookies_file, "/tmp/jar.txt"); + StringCopy(to->cookies_file, ""); + copy_htsopt(from, to); + if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0) + err = 1; + StringCopy(from->cookies_file, ""); + copy_htsopt(from, to); + if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0) + err = 1; + hts_free_opt(from); hts_free_opt(to); printf("copy-htsopt: %s\n", err ? "FAIL" : "OK"); diff --git a/tests/27_local-cookies-file.test b/tests/27_local-cookies-file.test new file mode 100644 index 00000000..9bc013b3 --- /dev/null +++ b/tests/27_local-cookies-file.test @@ -0,0 +1,22 @@ +#!/bin/bash +# +# End-to-end --cookies-file (#215): /gated/secret.php needs a cookie no page +# ever Set-Cookies, so it is reachable only when the option preloads it from a +# Netscape cookies.txt. Locks the CLI->opt->cookie_load->wire plumbing. + +set -e + +: "${top_srcdir:=..}" + +# preloaded cookie -> secret page is served. -o0 means a 500 leaves no file, so +# --found/--files only hold when the secret is genuinely fetched (200). +bash "$top_srcdir/tests/local-crawl.sh" --cookie 'session=opensesame' \ + --errors 0 --files 2 \ + --found 'gated/index.html' --found 'gated/secret.html' \ + httrack 'BASEURL/gated/index.php' -o0 + +# control: without the cookie the secret 500s; -o0 suppresses the error page so +# its absence is real (error + missing file) +bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \ + --found 'gated/index.html' --not-found 'gated/secret.html' \ + httrack 'BASEURL/gated/index.php' -o0 diff --git a/tests/Makefile.am b/tests/Makefile.am index 1fb61608..20c2ca1e 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -72,6 +72,7 @@ TESTS = \ 23_local-errpage.test \ 24_local-resume-overlap.test \ 25_local-mime-exclude.test \ - 26_local-strip-query.test + 26_local-strip-query.test \ + 27_local-cookies-file.test CLEANFILES = check-network_sh.cache diff --git a/tests/local-crawl.sh b/tests/local-crawl.sh index 0fcb4d80..bb4fcc20 100755 --- a/tests/local-crawl.sh +++ b/tests/local-crawl.sh @@ -12,11 +12,14 @@ # the mirror directory name. # # Usage: -# bash local-crawl.sh [--tls] [--root DIR] \ +# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \ # --errors N --files N --found PATH ... --directory PATH ... \ # --log-found REGEX ... --log-not-found REGEX ... \ # httrack BASEURL/some/path [httrack-args...] # --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt. +# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port, +# which the ephemeral port forces into the cookie domain) and passes it to +# httrack via --cookies-file, to exercise preloaded cookies. set -u @@ -85,6 +88,7 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create # --- parse leading control flags -------------------------------------------- declare -a audit=() +declare -a cookies=() scheme=http pos=0 args=("$@") @@ -105,6 +109,10 @@ while test "$pos" -lt "$nargs"; do pos=$((pos + 1)) root="${args[$pos]}" ;; + --cookie) + pos=$((pos + 1)) + cookies+=("${args[$pos]}") + ;; --errors | --files) audit+=("${args[$pos]}" "${args[$((pos + 1))]}") pos=$((pos + 1)) @@ -158,6 +166,17 @@ while test "$pos" -lt "$nargs"; do pos=$((pos + 1)) done +# --- materialize any --cookie entries into a cookies.txt --------------------- +if test "${#cookies[@]}" -gt 0; then + jar="${tmpdir}/cookies.txt" + : >"$jar" + for spec in "${cookies[@]}"; do + printf '127.0.0.1:%s\tTRUE\t/\tFALSE\t1999999999\t%s\t%s\n' \ + "$port" "${spec%%=*}" "${spec#*=}" >>"$jar" + done + hts+=(--cookies-file "$jar") +fi + # --- run httrack ------------------------------------------------------------- which httrack >/dev/null || die "could not find httrack" ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //') diff --git a/tests/local-server.py b/tests/local-server.py index ae890ae1..a5872140 100755 --- a/tests/local-server.py +++ b/tests/local-server.py @@ -110,6 +110,19 @@ def route_third(self): return self.fail_cookie("badger") self.send_html("\tThis is a test.") + # --cookies-file (#215): the secret page needs a cookie no page ever sets, + # so it is reachable only when --cookies-file preloads it. + GATE_COOKIE = ("session", "opensesame") + + def route_gated_index(self): + self.send_html('\tThis is a link') + + def route_gated_secret(self): + name, value = self.GATE_COOKIE + if self.request_cookies().get(name) != value: + return self.fail_cookie(name) + self.send_html("\tThis is the secret.") + def route_robots(self): body = b"User-agent: *\nDisallow:\n" self.send_response(200) @@ -345,6 +358,8 @@ def route_size_oversize(self): "/cookies/entrance.php": route_entrance, "/cookies/second.php": route_second, "/cookies/third.php": route_third, + "/gated/index.php": route_gated_index, + "/gated/secret.php": route_gated_secret, "/robots.txt": route_robots, "/types/index.html": route_types_index, "/types/control.php": route_types,