diff --git a/man/httrack.1 b/man/httrack.1
index 9615ddb0..6e7cd1ad 100644
--- a/man/httrack.1
+++ b/man/httrack.1
@@ -49,6 +49,7 @@ httrack \- offline browser : copy websites to a local directory
[ \fB\-%p, \-\-preserve\fR ]
[ \fB\-%T, \-\-utf8\-conversion\fR ]
[ \fB\-bN, \-\-cookies[=N]\fR ]
+[ \fB\-%K, \-\-cookies\-file\fR ]
[ \fB\-u, \-\-check\-type[=N]\fR ]
[ \fB\-j, \-\-parse\-java[=N]\fR ]
[ \fB\-sN, \-\-robots[=N]\fR ]
@@ -212,6 +213,8 @@ links conversion to UTF\-8 (\-\-utf8\-conversion)
.SS Spider options:
.IP \-bN
accept cookies in cookies.txt (0=do not accept,* 1=accept) (\-\-cookies[=N])
+.IP \-%K
+load extra cookies from a Netscape cookies.txt (\-\-cookies\-file )
.IP \-u
check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always) (\-\-check\-type[=N])
.IP \-j
diff --git a/src/htsalias.c b/src/htsalias.c
index f5cdc306..7359f2f7 100644
--- a/src/htsalias.c
+++ b/src/htsalias.c
@@ -112,6 +112,8 @@ const char *hts_optalias[][4] = {
{"include-query-string", "-%q", "single", ""},
{"strip-query", "-%g", "param1",
"strip [host/pattern=]key1,key2,... from URLs"},
+ {"cookies-file", "-%K", "param1",
+ "load extra cookies from a Netscape cookies.txt"},
{"generate-errors", "-o", "single", ""},
{"do-not-generate-errors", "-o0", "single", ""},
{"purge-old", "-X", "param", ""},
diff --git a/src/htscore.c b/src/htscore.c
index 4152ebda..0d25d327 100644
--- a/src/htscore.c
+++ b/src/htscore.c
@@ -523,9 +523,12 @@ int httpmirror(char *url1, httrackp * opt) {
opt->cookie = &cookie;
cookie.max_len = 30000; // max len
strcpybuff(cookie.data, "");
- // Charger cookies.txt par défaut ou cookies.txt du miroir
+ // Load the mirror's cookies.txt, then the one in the current directory
cookie_load(opt->cookie, StringBuff(opt->path_log), "cookies.txt");
cookie_load(opt->cookie, "", "cookies.txt");
+ // A user-supplied cookie file is merged last so it wins on conflicts
+ if (strnotempty(StringBuff(opt->cookies_file)))
+ cookie_load(opt->cookie, "", StringBuff(opt->cookies_file));
} else
opt->cookie = NULL;
@@ -3742,6 +3745,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
if (StringNotEmpty(from->strip_query))
StringCopyS(to->strip_query, from->strip_query);
+ if (StringNotEmpty(from->cookies_file))
+ StringCopyS(to->cookies_file, from->cookies_file);
+
if (from->retry > -1)
to->retry = from->retry;
diff --git a/src/htscoremain.c b/src/htscoremain.c
index eb1c3fe1..5de6fde1 100644
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -1976,6 +1976,24 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
StringCat(opt->strip_query, argv[na]);
}
break;
+ case 'K': // cookies-file: extra Netscape cookies.txt to preload
+ if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
+ HTS_PANIC_PRINTF(
+ "Option cookies-file needs a blank space and "
+ "a cookies.txt path");
+ printf("Example: --cookies-file \"/home/me/cookies.txt\"\n");
+ htsmain_free();
+ return -1;
+ } else {
+ na++;
+ if (strlen(argv[na]) >= 1024) {
+ HTS_PANIC_PRINTF("Cookie file path too long");
+ htsmain_free();
+ return -1;
+ }
+ StringCopy(opt->cookies_file, argv[na]);
+ }
+ break;
case 't': /* do not change type (ending) of filenames according to the MIME type */
opt->no_type_change = 1;
if (*(com+1)=='0') { opt->no_type_change = 0; com++; }
diff --git a/src/htshelp.c b/src/htshelp.c
index 103b53c8..8e17a4b2 100644
--- a/src/htshelp.c
+++ b/src/htshelp.c
@@ -572,6 +572,7 @@ void help(const char *app, int more) {
infomsg("");
infomsg("Spider options:");
infomsg(" bN accept cookies in cookies.txt (0=do not accept,* 1=accept)");
+ infomsg(" %K load extra cookies from a Netscape cookies.txt");
infomsg
(" u check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always)");
infomsg
diff --git a/src/htslib.c b/src/htslib.c
index d7663a0f..a0a01e76 100644
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -6045,6 +6045,7 @@ HTSEXT_API httrackp *hts_create_opt(void) {
opt->no_query_dedup = HTS_FALSE;
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
StringCopy(opt->strip_query, "");
+ StringCopy(opt->cookies_file, "");
opt->ftp_proxy = HTS_TRUE;
opt->convert_utf8 = HTS_TRUE;
StringCopy(opt->filelist, "");
@@ -6190,6 +6191,7 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
StringFree(opt->footer);
StringFree(opt->mod_blacklist);
StringFree(opt->strip_query);
+ StringFree(opt->cookies_file);
StringFree(opt->path_html);
StringFree(opt->path_html_utf8);
diff --git a/src/htsopt.h b/src/htsopt.h
index 4dccd17f..3007b0de 100644
--- a/src/htsopt.h
+++ b/src/htsopt.h
@@ -535,6 +535,8 @@ struct httrackp {
no_www_dedup; /**< with urlhack, keep www.host distinct from host */
hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */
hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */
+ String cookies_file; /**< extra Netscape cookies.txt to preload
+ (--cookies-file) */
};
/* Running statistics for a mirror. */
diff --git a/src/htsselftest.c b/src/htsselftest.c
index 95bc723f..cf833fa6 100644
--- a/src/htsselftest.c
+++ b/src/htsselftest.c
@@ -899,6 +899,19 @@ static int st_copyopt(httrackp *opt, int argc, char **argv) {
if (to->parseall != HTS_TRUE)
err = 1;
+ /* String field: a non-empty source deep-copies across, an empty source
+ leaves the target intact (StringNotEmpty guard). Covers the exported
+ copy_htsopt String path that no crawl test reaches. */
+ StringCopy(from->cookies_file, "/tmp/jar.txt");
+ StringCopy(to->cookies_file, "");
+ copy_htsopt(from, to);
+ if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0)
+ err = 1;
+ StringCopy(from->cookies_file, "");
+ copy_htsopt(from, to);
+ if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0)
+ err = 1;
+
hts_free_opt(from);
hts_free_opt(to);
printf("copy-htsopt: %s\n", err ? "FAIL" : "OK");
diff --git a/tests/27_local-cookies-file.test b/tests/27_local-cookies-file.test
new file mode 100644
index 00000000..9bc013b3
--- /dev/null
+++ b/tests/27_local-cookies-file.test
@@ -0,0 +1,22 @@
+#!/bin/bash
+#
+# End-to-end --cookies-file (#215): /gated/secret.php needs a cookie no page
+# ever Set-Cookies, so it is reachable only when the option preloads it from a
+# Netscape cookies.txt. Locks the CLI->opt->cookie_load->wire plumbing.
+
+set -e
+
+: "${top_srcdir:=..}"
+
+# preloaded cookie -> secret page is served. -o0 means a 500 leaves no file, so
+# --found/--files only hold when the secret is genuinely fetched (200).
+bash "$top_srcdir/tests/local-crawl.sh" --cookie 'session=opensesame' \
+ --errors 0 --files 2 \
+ --found 'gated/index.html' --found 'gated/secret.html' \
+ httrack 'BASEURL/gated/index.php' -o0
+
+# control: without the cookie the secret 500s; -o0 suppresses the error page so
+# its absence is real (error + missing file)
+bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
+ --found 'gated/index.html' --not-found 'gated/secret.html' \
+ httrack 'BASEURL/gated/index.php' -o0
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 1fb61608..20c2ca1e 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -72,6 +72,7 @@ TESTS = \
23_local-errpage.test \
24_local-resume-overlap.test \
25_local-mime-exclude.test \
- 26_local-strip-query.test
+ 26_local-strip-query.test \
+ 27_local-cookies-file.test
CLEANFILES = check-network_sh.cache
diff --git a/tests/local-crawl.sh b/tests/local-crawl.sh
index 0fcb4d80..bb4fcc20 100755
--- a/tests/local-crawl.sh
+++ b/tests/local-crawl.sh
@@ -12,11 +12,14 @@
# the mirror directory name.
#
# Usage:
-# bash local-crawl.sh [--tls] [--root DIR] \
+# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
# --errors N --files N --found PATH ... --directory PATH ... \
# --log-found REGEX ... --log-not-found REGEX ... \
# httrack BASEURL/some/path [httrack-args...]
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
+# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
+# which the ephemeral port forces into the cookie domain) and passes it to
+# httrack via --cookies-file, to exercise preloaded cookies.
set -u
@@ -85,6 +88,7 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create
# --- parse leading control flags --------------------------------------------
declare -a audit=()
+declare -a cookies=()
scheme=http
pos=0
args=("$@")
@@ -105,6 +109,10 @@ while test "$pos" -lt "$nargs"; do
pos=$((pos + 1))
root="${args[$pos]}"
;;
+ --cookie)
+ pos=$((pos + 1))
+ cookies+=("${args[$pos]}")
+ ;;
--errors | --files)
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
pos=$((pos + 1))
@@ -158,6 +166,17 @@ while test "$pos" -lt "$nargs"; do
pos=$((pos + 1))
done
+# --- materialize any --cookie entries into a cookies.txt ---------------------
+if test "${#cookies[@]}" -gt 0; then
+ jar="${tmpdir}/cookies.txt"
+ : >"$jar"
+ for spec in "${cookies[@]}"; do
+ printf '127.0.0.1:%s\tTRUE\t/\tFALSE\t1999999999\t%s\t%s\n' \
+ "$port" "${spec%%=*}" "${spec#*=}" >>"$jar"
+ done
+ hts+=(--cookies-file "$jar")
+fi
+
# --- run httrack -------------------------------------------------------------
which httrack >/dev/null || die "could not find httrack"
ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //')
diff --git a/tests/local-server.py b/tests/local-server.py
index ae890ae1..a5872140 100755
--- a/tests/local-server.py
+++ b/tests/local-server.py
@@ -110,6 +110,19 @@ def route_third(self):
return self.fail_cookie("badger")
self.send_html("\tThis is a test.")
+ # --cookies-file (#215): the secret page needs a cookie no page ever sets,
+ # so it is reachable only when --cookies-file preloads it.
+ GATE_COOKIE = ("session", "opensesame")
+
+ def route_gated_index(self):
+ self.send_html('\tThis is a link')
+
+ def route_gated_secret(self):
+ name, value = self.GATE_COOKIE
+ if self.request_cookies().get(name) != value:
+ return self.fail_cookie(name)
+ self.send_html("\tThis is the secret.")
+
def route_robots(self):
body = b"User-agent: *\nDisallow:\n"
self.send_response(200)
@@ -345,6 +358,8 @@ def route_size_oversize(self):
"/cookies/entrance.php": route_entrance,
"/cookies/second.php": route_second,
"/cookies/third.php": route_third,
+ "/gated/index.php": route_gated_index,
+ "/gated/secret.php": route_gated_secret,
"/robots.txt": route_robots,
"/types/index.html": route_types_index,
"/types/control.php": route_types,