Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions man/httrack.1
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ httrack \- offline browser : copy websites to a local directory
[ \fB\-%p, \-\-preserve\fR ]
[ \fB\-%T, \-\-utf8\-conversion\fR ]
[ \fB\-bN, \-\-cookies[=N]\fR ]
[ \fB\-%K, \-\-cookies\-file\fR ]
[ \fB\-u, \-\-check\-type[=N]\fR ]
[ \fB\-j, \-\-parse\-java[=N]\fR ]
[ \fB\-sN, \-\-robots[=N]\fR ]
Expand Down Expand Up @@ -212,6 +213,8 @@ links conversion to UTF\-8 (\-\-utf8\-conversion)
.SS Spider options:
.IP \-bN
accept cookies in cookies.txt (0=do not accept,* 1=accept) (\-\-cookies[=N])
.IP \-%K
load extra cookies from a Netscape cookies.txt (\-\-cookies\-file <param>)
.IP \-u
check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always) (\-\-check\-type[=N])
.IP \-j
Expand Down
2 changes: 2 additions & 0 deletions src/htsalias.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ const char *hts_optalias[][4] = {
{"include-query-string", "-%q", "single", ""},
{"strip-query", "-%g", "param1",
"strip [host/pattern=]key1,key2,... from URLs"},
{"cookies-file", "-%K", "param1",
"load extra cookies from a Netscape cookies.txt"},
{"generate-errors", "-o", "single", ""},
{"do-not-generate-errors", "-o0", "single", ""},
{"purge-old", "-X", "param", ""},
Expand Down
8 changes: 7 additions & 1 deletion src/htscore.c
Original file line number Diff line number Diff line change
Expand Up @@ -523,9 +523,12 @@ int httpmirror(char *url1, httrackp * opt) {
opt->cookie = &cookie;
cookie.max_len = 30000; // max len
strcpybuff(cookie.data, "");
// Charger cookies.txt par défaut ou cookies.txt du miroir
// Load the mirror's cookies.txt, then the one in the current directory
cookie_load(opt->cookie, StringBuff(opt->path_log), "cookies.txt");
cookie_load(opt->cookie, "", "cookies.txt");
// A user-supplied cookie file is merged last so it wins on conflicts
if (strnotempty(StringBuff(opt->cookies_file)))
cookie_load(opt->cookie, "", StringBuff(opt->cookies_file));
} else
opt->cookie = NULL;

Expand Down Expand Up @@ -3742,6 +3745,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
if (StringNotEmpty(from->strip_query))
StringCopyS(to->strip_query, from->strip_query);

if (StringNotEmpty(from->cookies_file))
StringCopyS(to->cookies_file, from->cookies_file);

if (from->retry > -1)
to->retry = from->retry;

Expand Down
18 changes: 18 additions & 0 deletions src/htscoremain.c
Original file line number Diff line number Diff line change
Expand Up @@ -1976,6 +1976,24 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
StringCat(opt->strip_query, argv[na]);
}
break;
case 'K': // cookies-file: extra Netscape cookies.txt to preload
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
HTS_PANIC_PRINTF(
"Option cookies-file needs a blank space and "
"a cookies.txt path");
printf("Example: --cookies-file \"/home/me/cookies.txt\"\n");
htsmain_free();
return -1;
} else {
na++;
if (strlen(argv[na]) >= 1024) {
HTS_PANIC_PRINTF("Cookie file path too long");
htsmain_free();
return -1;
}
StringCopy(opt->cookies_file, argv[na]);
}
break;
case 't': /* do not change type (ending) of filenames according to the MIME type */
opt->no_type_change = 1;
if (*(com+1)=='0') { opt->no_type_change = 0; com++; }
Expand Down
1 change: 1 addition & 0 deletions src/htshelp.c
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,7 @@ void help(const char *app, int more) {
infomsg("");
infomsg("Spider options:");
infomsg(" bN accept cookies in cookies.txt (0=do not accept,* 1=accept)");
infomsg(" %K load extra cookies from a Netscape cookies.txt");
infomsg
(" u check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always)");
infomsg
Expand Down
2 changes: 2 additions & 0 deletions src/htslib.c
Original file line number Diff line number Diff line change
Expand Up @@ -6045,6 +6045,7 @@ HTSEXT_API httrackp *hts_create_opt(void) {
opt->no_query_dedup = HTS_FALSE;
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
StringCopy(opt->strip_query, "");
StringCopy(opt->cookies_file, "");
opt->ftp_proxy = HTS_TRUE;
opt->convert_utf8 = HTS_TRUE;
StringCopy(opt->filelist, "");
Expand Down Expand Up @@ -6190,6 +6191,7 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
StringFree(opt->footer);
StringFree(opt->mod_blacklist);
StringFree(opt->strip_query);
StringFree(opt->cookies_file);

StringFree(opt->path_html);
StringFree(opt->path_html_utf8);
Expand Down
2 changes: 2 additions & 0 deletions src/htsopt.h
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,8 @@ struct httrackp {
no_www_dedup; /**< with urlhack, keep www.host distinct from host */
hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */
hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */
String cookies_file; /**< extra Netscape cookies.txt to preload
(--cookies-file) */
};

/* Running statistics for a mirror. */
Expand Down
13 changes: 13 additions & 0 deletions src/htsselftest.c
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,19 @@ static int st_copyopt(httrackp *opt, int argc, char **argv) {
if (to->parseall != HTS_TRUE)
err = 1;

/* String field: a non-empty source deep-copies across, an empty source
leaves the target intact (StringNotEmpty guard). Covers the exported
copy_htsopt String path that no crawl test reaches. */
StringCopy(from->cookies_file, "/tmp/jar.txt");
StringCopy(to->cookies_file, "");
copy_htsopt(from, to);
if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0)
err = 1;
StringCopy(from->cookies_file, "");
copy_htsopt(from, to);
if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0)
err = 1;

hts_free_opt(from);
hts_free_opt(to);
printf("copy-htsopt: %s\n", err ? "FAIL" : "OK");
Expand Down
22 changes: 22 additions & 0 deletions tests/27_local-cookies-file.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
#
# End-to-end --cookies-file (#215): /gated/secret.php needs a cookie no page
# ever Set-Cookies, so it is reachable only when the option preloads it from a
# Netscape cookies.txt. Locks the CLI->opt->cookie_load->wire plumbing.

set -e

: "${top_srcdir:=..}"

# preloaded cookie -> secret page is served. -o0 means a 500 leaves no file, so
# --found/--files only hold when the secret is genuinely fetched (200).
bash "$top_srcdir/tests/local-crawl.sh" --cookie 'session=opensesame' \
--errors 0 --files 2 \
--found 'gated/index.html' --found 'gated/secret.html' \
httrack 'BASEURL/gated/index.php' -o0

# control: without the cookie the secret 500s; -o0 suppresses the error page so
# its absence is real (error + missing file)
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
--found 'gated/index.html' --not-found 'gated/secret.html' \
httrack 'BASEURL/gated/index.php' -o0
3 changes: 2 additions & 1 deletion tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ TESTS = \
23_local-errpage.test \
24_local-resume-overlap.test \
25_local-mime-exclude.test \
26_local-strip-query.test
26_local-strip-query.test \
27_local-cookies-file.test

CLEANFILES = check-network_sh.cache
21 changes: 20 additions & 1 deletion tests/local-crawl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@
# the mirror directory name.
#
# Usage:
# bash local-crawl.sh [--tls] [--root DIR] \
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
# --errors N --files N --found PATH ... --directory PATH ... \
# --log-found REGEX ... --log-not-found REGEX ... \
# httrack BASEURL/some/path [httrack-args...]
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
# which the ephemeral port forces into the cookie domain) and passes it to
# httrack via --cookies-file, to exercise preloaded cookies.

set -u

Expand Down Expand Up @@ -85,6 +88,7 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create

# --- parse leading control flags --------------------------------------------
declare -a audit=()
declare -a cookies=()
scheme=http
pos=0
args=("$@")
Expand All @@ -105,6 +109,10 @@ while test "$pos" -lt "$nargs"; do
pos=$((pos + 1))
root="${args[$pos]}"
;;
--cookie)
pos=$((pos + 1))
cookies+=("${args[$pos]}")
;;
--errors | --files)
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
pos=$((pos + 1))
Expand Down Expand Up @@ -158,6 +166,17 @@ while test "$pos" -lt "$nargs"; do
pos=$((pos + 1))
done

# --- materialize any --cookie entries into a cookies.txt ---------------------
if test "${#cookies[@]}" -gt 0; then
jar="${tmpdir}/cookies.txt"
: >"$jar"
for spec in "${cookies[@]}"; do
printf '127.0.0.1:%s\tTRUE\t/\tFALSE\t1999999999\t%s\t%s\n' \
"$port" "${spec%%=*}" "${spec#*=}" >>"$jar"
done
hts+=(--cookies-file "$jar")
fi

# --- run httrack -------------------------------------------------------------
which httrack >/dev/null || die "could not find httrack"
ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //')
Expand Down
15 changes: 15 additions & 0 deletions tests/local-server.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,19 @@ def route_third(self):
return self.fail_cookie("badger")
self.send_html("\tThis is a test.")

# --cookies-file (#215): the secret page needs a cookie no page ever sets,
# so it is reachable only when --cookies-file preloads it.
GATE_COOKIE = ("session", "opensesame")

def route_gated_index(self):
self.send_html('\tThis is a <a href="secret.php">link</a>')

def route_gated_secret(self):
name, value = self.GATE_COOKIE
if self.request_cookies().get(name) != value:
return self.fail_cookie(name)
self.send_html("\tThis is the secret.")

def route_robots(self):
body = b"User-agent: *\nDisallow:\n"
self.send_response(200)
Expand Down Expand Up @@ -345,6 +358,8 @@ def route_size_oversize(self):
"/cookies/entrance.php": route_entrance,
"/cookies/second.php": route_second,
"/cookies/third.php": route_third,
"/gated/index.php": route_gated_index,
"/gated/secret.php": route_gated_secret,
"/robots.txt": route_robots,
"/types/index.html": route_types_index,
"/types/control.php": route_types,
Expand Down
Loading