diff --git a/man/httrack.1 b/man/httrack.1 index 6e7cd1ad..3ba9b835 100644 --- a/man/httrack.1 +++ b/man/httrack.1 @@ -24,6 +24,7 @@ httrack \- offline browser : copy websites to a local directory [ \fB\-EN, \-\-max\-time[=N]\fR ] [ \fB\-AN, \-\-max\-rate[=N]\fR ] [ \fB\-%cN, \-\-connection\-per\-second[=N]\fR ] +[ \fB\-%G, \-\-pause\fR ] [ \fB\-GN, \-\-max\-pause[=N]\fR ] [ \fB\-cN, \-\-sockets[=N]\fR ] [ \fB\-TN, \-\-timeout[=N]\fR ] @@ -155,6 +156,8 @@ maximum mirror time in seconds (60=1 minute, 3600=1 hour) (\-\-max\-time[=N]) maximum transfer rate in bytes/seconds (1000=1KB/s max) (\-\-max\-rate[=N]) .IP \-%cN maximum number of connections/seconds (*%c10) (\-\-connection\-per\-second[=N]) +.IP \-%G +random pause of MIN[:MAX] seconds between files (e.g. %G5:10) (\-\-pause ) .IP \-GN pause transfer if N bytes reached, and wait until lock file is deleted (\-\-max\-pause[=N]) .SS Flow control: diff --git a/src/htsalias.c b/src/htsalias.c index 7359f2f7..7b4811a3 100644 --- a/src/htsalias.c +++ b/src/htsalias.c @@ -114,6 +114,8 @@ const char *hts_optalias[][4] = { "strip [host/pattern=]key1,key2,... from URLs"}, {"cookies-file", "-%K", "param1", "load extra cookies from a Netscape cookies.txt"}, + {"pause", "-%G", "param1", + "random pause of MIN[:MAX] seconds between files"}, {"generate-errors", "-o", "single", ""}, {"do-not-generate-errors", "-o0", "single", ""}, {"purge-old", "-X", "param", ""}, diff --git a/src/htscore.c b/src/htscore.c index 0d25d327..c8736363 100644 --- a/src/htscore.c +++ b/src/htscore.c @@ -35,6 +35,7 @@ Please visit our Website: http://www.httrack.com #include #include +#include /* uint64_t for the pause mixer (already a hard dep via md5.h) */ /* File defs */ #include "htscore.h" @@ -3314,6 +3315,21 @@ HTS_INLINE int back_fillmax(struct_back * sback, httrackp * opt, return -1; /* plus de place */ } +/* Seed-derived: stable within a gap, rerolls per launch; a per-call rand() + would bias the delay toward min_ms (see header). Jitter, not crypto. */ +int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms) { + uint64_t z = (uint64_t) seed; + + if (max_ms <= min_ms) + return min_ms; + /* SplitMix64 finalizer: scrambles the low-entropy ms timestamp. */ + z += 0x9E3779B97F4A7C15ULL; + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL; + z ^= z >> 31; + return min_ms + (int) (z % (uint64_t) (max_ms - min_ms + 1)); +} + int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) { int n = opt->maxsoc - back_nsoc(sback); @@ -3334,6 +3350,18 @@ int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) { } } + // #185 randomized inter-file pause: non-blocking, one launch per gap + if (n > 0 && opt->pause_max_ms > 0 && HTS_STAT.last_connect > 0) { + TStamp opTime = + HTS_STAT.last_request ? HTS_STAT.last_request : HTS_STAT.last_connect; + TStamp lap = mtime_local() - opTime; + + if (lap < hts_pause_target_ms(opTime, opt->pause_min_ms, opt->pause_max_ms)) + n = 0; + else + n = 1; + } + return n; } @@ -3748,6 +3776,11 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) { if (StringNotEmpty(from->cookies_file)) StringCopyS(to->cookies_file, from->cookies_file); + if (from->pause_max_ms > 0) { + to->pause_min_ms = from->pause_min_ms; + to->pause_max_ms = from->pause_max_ms; + } + if (from->retry > -1) to->retry = from->retry; diff --git a/src/htscore.h b/src/htscore.h index 2d59f49f..d6caed43 100644 --- a/src/htscore.h +++ b/src/htscore.h @@ -418,6 +418,10 @@ int back_pluggable_sockets(struct_back * sback, httrackp * opt); int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt); +/* Randomized inter-file pause target in [min_ms,max_ms] (#185), derived from a + timestamp seed so it is stable within one gap and rerolls per launch. */ +int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms); + /* Schedule more links from the heap into free slots. Returns the number queued, or <=0 if none could be added (no free slot / paused / stopped). */ int back_fill(struct_back * sback, httrackp * opt, cache_back * cache, diff --git a/src/htscoremain.c b/src/htscoremain.c index 5de6fde1..c4c2d416 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -1994,6 +1994,33 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) { StringCopy(opt->cookies_file, argv[na]); } break; + case 'G': // pause: randomized inter-file delay MIN[:MAX] seconds + if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) { + HTS_PANIC_PRINTF("Option pause needs a blank space and a " + "delay in seconds (MIN[:MAX])"); + printf("Example: --pause 5:10\n"); + htsmain_free(); + return -1; + } else { + double pmin = 0, pmax = 0; + int nf; + + na++; + nf = sscanf(argv[na], "%lf:%lf", &pmin, &pmax); + if (nf < 2) + pmax = pmin; /* a single value means a fixed delay */ + /* positive-form bounds: NaN fails every comparison, so this + rejects it before the undefined (int)(NaN*1000) cast */ + if (nf < 1 || !(pmin >= 0 && pmax >= pmin && pmax <= 86400)) { + HTS_PANIC_PRINTF("Invalid --pause range (expected " + "MIN[:MAX] seconds, 0<=MIN<=MAX<=86400)"); + htsmain_free(); + return -1; + } + opt->pause_min_ms = (int) (pmin * 1000.0); + opt->pause_max_ms = (int) (pmax * 1000.0); + } + break; case 't': /* do not change type (ending) of filenames according to the MIME type */ opt->no_type_change = 1; if (*(com+1)=='0') { opt->no_type_change = 0; com++; } diff --git a/src/htshelp.c b/src/htshelp.c index 8e17a4b2..6a7ca5e6 100644 --- a/src/htshelp.c +++ b/src/htshelp.c @@ -521,6 +521,7 @@ void help(const char *app, int more) { infomsg(" EN maximum mirror time in seconds (60=1 minute, 3600=1 hour)"); infomsg(" AN maximum transfer rate in bytes/seconds (1000=1KB/s max)"); infomsg(" %cN maximum number of connections/seconds (*%c10)"); + infomsg(" %G random pause of MIN[:MAX] seconds between files (e.g. %G5:10)"); infomsg (" GN pause transfer if N bytes reached, and wait until lock file is deleted"); infomsg(""); diff --git a/src/htslib.c b/src/htslib.c index a0a01e76..4bea4c43 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -6046,6 +6046,8 @@ HTSEXT_API httrackp *hts_create_opt(void) { StringCopy(opt->footer, HTS_DEFAULT_FOOTER); StringCopy(opt->strip_query, ""); StringCopy(opt->cookies_file, ""); + opt->pause_min_ms = 0; + opt->pause_max_ms = 0; opt->ftp_proxy = HTS_TRUE; opt->convert_utf8 = HTS_TRUE; StringCopy(opt->filelist, ""); diff --git a/src/htsopt.h b/src/htsopt.h index 3007b0de..6d39bb63 100644 --- a/src/htsopt.h +++ b/src/htsopt.h @@ -537,6 +537,8 @@ struct httrackp { hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */ String cookies_file; /**< extra Netscape cookies.txt to preload (--cookies-file) */ + int pause_min_ms; /**< inter-file pause lower bound, ms (0=off, #185) */ + int pause_max_ms; /**< inter-file pause upper bound, ms */ }; /* Running statistics for a mirror. */ diff --git a/src/htsselftest.c b/src/htsselftest.c index cf833fa6..aca1c96b 100644 --- a/src/htsselftest.c +++ b/src/htsselftest.c @@ -912,12 +912,58 @@ static int st_copyopt(httrackp *opt, int argc, char **argv) { if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0) err = 1; + /* #185 pause pair: copied when enabled (max>0), the 0 sentinel skips */ + from->pause_min_ms = 5000; + from->pause_max_ms = 10000; + to->pause_min_ms = to->pause_max_ms = 0; + copy_htsopt(from, to); + if (to->pause_min_ms != 5000 || to->pause_max_ms != 10000) + err = 1; + from->pause_min_ms = from->pause_max_ms = 0; + copy_htsopt(from, to); + if (to->pause_min_ms != 5000 || to->pause_max_ms != 10000) + err = 1; + hts_free_opt(from); hts_free_opt(to); printf("copy-htsopt: %s\n", err ? "FAIL" : "OK"); return err; } +static int st_pause(httrackp *opt, int argc, char **argv) { + int err = 0, i, seen_low = 0, seen_high = 0; + + (void) opt; + (void) argc; + (void) argv; + /* Consecutive-ms seeds (production shape: launch timestamps a few ms apart) + must stay in range and spread, not collapse to a bound -- worst case for a + weak low-bit mixer. */ + for (i = 0; i < 10000; i++) { + int t = hts_pause_target_ms((TStamp) (1719500000000LL + i), 5000, 10000); + + if (t < 5000 || t > 10000) + err = 1; + seen_low |= (t < 6000); + seen_high |= (t > 9000); + } + if (!seen_low || !seen_high) + err = 1; + if (hts_pause_target_ms(12345, 8000, 8000) != 8000) /* equal bounds = fixed */ + err = 1; + /* deterministic: a seed yields the same target even after an intervening call + with another seed (no global PRNG state to perturb it) */ + { + int a = hts_pause_target_ms(99, 5000, 10000); + + (void) hts_pause_target_ms(54321, 5000, 10000); + if (hts_pause_target_ms(99, 5000, 10000) != a) + err = 1; + } + printf("pause: %s\n", err ? "FAIL" : "OK"); + return err; +} + static int st_relative(httrackp *opt, int argc, char **argv) { char s[HTS_URLMAXSIZE * 2]; @@ -1264,6 +1310,7 @@ static const struct selftest_entry { {"strsafe", "[overflow|overflow-buff [str]]", "bounded string-op self-test", st_strsafe}, {"copyopt", "", "copy_htsopt option-copy self-test", st_copyopt}, + {"pause", "", "randomized inter-file pause target self-test", st_pause}, {"relative", " ", "relative link between two paths", st_relative}, {"resolve", " ", "resolve a link against an origin", diff --git a/tests/01_engine-cmdline.test b/tests/01_engine-cmdline.test index 0a507618..549bd97a 100755 --- a/tests/01_engine-cmdline.test +++ b/tests/01_engine-cmdline.test @@ -90,4 +90,16 @@ refused "dangling-quote argument not refused cleanly" run_only "$tmp/q-lone" '"' refused "lone-quote argument not refused cleanly" +# --pause (#185): valid MIN[:MAX] accepted; malformed, reversed, over-range and +# non-finite values refused cleanly. NaN defeats naive `<`/`>` checks (it +# compares false to everything), so it must not slip through to the int cast. +run "$tmp/pause-ok" --pause 0.2:0.4 +accepted "$tmp/pause-ok" "#185: valid --pause range rejected" +run "$tmp/pause-fix" --pause 0.2 +accepted "$tmp/pause-fix" "#185: valid fixed --pause rejected" +for bad in nan nan:5 5:nan inf 10:5 99999; do + run "$tmp/pause-bad" --pause "$bad" + refused "#185: invalid --pause '$bad' not refused cleanly" +done + exit 0 diff --git a/tests/01_engine-pause.test b/tests/01_engine-pause.test new file mode 100755 index 00000000..4fb8e9be --- /dev/null +++ b/tests/01_engine-pause.test @@ -0,0 +1,15 @@ +#!/bin/bash +# +# --pause (#185): the inter-file pause target must stay in [min,max] and spread +# across it (a per-call rand() would collapse it toward min). Driven by the +# in-process 'httrack -#test=pause' test. POSIX-portable ($(BASH) is /bin/sh on macOS). + +set -eu + +# 'run' is an ignored placeholder argument. +out=$(httrack -#test=pause run) + +test "$out" = "pause: OK" || { + echo "expected 'pause: OK', got: $out" >&2 + exit 1 +} diff --git a/tests/28_local-pause.test b/tests/28_local-pause.test new file mode 100755 index 00000000..8505a750 --- /dev/null +++ b/tests/28_local-pause.test @@ -0,0 +1,29 @@ +#!/bin/bash +# +# --pause (#185): a fixed inter-file delay must slow a multi-file crawl. Measure +# the same crawl with and without --pause and compare: the harness overhead +# cancels, leaving only the pause. Integer seconds keep it portable (BSD date +# has no %N); a lower bound is not timing-flaky since a pause only adds time. + +set -e + +: "${top_srcdir:=..}" + +run() { # echoes the wall-clock seconds of one crawl + local t0 t1 + t0=$(date +%s) + bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \ + httrack 'BASEURL/types/index.html' -c1 "$@" >/dev/null 2>&1 + t1=$(date +%s) + echo $((t1 - t0)) +} + +base=$(run) +paused=$(run --pause 0.5) +delta=$((paused - base)) + +echo "crawl: ${base}s, with --pause 0.5: ${paused}s (delta ${delta}s)" +if [ "$delta" -lt 2 ]; then + echo "FAIL: --pause did not delay the crawl (delta ${delta}s)" >&2 + exit 1 +fi diff --git a/tests/Makefile.am b/tests/Makefile.am index 20c2ca1e..380c53e9 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -41,6 +41,7 @@ TESTS = \ 01_engine-idna.test \ 01_engine-mime.test \ 01_engine-parse.test \ + 01_engine-pause.test \ 01_engine-rcfile.test \ 01_engine-relative.test \ 01_engine-savename.test \ @@ -73,6 +74,7 @@ TESTS = \ 24_local-resume-overlap.test \ 25_local-mime-exclude.test \ 26_local-strip-query.test \ - 27_local-cookies-file.test + 27_local-cookies-file.test \ + 28_local-pause.test CLEANFILES = check-network_sh.cache