Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions man/httrack.1
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@ tolerant requests (accept bogus responses on some servers, but not standard!) (\
update hacks: various hacks to limit re\-transfers when updating (identical size, bogus response..) (\-\-updatehack)
.IP \-%u
url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..) (\-\-urlhack)
.br
opt out of one url\-hack part: \-\-keep\-www\-prefix (www.foo.com<>foo.com), \-\-keep\-double\-slashes (//), \-\-keep\-query\-order (?b&a)
.IP \-%A
assume that a type (cgi,asp..) is always linked with a mime type (\-%A php3,cgi=text/html;dat,bin=application/x\-zip) (\-\-assume <param>)
.br
Expand Down
3 changes: 3 additions & 0 deletions src/htsalias.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ const char *hts_optalias[][4] = {
{"tolerant", "-%B", "single", ""},
{"updatehack", "-%s", "single", ""}, {"sizehack", "-%s", "single", ""},
{"urlhack", "-%u", "single", ""},
{"keep-www-prefix", "-%j", "single", ""},
{"keep-double-slashes", "-%o", "single", ""},
{"keep-query-order", "-%y", "single", ""},
{"user-agent", "-F", "param1", "user-agent identity"},
{"referer", "-%R", "param1", "default referer URL"},
{"from", "-%E", "param1", "from email address"},
Expand Down
11 changes: 9 additions & 2 deletions src/htscore.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,10 @@ struct hash_struct {
coucal adrfil;
/* former address+path -> link index (renamed/moved entries) */
coucal former_adrfil;
/* scratch buffers reused across lookups (not reentrant) */
int normalized;
/* effective urlhack sub-flags: www.==host / // collapse / query-arg sort */
hts_boolean norm_host;
hts_boolean norm_slash;
hts_boolean norm_query;
/* query-strip keys (not owned); set from opt->strip_query at hash_init */
const char *strip_query;
char normfil[HTS_URLMAXSIZE * 2];
Expand Down Expand Up @@ -371,6 +373,11 @@ char *next_token(char *p, int flag);
char *fil_normalized_filtered(const char *source, char *dest,
const char *strip);

/* As fil_normalized_filtered(), but DO_SLASH/DO_QUERY gate the // collapse and
the query-argument sort independently (the urlhack sub-flags). */
char *fil_normalized_filtered_ex(const char *source, char *dest,
const char *strip, int do_slash, int do_query);

/* For URL ADR/FIL, return (in DEST) the comma keylist to strip from the
'\n'-separated "[pattern=]keys" RULES (patterns matched on host/path via
strjoker, last wins); NULL if none match. Feeds fil_normalized_filtered(). */
Expand Down
24 changes: 24 additions & 0 deletions src/htscoremain.c
Original file line number Diff line number Diff line change
Expand Up @@ -1570,6 +1570,30 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
com++;
}
break; // url hack
case 'j':
opt->no_www_dedup =
HTS_TRUE; // --keep-www-prefix: keep www.X != X
if (*(com + 1) == '0') {
opt->no_www_dedup = HTS_FALSE;
com++;
}
break;
case 'o':
opt->no_slash_dedup =
HTS_TRUE; // --keep-double-slashes: keep //
if (*(com + 1) == '0') {
opt->no_slash_dedup = HTS_FALSE;
com++;
}
break;
case 'y':
opt->no_query_dedup =
HTS_TRUE; // --keep-query-order: keep ?b&a order
if (*(com + 1) == '0') {
opt->no_query_dedup = HTS_FALSE;
com++;
}
break;
case 'v':
opt->verbosedisplay = HTS_VERBOSE_FULL;
if (isdigit((unsigned char) *(com + 1))) {
Expand Down
60 changes: 43 additions & 17 deletions src/htshash.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
const lien_url*const lien = (const lien_url*) value;
const char *const adr = !former ? lien->adr : lien->former_adr;
const char *const fil = !former ? lien->fil : lien->former_fil;
const char *const adr_norm = adr != NULL ?
( hash->normalized ? jump_normalized_const(adr)
: jump_identification_const(adr) )
: NULL;
const char *const adr_norm =
adr != NULL ? (hash->norm_host ? jump_normalized_const(adr)
: jump_identification_const(adr))
: NULL;

// copy address
assertf(adr_norm != NULL);
Expand All @@ -123,8 +123,9 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
const char *const keys = hts_query_strip_keys(hash->strip_query, adr, fil,
keybuf, sizeof(keybuf));

if (hash->normalized || keys != NULL) {
fil_normalized_filtered(fil, &hash->normfil[strlen(hash->normfil)], keys);
if (hash->norm_slash || hash->norm_query || keys != NULL) {
fil_normalized_filtered_ex(fil, &hash->normfil[strlen(hash->normfil)],
keys, hash->norm_slash, hash->norm_query);
} else {
strcpy(&hash->normfil[strlen(hash->normfil)], fil);
}
Expand All @@ -139,8 +140,7 @@ static int key_adrfil_equals_generic(void *arg,
coucal_key_const a_,
coucal_key_const b_,
const int former) {
hash_struct *const hash = (hash_struct*) arg;
const int normalized = hash->normalized;
hash_struct *const hash = (hash_struct *) arg;
const lien_url*const a = (const lien_url*) a_;
const lien_url*const b = (const lien_url*) b_;
const char *const a_adr = !former ? a->adr : a->former_adr;
Expand All @@ -157,10 +157,10 @@ static int key_adrfil_equals_generic(void *arg,
assertf(b_fil != NULL);

// skip scheme and authentication to the domain (possibly without www.)
ja = normalized
? jump_normalized_const(a_adr) : jump_identification_const(a_adr);
jb = normalized
? jump_normalized_const(b_adr) : jump_identification_const(b_adr);
ja = hash->norm_host ? jump_normalized_const(a_adr)
: jump_identification_const(a_adr);
jb = hash->norm_host ? jump_normalized_const(b_adr)
: jump_identification_const(b_adr);
assertf(ja != NULL);
assertf(jb != NULL);
if (strcasecmp(ja, jb) != 0) {
Expand All @@ -175,9 +175,12 @@ static int key_adrfil_equals_generic(void *arg,
const char *const keysb =
hts_query_strip_keys(hash->strip_query, b_adr, b_fil, kb, sizeof(kb));

if (normalized || keysa != NULL || keysb != NULL) {
fil_normalized_filtered(a_fil, hash->normfil, keysa);
fil_normalized_filtered(b_fil, hash->normfil2, keysb);
if (hash->norm_slash || hash->norm_query || keysa != NULL ||
keysb != NULL) {
fil_normalized_filtered_ex(a_fil, hash->normfil, keysa, hash->norm_slash,
hash->norm_query);
fil_normalized_filtered_ex(b_fil, hash->normfil2, keysb, hash->norm_slash,
hash->norm_query);
return strcmp(hash->normfil, hash->normfil2) == 0;
} else {
return strcmp(a_fil, b_fil) == 0;
Expand Down Expand Up @@ -237,11 +240,14 @@ static int key_former_adrfil_equals(void *arg,
return key_adrfil_equals_generic(arg, a, b, 1);
}

void hash_init(httrackp *opt, hash_struct * hash, int normalized) {
void hash_init(httrackp *opt, hash_struct *hash, hts_boolean normalized) {
hash->sav = coucal_new(0);
hash->adrfil = coucal_new(0);
hash->former_adrfil = coucal_new(0);
hash->normalized = normalized;
/* urlhack is the umbrella; per-feature negatives opt out of each part */
hash->norm_host = normalized && !opt->no_www_dedup;
hash->norm_slash = normalized && !opt->no_slash_dedup;
hash->norm_query = normalized && !opt->no_query_dedup;
/* snapshot the query-strip list (not owned; valid for the hash lifetime) */
hash->strip_query =
StringNotEmpty(opt->strip_query) ? StringBuff(opt->strip_query) : NULL;
Expand Down Expand Up @@ -300,6 +306,26 @@ void hash_free(hash_struct *hash) {
}
}

/* Test helper: do the two URLs dedupe to the same key under opt's urlhack
flags? Exercises the live hash compare (norm_host/slash/query resolution). */
hts_boolean hash_url_equals(httrackp *opt, const char *adra, const char *fila,
const char *adrb, const char *filb) {
hash_struct hash;
lien_url la, lb;
hts_boolean eq;

memset(&la, 0, sizeof(la));
memset(&lb, 0, sizeof(lb));
la.adr = key_duphandler(NULL, adra);
la.fil = key_duphandler(NULL, fila);
lb.adr = key_duphandler(NULL, adrb);
lb.fil = key_duphandler(NULL, filb);
hash_init(opt, &hash, opt->urlhack);
eq = key_adrfil_equals(&hash, &la, &lb);
hash_free(&hash);
return eq;
}

// retour: position ou -1 si non trouvé
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
hash_struct_type type) {
Expand Down
6 changes: 5 additions & 1 deletion src/htshash.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,12 @@ typedef enum hash_struct_type {
} hash_struct_type;

// tables de hachage
void hash_init(httrackp *opt, hash_struct *hash, int normalized);
void hash_init(httrackp *opt, hash_struct *hash, hts_boolean normalized);
void hash_free(hash_struct *hash);
/* Test helper: HTS_TRUE if the two URLs dedupe together under opt's urlhack
flags. */
hts_boolean hash_url_equals(httrackp *opt, const char *adra, const char *fila,
const char *adrb, const char *filb);
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
hash_struct_type type);
void hash_write(hash_struct * hash, size_t lpos);
Expand Down
3 changes: 3 additions & 0 deletions src/htshelp.c
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,9 @@ void help(const char *app, int more) {
(" %s update hacks: various hacks to limit re-transfers when updating (identical size, bogus response..)");
infomsg
(" %u url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..)");
infomsg(" opt out of one url-hack part: --keep-www-prefix "
"(www.foo.com<>foo.com), --keep-double-slashes (//), "
"--keep-query-order (?b&a)");
infomsg
(" %A assume that a type (cgi,asp..) is always linked with a mime type (-%A php3,cgi=text/html;dat,bin=application/x-zip)");
infomsg(" shortcut: '--assume standard' is equivalent to -%A "
Expand Down
33 changes: 25 additions & 8 deletions src/htslib.c
Original file line number Diff line number Diff line change
Expand Up @@ -3610,7 +3610,10 @@ static int sortNormFnc(const void *a_, const void *b_) {
return strcmp(*a + 1, *b + 1);
}

HTSEXT_API char *fil_normalized(const char *source, char *dest) {
/* Path normalizer core: optionally collapse redundant '//' (DO_SLASH) and/or
sort query arguments (DO_QUERY) so equivalent URLs dedupe. */
static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
int do_query) {
char lastc = 0;
int gotquery = 0;
int ampargs = 0;
Expand All @@ -3620,8 +3623,8 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
for(i = j = 0; source[i] != '\0'; i++) {
if (!gotquery && source[i] == '?')
gotquery = ampargs = 1;
if ((!gotquery && lastc == '/' && source[i] == '/') // foo//bar -> foo/bar
) {
if (do_slash && !gotquery && lastc == '/' && source[i] == '/') {
// foo//bar -> foo/bar
} else {
if (gotquery && source[i] == '&') {
ampargs++;
Expand All @@ -3633,7 +3636,7 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
dest[j++] = '\0';

/* Sort arguments (&foo=1&bar=2 == &bar=2&foo=1) */
if (ampargs > 1) {
if (do_query && ampargs > 1) {
char **amps = malloct(ampargs * sizeof(char *));
char *copyBuff = NULL;
size_t qLen = 0;
Expand Down Expand Up @@ -3681,6 +3684,10 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
return dest;
}

HTSEXT_API char *fil_normalized(const char *source, char *dest) {
return fil_normalized_ex(source, dest, 1, 1);
}

/* Is query key ARG[0..keylen) in the comma-separated STRIP list? "*" = all;
case-sensitive, space-trimmed tokens. */
static int hts_query_key_stripped(const char *arg, size_t keylen,
Expand Down Expand Up @@ -3711,8 +3718,9 @@ static int hts_query_key_stripped(const char *arg, size_t keylen,
}

/* see htscore.h */
char *fil_normalized_filtered(const char *source, char *dest,
const char *strip) {
char *fil_normalized_filtered_ex(const char *source, char *dest,
const char *strip, int do_slash,
int do_query) {
const char *query;
char BIGSTK tmp[HTS_URLMAXSIZE * 2];
htsbuff cb;
Expand All @@ -3721,7 +3729,7 @@ char *fil_normalized_filtered(const char *source, char *dest,
/* No strip list, or no query: plain normalization. */
if (strip == NULL || *strip == '\0' ||
(query = strchr(source, '?')) == NULL) {
return fil_normalized(source, dest);
return fil_normalized_ex(source, dest, do_slash, do_query);
}

/* Copy the path, re-emit kept query args, let fil_normalized() sort. Walk
Expand Down Expand Up @@ -3750,7 +3758,13 @@ char *fil_normalized_filtered(const char *source, char *dest,
break;
query++;
}
return fil_normalized(tmp, dest);
return fil_normalized_ex(tmp, dest, do_slash, do_query);
}

/* see htscore.h */
char *fil_normalized_filtered(const char *source, char *dest,
const char *strip) {
return fil_normalized_filtered_ex(source, dest, strip, 1, 1);
}

/* see htscore.h */
Expand Down Expand Up @@ -6026,6 +6040,9 @@ HTSEXT_API httrackp *hts_create_opt(void) {
opt->verbosedisplay = HTS_VERBOSE_NONE; // no text animation
opt->sizehack = HTS_FALSE;
opt->urlhack = HTS_TRUE;
opt->no_www_dedup = HTS_FALSE;
opt->no_slash_dedup = HTS_FALSE;
opt->no_query_dedup = HTS_FALSE;
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
StringCopy(opt->strip_query, "");
opt->ftp_proxy = HTS_TRUE;
Expand Down
16 changes: 11 additions & 5 deletions src/htsname.c
Original file line number Diff line number Diff line change
Expand Up @@ -237,9 +237,13 @@ int url_savename(lien_adrfilsave *const afs,
// www-42.foo.com -> foo.com
// foo.com/bar//foobar -> foo.com/bar/foobar
if (opt->urlhack) {
// copy of adr (without protocol), used for lookups (see urlhack)
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
normfil = fil_normalized_filtered(fil_complete, normfil_, strip);
// dedup-lookup key; honor the per-feature negatives like htshash.c so
// distinct URLs keep distinct savenames (else keep normadr = adr)
if (!opt->no_www_dedup)
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
normfil =
fil_normalized_filtered_ex(fil_complete, normfil_, strip,
!opt->no_slash_dedup, !opt->no_query_dedup);
} else {
if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder
char *pos = strchr(adr_complete, ':');
Expand All @@ -252,9 +256,11 @@ int url_savename(lien_adrfilsave *const afs,
normadr = normadr_;
}
}
// strip still applies with urlhack off (host left untouched)
// strip still applies with urlhack off (host left untouched); no // or
// query-sort here, to match the hash key (norm_slash/norm_query are 0 when
// urlhack is off) so a URL is looked up under the key it was stored with
if (strip != NULL)
normfil = fil_normalized_filtered(fil_complete, normfil_, strip);
normfil = fil_normalized_filtered_ex(fil_complete, normfil_, strip, 0, 0);
}

// à afficher sans ftp://
Expand Down
4 changes: 4 additions & 0 deletions src/htsopt.h
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,10 @@ struct httrackp {
htsoptstate state; /**< embedded live engine state */
String strip_query; /**< query keys to drop when deduping URLs (-strip-query);
appended at the tail to keep field offsets stable */
hts_boolean
no_www_dedup; /**< with urlhack, keep www.host distinct from host */
hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */
hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */
};

/* Running statistics for a mirror. */
Expand Down
26 changes: 19 additions & 7 deletions src/htsparse.c
Original file line number Diff line number Diff line change
Expand Up @@ -3602,16 +3602,28 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
int set_prio_to = 0; // pas de priotité fixéd par wizard

// check whether URLHack is harmless or not
if (opt->urlhack) {
// check whether URLHack is harmless or not (per the effective
// sub-flags)
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
!opt->no_query_dedup)) {
const int norm_host = !opt->no_www_dedup;
const int norm_slash = !opt->no_slash_dedup;
const int norm_query = !opt->no_query_dedup;
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];

n_adr[0] = n_fil[0] = '\0';
(void) adr_normalized_sized(moved->adr, n_adr, sizeof(n_adr));
(void) fil_normalized(moved->fil, n_fil);
(void) adr_normalized_sized(urladr(), pn_adr, sizeof(pn_adr));
(void) fil_normalized(urlfil(), pn_fil);
strlcpybuff(n_adr,
norm_host ? jump_normalized_const(moved->adr)
: jump_identification_const(moved->adr),
sizeof(n_adr));
strlcpybuff(pn_adr,
norm_host ? jump_normalized_const(urladr())
: jump_identification_const(urladr()),
sizeof(pn_adr));
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
norm_query);
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
norm_query);
if (strcasecmp(n_adr, pn_adr) == 0
&& strcasecmp(n_fil, pn_fil) == 0) {
hts_log_print(opt, LOG_WARNING,
Expand Down
Loading
Loading