From 0a40f1e4f4948f8671def3f2c42fc42bb6c36249 Mon Sep 17 00:00:00 2001 From: dracpet Date: Thu, 21 May 2026 23:38:55 +0200 Subject: [PATCH] fix: resolve relative URLs against parent dir when base_fork_url is an HTML file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When crawling a site where pages link to index.html, the Fork instance gets base_fork_url ending in .html. The reset_url function then appends /field_value/ and calls urljoin(..., '.'), which treats index.html/ as a directory. Every relative link gets index.html/ injected into its path (e.g. .../en/index.html/about_dolphindb.html), causing 404 cascade. Fix: in reset_url, resolve against parent dir for .html/.htm base URLs. In get_child_link_list, use crawl_prefix (parent dir for HTML files) for the link filter so correctly-resolved URLs aren't filtered out. Verified: scraped docs.dolphindb.com/en/ with selector 'main' → 800+ documents, 0 broken URLs. --- apps/common/utils/fork.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/apps/common/utils/fork.py b/apps/common/utils/fork.py index 8964dc25759..b4c47ab1114 100644 --- a/apps/common/utils/fork.py +++ b/apps/common/utils/fork.py @@ -92,11 +92,15 @@ def __init__(self, base_fork_url: str, selector_list: List[str]): fragment='').geturl() def get_child_link_list(self, bf: BeautifulSoup): - pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + "|/).*" + # Compute the crawl prefix: parent directory when base_fork_url is an HTML file + crawl_prefix = self.base_fork_url + if crawl_prefix.endswith(('.html', '.htm')): + crawl_prefix = crawl_prefix.rsplit('/', 1)[0] + pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + crawl_prefix + "|/).*" link_list = bf.find_all(name='a', href=re.compile(pattern)) result = [ChildLink(link.get('href'), link) if link.get('href').startswith(self.base_url) else ChildLink( self.base_url + link.get('href'), link) for link in link_list] - result = [row for row in result if row.url.startswith(self.base_fork_url)] + result = [row for row in result if row.url.startswith(crawl_prefix)] return result def get_content_html(self, bf: BeautifulSoup): @@ -118,9 +122,14 @@ def reset_url(tag, field, base_fork_url): result_url = ParseResult(scheme=result.scheme, netloc=result.netloc, path=field_value, params='', query='', fragment='').geturl() else: - result_url = urljoin( - base_fork_url + '/' + (field_value if field_value.endswith('/') else field_value + '/'), - ".") + # When base_fork_url is an HTML file (not a directory), resolve relative + # links against its parent directory to avoid broken paths like + # /en/index.html/about_dolphindb.html + if base_fork_url.endswith(('.html', '.htm')): + base = base_fork_url.rsplit('/', 1)[0] + '/' + else: + base = base_fork_url + '/' + result_url = urljoin(base, field_value) result_url = result_url[:-1] if result_url.endswith('/') else result_url tag[field] = result_url