From 0a40f1e4f4948f8671def3f2c42fc42bb6c36249 Mon Sep 17 00:00:00 2001
From: dracpet <dracpet27@gmail.com>
Date: Thu, 21 May 2026 23:38:55 +0200
Subject: [PATCH] fix: resolve relative URLs against parent dir when
 base_fork_url is an HTML file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When crawling a site where pages link to index.html, the Fork instance
gets base_fork_url ending in .html. The reset_url function then appends
/field_value/ and calls urljoin(..., '.'), which treats index.html/ as
a directory. Every relative link gets index.html/ injected into its path
(e.g. .../en/index.html/about_dolphindb.html), causing 404 cascade.

Fix: in reset_url, resolve against parent dir for .html/.htm base URLs.
In get_child_link_list, use crawl_prefix (parent dir for HTML files)
for the link filter so correctly-resolved URLs aren't filtered out.

Verified: scraped docs.dolphindb.com/en/ with selector 'main' → 800+
documents, 0 broken URLs.
---
 apps/common/utils/fork.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/apps/common/utils/fork.py b/apps/common/utils/fork.py
index 8964dc25759..b4c47ab1114 100644
--- a/apps/common/utils/fork.py
+++ b/apps/common/utils/fork.py
@@ -92,11 +92,15 @@ def __init__(self, base_fork_url: str, selector_list: List[str]):
                                     fragment='').geturl()
 
     def get_child_link_list(self, bf: BeautifulSoup):
-        pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + "|/).*"
+        # Compute the crawl prefix: parent directory when base_fork_url is an HTML file
+        crawl_prefix = self.base_fork_url
+        if crawl_prefix.endswith(('.html', '.htm')):
+            crawl_prefix = crawl_prefix.rsplit('/', 1)[0]
+        pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + crawl_prefix + "|/).*"
         link_list = bf.find_all(name='a', href=re.compile(pattern))
         result = [ChildLink(link.get('href'), link) if link.get('href').startswith(self.base_url) else ChildLink(
             self.base_url + link.get('href'), link) for link in link_list]
-        result = [row for row in result if row.url.startswith(self.base_fork_url)]
+        result = [row for row in result if row.url.startswith(crawl_prefix)]
         return result
 
     def get_content_html(self, bf: BeautifulSoup):
@@ -118,9 +122,14 @@ def reset_url(tag, field, base_fork_url):
             result_url = ParseResult(scheme=result.scheme, netloc=result.netloc, path=field_value, params='', query='',
                                      fragment='').geturl()
         else:
-            result_url = urljoin(
-                base_fork_url + '/' + (field_value if field_value.endswith('/') else field_value + '/'),
-                ".")
+            # When base_fork_url is an HTML file (not a directory), resolve relative
+            # links against its parent directory to avoid broken paths like
+            # /en/index.html/about_dolphindb.html
+            if base_fork_url.endswith(('.html', '.htm')):
+                base = base_fork_url.rsplit('/', 1)[0] + '/'
+            else:
+                base = base_fork_url + '/'
+            result_url = urljoin(base, field_value)
         result_url = result_url[:-1] if result_url.endswith('/') else result_url
         tag[field] = result_url