From cef74c347a2a00af61fea2d2687ba2572a7fc007 Mon Sep 17 00:00:00 2001 From: Matt Van Horn Date: Wed, 13 May 2026 12:49:32 -0700 Subject: [PATCH] fix: preserve full path in repo cache key for GitLab subgroup URLs The previous _extract_repo_name_from_url collapsed https://gitlab.com/group/subgroup/repo down to a key of subgroup_repo, which collides if two different parent groups have a project/subproject pair with the same last two segments. The clone URL was passed through intact, but the cache lookup at ~/.adalflow/databases/{key}.pkl could return the wrong project. Switch to urlparse and join every path segment with underscores, so https://gitlab.com/group/subgroup/repo becomes group_subgroup_repo. Plain 2-segment URLs (github.com/owner/repo, bitbucket.org/owner/repo, gitlab.com/owner/repo) keep the existing owner_repo shape. The .git suffix on the final segment is still stripped. Update the existing GitLab subgroup test case to reflect the new shape. Relates to #438 and #439. The reporter's specific failure mode may be downstream of this (the clone itself, not the cache key); this commit fixes the cache-key half. The remaining downstream investigation is noted in the PR body. --- api/data_pipeline.py | 34 ++++++++++++++++++++++++---------- test/test_extract_repo_name.py | 2 +- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/api/data_pipeline.py b/api/data_pipeline.py index f98068651..b072e2c70 100644 --- a/api/data_pipeline.py +++ b/api/data_pipeline.py @@ -769,17 +769,31 @@ def reset_database(self): def _extract_repo_name_from_url(self, repo_url_or_path: str, repo_type: str) -> str: # Extract owner and repo name to create unique identifier - url_parts = repo_url_or_path.rstrip('/').split('/') - - if repo_type in ["github", "gitlab", "bitbucket"] and len(url_parts) >= 5: - # GitHub URL format: https://github.com/owner/repo - # GitLab URL format: https://gitlab.com/owner/repo or https://gitlab.com/group/subgroup/repo - # Bitbucket URL format: https://bitbucket.org/owner/repo - owner = url_parts[-2] - repo = url_parts[-1].replace(".git", "") - repo_name = f"{owner}_{repo}" + repo_url_or_path = repo_url_or_path.strip().rstrip('/') + + def strip_git_suffix(name: str) -> str: + return name[:-4] if name.endswith(".git") else name + + if repo_type in ["github", "gitlab", "bitbucket"]: + parsed_url = urlparse(repo_url_or_path) + if parsed_url.scheme and parsed_url.netloc: + path_parts = [part for part in parsed_url.path.strip('/').split('/') if part] + else: + path = repo_url_or_path.split('?', 1)[0].split('#', 1)[0].strip('/') + path_parts = [part for part in path.split('/') if part] + if len(path_parts) >= 3: + host = path_parts[0].lower() + if "." in host or host == "localhost": + path_parts = path_parts[1:] + + if len(path_parts) >= 2: + path_parts[-1] = strip_git_suffix(path_parts[-1]) + repo_name = "_".join(path_parts) + else: + repo_name = strip_git_suffix(path_parts[-1]) if path_parts else "" else: - repo_name = url_parts[-1].replace(".git", "") + url_parts = repo_url_or_path.split('/') + repo_name = strip_git_suffix(url_parts[-1]) return repo_name def _create_repo(self, repo_url_or_path: str, repo_type: str = None, access_token: str = None) -> None: diff --git a/test/test_extract_repo_name.py b/test/test_extract_repo_name.py index 65a15da09..ddaff4a62 100644 --- a/test/test_extract_repo_name.py +++ b/test/test_extract_repo_name.py @@ -55,7 +55,7 @@ def test_extract_repo_name_gitlab_urls(self): # Test GitLab URL with subgroups gitlab_subgroup = "https://gitlab.com/group/subgroup/repo" result = self.db_manager._extract_repo_name_from_url(gitlab_subgroup, "gitlab") - assert result == "subgroup_repo" + assert result == "group_subgroup_repo" print("✓ GitLab URL tests passed")