Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions api/data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,17 +769,31 @@ def reset_database(self):

def _extract_repo_name_from_url(self, repo_url_or_path: str, repo_type: str) -> str:
# Extract owner and repo name to create unique identifier
url_parts = repo_url_or_path.rstrip('/').split('/')

if repo_type in ["github", "gitlab", "bitbucket"] and len(url_parts) >= 5:
# GitHub URL format: https://github.com/owner/repo
# GitLab URL format: https://gitlab.com/owner/repo or https://gitlab.com/group/subgroup/repo
# Bitbucket URL format: https://bitbucket.org/owner/repo
owner = url_parts[-2]
repo = url_parts[-1].replace(".git", "")
repo_name = f"{owner}_{repo}"
repo_url_or_path = repo_url_or_path.strip().rstrip('/')

def strip_git_suffix(name: str) -> str:
return name[:-4] if name.endswith(".git") else name

if repo_type in ["github", "gitlab", "bitbucket"]:
parsed_url = urlparse(repo_url_or_path)
if parsed_url.scheme and parsed_url.netloc:
path_parts = [part for part in parsed_url.path.strip('/').split('/') if part]
else:
path = repo_url_or_path.split('?', 1)[0].split('#', 1)[0].strip('/')
path_parts = [part for part in path.split('/') if part]
if len(path_parts) >= 3:
host = path_parts[0].lower()
if "." in host or host == "localhost":
path_parts = path_parts[1:]

if len(path_parts) >= 2:
path_parts[-1] = strip_git_suffix(path_parts[-1])
repo_name = "_".join(path_parts)
else:
repo_name = strip_git_suffix(path_parts[-1]) if path_parts else ""
Comment on lines +789 to +793
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logic for handling path_parts can be simplified. Since "_".join(path_parts) on a list with a single element returns that element itself, you can combine the two branches into one, checking only if the list is non-empty.

Suggested change
if len(path_parts) >= 2:
path_parts[-1] = strip_git_suffix(path_parts[-1])
repo_name = "_".join(path_parts)
else:
repo_name = strip_git_suffix(path_parts[-1]) if path_parts else ""
if path_parts:
path_parts[-1] = strip_git_suffix(path_parts[-1])
repo_name = "_".join(path_parts)
else:
repo_name = ""

else:
repo_name = url_parts[-1].replace(".git", "")
url_parts = repo_url_or_path.split('/')
repo_name = strip_git_suffix(url_parts[-1])
return repo_name

def _create_repo(self, repo_url_or_path: str, repo_type: str = None, access_token: str = None) -> None:
Expand Down
2 changes: 1 addition & 1 deletion test/test_extract_repo_name.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_extract_repo_name_gitlab_urls(self):
# Test GitLab URL with subgroups
gitlab_subgroup = "https://gitlab.com/group/subgroup/repo"
result = self.db_manager._extract_repo_name_from_url(gitlab_subgroup, "gitlab")
assert result == "subgroup_repo"
assert result == "group_subgroup_repo"

print("✓ GitLab URL tests passed")

Expand Down