From 34a1a4b0318db62149c9934665cc65f73b299137 Mon Sep 17 00:00:00 2001 From: Mateusz Sterczewski Date: Wed, 7 Jan 2026 13:11:45 +0100 Subject: [PATCH 1/5] CM-55207-Fix Secrets documents collecting for diff scan --- cycode/cli/apps/scan/commit_range_scanner.py | 4 +- .../files_collector/commit_range_documents.py | 51 ++++++++++++++--- .../test_commit_range_documents.py | 55 +++++++++++++++++++ 3 files changed, 100 insertions(+), 10 deletions(-) diff --git a/cycode/cli/apps/scan/commit_range_scanner.py b/cycode/cli/apps/scan/commit_range_scanner.py index eb0296c8..85497d5f 100644 --- a/cycode/cli/apps/scan/commit_range_scanner.py +++ b/cycode/cli/apps/scan/commit_range_scanner.py @@ -186,7 +186,7 @@ def _scan_commit_range_documents( def _scan_sca_commit_range(ctx: typer.Context, repo_path: str, commit_range: str, **_) -> None: scan_parameters = get_scan_parameters(ctx, (repo_path,)) - from_commit_rev, to_commit_rev = parse_commit_range(commit_range, repo_path) + from_commit_rev, to_commit_rev, _ = parse_commit_range(commit_range, repo_path) from_commit_documents, to_commit_documents, _ = get_commit_range_modified_documents( ctx.obj['progress_bar'], ScanProgressBarSection.PREPARE_LOCAL_FILES, repo_path, from_commit_rev, to_commit_rev ) @@ -227,7 +227,7 @@ def _scan_secret_commit_range( def _scan_sast_commit_range(ctx: typer.Context, repo_path: str, commit_range: str, **_) -> None: scan_parameters = get_scan_parameters(ctx, (repo_path,)) - from_commit_rev, to_commit_rev = parse_commit_range(commit_range, repo_path) + from_commit_rev, to_commit_rev, _ = parse_commit_range(commit_range, repo_path) _, commit_documents, diff_documents = get_commit_range_modified_documents( ctx.obj['progress_bar'], ScanProgressBarSection.PREPARE_LOCAL_FILES, diff --git a/cycode/cli/files_collector/commit_range_documents.py b/cycode/cli/files_collector/commit_range_documents.py index 2ca54dd5..5884b6a6 100644 --- a/cycode/cli/files_collector/commit_range_documents.py +++ b/cycode/cli/files_collector/commit_range_documents.py @@ -67,12 +67,15 @@ def collect_commit_range_diff_documents( commit_documents_to_scan = [] repo = git_proxy.get_repo(path) - total_commits_count = int(repo.git.rev_list('--count', commit_range)) - logger.debug('Calculating diffs for %s commits in the commit range %s', total_commits_count, commit_range) + + normalized_commit_range = normalize_commit_range(commit_range, path) + + total_commits_count = int(repo.git.rev_list('--count', normalized_commit_range)) + logger.debug('Calculating diffs for %s commits in the commit range %s', total_commits_count, normalized_commit_range) progress_bar.set_section_length(ScanProgressBarSection.PREPARE_LOCAL_FILES, total_commits_count) - for scanned_commits_count, commit in enumerate(repo.iter_commits(rev=commit_range)): + for scanned_commits_count, commit in enumerate(repo.iter_commits(rev=normalized_commit_range)): if _does_reach_to_max_commits_to_scan_limit(commit_ids_to_scan, max_commits_count): logger.debug('Reached to max commits to scan count. Going to scan only %s last commits', max_commits_count) progress_bar.update(ScanProgressBarSection.PREPARE_LOCAL_FILES, total_commits_count - scanned_commits_count) @@ -96,7 +99,12 @@ def collect_commit_range_diff_documents( logger.debug( 'Found all relevant files in commit %s', - {'path': path, 'commit_range': commit_range, 'commit_id': commit_id}, + { + 'path': path, + 'commit_range': commit_range, + 'normalized_commit_range': normalized_commit_range, + 'commit_id': commit_id, + }, ) logger.debug('List of commit ids to scan, %s', {'commit_ids': commit_ids_to_scan}) @@ -428,8 +436,8 @@ def get_pre_commit_modified_documents( return git_head_documents, pre_committed_documents, diff_documents -def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Optional[str]]: - """Parses a git commit range string and returns the full SHAs for the 'from' and 'to' commits. +def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Optional[str], Optional[str]]: + """Parses a git commit range string and returns the full SHAs for the 'from' and 'to' commits as well as the separator. Supports: - 'from..to' @@ -442,8 +450,10 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt if '...' in commit_range: from_spec, to_spec = commit_range.split('...', 1) + separator = '...' elif '..' in commit_range: from_spec, to_spec = commit_range.split('..', 1) + separator = '..' else: # Git commands like 'git diff ' compare against HEAD. from_spec = commit_range @@ -459,7 +469,32 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt # Use rev_parse to resolve each specifier to its full commit SHA from_commit_rev = repo.rev_parse(from_spec).hexsha to_commit_rev = repo.rev_parse(to_spec).hexsha - return from_commit_rev, to_commit_rev + return from_commit_rev, to_commit_rev, separator except git_proxy.get_git_command_error() as e: logger.warning("Failed to parse commit range '%s'", commit_range, exc_info=e) - return None, None + return None, None, None + +def normalize_commit_range(commit_range: str, path: str) -> str: + """Normalize a commit range string to handle various formats consistently with all scan types. + + Returns: + A normalized commit range string suitable for Git operations (e.g., 'full_sha1..full_sha2') + """ + from_commit_rev, to_commit_rev, separator = parse_commit_range(commit_range, path) + if from_commit_rev is None or to_commit_rev is None: + logger.warning( + 'Failed to parse commit range "%s", falling back to raw string. This may cause unexpected behavior.', + commit_range + ) + # Fall back to using the raw commit_range string + return commit_range + + # Construct a normalized range string using the original separator for iter_commits + # This preserves the semantics of two-dot vs three-dot syntax + normalized_commit_range = f'{from_commit_rev}{separator}{to_commit_rev}' + logger.debug( + 'Normalized commit range "%s" to "%s"', + commit_range, + normalized_commit_range, + ) + return normalized_commit_range \ No newline at end of file diff --git a/tests/cli/files_collector/test_commit_range_documents.py b/tests/cli/files_collector/test_commit_range_documents.py index d27d24a3..8a215bbc 100644 --- a/tests/cli/files_collector/test_commit_range_documents.py +++ b/tests/cli/files_collector/test_commit_range_documents.py @@ -13,6 +13,7 @@ _get_default_branches_for_merge_base, calculate_pre_push_commit_range, calculate_pre_receive_commit_range, + collect_commit_range_diff_documents, get_diff_file_path, get_safe_head_reference_for_diff, parse_commit_range, @@ -1047,3 +1048,57 @@ def test_initial_oldest_commit_without_parent_with_two_commits_returns_single_co work_repo.close() finally: server_repo.close() + + +class TestCollectCommitRangeDiffDocuments: + """Test the collect_commit_range_diff_documents function with various commit range formats.""" + + def test_collect_with_various_commit_range_formats(self) -> None: + """Test that different commit range formats are normalized and work correctly.""" + with temporary_git_repository() as (temp_dir, repo): + # Create three commits + a_file = os.path.join(temp_dir, 'a.txt') + with open(a_file, 'w') as f: + f.write('A') + repo.index.add(['a.txt']) + a_commit = repo.index.commit('A') + + with open(a_file, 'a') as f: + f.write('B') + repo.index.add(['a.txt']) + b_commit = repo.index.commit('B') + + with open(a_file, 'a') as f: + f.write('C') + repo.index.add(['a.txt']) + c_commit = repo.index.commit('C') + + # Create mock context + mock_ctx = Mock() + mock_progress_bar = Mock() + mock_progress_bar.set_section_length = Mock() + mock_progress_bar.update = Mock() + mock_ctx.obj = {'progress_bar': mock_progress_bar} + + # Test two-dot range - should collect documents from commits B and C (2 commits, 2 documents) + commit_range = f'{a_commit.hexsha}..{c_commit.hexsha}' + documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range) + assert len(documents) == 2, f'Expected 2 documents from range A..C, got {len(documents)}' + commit_ids_in_documents = {doc.unique_id for doc in documents if doc.unique_id} + assert b_commit.hexsha in commit_ids_in_documents and c_commit.hexsha in commit_ids_in_documents + + # Test three-dot range - should collect documents from commits B and C (2 commits, 2 documents) + commit_range = f'{a_commit.hexsha}...{c_commit.hexsha}' + documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range) + assert len(documents) == 2, f'Expected 2 documents from range A...C, got {len(documents)}' + + # Test parent notation with three-dot - should collect document from commit C (1 commit, 1 document) + commit_range = f'{c_commit.hexsha}~1...{c_commit.hexsha}' + documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range) + assert len(documents) == 1, f'Expected 1 document from range C~1...C, got {len(documents)}' + assert documents[0].unique_id == c_commit.hexsha + + # Test single commit spec - should be interpreted as A..HEAD (commits B and C, 2 documents) + commit_range = a_commit.hexsha + documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range) + assert len(documents) == 2, f'Expected 2 documents from single commit A (interpreted as A..HEAD), got {len(documents)}' From 7da3f983eec554040291eadd4ee74932ba843246 Mon Sep 17 00:00:00 2001 From: Mateusz Sterczewski Date: Wed, 7 Jan 2026 13:24:56 +0100 Subject: [PATCH 2/5] CM-55207-Fix tests and lint --- .../files_collector/commit_range_documents.py | 24 ++++++++++-------- .../test_commit_range_documents.py | 25 ++++++++++--------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/cycode/cli/files_collector/commit_range_documents.py b/cycode/cli/files_collector/commit_range_documents.py index 5884b6a6..9735ca71 100644 --- a/cycode/cli/files_collector/commit_range_documents.py +++ b/cycode/cli/files_collector/commit_range_documents.py @@ -67,11 +67,12 @@ def collect_commit_range_diff_documents( commit_documents_to_scan = [] repo = git_proxy.get_repo(path) - + normalized_commit_range = normalize_commit_range(commit_range, path) - + total_commits_count = int(repo.git.rev_list('--count', normalized_commit_range)) - logger.debug('Calculating diffs for %s commits in the commit range %s', total_commits_count, normalized_commit_range) + logger.debug('Calculating diffs for %s commits in the commit range %s', + total_commits_count, normalized_commit_range) progress_bar.set_section_length(ScanProgressBarSection.PREPARE_LOCAL_FILES, total_commits_count) @@ -100,9 +101,9 @@ def collect_commit_range_diff_documents( logger.debug( 'Found all relevant files in commit %s', { - 'path': path, - 'commit_range': commit_range, - 'normalized_commit_range': normalized_commit_range, + 'path': path, + 'commit_range': commit_range, + 'normalized_commit_range': normalized_commit_range, 'commit_id': commit_id, }, ) @@ -437,7 +438,8 @@ def get_pre_commit_modified_documents( def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Optional[str], Optional[str]]: - """Parses a git commit range string and returns the full SHAs for the 'from' and 'to' commits as well as the separator. + """Parses a git commit range string and returns the full SHAs for the 'from' and 'to' commits. + Also, it returns the separator in the commit range. Supports: - 'from..to' @@ -462,8 +464,10 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt # If a spec is empty (e.g., from '..master'), default it to 'HEAD' if not from_spec: from_spec = consts.GIT_HEAD_COMMIT_REV + separator = '..' if not to_spec: to_spec = consts.GIT_HEAD_COMMIT_REV + separator = '..' try: # Use rev_parse to resolve each specifier to its full commit SHA @@ -476,7 +480,7 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt def normalize_commit_range(commit_range: str, path: str) -> str: """Normalize a commit range string to handle various formats consistently with all scan types. - + Returns: A normalized commit range string suitable for Git operations (e.g., 'full_sha1..full_sha2') """ @@ -488,7 +492,7 @@ def normalize_commit_range(commit_range: str, path: str) -> str: ) # Fall back to using the raw commit_range string return commit_range - + # Construct a normalized range string using the original separator for iter_commits # This preserves the semantics of two-dot vs three-dot syntax normalized_commit_range = f'{from_commit_rev}{separator}{to_commit_rev}' @@ -497,4 +501,4 @@ def normalize_commit_range(commit_range: str, path: str) -> str: commit_range, normalized_commit_range, ) - return normalized_commit_range \ No newline at end of file + return normalized_commit_range diff --git a/tests/cli/files_collector/test_commit_range_documents.py b/tests/cli/files_collector/test_commit_range_documents.py index 8a215bbc..0779f678 100644 --- a/tests/cli/files_collector/test_commit_range_documents.py +++ b/tests/cli/files_collector/test_commit_range_documents.py @@ -847,40 +847,40 @@ def test_two_dot_linear_history(self) -> None: with temporary_git_repository() as (temp_dir, repo): a, b, c = self._make_linear_history(repo, temp_dir) - parsed_from, parsed_to = parse_commit_range(f'{a}..{c}', temp_dir) - assert (parsed_from, parsed_to) == (a, c) + parsed_from, parsed_to, separator = parse_commit_range(f'{a}..{c}', temp_dir) + assert (parsed_from, parsed_to, separator) == (a, c, '..') def test_three_dot_linear_history(self) -> None: """For 'A...C' in linear history, expect (A,C).""" with temporary_git_repository() as (temp_dir, repo): a, b, c = self._make_linear_history(repo, temp_dir) - parsed_from, parsed_to = parse_commit_range(f'{a}...{c}', temp_dir) - assert (parsed_from, parsed_to) == (a, c) + parsed_from, parsed_to, separator = parse_commit_range(f'{a}...{c}', temp_dir) + assert (parsed_from, parsed_to, separator) == (a, c, '...') def test_open_right_linear_history(self) -> None: """For 'A..', expect (A,HEAD=C).""" with temporary_git_repository() as (temp_dir, repo): a, b, c = self._make_linear_history(repo, temp_dir) - parsed_from, parsed_to = parse_commit_range(f'{a}..', temp_dir) - assert (parsed_from, parsed_to) == (a, c) + parsed_from, parsed_to, separator = parse_commit_range(f'{a}..', temp_dir) + assert (parsed_from, parsed_to, separator) == (a, c, '..') def test_open_left_linear_history(self) -> None: """For '..C' where HEAD==C, expect (HEAD=C,C).""" with temporary_git_repository() as (temp_dir, repo): a, b, c = self._make_linear_history(repo, temp_dir) - parsed_from, parsed_to = parse_commit_range(f'..{c}', temp_dir) - assert (parsed_from, parsed_to) == (c, c) + parsed_from, parsed_to, separator = parse_commit_range(f'..{c}', temp_dir) + assert (parsed_from, parsed_to, separator) == (c, c, '..') def test_single_commit_spec(self) -> None: """For 'A', expect (A,HEAD=C).""" with temporary_git_repository() as (temp_dir, repo): a, b, c = self._make_linear_history(repo, temp_dir) - parsed_from, parsed_to = parse_commit_range(a, temp_dir) - assert (parsed_from, parsed_to) == (a, c) + parsed_from, parsed_to, separator = parse_commit_range(a, temp_dir) + assert (parsed_from, parsed_to, separator) == (a, c, '..') class TestParsePreReceiveInput: @@ -1085,7 +1085,8 @@ def test_collect_with_various_commit_range_formats(self) -> None: documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range) assert len(documents) == 2, f'Expected 2 documents from range A..C, got {len(documents)}' commit_ids_in_documents = {doc.unique_id for doc in documents if doc.unique_id} - assert b_commit.hexsha in commit_ids_in_documents and c_commit.hexsha in commit_ids_in_documents + assert b_commit.hexsha in commit_ids_in_documents + assert c_commit.hexsha in commit_ids_in_documents # Test three-dot range - should collect documents from commits B and C (2 commits, 2 documents) commit_range = f'{a_commit.hexsha}...{c_commit.hexsha}' @@ -1101,4 +1102,4 @@ def test_collect_with_various_commit_range_formats(self) -> None: # Test single commit spec - should be interpreted as A..HEAD (commits B and C, 2 documents) commit_range = a_commit.hexsha documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range) - assert len(documents) == 2, f'Expected 2 documents from single commit A (interpreted as A..HEAD), got {len(documents)}' + assert len(documents) == 2, f'Expected 2 documents from single commit A, got {len(documents)}' From 107e69b1499c8c5a441c745f07110a8ae88fa31b Mon Sep 17 00:00:00 2001 From: Mateusz Sterczewski Date: Wed, 7 Jan 2026 13:28:15 +0100 Subject: [PATCH 3/5] CM-55207-Fix ruff --- cycode/cli/files_collector/commit_range_documents.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cycode/cli/files_collector/commit_range_documents.py b/cycode/cli/files_collector/commit_range_documents.py index 9735ca71..902a2919 100644 --- a/cycode/cli/files_collector/commit_range_documents.py +++ b/cycode/cli/files_collector/commit_range_documents.py @@ -487,14 +487,12 @@ def normalize_commit_range(commit_range: str, path: str) -> str: from_commit_rev, to_commit_rev, separator = parse_commit_range(commit_range, path) if from_commit_rev is None or to_commit_rev is None: logger.warning( - 'Failed to parse commit range "%s", falling back to raw string. This may cause unexpected behavior.', + 'Failed to parse commit range "%s", falling back to raw string.', commit_range ) - # Fall back to using the raw commit_range string return commit_range # Construct a normalized range string using the original separator for iter_commits - # This preserves the semantics of two-dot vs three-dot syntax normalized_commit_range = f'{from_commit_rev}{separator}{to_commit_rev}' logger.debug( 'Normalized commit range "%s" to "%s"', From 8bd3e661d38a7b7fb1799d0308011fa9e176131f Mon Sep 17 00:00:00 2001 From: Mateusz Sterczewski Date: Wed, 7 Jan 2026 13:34:16 +0100 Subject: [PATCH 4/5] CM-55207-Fix tests further --- cycode/cli/files_collector/commit_range_documents.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/cycode/cli/files_collector/commit_range_documents.py b/cycode/cli/files_collector/commit_range_documents.py index 902a2919..2feb6934 100644 --- a/cycode/cli/files_collector/commit_range_documents.py +++ b/cycode/cli/files_collector/commit_range_documents.py @@ -100,12 +100,7 @@ def collect_commit_range_diff_documents( logger.debug( 'Found all relevant files in commit %s', - { - 'path': path, - 'commit_range': commit_range, - 'normalized_commit_range': normalized_commit_range, - 'commit_id': commit_id, - }, + {'path': path, 'commit_range': normalized_commit_range, 'commit_id': commit_id}, ) logger.debug('List of commit ids to scan, %s', {'commit_ids': commit_ids_to_scan}) @@ -450,12 +445,12 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt """ repo = git_proxy.get_repo(path) + separator = '..' if '...' in commit_range: from_spec, to_spec = commit_range.split('...', 1) separator = '...' elif '..' in commit_range: from_spec, to_spec = commit_range.split('..', 1) - separator = '..' else: # Git commands like 'git diff ' compare against HEAD. from_spec = commit_range @@ -464,10 +459,8 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt # If a spec is empty (e.g., from '..master'), default it to 'HEAD' if not from_spec: from_spec = consts.GIT_HEAD_COMMIT_REV - separator = '..' if not to_spec: to_spec = consts.GIT_HEAD_COMMIT_REV - separator = '..' try: # Use rev_parse to resolve each specifier to its full commit SHA From bc835e03c007d2881edb139f68c8eacd4658de23 Mon Sep 17 00:00:00 2001 From: Mateusz Sterczewski Date: Wed, 7 Jan 2026 13:38:35 +0100 Subject: [PATCH 5/5] CM-55207-Ruff format --- .../files_collector/commit_range_documents.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cycode/cli/files_collector/commit_range_documents.py b/cycode/cli/files_collector/commit_range_documents.py index 2feb6934..d92aea81 100644 --- a/cycode/cli/files_collector/commit_range_documents.py +++ b/cycode/cli/files_collector/commit_range_documents.py @@ -71,8 +71,9 @@ def collect_commit_range_diff_documents( normalized_commit_range = normalize_commit_range(commit_range, path) total_commits_count = int(repo.git.rev_list('--count', normalized_commit_range)) - logger.debug('Calculating diffs for %s commits in the commit range %s', - total_commits_count, normalized_commit_range) + logger.debug( + 'Calculating diffs for %s commits in the commit range %s', total_commits_count, normalized_commit_range + ) progress_bar.set_section_length(ScanProgressBarSection.PREPARE_LOCAL_FILES, total_commits_count) @@ -100,7 +101,12 @@ def collect_commit_range_diff_documents( logger.debug( 'Found all relevant files in commit %s', - {'path': path, 'commit_range': normalized_commit_range, 'commit_id': commit_id}, + { + 'path': path, + 'commit_range': commit_range, + 'normalized_commit_range': normalized_commit_range, + 'commit_id': commit_id, + }, ) logger.debug('List of commit ids to scan, %s', {'commit_ids': commit_ids_to_scan}) @@ -471,6 +477,7 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt logger.warning("Failed to parse commit range '%s'", commit_range, exc_info=e) return None, None, None + def normalize_commit_range(commit_range: str, path: str) -> str: """Normalize a commit range string to handle various formats consistently with all scan types. @@ -479,10 +486,7 @@ def normalize_commit_range(commit_range: str, path: str) -> str: """ from_commit_rev, to_commit_rev, separator = parse_commit_range(commit_range, path) if from_commit_rev is None or to_commit_rev is None: - logger.warning( - 'Failed to parse commit range "%s", falling back to raw string.', - commit_range - ) + logger.warning('Failed to parse commit range "%s", falling back to raw string.', commit_range) return commit_range # Construct a normalized range string using the original separator for iter_commits