-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfind-duplicate-files.py
More file actions
executable file
·78 lines (60 loc) · 2.19 KB
/
find-duplicate-files.py
File metadata and controls
executable file
·78 lines (60 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
"""Given a file, find all duplicate files under a given directory.
Usage:
find-duplicate-files.py <FILE_PATH> [<SEARCH_DIR>] # recursively print duplicate file paths
# default search dir: current working directory
Examples:
find-duplicate-files.py photo.jpg # Search for duplicates of photo.jpg in the CWD
find-duplicate-files.py movie.mkv /mnt/archive # Search for movie.mkv duplicates in /mnt/archive
author: andreasl
"""
import argparse
import hashlib
import sys
from pathlib import Path
from typing import Generator
def hash_file(file_path: Path, algo: str = "sha256") -> str:
"""Compute hash of the file at given path."""
hasher = hashlib.new(algo)
with file_path.open("rb") as f:
while chunk := f.read(8192):
hasher.update(chunk)
return hasher.hexdigest()
def find_duplicates(file_path: Path, search_dir: Path) -> Generator[Path, None, None]:
"""Recursively find all duplicates of the file at given path in `search_dir`."""
file_size = file_path.stat().st_size
file_hash = hash_file(file_path)
for candidate_path in search_dir.rglob("*"):
if candidate_path.is_file() and candidate_path != file_path:
if candidate_path.stat().st_size != file_size:
continue
if hash_file(candidate_path) == file_hash:
yield candidate_path
def main() -> int:
"""Program main entry point."""
parser = argparse.ArgumentParser(
description="Find all duplicates of a given file in a given directory."
)
parser.add_argument(
"file",
type=Path,
help="Path to the file",
)
parser.add_argument(
"search_dir",
type=Path,
nargs="?", # ?: a single argument, 0 or 1
default=Path.cwd(),
help="Directory to search for duplicates",
)
args = parser.parse_args()
found_any = False
for duplicate in find_duplicates(args.file, args.search_dir):
print(duplicate)
found_any = True
if not found_any:
print("No duplicates found.")
return 1
return 0
if __name__ == "__main__":
sys.exit(main())