-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun.py
More file actions
162 lines (146 loc) · 7.05 KB
/
run.py
File metadata and controls
162 lines (146 loc) · 7.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python3
"""
run.py — Full literature discovery pipeline in one command.
Runs all four steps in sequence:
1. literature_explorer — resolve corpus, build co-citation graph, score candidates
2. enrich_abstracts — fill missing abstracts (SS batch → OpenAlex → CrossRef)
3. visualize — interactive co-citation network + timeline + field chart
(search is interactive — use cache_search.py separately)
Usage:
python run.py --bib references.bib
python run.py --bib references.bib --keywords "open science" "data sharing"
python run.py --bib references.bib --skip-enrich
python run.py --bib references.bib --skip-viz
python run.py --bib references.bib --top 80 --out results
"""
import argparse, sys
from pathlib import Path
from types import SimpleNamespace
def main():
parser = argparse.ArgumentParser(
description="Full literature discovery pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
# ── Input ──
parser.add_argument("--bib", help="Path to .bib file")
parser.add_argument("--pdf-dir", help="Path to folder of PDFs (alternative to --bib)")
parser.add_argument("--cache", default=None,
help="Cache file path (default: derived from bib name)")
# ── Explorer ──
parser.add_argument("--keywords", nargs="+", metavar="KW",
help="Focus keywords to re-weight scoring "
"(e.g. --keywords 'open data' 'data sharing')")
parser.add_argument("--preset",
choices=["balanced","highly-cited","recent","interdisciplinary"],
default="balanced",
help="Scoring weight preset (default: balanced)")
parser.add_argument("--cluster", type=int, default=None, metavar="N",
help="Cluster candidates into N topics (0=auto-detect k)")
parser.add_argument("--out", default="outputs",
help="Root output directory (default: outputs/)")
# ── Enrich ──
parser.add_argument("--skip-enrich", action="store_true",
help="Skip abstract enrichment step")
parser.add_argument("--enrich-limit", type=int, default=0,
help="Cap OA/CR API calls during enrichment (0 = no limit)")
# ── Visualize ──
parser.add_argument("--skip-viz", action="store_true",
help="Skip visualization step")
parser.add_argument("--top", type=int, default=50,
help="Top N candidates to show in network (default: 50)")
args = parser.parse_args()
if not args.bib and not args.pdf_dir:
parser.error("Provide --bib or --pdf-dir")
# ── Derive cache path (shared across all steps) ──
if args.cache:
cache_path = Path(args.cache)
elif args.bib:
cache_path = Path(args.bib).parent / f"{Path(args.bib).stem}_cache.json"
else:
proj = Path(args.pdf_dir.rstrip("/\\")).stem
cache_path = Path(args.pdf_dir).parent / f"{proj}_cache.json"
print("\n" + "═" * 60)
print(" 🔭 Literature Discovery Pipeline")
print("═" * 60)
print(f" Bib : {args.bib or args.pdf_dir}")
print(f" Cache : {cache_path}")
print(f" Outputs : {args.out}/")
print("═" * 60 + "\n")
# ─────────────────────────────────────────────────────────
# Step 1: Explore
# ─────────────────────────────────────────────────────────
print("\n" + "─" * 60)
print(" Step 1/3 — Resolving corpus & building co-citation graph")
print("─" * 60)
from core.explorer import run as run_explorer
explorer_args = SimpleNamespace(
bib=args.bib,
pdf_dir=getattr(args, "pdf_dir", None),
out=args.out,
cache=str(cache_path),
keywords=args.keywords,
preset=args.preset,
cluster=args.cluster,
)
result = run_explorer(explorer_args)
# ─────────────────────────────────────────────────────────
# Step 2: Enrich abstracts
# ─────────────────────────────────────────────────────────
if not args.skip_enrich:
print("\n" + "─" * 60)
print(" Step 2/3 — Enriching abstracts")
print("─" * 60)
from core.enrich import run as run_enrich
enrich_args = SimpleNamespace(
cache=str(cache_path),
limit=args.enrich_limit,
dry_run=False,
sources=["ss", "oa", "cr"],
)
run_enrich(enrich_args)
else:
print("\n ⏭ Skipping abstract enrichment (--skip-enrich)")
# ─────────────────────────────────────────────────────────
# Step 3: Visualize
# ─────────────────────────────────────────────────────────
if not args.skip_viz:
print("\n" + "─" * 60)
print(" Step 3/3 — Generating visualizations")
print("─" * 60)
from core.viz import run as run_viz
viz_args = SimpleNamespace(
cache=str(cache_path),
top=args.top,
out=args.out,
no_network=False,
no_timeline=False,
no_fields=False,
no_table=False,
)
out_paths = run_viz(viz_args)
else:
print("\n ⏭ Skipping visualizations (--skip-viz)")
out_paths = {}
# ─────────────────────────────────────────────────────────
# Summary
# ─────────────────────────────────────────────────────────
proj_name = result["proj_name"]
out_dir = Path(args.out) / proj_name
print("\n" + "═" * 60)
print(" ✅ Pipeline complete!")
print("═" * 60)
print(f" Project : {proj_name}")
print(f" Cache : {cache_path}")
print(f" Outputs : {out_dir}/")
print(f" Core : {result['core']} papers")
print(f" Periphery : {result['periphery']} papers")
if out_paths:
for key, path in out_paths.items():
print(f" {key.capitalize():<10}: {path}")
print()
print(" Next: search your cache interactively:")
print(f' python cache_search.py --cache {cache_path} "your keywords"')
print("═" * 60 + "\n")
if __name__ == "__main__":
main()