samantics/split.py at main · broomhead/samantics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
"""Split a JSONL labels file into train/val[/test] splits by percentage."""
from __future__ import annotations

import argparse
import json
import random
from pathlib import Path
from typing import Iterable


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Split a JSONL labels file into train/val[/test] by percentage."
    )
    parser.add_argument("input", type=Path, help="Path to the input JSONL.")
    parser.add_argument("train_output", type=Path, help="Where to write the train JSONL.")
    parser.add_argument("val_output", type=Path, help="Where to write the val JSONL.")
    parser.add_argument(
        "test_output",
        type=Path,
        nargs="?",
        default=None,
        help="Optional path for test JSONL output. If omitted, 2-way split only.",
    )
    parser.add_argument(
        "--train", "--train-percent",
        dest="train_percent",
        type=float,
        default=85.0,
        help="Percent of samples for train (0-100). Default: 85.",
    )
    parser.add_argument(
        "--val", "--val-percent",
        dest="val_percent",
        type=float,
        default=None,
        help=(
            "Percent of samples for val (0-100). "
            "Defaults to 100-train for 2-way split, 10 for 3-way split."
        ),
    )
    parser.add_argument(
        "--test", "--test-percent",
        dest="test_percent",
        type=float,
        default=None,
        help="Percent of samples for test (0-100). Only used when test_output is given.",
    )
    parser.add_argument("--seed", type=int, default=42, help="Random seed. Default: 42.")
    return parser.parse_args()


def iter_jsonl(path: Path) -> Iterable[dict]:
    with path.open("r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError:
                continue
            if isinstance(rec, dict):
                yield rec


def write_jsonl(path: Path, records: list[dict]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as fh:
        for rec in records:
            fh.write(json.dumps(rec) + "\n")


def main() -> None:
    args = parse_args()
    records = list(iter_jsonl(args.input))
    if not records:
        raise SystemExit(f"No valid records found in {args.input}")

    random.seed(args.seed)
    random.shuffle(records)
    n = len(records)

    if args.test_output is None:
        # 2-way split
        val_percent = args.val_percent if args.val_percent is not None else (100.0 - args.train_percent)
        train_pct = args.train_percent
        if not (0 < train_pct < 100 and 0 < val_percent < 100):
            raise SystemExit("Train/val percents must be between 0 and 100.")
        if abs((train_pct + val_percent) - 100.0) > 1e-6:
            raise SystemExit("Train/val percents must sum to 100 for 2-way split.")
        train_count = int(n * (train_pct / 100.0))
        train_recs = records[:train_count]
        val_recs = records[train_count:]
        test_recs: list[dict] = []

        write_jsonl(args.train_output, train_recs)
        write_jsonl(args.val_output, val_recs)
        print(
            f"Split {n} records -> "
            f"train {len(train_recs)} ({train_pct:.1f}%), "
            f"val {len(val_recs)} ({val_percent:.1f}%)"
        )
        print(f"  {args.train_output}")
        print(f"  {args.val_output}")
    else:
        # 3-way split
        train_pct = args.train_percent
        val_pct = args.val_percent if args.val_percent is not None else 10.0
        test_pct = args.test_percent if args.test_percent is not None else (100.0 - train_pct - val_pct)
        if not (0 < train_pct < 100 and 0 < val_pct < 100 and test_pct > 0):
            raise SystemExit("Train/val percents must be between 0 and 100 and leave room for test.")
        train_count = int(n * (train_pct / 100.0))
        val_count = int(n * (val_pct / 100.0))
        train_recs = records[:train_count]
        val_recs = records[train_count : train_count + val_count]
        test_recs = records[train_count + val_count :]

        write_jsonl(args.train_output, train_recs)
        write_jsonl(args.val_output, val_recs)
        write_jsonl(args.test_output, test_recs)
        actual_test_pct = 100.0 - train_pct - val_pct
        print(
            f"Split {n} records -> "
            f"train {len(train_recs)} ({train_pct:.1f}%), "
            f"val {len(val_recs)} ({val_pct:.1f}%), "
            f"test {len(test_recs)} ({actual_test_pct:.1f}%)"
        )
        print(f"  {args.train_output}")
        print(f"  {args.val_output}")
        print(f"  {args.test_output}")


if __name__ == "__main__":
    main()