Skip to content

Commit 271b9f2

Browse files
authored
Merge pull request #6 from FlacSy/develop
Add ruff: Makefile targets (lint, format, format-fix, lint-fix), dev …
2 parents 4aadebb + a2564ec commit 271b9f2

8 files changed

Lines changed: 163 additions & 52 deletions

File tree

Makefile

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: develop build test test-rust test-python test-wasm bench bench-rust bench-python bench-compare wasm wasm-nodejs npm-publish lang-packages npm-publish-languages
1+
.PHONY: develop build test test-rust test-python test-wasm bench bench-rust bench-python bench-compare lint lint-fix format format-fix wasm wasm-nodejs npm-publish lang-packages npm-publish-languages
22

33
develop:
44
cd python && maturin develop
@@ -38,6 +38,22 @@ bench-python:
3838
@if [ -d .venv ]; then .venv/bin/python -m pytest tests/bench_filter.py -v --benchmark-only; \
3939
else python3 -m pytest tests/bench_filter.py -v --benchmark-only; fi
4040

41+
# Ruff: lint (check only)
42+
lint:
43+
@if [ -d .venv ]; then .venv/bin/ruff check .; else ruff check .; fi
44+
45+
# Ruff: format check (CI)
46+
format:
47+
@if [ -d .venv ]; then .venv/bin/ruff format --check .; else ruff format --check .; fi
48+
49+
# Ruff: format fix (apply formatting)
50+
format-fix:
51+
@if [ -d .venv ]; then .venv/bin/ruff format .; else ruff format .; fi
52+
53+
# Ruff: lint with auto-fix
54+
lint-fix:
55+
@if [ -d .venv ]; then .venv/bin/ruff check . --fix; else ruff check . --fix; fi
56+
4157
# WebAssembly build for browser
4258
wasm:
4359
cd rust/badwords-wasm && wasm-pack build --target web --out-dir pkg

ml/prepare_data.py

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,14 @@
1010
import pandas as pd
1111
from datasets import load_dataset
1212

13-
TOXIC_COLUMNS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
13+
TOXIC_COLUMNS = [
14+
"toxic",
15+
"severe_toxic",
16+
"obscene",
17+
"threat",
18+
"insult",
19+
"identity_hate",
20+
]
1421
TEXT_COLUMN = "comment_text"
1522
OUTPUT_DIR = Path(__file__).parent / "data" / "processed"
1623

@@ -46,29 +53,51 @@ def load_single(
4653
if label_source == "paradetox":
4754
# toxic = 1, neutral/detox = 0
4855
input_col = next(
49-
(c for c in [
50-
"input", "source", "toxic",
51-
"en_toxic_comment", "ru_toxic_comment", "toxic_sentence",
52-
] if c in df.columns),
56+
(
57+
c
58+
for c in [
59+
"input",
60+
"source",
61+
"toxic",
62+
"en_toxic_comment",
63+
"ru_toxic_comment",
64+
"toxic_sentence",
65+
]
66+
if c in df.columns
67+
),
5368
None,
5469
)
5570
output_col = next(
56-
(c for c in [
57-
"output", "target", "detox",
58-
"en_neutral_comment", "ru_neutral_comment", "neutral_sentence",
59-
] if c in df.columns),
71+
(
72+
c
73+
for c in [
74+
"output",
75+
"target",
76+
"detox",
77+
"en_neutral_comment",
78+
"ru_neutral_comment",
79+
"neutral_sentence",
80+
]
81+
if c in df.columns
82+
),
6083
None,
6184
)
6285
if not input_col or not output_col:
63-
raise ValueError(f"ParaDetox format needs toxic/neutral columns. Columns: {list(df.columns)}")
86+
raise ValueError(
87+
f"ParaDetox format needs toxic/neutral columns. Columns: {list(df.columns)}"
88+
)
6489
toxic_df = df[[input_col]].rename(columns={input_col: TEXT_COLUMN})
6590
toxic_df["label"] = 1
6691
clean_df = df[[output_col]].rename(columns={output_col: TEXT_COLUMN})
6792
clean_df["label"] = 0
6893
df = pd.concat([toxic_df, clean_df], ignore_index=True)
6994
else:
7095
text_col = text_col or next(
71-
(c for c in ["comment_text", "text", "comment", "sentence", "content"] if c in df.columns),
96+
(
97+
c
98+
for c in ["comment_text", "text", "comment", "sentence", "content"]
99+
if c in df.columns
100+
),
72101
None,
73102
)
74103
if not text_col:
@@ -81,7 +110,9 @@ def load_single(
81110
# civil_comments: toxicity 0-1, threshold 0.5
82111
tox_col = next((c for c in ["toxicity", "toxic"] if c in df.columns), None)
83112
if not tox_col:
84-
raise ValueError(f"Toxicity column not found. Columns: {list(df.columns)}")
113+
raise ValueError(
114+
f"Toxicity column not found. Columns: {list(df.columns)}"
115+
)
85116
df["label"] = (df[tox_col].fillna(0) >= 0.5).astype(int)
86117
elif label_source.startswith("toxic"):
87118
toxic_cols = [c for c in TOXIC_COLUMNS if c in df.columns]
@@ -132,7 +163,10 @@ def load_multilingual(max_samples_per_dataset: int | None = None) -> pd.DataFram
132163

133164
# English + Russian + multilingual paradetox
134165
for name, (ds, _, src) in DATASET_PRESETS.items():
135-
if name in ("paradetox", "ru_paradetox", "multilingual_paradetox") and src == "paradetox":
166+
if (
167+
name in ("paradetox", "ru_paradetox", "multilingual_paradetox")
168+
and src == "paradetox"
169+
):
136170
try:
137171
df = load_single(ds, src, None, max_samples_per_dataset, 3, 512)
138172
dfs.append(df)
@@ -144,7 +178,9 @@ def load_multilingual(max_samples_per_dataset: int | None = None) -> pd.DataFram
144178
return pd.concat(dfs, ignore_index=True).drop_duplicates(subset=[TEXT_COLUMN])
145179

146180

147-
def balance(df: pd.DataFrame, ratio: float = 0.3, max_total: int | None = None) -> pd.DataFrame:
181+
def balance(
182+
df: pd.DataFrame, ratio: float = 0.3, max_total: int | None = None
183+
) -> pd.DataFrame:
148184
"""Balance classes. ratio = fraction of positive samples. max_total caps result size."""
149185
pos = df[df["label"] == 1]
150186
neg = df[df["label"] == 0]
@@ -213,7 +249,9 @@ def main() -> None:
213249
ds_name, text_col, label_src = DATASET_PRESETS[args.preset]
214250
df = load_single(ds_name, label_src, text_col, args.max_samples, 3, 512)
215251

216-
print(f"Total: {len(df)} samples, {df['label'].sum()} positive ({df['label'].mean():.2%})")
252+
print(
253+
f"Total: {len(df)} samples, {df['label'].sum()} positive ({df['label'].mean():.2%})"
254+
)
217255

218256
df_balanced = balance(df, ratio=args.positive_ratio, max_total=args.max_total)
219257
print(f"Balanced: {len(df_balanced)} samples")

ml/quantize_model.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ def main() -> None:
5858
target.unlink()
5959
shutil.copy(quant_path, target)
6060
new_size = target.stat().st_size
61-
print(f"Done: {orig_size / 1e6:.1f} MB -> {new_size / 1e6:.1f} MB ({100 * new_size / orig_size:.0f}%)")
61+
print(
62+
f"Done: {orig_size / 1e6:.1f} MB -> {new_size / 1e6:.1f} MB ({100 * new_size / orig_size:.0f}%)"
63+
)
6264

6365

6466
if __name__ == "__main__":

ml/test_inference.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
# Expected: 1=toxic, 0=clean
1515
TEST_CASES = [
16-
1716
("Поздравзяю теперь ты не тупой", 1),
1817
]
1918

@@ -29,9 +28,7 @@ def predict(model, tokenizer, text: str) -> float:
2928
def main() -> None:
3029
print("Loading model...")
3130
model = ORTModelForSequenceClassification.from_pretrained(str(MODELS_DIR))
32-
tokenizer = AutoTokenizer.from_pretrained(
33-
str(MODELS_DIR), fix_mistral_regex=True
34-
)
31+
tokenizer = AutoTokenizer.from_pretrained(str(MODELS_DIR), fix_mistral_regex=True)
3532

3633
print("\n" + "=" * 70)
3734
print("Toxicity scores (1.0 = toxic, 0.5 threshold)")
@@ -49,7 +46,9 @@ def main() -> None:
4946
print(f" {prob:.3f} [{label:5}] {ok} (exp: {exp_str}) {text!r}")
5047

5148
print("=" * 70)
52-
print(f"Accuracy: {correct}/{len(TEST_CASES)} ({100 * correct / len(TEST_CASES):.0f}%)")
49+
print(
50+
f"Accuracy: {correct}/{len(TEST_CASES)} ({100 * correct / len(TEST_CASES):.0f}%)"
51+
)
5352
print("Note: evasion (leetspeak, spacing), indirect RU insults often missed.")
5453
print("=" * 70)
5554

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ classifiers = [
3232
]
3333

3434
[project.optional-dependencies]
35-
dev = ["pytest>=7.0", "pytest-benchmark>=4.0"]
35+
dev = ["pytest>=7.0", "pytest-benchmark>=4.0", "ruff>=0.4"]
3636
ml = ["onnxruntime>=1.16", "optimum[onnxruntime]>=1.14", "transformers>=4.36"]
3737

3838
[project.urls]

python/badwords/ml/_paths.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,19 @@ def _download_model(cache_dir: Path) -> None:
6262
zip_path = cache_dir / ASSET_NAME
6363

6464
# Get latest release
65-
api_url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/releases/latest"
66-
req = urllib.request.Request(api_url, headers={"Accept": "application/vnd.github+json"})
65+
api_url = (
66+
f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/releases/latest"
67+
)
68+
req = urllib.request.Request(
69+
api_url, headers={"Accept": "application/vnd.github+json"}
70+
)
6771
with urllib.request.urlopen(req, timeout=30) as r:
6872
release = json.loads(r.read().decode())
6973

7074
# Find asset
71-
asset = next((a for a in release.get("assets", []) if a["name"] == ASSET_NAME), None)
75+
asset = next(
76+
(a for a in release.get("assets", []) if a["name"] == ASSET_NAME), None
77+
)
7278
if not asset:
7379
raise FileNotFoundError(
7480
f"Asset {ASSET_NAME} not found in release {release.get('tag_name', '?')}. "

python/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ classifiers = [
3232
]
3333

3434
[project.optional-dependencies]
35-
dev = ["pytest>=7.0", "pytest-benchmark>=4.0"]
35+
dev = ["pytest>=7.0", "pytest-benchmark>=4.0", "ruff>=0.4"]
3636
ml = ["onnxruntime>=1.16", "optimum[onnxruntime]>=1.14", "transformers>=4.36"]
3737

3838
[project.urls]

0 commit comments

Comments
 (0)