FlacSy
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 34 additions & 1 deletion b/‎Makefile‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 46 additions & 0 deletions b/‎README.md‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎ml/README.md‎
Lines changed: 81 additions & 0 deletions b/‎ml/README.md‎
Lines changed: 81 additions & 0 deletions
@@ -5,6 +5,10 @@ __pycache__/
 .ruff_cache
 .mypy_cache
 dist
+release/
+ml/data/
+ml/models/
+badwords-ml-model.zip
 *.egg-info
 .idea
 t.py
 
@@ -1,4 +1,4 @@
-.PHONY: develop build test test-rust test-python test-wasm bench bench-rust bench-python wasm wasm-nodejs npm-publish lang-packages npm-publish-languages
+.PHONY: develop build test test-rust test-python test-wasm bench bench-rust bench-python bench-compare wasm wasm-nodejs npm-publish lang-packages npm-publish-languages
 
 develop:
 	cd python && maturin develop
@@ -26,6 +26,11 @@ test-wasm:
 
 bench: bench-rust bench-python
 
+bench-compare:
+	@echo "BadWords vs glin-profanity (requires: pip install glin-profanity)"
+	@if [ -d .venv ]; then .venv/bin/python scripts/bench_compare.py; \
+	else python3 scripts/bench_compare.py; fi
+
 bench-rust:
 	cargo bench -p badwords-core
 
@@ -49,3 +54,31 @@ lang-packages:
 
 npm-publish-languages:
 	cd js/languages && npm publish --access public
+
+# ML training (requires: pip install -r ml/requirements.txt)
+ml-prepare:
+	cd ml && python prepare_data.py --preset multilingual
+
+# Full dataset (~600k samples, ~8-10h training with xlm-roberta)
+ml-prepare-full:
+	cd ml && python prepare_data.py --preset multilingual --max-total 600000
+
+# Max dataset (no cap, ~1M+ samples, ~15-20h)
+ml-prepare-max:
+	cd ml && python prepare_data.py --preset multilingual
+
+ml-train:
+	cd ml && python train.py
+
+ml-test:
+	cd ml && python test_inference.py
+
+# Quantize model: 500MB -> ~135MB
+ml-quantize:
+	cd ml && python quantize_model.py
+
+# Package ML model for GitHub Release (upload as badwords-ml-model.zip)
+ml-package:
+	@if [ ! -f ml/models/model.onnx ]; then echo "Run ml-train and ml-quantize first"; exit 1; fi
+	(cd ml/models && zip -r ../../badwords-ml-model.zip . -x "checkpoints/*" -x "checkpoints/*/*")
+	@echo "Created badwords-ml-model.zip — upload to GitHub Release"
@@ -29,6 +29,7 @@ with multilingual support and evasion detection.**
 
 [Installation](#-installation) •
 [Quick Start](#-quick-start) •
+[Benchmarks](#-benchmarks) •
 [Supported Languages](#-supported-languages) •
 [Evasion Detection](#-advanced-evasion-detection) •
 [Documentation](https://badwords.flacsy.dev)
@@ -96,6 +97,51 @@ print(clean_text) # "Some very *** text here"
 
 ---
 
+## ⏱ Benchmarks
+
+| CPU | GPU | RAM | OS |
+|-----|-----|-----|----|
+| x86_64 i7 Intel® Core™ i7-10700KF × 16 | NVIDIA GeForce RTX™ 3070 | 64 GB DDR4 3200MHz | Ubuntu 24.04.2 LTS | 
+
+
+Rule-based matching (en+ru, `match_threshold=1.0`). Run: `make bench`
+
+| Scenario | Rust (badwords-core) | Python (badwords-py) |
+|----------|----------------------|----------------------|
+| Clean text (no match) | ~7.6 µs (~130 K/s) | ~7.7 µs (~130 K/s) |
+| Bad word (match) | ~3.1 µs (~320 K/s) | ~2.7 µs (~370 K/s) |
+| Censor (replace) | ~2.8 µs (~360 K/s) | ~2.5 µs (~400 K/s) |
+| 5 texts batch | ~15 µs (~330 K/s) | ~16 µs (~310 K/s) |
+
+*Python uses Rust via PyO3, overhead minimal.*
+
+### vs glin-profanity
+
+Rule-based mode, en+ru. Run: `make bench-compare` (requires `pip install glin-profanity`)
+
+| Scenario | BadWords | glin-profanity |
+|----------|----------|----------------|
+| Clean text | ~7 µs (~140 K/s) | ~4.4 ms (~230/s) |
+| Bad word | ~1.3 µs (~770 K/s) | ~0.2 ms (~5 K/s) |
+| Censor | ~1.8 µs (~560 K/s) | ~1.4 ms (~700/s) |
+| 5 texts batch | ~16 µs (~310 K/s) | ~10 ms (~500/s) |
+
+*BadWords is ~100–600× faster (Rust core vs pure Python).*
+
+### ML mode
+
+`pip install glin-profanity[ml]` + `make bench-compare`. 100 iter each.
+
+| Scenario | BadWords ML (ONNX) | glin transformer |
+|----------|--------------------|-------------------|
+| Clean text (43 chars) | ~6.5 ms (~150/s) | ~27 ms (~37/s) |
+| Bad word (8 chars) | ~4.6 ms (~220/s) | ~21 ms (~47/s) |
+| 5 texts batch (82 chars) | ~24 ms (~210/s) | ~107 ms (~47/s) |
+
+*BadWords ML (XLM-RoBERTa) ~3–4× faster than glin transformer.*
+
+---
+
 ## 🛠 Methods & API
 
 ### `filter_text(text, match_threshold=1.0, replace_character=None)`
 
@@ -0,0 +1,81 @@
+# ML Training Pipeline
+
+Data preparation and model training for `badwords-py[ml]`.
+
+## Setup
+
+```bash
+cd ml
+pip install -r requirements.txt
+```
+
+### CUDA (GPU)
+
+Install PyTorch with CUDA **before** other deps:
+
+```bash
+# CUDA 12.4 (or cu121 for older drivers)
+pip install torch --index-url https://download.pytorch.org/whl/cu124
+```
+
+Check: `python -c "import torch; print(torch.cuda.is_available())"` → `True`
+
+Trainer uses GPU automatically when available.
+
+## Usage
+
+### 1. Prepare data
+
+```bash
+# Quick (~30k): EN + RU + 9 languages
+python prepare_data.py --preset multilingual
+
+# Full (~600k): SetFit + civil_comments + paradetox
+python prepare_data.py --preset multilingual --max-total 600000
+
+# Max (~1M+): all data, no cap
+python prepare_data.py --preset multilingual
+
+# English only
+python prepare_data.py --preset toxic_conversations --max-samples 200000
+
+# Single dataset
+python prepare_data.py --preset single --dataset SetFit/toxic_conversations
+```
+
+### 2. Train model
+
+```bash
+# Default: xlm-roberta (best quality), then quantize
+python train.py --epochs 2 --batch-size 8
+
+# Lighter: distilbert (faster training)
+python train.py --model distilbert-base-multilingual-cased --epochs 2 --batch-size 32
+```
+
+Output: `models/` (ONNX + tokenizer)
+
+### 3. Quantize (~4x smaller, recommended)
+
+```bash
+python quantize_model.py
+# xlm-roberta: 500MB -> ~135MB
+# distilbert: 250MB -> ~65MB
+```
+
+## Datasets
+
+| Preset | Sources | Languages | Size |
+|--------|---------|-----------|------|
+| `multilingual` | SetFit + paradetox + ru_paradetox + multilingual_paradetox | EN, RU, UK, DE, ES, AR, ZH, HI, AM | 30k+ |
+| `toxic_conversations` | SetFit/toxic_conversations | EN | 1.8M |
+| `civil_comments` | google/civil_comments | EN | 2M |
+| `paradetox` | s-nlp/paradetox | EN | 20k |
+| `ru_paradetox` | s-nlp/ru_paradetox | RU | 12k |
+
+## Model
+
+- **Default:** `xlm-roberta-base` (best quality, ~135MB after quantize)
+- **Lighter:** `distilbert-base-multilingual-cased` (~65MB after quantize, faster training)
+- Task: binary classification (offensive probability)
+- Output: ONNX for inference