diff --git a/CONTEXT.md b/CONTEXT.md index 398bbf09..ad7aa4e5 100644 --- a/CONTEXT.md +++ b/CONTEXT.md @@ -57,6 +57,13 @@ _Avoid_: отдельная роль, модератор _Code aliases_: `low_sent_pool` _Avoid_: обычная лента, топовая лента +**Дедупликация мемов**: +Шаг пайплайна, который распознаёт повторки мемов до попадания в рекомендации, +помечает дубль как `duplicate`, переносит неконфликтующие реакции на оригинал +и обновляет счётчики оригинала. +_Code aliases_: `storage.deduplication`, `meme.duplicate_of`, `MemeStatus.DUPLICATE` +_Avoid_: ручной cleanup, постфактум чистка очереди + **Кандидат в источники**: Публичный источник мемов, найденный или предложенный для возможного добавления в бота. _Code aliases_: `meme_source_candidate`, Source Candidate @@ -108,6 +115,7 @@ _Avoid_: router ## Relationships - **Модератор** может участвовать в **модераторском чате** и получать **разбор новых мемов** в своей ленте. +- **Дедупликация мемов** должна выполняться до того, как мем становится доступен обычной рекомендательной системе. - **Модераторский чат** используется для комьюнити-циклов; **чат проверки загрузок** используется для решений по пользовательским загрузкам. - **Проверяющий загрузки** определяется доступом к **чату проверки загрузок**, а не `UserType`. - Только **русскоязычный кандидат в источники** может попасть в автоматическое **голосование за источник**. diff --git a/scripts/agent_doctor.py b/scripts/agent_doctor.py index d4ce19e1..76e30eb7 100755 --- a/scripts/agent_doctor.py +++ b/scripts/agent_doctor.py @@ -73,11 +73,21 @@ def check_command_available(command: str) -> CheckResult: def check_describe_memes_models(root: Path = ROOT) -> CheckResult: - path = root / "src" / "flows" / "storage" / "describe_memes.py" - try: - model_ids = extract_vision_models(path) - except Exception as exc: - return CheckResult("describe_memes:free_models", False, str(exc)) + paths = ( + root / "src" / "flows" / "storage" / "describe_memes.py", + root / "src" / "flows" / "storage" / "openrouter_vision.py", + ) + errors: list[str] = [] + for path in paths: + try: + model_ids = extract_vision_models(path) + source_path = path + break + except Exception as exc: + errors.append(str(exc)) + else: + return CheckResult("describe_memes:free_models", False, "; ".join(errors)) + paid = non_free_openrouter_models(model_ids) if paid: return CheckResult( @@ -85,7 +95,11 @@ def check_describe_memes_models(root: Path = ROOT) -> CheckResult: False, "non-free model ids: " + ", ".join(paid), ) - return CheckResult("describe_memes:free_models", True, f"{len(model_ids)} free model(s)") + return CheckResult( + "describe_memes:free_models", + True, + f"{len(model_ids)} free model(s) in {source_path.relative_to(root)}", + ) def check_paperclip_access_adapter( diff --git a/scripts/deep_parse.py b/scripts/deep_parse.py index ae0c0b4c..b650e09f 100644 --- a/scripts/deep_parse.py +++ b/scripts/deep_parse.py @@ -39,7 +39,6 @@ async def main(): # Find source in DB from sqlalchemy import text - from src.database import fetch_one source = await fetch_one( text("SELECT id, url, status FROM meme_source WHERE url = :url"), @@ -72,7 +71,7 @@ async def main(): if posts: await insert_parsed_posts_from_telegram(source["id"], posts) - print(f"Inserted into meme_raw_telegram") + print("Inserted into meme_raw_telegram") await update_meme_source( meme_source_id=source["id"], parsed_at=datetime.now(timezone.utc) diff --git a/scripts/e2e_smoke.py b/scripts/e2e_smoke.py index 80304673..f8cbb458 100644 --- a/scripts/e2e_smoke.py +++ b/scripts/e2e_smoke.py @@ -26,7 +26,6 @@ from telethon import TelegramClient from telethon.sessions import StringSession - # --- Config --- API_ID = os.environ.get("TELEGRAM_API_ID") API_HASH = os.environ.get("TELEGRAM_API_HASH") @@ -150,9 +149,11 @@ async def test_delete(client, bot): if btn.data and b"delete" in btn.data.lower(): await msg.click(data=btn.data) confirm_msg = await wait_for_response(client, bot, msg.id) - if confirm_msg and ("ciao" in (confirm_msg.text or "").lower() or "start" in (confirm_msg.text or "").lower()): + confirm_text = (confirm_msg.text or "").lower() if confirm_msg else "" + if confirm_msg and ("ciao" in confirm_text or "start" in confirm_text): return "PASS", f"State deleted: {(confirm_msg.text or '')[:80]}" - return "WARN", f"Delete clicked but unexpected response: {(confirm_msg.text if confirm_msg else 'no response')[:80]}" + response = confirm_msg.text if confirm_msg else "no response" + return "WARN", f"Delete clicked but unexpected response: {response[:80]}" if "sure" in (msg.text or "").lower() or "delete" in (msg.text or "").lower(): return "WARN", f"Got confirmation prompt but no button found: {(msg.text or '')[:80]}" diff --git a/scripts/eval_crossposting_ml.py b/scripts/eval_crossposting_ml.py new file mode 100644 index 00000000..f980ef5d --- /dev/null +++ b/scripts/eval_crossposting_ml.py @@ -0,0 +1,390 @@ +"""Offline evaluator for simple crossposting virality models. + +The goal is deliberately narrow: predict whether a posted image meme lands +above the channel median 24h forward rate. This is not a production ranker. +It is a read-only gate for deciding whether simple linear ML features are worth +promoting into shadow scoring. + +Usage: + ANALYST_DATABASE_URL=... python scripts/eval_crossposting_ml.py + ANALYST_DATABASE_URL=... python scripts/eval_crossposting_ml.py --days 120 +""" + +from __future__ import annotations + +import argparse +import asyncio +import math +import os +import sys +from dataclasses import dataclass +from typing import Iterable + +import asyncpg + +FEATURE_NAMES = [ + "log_source_signal", + "log_source_posts", + "log_pre_likes", + "pre_like_rate", + "log_pre_reactions", + "log_pre_share_users", + "caption_present", + "hour_sin", + "hour_cos", +] + + +@dataclass +class Example: + channel: str + posted_at: object + fwd_per_1k_24h: float + features: list[float] + + +async def get_connection() -> asyncpg.Connection: + url = os.environ.get("ANALYST_DATABASE_URL") or os.environ.get("DATABASE_URL") + if not url: + print("ERROR: set ANALYST_DATABASE_URL or DATABASE_URL", file=sys.stderr) + sys.exit(1) + return await asyncpg.connect(url, statement_cache_size=0) + + +async def fetch_examples(conn: asyncpg.Connection, days: int) -> list[Example]: + rows = await conn.fetch( + """ + WITH labels AS ( + SELECT + cp.channel, + cp.meme_id, + cp.created_at AS posted_at, + m.meme_source_id, + (m.caption IS NOT NULL)::int AS caption_present, + s24.views AS views_24h, + s24.forwards AS forwards_24h, + 1000.0 * s24.forwards / NULLIF(s24.views, 0) AS fwd_per_1k_24h + FROM crossposting cp + JOIN meme m ON m.id = cp.meme_id + JOIN LATERAL ( + SELECT cps.snapshot_at, cps.views, cps.forwards + FROM crossposting_snapshots cps + WHERE cps.channel = cp.channel + AND cps.meme_id = cp.meme_id + AND cps.snapshot_at BETWEEN cp.created_at + interval '20 hours' + AND cp.created_at + interval '36 hours' + AND cps.views > 0 + AND cps.forwards IS NOT NULL + ORDER BY abs( + extract(epoch FROM cps.snapshot_at - (cp.created_at + interval '24 hours')) + ) + LIMIT 1 + ) s24 ON true + WHERE cp.channel IN ('tgchannelru', 'tgchannelen') + AND cp.created_at < now() - interval '36 hours' + AND cp.created_at >= now() - ($1 || ' days')::interval + AND m.type = 'image' + ), + reaction_features AS ( + SELECT + l.channel, + l.meme_id, + count(*) FILTER (WHERE r.reaction_id = 1) AS pre_likes, + count(*) FILTER (WHERE r.reaction_id = 2) AS pre_skips, + count(*) FILTER (WHERE r.reaction_id IN (1, 2)) AS pre_reactions + FROM labels l + LEFT JOIN user_meme_reaction r + ON r.meme_id = l.meme_id + AND r.reacted_at IS NOT NULL + AND r.reacted_at < l.posted_at + AND r.reaction_id IN (1, 2) + GROUP BY l.channel, l.meme_id + ), + share_clicks AS ( + SELECT + share_match.parts[2]::bigint AS meme_id, + udll.user_id, + udll.created_at + FROM user_deep_link_log udll + CROSS JOIN LATERAL regexp_matches( + udll.deep_link, + '^s_([1-9][0-9]{0,18})_([1-9][0-9]{0,18})$' + ) AS share_match(parts) + WHERE udll.created_at >= now() - ($1 || ' days')::interval + AND CASE + WHEN length(share_match.parts[1]) = 19 + AND share_match.parts[1] > '9223372036854775807' THEN false + WHEN length(share_match.parts[2]) = 19 + AND share_match.parts[2] > '9223372036854775807' THEN false + ELSE udll.user_id <> share_match.parts[1]::bigint + END + ), + share_features AS ( + SELECT + l.channel, + l.meme_id, + count(*) AS pre_share_clicks, + count(DISTINCT sc.user_id) AS pre_share_users + FROM labels l + LEFT JOIN share_clicks sc + ON sc.meme_id = l.meme_id + AND sc.created_at < l.posted_at + GROUP BY l.channel, l.meme_id + ) + SELECT + l.channel, + l.posted_at, + l.fwd_per_1k_24h, + l.caption_present, + COALESCE(rf.pre_likes, 0) AS pre_likes, + COALESCE(rf.pre_skips, 0) AS pre_skips, + COALESCE(rf.pre_reactions, 0) AS pre_reactions, + COALESCE(sf.pre_share_users, 0) AS pre_share_users, + extract(hour FROM l.posted_at + interval '3 hours')::int AS hour_msk, + COALESCE(sq.source_signal, 0) AS source_signal, + COALESCE(sq.source_posts, 0) AS source_posts + FROM labels l + JOIN reaction_features rf ON rf.channel = l.channel AND rf.meme_id = l.meme_id + JOIN share_features sf ON sf.channel = l.channel AND sf.meme_id = l.meme_id + LEFT JOIN LATERAL ( + SELECT + AVG(cp2.forwards * SQRT(GREATEST(cp2.views, 1) / 100.0)) AS source_signal, + COUNT(*) AS source_posts + FROM crossposting cp2 + JOIN meme m2 ON m2.id = cp2.meme_id + WHERE cp2.channel = l.channel + AND cp2.created_at > l.posted_at - interval '30 days' + AND cp2.created_at < l.posted_at - interval '48 hours' + AND cp2.views IS NOT NULL + AND cp2.views > 0 + AND cp2.forwards IS NOT NULL + AND m2.type = 'image' + AND m2.meme_source_id = l.meme_source_id + ) sq ON true + ORDER BY l.channel, l.posted_at + """, + str(days), + ) + + examples: list[Example] = [] + for row in rows: + pre_reactions = row["pre_reactions"] or 0 + pre_likes = row["pre_likes"] or 0 + pre_like_rate = pre_likes / pre_reactions if pre_reactions else 0.5 + hour_angle = 2 * math.pi * (row["hour_msk"] or 0) / 24 + features = [ + math.log1p(float(row["source_signal"] or 0)), + math.log1p(float(row["source_posts"] or 0)), + math.log1p(float(pre_likes)), + pre_like_rate, + math.log1p(float(pre_reactions)), + math.log1p(float(row["pre_share_users"] or 0)), + float(row["caption_present"] or 0), + math.sin(hour_angle), + math.cos(hour_angle), + ] + examples.append( + Example( + channel=row["channel"], + posted_at=row["posted_at"], + fwd_per_1k_24h=float(row["fwd_per_1k_24h"]), + features=features, + ) + ) + return examples + + +def median(values: Iterable[float]) -> float: + ordered = sorted(values) + n = len(ordered) + if n == 0: + raise ValueError("median of empty list") + midpoint = n // 2 + if n % 2: + return ordered[midpoint] + return (ordered[midpoint - 1] + ordered[midpoint]) / 2 + + +def standardize( + train_x: list[list[float]], + test_x: list[list[float]], +) -> tuple[list[list[float]], list[list[float]]]: + n_features = len(train_x[0]) + means = [sum(x[j] for x in train_x) / len(train_x) for j in range(n_features)] + stds = [] + for j in range(n_features): + variance = sum((x[j] - means[j]) ** 2 for x in train_x) / len(train_x) + stds.append(math.sqrt(variance) or 1.0) + + def transform(rows: list[list[float]]) -> list[list[float]]: + return [[(x[j] - means[j]) / stds[j] for j in range(n_features)] for x in rows] + + return transform(train_x), transform(test_x) + + +def sigmoid(value: float) -> float: + if value >= 0: + z = math.exp(-value) + return 1 / (1 + z) + z = math.exp(value) + return z / (1 + z) + + +def train_logistic_regression( + train_x: list[list[float]], + train_y: list[int], + *, + iterations: int, + lr: float, + l2: float, +) -> list[float]: + n_features = len(train_x[0]) + weights = [0.0] * (n_features + 1) + + for _ in range(iterations): + gradients = [0.0] * (n_features + 1) + for x, y in zip(train_x, train_y): + z = weights[0] + sum(w * v for w, v in zip(weights[1:], x)) + error = sigmoid(z) - y + gradients[0] += error + for j, value in enumerate(x, start=1): + gradients[j] += error * value + + n = len(train_x) + weights[0] -= lr * gradients[0] / n + for j in range(1, len(weights)): + gradients[j] = gradients[j] / n + l2 * weights[j] + weights[j] -= lr * gradients[j] + + return weights + + +def predict(weights: list[float], rows: list[list[float]]) -> list[float]: + return [sigmoid(weights[0] + sum(w * v for w, v in zip(weights[1:], x))) for x in rows] + + +def auc(scores: list[float], labels: list[int]) -> float: + positives = [(s, y) for s, y in zip(scores, labels) if y == 1] + negatives = [(s, y) for s, y in zip(scores, labels) if y == 0] + if not positives or not negatives: + return 0.5 + + wins = 0.0 + total = 0 + for pos_score, _ in positives: + for neg_score, _ in negatives: + total += 1 + if pos_score > neg_score: + wins += 1 + elif pos_score == neg_score: + wins += 0.5 + return wins / total + + +def top_quintile_lift(scores: list[float], labels: list[int]) -> float: + if not labels or sum(labels) == 0: + return 0.0 + paired = sorted(zip(scores, labels), key=lambda pair: pair[0], reverse=True) + top_n = max(1, math.ceil(len(paired) * 0.2)) + selected_count = 0.0 + selected_positives = 0.0 + index = 0 + while selected_count < top_n and index < len(paired): + score = paired[index][0] + group_labels: list[int] = [] + while index < len(paired) and paired[index][0] == score: + group_labels.append(paired[index][1]) + index += 1 + + remaining = top_n - selected_count + if len(group_labels) <= remaining: + selected_count += len(group_labels) + selected_positives += sum(group_labels) + else: + selected_count += remaining + selected_positives += sum(group_labels) * (remaining / len(group_labels)) + + top_rate = selected_positives / top_n + base_rate = sum(labels) / len(labels) + return top_rate / base_rate if base_rate else 0.0 + + +def evaluate_channel(channel: str, examples: list[Example], train_fraction: float) -> None: + channel_examples = [e for e in examples if e.channel == channel] + channel_examples.sort(key=lambda e: e.posted_at) + if len(channel_examples) < 30: + print(f"\n{channel}: not enough labeled posts ({len(channel_examples)})") + return + + split = max(1, min(len(channel_examples) - 1, int(len(channel_examples) * train_fraction))) + train = channel_examples[:split] + test = channel_examples[split:] + threshold = median(e.fwd_per_1k_24h for e in train) + + train_x = [e.features for e in train] + test_x = [e.features for e in test] + train_y = [int(e.fwd_per_1k_24h >= threshold) for e in train] + test_y = [int(e.fwd_per_1k_24h >= threshold) for e in test] + + if len(set(train_y)) < 2 or len(set(test_y)) < 2: + print(f"\n{channel}: split has one target class, cannot evaluate") + return + + train_x_std, test_x_std = standardize(train_x, test_x) + weights = train_logistic_regression( + train_x_std, + train_y, + iterations=2500, + lr=0.05, + l2=0.05, + ) + scores = predict(weights, test_x_std) + + baselines = { + "source_signal": [x[0] for x in test_x], + "pre_likes": [x[2] for x in test_x], + "pre_share_users": [x[5] for x in test_x], + } + + print(f"\n{channel}") + print(f" labeled posts: {len(channel_examples)}") + print(f" train/test: {len(train)}/{len(test)}") + print(f" train median target: {threshold:.2f} fwd/1k") + print(f" logistic_auc: {auc(scores, test_y):.3f}") + print(f" logistic_top20_lift: {top_quintile_lift(scores, test_y):.2f}x") + print(f" pre_share_users_coverage: {sum(1 for x in test_x if x[5] > 0)}/{len(test_x)}") + for name, baseline_scores in baselines.items(): + print(f" {name}_auc: {auc(baseline_scores, test_y):.3f}") + print(f" {name}_top20_lift: {top_quintile_lift(baseline_scores, test_y):.2f}x") + + coef_pairs = sorted( + zip(FEATURE_NAMES, weights[1:]), + key=lambda item: abs(item[1]), + reverse=True, + ) + print(" strongest coefficients:") + for name, value in coef_pairs[:5]: + print(f" {name}: {value:+.3f}") + + +async def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--days", type=int, default=120) + parser.add_argument("--train-fraction", type=float, default=0.7) + args = parser.parse_args() + + conn = await get_connection() + try: + await conn.execute("SET statement_timeout = '30s'") + examples = await fetch_examples(conn, args.days) + finally: + await conn.close() + + print("Crossposting ML offline eval") + print(f"Examples: {len(examples)} image posts over {args.days} days") + for channel in ("tgchannelru", "tgchannelen"): + evaluate_channel(channel, examples, args.train_fraction) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/eval_ranking.py b/scripts/eval_ranking.py index b9720766..193b2ff9 100644 --- a/scripts/eval_ranking.py +++ b/scripts/eval_ranking.py @@ -47,7 +47,7 @@ async def eval_lr_smoothed(conn: asyncpg.Connection, hours: int, min_reactions: """ print(f"\n{'='*60}") - print(f" OFFLINE EVAL: lr_smoothed scoring") + print(" OFFLINE EVAL: lr_smoothed scoring") print(f" Test window: last {hours} hours") print(f" Min reactions per user: {min_reactions}") print(f"{'='*60}\n") @@ -171,7 +171,7 @@ async def eval_lr_smoothed(conn: asyncpg.Connection, hours: int, min_reactions: print(f" Pairwise accuracy: {pairwise_acc:.1%}") print(f" Avg per-user acc: {avg_user_acc:.1%}") print(f" Tie rate: {tie_rate:.1%}") - print(f" Random baseline: 50.0%") + print(" Random baseline: 50.0%") print() # Breakdown by user engagement level @@ -207,7 +207,7 @@ async def eval_engagement_score(conn: asyncpg.Connection, hours: int, min_reacti """Evaluate engagement_score as a ranking signal.""" print(f"\n{'='*60}") - print(f" OFFLINE EVAL: engagement_score") + print(" OFFLINE EVAL: engagement_score") print(f"{'='*60}\n") t0 = time.time() diff --git a/scripts/generate_session_string.py b/scripts/generate_session_string.py index 304e5874..662aea35 100644 --- a/scripts/generate_session_string.py +++ b/scripts/generate_session_string.py @@ -13,8 +13,8 @@ The output is a session string — store it as TELEGRAM_SESSION_STRING env var. """ -from telethon.sync import TelegramClient from telethon.sessions import StringSession +from telethon.sync import TelegramClient api_id = int(input("API ID: ")) api_hash = input("API Hash: ") diff --git a/scripts/serve_flows.py b/scripts/serve_flows.py index 1cd7ec77..17ae2f41 100644 --- a/scripts/serve_flows.py +++ b/scripts/serve_flows.py @@ -150,10 +150,12 @@ # ── Crossposting (Moscow timezone) ── post_meme_to_tgchannelru.to_deployment( name="Post to TG Channel RU", - # Experiment: dropped 18:00 MSK (21:00 UTC, 10.2 fwd/1k — worst slot). - # Added 11:00 MSK. Data: 10:00 MSK = 25.6 fwd/1k (best), 18:00 MSK = 15.6 fwd/1k. - # Baseline (pre-2026-04-13): 8,10,12,14,16,18 MSK (6x/day) - schedules=[CronSchedule(cron="20 8,10,11,12,14,16 * * *", timezone=MSK)], + # May 21 readout: v2 ranker is stable, but the 10/11/12 MSK cluster + # makes the channel feel hourly. Keep 5/day and move one slot into + # the evening reactivation window. + # Prior experiment: dropped weak 18:00 MSK slot. + # Baseline (pre-2026-04-13): 8,10,12,14,16,18 MSK (6x/day). + schedules=[CronSchedule(cron="20 8,10,14,16,21 * * *", timezone=MSK)], ), post_meme_to_tgchannelen.to_deployment( name="Post to TG Channel EN", diff --git a/specs/crossposting-share-optimization-2026-05-18.md b/specs/crossposting-share-optimization-2026-05-18.md index c912e852..452be0c0 100644 --- a/specs/crossposting-share-optimization-2026-05-18.md +++ b/specs/crossposting-share-optimization-2026-05-18.md @@ -161,3 +161,88 @@ CREATE INDEX CONCURRENTLY IF NOT EXISTS ON user_meme_reaction (meme_id, reacted_at) INCLUDE (user_id, reaction_id, sent_at); ``` + +## May 21 recheck + +Prod snapshots were fresh through 2026-05-21 11:00 UTC. The v2 ranker did not +show a clear forward-rate regression: + +| Channel | Recent mature image posts | Recent agg fwd/1k | v2 agg fwd/1k | +| --- | ---: | ---: | ---: | +| RU | 43 | 23.57 | 24.59 | +| EN | 39 | 18.65 | 18.42 | + +Subscriber growth is the unsolved part: RU was 2165 -> 2155 over the last +30 days, while EN was 623 -> 653. Better meme selection alone is still not +enough to grow RU. + +Operational finding: normal scheduled posts were not the only channel volume. +Weekly uploaded-meme reward albums add 5 media posts at once and were logged as +`score_version=1`, which mixed non-ranker posts into old-ranker readouts. The +May 21 cleanup sets reward album logs to `score_version=0` and keeps their +caption on the first media item for analysis. + +Frequency adjustment: RU scheduled posts move from `8,10,11,12,14,16` MSK to +`8,10,14,16,21` MSK. This removes the 10/11/12 hourly cluster and moves one +slot into the evening reactivation window. Bot activity in the last 30 days: +21:00 MSK had 30.5k reactions / 367 active users; 22:00 MSK had 32.0k reactions +/ 349 active users. Use 21:00 first because the active-user base is slightly +wider and the slot is less late. + +ML status: `scripts/eval_crossposting_ml.py` now runs a read-only logistic +baseline against 24h channel labels. Initial 90-day run: + +| Channel | Labeled images | Logistic AUC | Source-signal AUC | Pre-share top20 lift | +| --- | ---: | ---: | ---: | ---: | +| RU | 164 | 0.491 | 0.568 | 1.96x | +| EN | 162 | 0.548 | 0.410 | 2.45x | + +Conclusion: this is not yet strong enough to ship an ML ranker. The next useful +step is richer candidate-level offline evaluation, not turning on `score_version=3`. +Keep ML work timestamp-safe: labels from 24h snapshots, features only from data +available before the simulated decision. + +May 22 correction: the first `pre_share_users_top20_lift` readout was inflated +by evaluator tie-bias. `top_quintile_lift` sorted `(score, label)` tuples, so +equal scores placed positive labels before negative labels. After making ties +label-neutral, `pre_share_users` is not shippable: the 120-day split has only +1/52 RU test posts and 0/51 EN test posts with positive pre-share coverage; +the corrected pre-share top20 lift is 0.93x for RU and 1.00x for EN. Keep prior +share clicks as a logged feature until coverage improves. + +### Segment-first ML plan + +The flat meme-level model is not the right abstraction. The next evaluator +should model `meme x user_segment` evidence first, then aggregate segment +responses into a channel-success prediction. + +User segments to test before any production ranker: + +- Engagement depth: new, casual, regular, heavy, based on recent reaction count + and active days. +- Taste/source affinity: top source clusters per user from historical likes and + skips; start with `meme_source_id` families, add OCR/description embeddings + only after the tabular baseline is sane. +- Reaction behavior: fast liker, slow reader, fast skipper, high-share clicker. +- Language/context: selected languages, observed liked meme language, local + active-hour bucket. + +Candidate segment features: + +- Segment impressions before channel post. +- Segment like rate and Wilson-smoothed like rate. +- Segment median reaction time and fast-skip rate. +- Segment in-bot share click users. +- Coverage: number of distinct segments with enough evidence. + +Targets should stay channel-specific: + +- Primary target: `forwards_24h / views_24h` above channel rolling median or + top quartile. This captures shareability without over-rewarding high reach. +- Secondary target: reaction rate above rolling median. +- Reach target: `views_24h` above expected views for that channel/hour/day. + Keep reach separate because post timing and subscriber base can dominate it. + +Do not train on all-time aggregates such as current `meme_stats.invited_count` +for historical examples. Every feature must be reconstructed as of the simulated +decision time. diff --git a/specs/dedup.md b/specs/dedup.md index b9a4e84c..a316d8e7 100644 --- a/specs/dedup.md +++ b/specs/dedup.md @@ -6,9 +6,10 @@ 1. **ETL single-media filter** (~80% of the 17%) — carousel posts removed. Not true dedup. 2. **Telegram forwarded_url** — same-source repost detection at ETL time. -3. **OCR text trigram similarity** — PostgreSQL `pg_trgm` operator `%` on extracted text. Min 12 chars. Works on memes with `ocr_result` populated by [Describe Memes](describe-memes.md) (OpenRouter vision). +3. **Telegram file_id exact match** — storage/final pipeline check before recommendation eligibility. +4. **OCR text trigram similarity** — PostgreSQL `pg_trgm` operator `%` on extracted text. Min 12 chars. Works on memes with `ocr_result` populated by [Describe Memes](describe-memes.md) (OpenRouter vision). -The text-based dedup (`find_meme_duplicate()` in `src/storage/service.py`) uses: +The text-based dedup (`find_duplicate_by_ocr_text()` in `src/storage/deduplication.py`) uses: ```sql AND (M.ocr_result ->> 'text') % '{imagetext}' -- trigram similarity > 0.3 ``` diff --git a/specs/parsing-etl.md b/specs/parsing-etl.md index 64fc883f..7a5f9429 100644 --- a/specs/parsing-etl.md +++ b/specs/parsing-etl.md @@ -83,7 +83,8 @@ See [dedup.md](dedup.md) for improvement plan. | `src/storage/parsers/vk.py` | VK API parser | | `src/storage/parsers/ig.py` | Instagram HikerAPI parser | | `src/storage/etl.py` | Raw -> processed meme transformation | -| `src/storage/service.py` | DB queries, find_meme_duplicate() | +| `src/storage/service.py` | Shared DB queries and meme status updates | +| `src/storage/deduplication.py` | File ID/OCR duplicate detection and resolution | | `src/storage/watermark.py` | Image watermarking (Pillow) | | `src/storage/ads.py` | Ad keyword detection | | `src/flows/storage/memes.py` | Pipeline orchestration (tg/vk/ig_meme_pipeline) | diff --git a/specs/testing.md b/specs/testing.md index ff13ebb9..4c71edb3 100644 --- a/specs/testing.md +++ b/specs/testing.md @@ -68,7 +68,7 @@ ### Phase 3: ETL and parsing **3a. Dedup tests**: -- find_meme_duplicate() with matching OCR text +- find_duplicate_by_ocr_text() with matching OCR text - Perceptual hash dedup (after implementation) **3b. ETL integration tests**: diff --git a/src/crossposting/service.py b/src/crossposting/service.py index 53201a4c..fb658b94 100644 --- a/src/crossposting/service.py +++ b/src/crossposting/service.py @@ -41,8 +41,8 @@ async def log_meme_sent( # Per-channel ranker constants (mirror the SQL ORDER BY). _CHANNEL_PARAMS: dict[str, dict[str, Any]] = { - "tgchannelru": {"impr_penalty": 0.8, "age_threshold": 7}, - "tgchannelen": {"impr_penalty": 0.5, "age_threshold": 90}, + "tgchannelru": {"impr_penalty": 0.8, "age_threshold": 7, "language_code": "ru"}, + "tgchannelen": {"impr_penalty": 0.5, "age_threshold": 90, "language_code": "en"}, } @@ -185,7 +185,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: return {k: candidate[k] for k in _PICKED_FIELDS} -_RU_QUERY = """ +_STANDARD_RANKER_QUERY = """ WITH selected_at AS ( SELECT NOW() AS decided_at ), @@ -196,124 +196,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: COUNT(*) AS n_posts FROM crossposting cp JOIN meme m ON m.id = cp.meme_id - WHERE cp.channel = 'tgchannelru' - AND cp.created_at > NOW() - INTERVAL '30 days' - AND cp.created_at < NOW() - INTERVAL '48 hours' - AND cp.views IS NOT NULL - AND cp.views > 0 - AND m.type = 'image' - GROUP BY m.meme_source_id - HAVING COUNT(*) >= 5 - ), - src_median AS ( - SELECT PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY signal) AS m_signal - FROM src_quality - ), - recent_src AS ( - SELECT DISTINCT m2.meme_source_id - FROM crossposting cp2 - JOIN meme m2 ON m2.id = cp2.meme_id - WHERE cp2.channel = 'tgchannelru' - AND cp2.created_at > NOW() - INTERVAL '24 hours' - AND cp2.telegram_message_id IS NOT NULL - ), - ranked AS ( - SELECT - M.id, M.type, M.telegram_file_id, M.caption, - M.meme_source_id, - MS.nlikes, MS.ndislikes, MS.raw_impr_rank, - MS.age_days, MS.nmemes_sent, MS.invited_count, - SQ.signal AS src_signal, - (SELECT m_signal FROM src_median) AS median_signal, - COUNT(*) OVER () AS candidate_pool_size, - ROW_NUMBER() OVER ( - ORDER BY -1 - * COALESCE((MS.nlikes + 1.) / (MS.nlikes + MS.ndislikes + 1), 0.5) - * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE 0.8 END - * CASE WHEN MS.age_days < 7 THEN 1 ELSE 0.8 END - * CASE WHEN M.caption IS NULL THEN 1 ELSE 0.8 END - * CASE - WHEN MS.nmemes_sent <= 1 THEN 1 - ELSE (MS.nlikes + MS.ndislikes) * 1. / MS.nmemes_sent - END - * COALESCE( - LEAST(2.0, GREATEST(0.5, - SQ.signal / NULLIF((SELECT m_signal FROM src_median), 0) - )), - 1.0 - ) - * (1.0 + LEAST(MS.invited_count, 10) * 0.1), - M.id - ) AS candidate_rank - FROM meme M - INNER JOIN meme_stats MS ON MS.meme_id = M.id - LEFT JOIN crossposting CP ON CP.meme_id = M.id AND CP.channel = 'tgchannelru' - LEFT JOIN src_quality SQ ON SQ.meme_source_id = M.meme_source_id - WHERE 1=1 - AND CP.meme_id IS NULL - AND M.status = 'ok' - AND M.language_code = 'ru' - AND M.type = 'image' - AND MS.nlikes >= 5 - AND M.meme_source_id NOT IN (SELECT meme_source_id FROM recent_src) - ORDER BY -1 - * COALESCE((MS.nlikes + 1.) / (MS.nlikes + MS.ndislikes + 1), 0.5) - * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE 0.8 END - * CASE WHEN MS.age_days < 7 THEN 1 ELSE 0.8 END - * CASE WHEN M.caption IS NULL THEN 1 ELSE 0.8 END - * CASE - WHEN MS.nmemes_sent <= 1 THEN 1 - ELSE (MS.nlikes + MS.ndislikes) * 1. / MS.nmemes_sent - END - * COALESCE( - LEAST(2.0, GREATEST(0.5, - SQ.signal / NULLIF((SELECT m_signal FROM src_median), 0) - )), - 1.0 - ) - * (1.0 + LEAST(MS.invited_count, 10) * 0.1), - M.id - LIMIT :limit - ) - SELECT - ranked.*, - COALESCE(share_clicks.pre_inbot_share_clicks, 0) AS pre_inbot_share_clicks, - COALESCE(share_clicks.pre_inbot_share_click_users, 0) AS pre_inbot_share_click_users - FROM ranked - CROSS JOIN selected_at - LEFT JOIN LATERAL ( - SELECT - COUNT(*) AS pre_inbot_share_clicks, - COUNT(DISTINCT user_id) AS pre_inbot_share_click_users - FROM user_deep_link_log udll - CROSS JOIN LATERAL ( - SELECT substring( - udll.deep_link FROM ('^s_([1-9][0-9]{0,18})_' || ranked.id || '$') - ) AS sharer_id - ) share_link - WHERE udll.created_at < selected_at.decided_at - AND CASE - WHEN share_link.sharer_id IS NULL THEN false - WHEN length(share_link.sharer_id) = 19 - AND share_link.sharer_id > '9223372036854775807' THEN false - ELSE udll.user_id <> share_link.sharer_id::bigint - END - ) share_clicks ON true - ORDER BY ranked.candidate_rank -""" - -_EN_QUERY = """ - WITH selected_at AS ( - SELECT NOW() AS decided_at - ), - src_quality AS ( - SELECT - m.meme_source_id, - AVG(cp.forwards * SQRT(GREATEST(cp.views, 1) / 100.0)) AS signal, - COUNT(*) AS n_posts - FROM crossposting cp - JOIN meme m ON m.id = cp.meme_id - WHERE cp.channel = 'tgchannelen' + WHERE cp.channel = :channel AND cp.created_at > NOW() - INTERVAL '30 days' AND cp.created_at < NOW() - INTERVAL '48 hours' AND cp.views IS NOT NULL @@ -330,7 +213,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: SELECT DISTINCT m2.meme_source_id FROM crossposting cp2 JOIN meme m2 ON m2.id = cp2.meme_id - WHERE cp2.channel = 'tgchannelen' + WHERE cp2.channel = :channel AND cp2.created_at > NOW() - INTERVAL '24 hours' AND cp2.telegram_message_id IS NOT NULL ), @@ -346,8 +229,8 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: ROW_NUMBER() OVER ( ORDER BY -1 * COALESCE((MS.nlikes + 1.) / (MS.nlikes + MS.ndislikes + 1), 0.5) - * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE 0.5 END - * CASE WHEN MS.age_days < 90 THEN 1 ELSE 0.8 END + * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE :impr_penalty END + * CASE WHEN MS.age_days < :age_threshold THEN 1 ELSE 0.8 END * CASE WHEN M.caption IS NULL THEN 1 ELSE 0.8 END * CASE WHEN MS.nmemes_sent <= 1 THEN 1 @@ -364,19 +247,19 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: ) AS candidate_rank FROM meme M INNER JOIN meme_stats MS ON MS.meme_id = M.id - LEFT JOIN crossposting CP ON CP.meme_id = M.id AND CP.channel = 'tgchannelen' + LEFT JOIN crossposting CP ON CP.meme_id = M.id AND CP.channel = :channel LEFT JOIN src_quality SQ ON SQ.meme_source_id = M.meme_source_id WHERE 1=1 AND CP.meme_id IS NULL AND M.status = 'ok' - AND M.language_code = 'en' + AND M.language_code = :language_code AND M.type = 'image' AND MS.nlikes >= 5 AND M.meme_source_id NOT IN (SELECT meme_source_id FROM recent_src) ORDER BY -1 * COALESCE((MS.nlikes + 1.) / (MS.nlikes + MS.ndislikes + 1), 0.5) - * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE 0.5 END - * CASE WHEN MS.age_days < 90 THEN 1 ELSE 0.8 END + * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE :impr_penalty END + * CASE WHEN MS.age_days < :age_threshold THEN 1 ELSE 0.8 END * CASE WHEN M.caption IS NULL THEN 1 ELSE 0.8 END * CASE WHEN MS.nmemes_sent <= 1 THEN 1 @@ -477,6 +360,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: (MS.nlikes + MS.ndislikes) * 1.0 / MS.nmemes_sent )) END DESC, + MS.invited_count DESC, M.id ) AS base_source_rank FROM meme M @@ -490,7 +374,6 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: AND M.type = 'image' AND M.telegram_file_id IS NOT NULL AND MS.nlikes >= 5 - AND SQ.signal IS NOT NULL AND ( :respect_recent_source_cap = false OR M.meme_source_id NOT IN (SELECT meme_source_id FROM recent_src) @@ -513,6 +396,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: (nlikes + ndislikes) * 1.0 / nmemes_sent )) END DESC, + invited_count DESC, id ) AS base_rank FROM base @@ -546,7 +430,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: with_shares.*, ROW_NUMBER() OVER ( PARTITION BY meme_source_id - ORDER BY share_max_score DESC, id + ORDER BY share_max_score DESC, invited_count DESC, id ) AS source_rank FROM ( SELECT @@ -593,7 +477,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: SELECT * FROM scored WHERE source_rank = 1 - ORDER BY share_max_score DESC, id + ORDER BY share_max_score DESC, invited_count DESC, id LIMIT :limit """ @@ -616,6 +500,23 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]: } +async def _get_next_meme_for_channel( + channel: str, + *, + log_top_n: int, + score_version: int, +) -> tuple[dict[str, Any] | None, dict[str, Any] | None]: + params = { + **_CHANNEL_PARAMS[channel], + "channel": channel, + "limit": log_top_n, + } + rows = await fetch_all(text(_STANDARD_RANKER_QUERY), params) + if not rows: + return None, None + return _picked_meme_dict(rows[0]), _build_decision_log(channel, score_version, rows) + + async def get_next_meme_for_tgchannelru( log_top_n: int = 5, ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]: @@ -629,20 +530,22 @@ async def get_next_meme_for_tgchannelru( - ``decision_log`` — kwargs dict for ``log_ranker_decision``, with the top-N candidates and per-candidate score breakdown. ``None`` when no candidates. """ - rows = await fetch_all(text(_RU_QUERY), {"limit": log_top_n}) - if not rows: - return None, None - return _picked_meme_dict(rows[0]), _build_decision_log("tgchannelru", 2, rows) + return await _get_next_meme_for_channel( + "tgchannelru", + log_top_n=log_top_n, + score_version=2, + ) async def get_next_meme_for_tgchannelen( log_top_n: int = 5, ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]: """Same as :func:`get_next_meme_for_tgchannelru` but for @fast_food_memes (EN).""" - rows = await fetch_all(text(_EN_QUERY), {"limit": log_top_n}) - if not rows: - return None, None - return _picked_meme_dict(rows[0]), _build_decision_log("tgchannelen", 2, rows) + return await _get_next_meme_for_channel( + "tgchannelen", + log_top_n=log_top_n, + score_version=2, + ) async def get_next_share_max_meme_for_tgchannelru( diff --git a/src/database.py b/src/database.py index 365a505d..c59fc79c 100644 --- a/src/database.py +++ b/src/database.py @@ -1,6 +1,6 @@ import asyncio import uuid -from typing import Any +from typing import Any, Awaitable, Callable, TypeVar from sqlalchemy import ( BigInteger, @@ -27,7 +27,7 @@ text, ) from sqlalchemy.dialects.postgresql import JSONB -from sqlalchemy.ext.asyncio import create_async_engine +from sqlalchemy.ext.asyncio import AsyncConnection, create_async_engine from sqlalchemy.pool import NullPool from src.config import settings @@ -41,6 +41,8 @@ DATABASE_URL = str(settings.DATABASE_URL) +T = TypeVar("T") + _engine_kwargs: dict = dict( connect_args={ "prepared_statement_name_func": lambda: f"__asyncpg_{uuid.uuid4()}__", @@ -882,3 +884,25 @@ async def execute( raise # Unreachable — loop always returns or raises raise RuntimeError("execute retry loop exhausted without returning") # pragma: no cover + + +async def run_in_transaction(fn: Callable[[AsyncConnection], Awaitable[T]]) -> T: + """Run several DB statements in one transaction with the standard retry policy.""" + _DEADLOCK_MAX_RETRIES = 2 + _max_attempts = max(_TRANSIENT_MAX_RETRIES, _DEADLOCK_MAX_RETRIES) + 1 + _transient_attempts = 0 + + for attempt in range(_max_attempts): + try: + async with engine.begin() as conn: + return await fn(conn) + except Exception as exc: + if _is_transient_connection_error(exc) and _transient_attempts < _TRANSIENT_MAX_RETRIES: + await asyncio.sleep(0.025 * (2**_transient_attempts)) + _transient_attempts += 1 + continue + if _is_deadlock_error(exc) and attempt < _DEADLOCK_MAX_RETRIES: + await asyncio.sleep(0.1 * (2**attempt)) + continue + raise + raise RuntimeError("transaction retry loop exhausted without returning") # pragma: no cover diff --git a/src/flows/rewards/uploaded_memes.py b/src/flows/rewards/uploaded_memes.py index 81c8edbc..71c46557 100644 --- a/src/flows/rewards/uploaded_memes.py +++ b/src/flows/rewards/uploaded_memes.py @@ -26,6 +26,8 @@ from src.tgbot.handlers.treasury.payments import pay_if_not_paid_with_alert from src.tgbot.logs import log +REWARD_ALBUM_SCORE_VERSION = 0 + """ 1. Get all uploaded memes this week. 2. Calculate some stats: @@ -58,72 +60,39 @@ def _meme_dict_to_input_media(m: dict): raise Exception(f"Can't get meme type from: {m}") -@flow( - name="Reward RU users for weekly top uploaded memes", - retries=1, - retry_delay_seconds=60, - timeout_seconds=300, - on_failure=[notify_telegram_on_failure], +REWARD_TRX_TYPES = ( + TrxType.UPLOADER_TOP_WEEKLY_1, + TrxType.UPLOADER_TOP_WEEKLY_2, + TrxType.UPLOADER_TOP_WEEKLY_3, + TrxType.UPLOADER_TOP_WEEKLY_4, + TrxType.UPLOADER_TOP_WEEKLY_5, ) -async def reward_ru_users_for_weekly_top_uploaded_memes(): - logger = get_run_logger() - logger.info("Going to reward users for weekly top uploaded memes") - - uploaded_memes = await get_all_uploaded_memes_weekly_ru() - logger.info(f"Received {len(uploaded_memes)} uploaded memes") - - if len(uploaded_memes) < 5: - await log(f"Not enough memes to reward users: only {len(uploaded_memes)}") - return - - nuploaded = len(uploaded_memes) - nusers = len(set(m["author_id"] for m in uploaded_memes)) - views = sum(m["nmemes_sent"] for m in uploaded_memes) - likes = sum(m["nlikes"] for m in uploaded_memes) - dislikes = sum(m["ndislikes"] for m in uploaded_memes) - avg_like = likes / (likes + dislikes) if likes + dislikes > 0 else 0 - logger.info(f"Uploaded: {nuploaded} by {nusers}, views: {views}, like%: {avg_like}") - today = datetime.today().date().strftime("%Y-%m-%d") - ########################### - # reward top authors +def _like_rate(meme: dict) -> float: + reactions = meme["nlikes"] + meme["ndislikes"] + return meme["nlikes"] / reactions if reactions > 0 else 0 - top_memes = sorted( - uploaded_memes, - key=lambda m: ( - m["nlikes"] / (m["nlikes"] + m["ndislikes"]) if m["nlikes"] + m["ndislikes"] > 0 else 0 - ), - reverse=True, - )[:5] - for i, top_meme in enumerate(top_memes): - if i == 0: - type = TrxType.UPLOADER_TOP_WEEKLY_1 - elif i == 1: - type = TrxType.UPLOADER_TOP_WEEKLY_2 - elif i == 2: - type = TrxType.UPLOADER_TOP_WEEKLY_3 - elif i == 3: - type = TrxType.UPLOADER_TOP_WEEKLY_4 - elif i == 4: - type = TrxType.UPLOADER_TOP_WEEKLY_5 - else: - continue +def _like_percent(likes: int, dislikes: int) -> int: + total = likes + dislikes + return round(likes * 100.0 / total) if total else 0 - await pay_if_not_paid_with_alert( - bot, - top_meme["author_id"], - type, - external_id=today, - ) - if top_meme["status"] != MemeStatus.PUBLISHED: - await update_meme(top_meme["meme_id"], status=MemeStatus.PUBLISHED) +def _top_uploaded_memes(uploaded_memes: list[dict], limit: int = 5) -> list[dict]: + return sorted(uploaded_memes, key=_like_rate, reverse=True)[:limit] - # send message to tgchannelru - channel_text = f""" +def _ru_channel_text( + top_memes: list[dict], + *, + uploaded_count: int, + user_count: int, + views: int, + likes: int, + dislikes: int, +) -> str: + return f""" 🏆 ТОП-5 загруженных мемов недели 🥇 - {top_memes[0]["nickname"] or "???"} @@ -132,187 +101,206 @@ async def reward_ru_users_for_weekly_top_uploaded_memes(): 🏅 - {top_memes[3]["nickname"] or "???"} 🏅 - {top_memes[4]["nickname"] or "???"} -📥 Загружено мемов: {nuploaded} -👤 Пользователями: {nusers} +📥 Загружено мемов: {uploaded_count} +👤 Пользователями: {user_count} 👁️ Просмотры: {views} -👍 Доля лайков: {round(likes * 100.0 / (likes + dislikes))}% +👍 Доля лайков: {_like_percent(likes, dislikes)}% Перешли топ мем в бота → выиграй до 500 🍔 """ # noqa - ms = await bot.send_media_group( - TELEGRAM_CHANNEL_RU_CHAT_ID, - [_meme_dict_to_input_media(m) for m in top_memes], - caption=channel_text, - parse_mode="HTML", - ) - # log_meme_sent failures must NOT propagate — Prefect would retry the flow - # and re-publish the album publicly. Missing one diversity-cap row is the - # smaller harm; the safe block below mirrors the author-notify pattern. - for i, top_meme in enumerate(top_memes): - try: - await log_meme_sent( - top_meme["meme_id"], - channel=Channel.TG_CHANNEL_RU, - telegram_message_id=ms[i].id, - ) - except Exception as e: - logger.error(f"Failed to log meme_sent for {top_meme['meme_id']}: {e}") +def _en_channel_text( + top_memes: list[dict], + *, + uploaded_count: int, + user_count: int, + views: int, + likes: int, + dislikes: int, +) -> str: + return f""" +🏆 Best uploaded memes of a week - message_link = f"{TELEGRAM_CHANNEL_RU_LINK}/{ms[0].id}" +🥇 - {top_memes[0]["nickname"] or "???"} +🥈 - {top_memes[1]["nickname"] or "???"} +🥉 - {top_memes[2]["nickname"] or "???"} +🏅 - {top_memes[3]["nickname"] or "???"} +🏅 - {top_memes[4]["nickname"] or "???"} - # send message to authors +📥 uploaded memes: {uploaded_count} +👤 by users: {user_count} +👁️ views: {views} +👍 like %: {_like_percent(likes, dislikes)}% - author_ids = set(m["author_id"] for m in top_memes) - logger.info(f"Going to notify {len(author_ids)} authors about rewards") - for author_id in author_ids: - user_uploaded_memes = [m for m in uploaded_memes if m["author_id"] == author_id] - likes = sum(m["nlikes"] for m in user_uploaded_memes) - dislikes = sum(m["ndislikes"] for m in user_uploaded_memes) - like_prc = round(likes * 100.0 / (likes + dislikes)) if likes + dislikes else 0 - views = sum(m["nmemes_sent"] for m in uploaded_memes) +Forward top meme to our bot → win up to 500 🍔 + """ # noqa - user_text = f""" + +def _ru_user_text( + user_uploaded_memes: list[dict], + *, + views: int, + like_percent: int, + message_link: str, +) -> str: + return f""" Стата по загруженным тобой мемам: 📥 Загружено мемов: {len(user_uploaded_memes)} 👁️ Просмотры: {views} -👍 Доля лайков: {like_prc}% +👍 Доля лайков: {like_percent}% Смотри топ-5 мемов недели в нашем канале: {message_link} """ - try: - await bot.send_message(author_id, user_text) - except Exception as e: - logger.error(f"Failed to send message to {author_id}: {e}") - await asyncio.sleep(2) +def _en_user_text( + user_uploaded_memes: list[dict], + *, + views: int, + like_percent: int, + message_link: str, +) -> str: + return f""" +Your stats for uploaded memes: +📥 Uploaded memes: {len(user_uploaded_memes)} +👁️ Views: {views} +👍 Like %: {like_percent}% -@flow( - name="Reward EN users for weekly top uploaded memes", - retries=1, - retry_delay_seconds=60, - timeout_seconds=300, - on_failure=[notify_telegram_on_failure], -) -async def reward_en_users_for_weekly_top_uploaded_memes(): - logger = get_run_logger() - logger.info("Going to reward users for weekly top uploaded memes") +Check out top-5 uploaded memes of the week in our channel: {message_link} + """ - uploaded_memes = await get_all_uploaded_memes_weekly_en() - logger.info(f"Received {len(uploaded_memes)} uploaded memes") + +async def _reward_users_for_weekly_top_uploaded_memes( + *, + uploaded_memes: list[dict], + channel: Channel, + channel_chat_id: int, + channel_link: str, + channel_text_builder, + user_text_builder, +) -> None: + logger = get_run_logger() + logger.info("Received %d uploaded memes", len(uploaded_memes)) if len(uploaded_memes) < 5: await log(f"Not enough memes to reward users: only {len(uploaded_memes)}") return - nuploaded = len(uploaded_memes) - nusers = len(set(m["author_id"] for m in uploaded_memes)) + uploaded_count = len(uploaded_memes) + user_count = len({m["author_id"] for m in uploaded_memes}) views = sum(m["nmemes_sent"] for m in uploaded_memes) likes = sum(m["nlikes"] for m in uploaded_memes) dislikes = sum(m["ndislikes"] for m in uploaded_memes) - avg_like = likes / (likes + dislikes) if likes + dislikes > 0 else 0 + logger.info( + "Uploaded: %d by %d, views: %d, like%%: %.3f", + uploaded_count, + user_count, + views, + _like_percent(likes, dislikes) / 100, + ) - logger.info(f"Uploaded: {nuploaded} by {nusers}, views: {views}, like%: {avg_like}") today = datetime.today().date().strftime("%Y-%m-%d") - - ########################### - # reward top authors - - top_memes = sorted( - uploaded_memes, - key=lambda m: ( - m["nlikes"] / (m["nlikes"] + m["ndislikes"]) if m["nlikes"] + m["ndislikes"] > 0 else 0 - ), - reverse=True, - )[:5] + top_memes = _top_uploaded_memes(uploaded_memes) for i, top_meme in enumerate(top_memes): - if i == 0: - type = TrxType.UPLOADER_TOP_WEEKLY_1 - elif i == 1: - type = TrxType.UPLOADER_TOP_WEEKLY_2 - elif i == 2: - type = TrxType.UPLOADER_TOP_WEEKLY_3 - elif i == 3: - type = TrxType.UPLOADER_TOP_WEEKLY_4 - elif i == 4: - type = TrxType.UPLOADER_TOP_WEEKLY_5 - else: - continue - await pay_if_not_paid_with_alert( bot, top_meme["author_id"], - type, + REWARD_TRX_TYPES[i], external_id=today, ) if top_meme["status"] != MemeStatus.PUBLISHED: await update_meme(top_meme["meme_id"], status=MemeStatus.PUBLISHED) - # send message to tgchannelen - - channel_text = f""" -🏆 Best uploaded memes of a week - -🥇 - {top_memes[0]["nickname"] or "???"} -🥈 - {top_memes[1]["nickname"] or "???"} -🥉 - {top_memes[2]["nickname"] or "???"} -🏅 - {top_memes[3]["nickname"] or "???"} -🏅 - {top_memes[4]["nickname"] or "???"} - -📥 uploaded memes: {nuploaded} -👤 by users: {nusers} -👁️ views: {views} -👍 like %: {round(likes * 100.0 / (likes + dislikes))}% - -Forward top meme to our bot → win up to 500 🍔 - """ # noqa - - ms = await bot.send_media_group( - TELEGRAM_CHANNEL_EN_CHAT_ID, + channel_text = channel_text_builder( + top_memes, + uploaded_count=uploaded_count, + user_count=user_count, + views=views, + likes=likes, + dislikes=dislikes, + ) + messages = await bot.send_media_group( + channel_chat_id, [_meme_dict_to_input_media(m) for m in top_memes], caption=channel_text, parse_mode="HTML", ) - # log_meme_sent failures must NOT propagate (see RU flow above for context). + # log_meme_sent failures must NOT propagate — Prefect would retry the flow + # and re-publish the album publicly. Missing one diversity-cap row is the + # smaller harm; the safe block below mirrors the author-notify pattern. for i, top_meme in enumerate(top_memes): try: await log_meme_sent( top_meme["meme_id"], - channel=Channel.TG_CHANNEL_EN, - telegram_message_id=ms[i].id, + channel=channel, + telegram_message_id=messages[i].id, + caption_text=channel_text if i == 0 else None, + score_version=REWARD_ALBUM_SCORE_VERSION, ) except Exception as e: logger.error(f"Failed to log meme_sent for {top_meme['meme_id']}: {e}") - message_link = f"{TELEGRAM_CHANNEL_EN_LINK}/{ms[0].id}" - - # send message to authors - - author_ids = set(m["author_id"] for m in top_memes) - logger.info(f"Going to notify {len(author_ids)} authors about rewards") + message_link = f"{channel_link}/{messages[0].id}" + author_ids = {m["author_id"] for m in top_memes} + logger.info("Going to notify %d authors about rewards", len(author_ids)) for author_id in author_ids: user_uploaded_memes = [m for m in uploaded_memes if m["author_id"] == author_id] - likes = sum(m["nlikes"] for m in user_uploaded_memes) - dislikes = sum(m["ndislikes"] for m in user_uploaded_memes) - like_prc = round(likes * 100.0 / (likes + dislikes)) if likes + dislikes else 0 - views = sum(m["nmemes_sent"] for m in uploaded_memes) - - user_text = f""" -Your stats for uploaded memes: -📥 Uploaded memes: {len(user_uploaded_memes)} -👁️ Views: {views} -👍 Like %: {like_prc}% - -Check out top-5 uploaded memes of the week in our channel: {message_link} - """ + user_likes = sum(m["nlikes"] for m in user_uploaded_memes) + user_dislikes = sum(m["ndislikes"] for m in user_uploaded_memes) + user_views = sum(m["nmemes_sent"] for m in user_uploaded_memes) + user_text = user_text_builder( + user_uploaded_memes, + views=user_views, + like_percent=_like_percent(user_likes, user_dislikes), + message_link=message_link, + ) try: await bot.send_message(author_id, user_text) except Exception as e: logger.error(f"Failed to send message to {author_id}: {e}") await asyncio.sleep(2) + + +@flow( + name="Reward RU users for weekly top uploaded memes", + retries=1, + retry_delay_seconds=60, + timeout_seconds=300, + on_failure=[notify_telegram_on_failure], +) +async def reward_ru_users_for_weekly_top_uploaded_memes(): + logger = get_run_logger() + logger.info("Going to reward users for weekly top uploaded memes") + await _reward_users_for_weekly_top_uploaded_memes( + uploaded_memes=await get_all_uploaded_memes_weekly_ru(), + channel=Channel.TG_CHANNEL_RU, + channel_chat_id=TELEGRAM_CHANNEL_RU_CHAT_ID, + channel_link=TELEGRAM_CHANNEL_RU_LINK, + channel_text_builder=_ru_channel_text, + user_text_builder=_ru_user_text, + ) + + +@flow( + name="Reward EN users for weekly top uploaded memes", + retries=1, + retry_delay_seconds=60, + timeout_seconds=300, + on_failure=[notify_telegram_on_failure], +) +async def reward_en_users_for_weekly_top_uploaded_memes(): + logger = get_run_logger() + logger.info("Going to reward users for weekly top uploaded memes") + await _reward_users_for_weekly_top_uploaded_memes( + uploaded_memes=await get_all_uploaded_memes_weekly_en(), + channel=Channel.TG_CHANNEL_EN, + channel_chat_id=TELEGRAM_CHANNEL_EN_CHAT_ID, + channel_link=TELEGRAM_CHANNEL_EN_LINK, + channel_text_builder=_en_channel_text, + user_text_builder=_en_user_text, + ) diff --git a/src/flows/storage/describe_memes.py b/src/flows/storage/describe_memes.py index 02c91e58..16674c49 100644 --- a/src/flows/storage/describe_memes.py +++ b/src/flows/storage/describe_memes.py @@ -26,532 +26,30 @@ import asyncio import base64 -import json -import re import time -from datetime import datetime, timezone -import httpx from prefect import flow, get_run_logger from src.config import settings -from src.database import execute, fetch_all, fetch_one, meme from src.flows.events import safe_emit from src.flows.hooks import notify_telegram_on_failure -from src.redis import redis_client -from src.storage.upload import download_meme_content_from_tg - -OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" -OPENROUTER_FREE_DAILY_REQUEST_LIMIT = 1000 -OPENROUTER_FREE_DAILY_REQUEST_BUDGET = 900 -OPENROUTER_FREE_REQUEST_COUNTER_TTL_SECONDS = 60 * 60 * 48 -OPENROUTER_FREE_STATS_TTL_SECONDS = 60 * 60 * 24 * 14 -OPENROUTER_DEFAULT_RATE_LIMIT_COOLDOWN_SECONDS = 60 * 15 -OPENROUTER_MAX_RATE_LIMIT_COOLDOWN_SECONDS = 60 * 60 -OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS = 60 * 15 -OPENROUTER_FORBIDDEN_MODEL_COOLDOWN_SECONDS = 60 * 60 * 6 - -# FREE models only. Never add paid models here — spending balance below $0 -# blocks ALL models (even free ones) with HTTP 402. Free tier requires $10+ -# lifetime purchases for 1,000 req/day (vs 50/day without). -# See specs/describe-memes.md for full OpenRouter constraints. -# -# Verified available on OpenRouter API as of 2026-05-17. -# Ordered by preference. Falls back to next model on 429/403/timeout/bad response. -# Transient failures set Redis cooldowns so later memes/runs try other free models. -VISION_MODELS = [ - "google/gemma-4-31b-it:free", # 262k context, primary - "google/gemma-4-26b-a4b-it:free", # 262k context, MoE variant - # Gemma 3 free vision fallbacks are no longer listed by OpenRouter. - # nvidia/nemotron-nano-12b-v2-vl:free removed — returns 504s and invalid - # JSON/empty content (see specs/describe-memes.md). -] - -DESCRIBE_PROMPT = ( - "You are analyzing a meme image. Extract the following:\n\n" - "1. OCR_TEXT: ALL text visible in the image, exactly as written. " - "Preserve original language and line breaks. " - "If no text, return empty string.\n\n" - "2. DESCRIPTION: Describe the meme in 1-3 sentences in English. " - "What's happening visually? What's the joke? " - "Be specific (panels, characters, reactions, meme format).\n\n" - "3. LANGUAGE: Primary language of the meme text as ISO 639-1 code " - '(e.g. "ru", "en"). If no text, return "en". ' - "If mixed, return dominant language.\n\n" - "Respond with ONLY valid JSON, no markdown fences:\n" - '{"ocr_text": "...", "description": "...", "language": "..."}' +from src.flows.storage.describe_memes_repository import ( + get_memes_to_describe, + increment_describe_failures, + save_meme_description, ) +from src.flows.storage.openrouter_vision import ( + ALL_FAILED, + DAILY_BUDGET_EXHAUSTED, + QUOTA_EXHAUSTED, + RATE_LIMITED, + VISION_MODELS, + call_openrouter_vision, +) +from src.storage.deduplication import deduplicate_described_meme +from src.storage.upload import download_meme_content_from_tg -# Sentinel return values from call_openrouter_vision -RATE_LIMITED = "__rate_limited" -ALL_FAILED = "__all_failed" -QUOTA_EXHAUSTED = "__quota_exhausted" -DAILY_BUDGET_EXHAUSTED = "__daily_budget_exhausted" - - -class UnsafeOpenRouterModelError(ValueError): - """Raised when a non-free OpenRouter model is configured.""" - - -def _validate_free_vision_models(model_ids: list[str]) -> None: - paid_model_ids = [model_id for model_id in model_ids if not model_id.endswith(":free")] - if paid_model_ids: - raise UnsafeOpenRouterModelError( - "OpenRouter paid models are forbidden in VISION_MODELS: " + ", ".join(paid_model_ids) - ) - - -def _validate_openrouter_free_budget() -> None: - if OPENROUTER_FREE_DAILY_REQUEST_BUDGET >= OPENROUTER_FREE_DAILY_REQUEST_LIMIT: - raise ValueError( - "OpenRouter local safety budget must stay below the documented " - f"{OPENROUTER_FREE_DAILY_REQUEST_LIMIT}/day free-model cap" - ) - - -def _openrouter_free_request_counter_key(now: datetime | None = None) -> str: - now = now or datetime.now(timezone.utc) - return f"openrouter:free_requests:{now.date().isoformat()}" - - -def _openrouter_stats_key(now: datetime | None = None) -> str: - now = now or datetime.now(timezone.utc) - return f"openrouter:free_ocr_stats:{now.strftime('%Y-%m-%d:%H')}" - - -def _openrouter_model_cooldown_key(model_id: str) -> str: - return f"openrouter:free_model_cooldown:{model_id}" - - -def _normalize_retry_after(raw_retry_after: float | None) -> float | None: - if raw_retry_after is None: - return None - if raw_retry_after > 60 * 60 * 24: - return max(0.0, raw_retry_after - time.time()) - return max(0.0, raw_retry_after) - - -def _rate_limit_cooldown_seconds(raw_retry_after: float | None) -> int: - retry_after = _normalize_retry_after(raw_retry_after) - if retry_after is None: - return OPENROUTER_DEFAULT_RATE_LIMIT_COOLDOWN_SECONDS - return int( - min( - max(retry_after, 60.0), - OPENROUTER_MAX_RATE_LIMIT_COOLDOWN_SECONDS, - ) - ) - - -_RESERVE_OPENROUTER_FREE_REQUEST_LUA = """ -local current = tonumber(redis.call("GET", KEYS[1]) or "0") -local budget = tonumber(ARGV[1]) -if current >= budget then - return {0, current} -end - -current = redis.call("INCR", KEYS[1]) -if current == 1 then - redis.call("EXPIRE", KEYS[1], tonumber(ARGV[2])) -end - -return {1, current} -""" - - -async def _reserve_openrouter_free_request(log) -> tuple[bool, int]: - """Reserve one daily free-model request attempt. - - OpenRouter counts failed attempts toward the daily free quota, so we reserve - before every model attempt, including fallbacks. If Redis is unavailable, fail - closed and do not call OpenRouter. - """ - key = _openrouter_free_request_counter_key() - try: - reserved, used_today = await redis_client.eval( - _RESERVE_OPENROUTER_FREE_REQUEST_LUA, - 1, - key, - OPENROUTER_FREE_DAILY_REQUEST_BUDGET, - OPENROUTER_FREE_REQUEST_COUNTER_TTL_SECONDS, - ) - return bool(int(reserved)), int(used_today) - except Exception as e: - log.error("OpenRouter quota guard failed via Redis; refusing request: %s", e) - return False, -1 - - -async def _record_openrouter_metric(model_id: str, outcome: str) -> None: - key = _openrouter_stats_key() - field = f"{model_id}:{outcome}" - try: - async with redis_client.pipeline(transaction=True) as pipe: - await pipe.hincrby(key, field, 1) - await pipe.expire(key, OPENROUTER_FREE_STATS_TTL_SECONDS) - await pipe.execute() - except Exception: - pass - - -async def _get_openrouter_model_cooldown(model_id: str) -> int: - try: - ttl = await redis_client.ttl(_openrouter_model_cooldown_key(model_id)) - except Exception: - return 0 - return int(ttl) if ttl and ttl > 0 else 0 - - -async def _cool_down_openrouter_model(model_id: str, seconds: int, reason: str) -> None: - try: - await redis_client.set( - _openrouter_model_cooldown_key(model_id), - reason, - ex=max(1, int(seconds)), - ) - except Exception: - pass - - -async def _cool_down_transient_openrouter_model(model_id: str, reason: str) -> float: - await _cool_down_openrouter_model( - model_id, - OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS, - reason, - ) - return float(OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS) - - -_validate_free_vision_models(VISION_MODELS) -_validate_openrouter_free_budget() - - -async def get_memes_to_describe(limit: int = 30) -> list[dict]: - """Get image memes without descriptions. - - Priority order: - 1. Recently uploaded memes (last 24h) — enables dedup for user uploads - 2. Most liked memes — improves Wrapped coverage - - Skips memes that have failed 3+ times (tracked in ocr_result.describe_failures). - """ - from sqlalchemy import text - - query = text( - """ - SELECT - M.id, - M.telegram_file_id, - M.ocr_result, - M.language_code - FROM meme M - LEFT JOIN meme_stats MS ON MS.meme_id = M.id - LEFT JOIN meme_source SRC ON SRC.id = M.meme_source_id - WHERE M.type = 'image' - AND M.status = 'ok' - AND M.telegram_file_id IS NOT NULL - AND ( - M.ocr_result IS NULL - OR M.ocr_result->>'description' IS NULL - ) - AND COALESCE((M.ocr_result->>'describe_failures')::int, 0) < 3 - ORDER BY - CASE WHEN SRC.type = 'user upload' - AND M.created_at > now() - interval '24 hours' - THEN 0 ELSE 1 END, - COALESCE(MS.nlikes, 0) DESC, - M.id DESC - LIMIT :limit - """ - ).bindparams(limit=limit) - - return await fetch_all(query) - - -def _parse_vision_response(raw_content: str) -> dict: - """Parse JSON from model response, stripping markdown fences if present. - - Falls back to escape-fixing and regex extraction to handle common LLM JSON issues - (invalid escape sequences, unterminated strings from lower-quality models). - """ - content = raw_content.strip() - if content.startswith("```"): - content = content.split("\n", 1)[1] if "\n" in content else content[3:] - if content.endswith("```"): - content = content[:-3] - content = content.strip() - if content.startswith("json"): - content = content[4:].strip() - - # 1. Standard parse - try: - return json.loads(content) - except json.JSONDecodeError: - pass - - # 2. Fix invalid escape sequences (e.g. \' or \k not valid in JSON) - try: - fixed = re.sub(r'\\(?!["\\/bfnrtu])', r"\\\\", content) - return json.loads(fixed) - except (json.JSONDecodeError, Exception): - pass - - # 3. Regex extraction — last resort for severely malformed output - result = {} - for key in ("ocr_text", "description", "language"): - match = re.search(rf'"{key}"\s*:\s*"((?:[^"\\]|\\.)*)"', content, re.DOTALL) - if match: - try: - result[key] = json.loads(f'"{match.group(1)}"') - except json.JSONDecodeError: - result[key] = match.group(1) - if result: - return result - - raise json.JSONDecodeError("Could not parse model response", content, 0) - - -def _parse_retry_after(response: httpx.Response) -> float | None: - """Extract retry delay from Retry-After header or response body.""" - header = response.headers.get("retry-after") or response.headers.get("x-ratelimit-reset") - if header: - try: - return float(header) - except ValueError: - pass - try: - body = response.json() - if "error" in body and "metadata" in body["error"]: - reset = body["error"]["metadata"].get("ratelimit_reset") - if reset: - return float(reset) - except Exception: - pass - return None - - -async def call_openrouter_vision(image_b64: str, log, *, deadline: float | None = None) -> dict: - """Call OpenRouter vision model with fallback chain. - - Args: - deadline: monotonic timestamp after which we stop trying models. - - Returns: - dict with result on success, or {RATE_LIMITED: True} / {ALL_FAILED: True} - """ - headers = { - "Authorization": f"Bearer {settings.OPENROUTER_API_KEY}", - "Content-Type": "application/json", - } - - next_retry_after: float | None = None - tried_models = 0 - - async with httpx.AsyncClient(timeout=30.0) as client: - for model_id in VISION_MODELS: - if not model_id.endswith(":free"): - raise UnsafeOpenRouterModelError(f"Refusing non-free OpenRouter model: {model_id}") - - cooldown_ttl = await _get_openrouter_model_cooldown(model_id) - if cooldown_ttl > 0: - log.info( - "Skipping %s — free-model cooldown has %ss remaining.", - model_id, - cooldown_ttl, - ) - if next_retry_after is None or cooldown_ttl < next_retry_after: - next_retry_after = float(cooldown_ttl) - continue - - # Stop trying more models if we're running out of time - if deadline is not None and time.monotonic() > deadline - 35: - log.warning("Skipping remaining models — approaching deadline") - break - - payload = { - "model": model_id, - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": DESCRIBE_PROMPT}, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_b64}", - }, - }, - ], - } - ], - "max_tokens": 500, - "temperature": 0.2, - } - - try: - tried_models += 1 - reserved, used_today = await _reserve_openrouter_free_request(log) - if not reserved: - log.warning( - "OpenRouter free-model daily safety budget exhausted " - "(%s/%s attempts). Refusing request.", - used_today if used_today >= 0 else "unknown", - OPENROUTER_FREE_DAILY_REQUEST_BUDGET, - ) - return {DAILY_BUDGET_EXHAUSTED: True, "__used_today": used_today} - await _record_openrouter_metric(model_id, "attempt") - - response = await client.post( - f"{OPENROUTER_BASE_URL}/chat/completions", - headers=headers, - json=payload, - ) - - if response.status_code == 402: - log.warning( - "OpenRouter quota exhausted (HTTP 402). " - "Balance likely below $0 — all models blocked. " - "Check https://openrouter.ai/settings/credits" - ) - await _record_openrouter_metric(model_id, "quota_exhausted") - return {QUOTA_EXHAUSTED: True} - - if response.status_code == 429: - raw_retry_after = _parse_retry_after(response) - retry_after = _normalize_retry_after(raw_retry_after) - cooldown = _rate_limit_cooldown_seconds(raw_retry_after) - await _record_openrouter_metric(model_id, "rate_limited") - await _cool_down_openrouter_model(model_id, cooldown, "rate_limited") - if next_retry_after is None or cooldown < next_retry_after: - next_retry_after = float(cooldown) - log.info( - "Rate-limited (429) on %s (retry-after: %ss, cooldown: %ss)", - model_id, - retry_after or "unknown", - cooldown, - ) - continue - - if response.status_code == 403: - await _record_openrouter_metric(model_id, "forbidden") - await _cool_down_openrouter_model( - model_id, - OPENROUTER_FORBIDDEN_MODEL_COOLDOWN_SECONDS, - "forbidden", - ) - log.warning("Model %s HTTP 403 (access denied), trying next...", model_id) - continue - - response.raise_for_status() - - body = response.text.strip() - json_start = body.find("{") - if json_start < 0: - await _record_openrouter_metric(model_id, "bad_response") - retry_after = await _cool_down_transient_openrouter_model( - model_id, - "bad_response", - ) - if next_retry_after is None or retry_after < next_retry_after: - next_retry_after = retry_after - log.warning("Model %s returned no JSON: %s", model_id, body[:100]) - continue - data = json.loads(body[json_start:]) - - if "choices" not in data: - await _record_openrouter_metric(model_id, "bad_response") - retry_after = await _cool_down_transient_openrouter_model( - model_id, - "bad_response", - ) - if next_retry_after is None or retry_after < next_retry_after: - next_retry_after = retry_after - log.warning("Model %s no choices: %s", model_id, str(data)[:200]) - continue - - content = data["choices"][0]["message"]["content"] - if not content: - await _record_openrouter_metric(model_id, "empty_content") - retry_after = await _cool_down_transient_openrouter_model( - model_id, - "empty_content", - ) - if next_retry_after is None or retry_after < next_retry_after: - next_retry_after = retry_after - log.warning("Model %s empty content", model_id) - continue - result = _parse_vision_response(content) - - if "description" not in result and "ocr_text" not in result: - await _record_openrouter_metric(model_id, "bad_json") - retry_after = await _cool_down_transient_openrouter_model( - model_id, - "bad_json", - ) - if next_retry_after is None or retry_after < next_retry_after: - next_retry_after = retry_after - log.warning("Model %s bad JSON: %s", model_id, str(result)[:200]) - continue - - result["__model"] = model_id - await _record_openrouter_metric(model_id, "success") - return result - - except json.JSONDecodeError as e: - await _record_openrouter_metric(model_id, "invalid_json") - retry_after = await _cool_down_transient_openrouter_model( - model_id, - "invalid_json", - ) - if next_retry_after is None or retry_after < next_retry_after: - next_retry_after = retry_after - log.warning("Model %s invalid JSON: %s", model_id, e) - continue - except httpx.HTTPStatusError as e: - await _record_openrouter_metric(model_id, f"http_{e.response.status_code}") - if e.response.status_code >= 500: - retry_after = await _cool_down_transient_openrouter_model( - model_id, - f"http_{e.response.status_code}", - ) - if next_retry_after is None or retry_after < next_retry_after: - next_retry_after = retry_after - log.warning("Model %s HTTP %s", model_id, e.response.status_code) - continue - except (httpx.ReadTimeout, httpx.ConnectTimeout) as e: - await _record_openrouter_metric(model_id, "timeout") - retry_after = await _cool_down_transient_openrouter_model(model_id, "timeout") - if next_retry_after is None or retry_after < next_retry_after: - next_retry_after = retry_after - log.warning("Model %s timeout: %s", model_id, type(e).__name__) - continue - except httpx.RequestError as e: - await _record_openrouter_metric(model_id, "request_error") - retry_after = await _cool_down_transient_openrouter_model( - model_id, - "request_error", - ) - if next_retry_after is None or retry_after < next_retry_after: - next_retry_after = retry_after - log.warning("Model %s request error: %s", model_id, type(e).__name__) - continue - except Exception as e: - await _record_openrouter_metric(model_id, "error") - log.warning("Model %s error: %s", model_id, e) - continue - - if tried_models == 0 or next_retry_after is not None: - return {RATE_LIMITED: True, "__retry_after": next_retry_after} - - # All models exhausted on non-retryable responses. - return {ALL_FAILED: True} - - -async def _increment_describe_failures(meme_id: int, existing_ocr: dict, reason: str): - """Track describe failures in ocr_result so permanently broken memes get skipped.""" - failures = int(existing_ocr.get("describe_failures", 0)) + 1 - merged = {**existing_ocr, "describe_failures": failures, "last_failure_reason": reason} - update_query = meme.update().where(meme.c.id == meme_id).values(ocr_result=merged) - await execute(update_query) +__all__ = ["VISION_MODELS", "describe_memes_flow", "describe_single_meme"] async def describe_single_meme(meme_row: dict, log, *, deadline: float | None = None) -> str: @@ -568,7 +66,7 @@ async def describe_single_meme(meme_row: dict, log, *, deadline: float | None = image_bytes = await download_meme_content_from_tg(file_id) except Exception as e: log.warning("Meme %s: download failed: %s", meme_id, e) - await _increment_describe_failures(meme_id, existing_ocr, str(e)) + await increment_describe_failures(meme_id, existing_ocr, str(e)) return "failed" image_b64 = base64.b64encode(image_bytes).decode("utf-8") @@ -578,11 +76,11 @@ async def describe_single_meme(meme_row: dict, log, *, deadline: float | None = result = await call_openrouter_vision(image_b64, log, deadline=deadline) except Exception as e: log.warning("Meme %s: OpenRouter error: %s", meme_id, e) - await _increment_describe_failures(meme_id, existing_ocr, str(e)) + await increment_describe_failures(meme_id, existing_ocr, str(e)) return "failed" if result is None: - await _increment_describe_failures(meme_id, existing_ocr, "no result") + await increment_describe_failures(meme_id, existing_ocr, "no result") return "failed" if result.get(RATE_LIMITED): @@ -598,54 +96,22 @@ async def describe_single_meme(meme_row: dict, log, *, deadline: float | None = return "daily_budget_exhausted" if result.get(ALL_FAILED): - await _increment_describe_failures(meme_id, existing_ocr, "all models failed") + await increment_describe_failures(meme_id, existing_ocr, "all models failed") return "failed" - # Merge with existing ocr_result - ocr_text = result.get("ocr_text", "") - description = result.get("description", "") - language = result.get("language", "") - model_used = result.get("__model", VISION_MODELS[0]) - - merged = { - **existing_ocr, - "model": model_used, - "calculated_at": datetime.now(timezone.utc).isoformat(), - "raw_result": { - "ocr_text": ocr_text, - "description": description, - "language": language, - }, - "description": description, - } - - if not existing_ocr.get("text"): - merged["text"] = ocr_text - - update_kwargs = {"ocr_result": merged} - - # Only update language_code if the detected language is one we already use - # This ensures inner joins with user_language work correctly - KNOWN_LANGUAGES = { - "ru", - "en", - "uk", - "es", - "fa", - "pl", - "hi", - "am", - "de", - "fr", - "pt-br", - "ar", - "uz", - } - if language and language.lower() in KNOWN_LANGUAGES: - update_kwargs["language_code"] = language.lower() - - update_query = meme.update().where(meme.c.id == meme_id).values(**update_kwargs).returning(meme) - await fetch_one(update_query) + merged = await save_meme_description(meme_id, existing_ocr, result) + dedup_result = await deduplicate_described_meme( + meme_id, + merged.get("text", ""), + status=meme_row.get("status"), + ) + if dedup_result.duplicate_found: + log.info( + "Meme %s resolved as OCR duplicate of %s after describe: %s", + meme_id, + dedup_result.duplicate_of, + dedup_result.resolution, + ) return "ok" @@ -719,7 +185,7 @@ async def describe_memes_flow(batch_size: int = 20) -> None: i + 1, len(memes), ) - await _increment_describe_failures( + await increment_describe_failures( meme_row["id"], meme_row["ocr_result"] or {}, f"per-meme timeout ({effective_timeout:.0f}s)", diff --git a/src/flows/storage/describe_memes_repository.py b/src/flows/storage/describe_memes_repository.py new file mode 100644 index 00000000..b1b924de --- /dev/null +++ b/src/flows/storage/describe_memes_repository.py @@ -0,0 +1,111 @@ +from datetime import datetime, timezone +from typing import Any + +from sqlalchemy import text + +from src.database import execute, fetch_all, fetch_one, meme +from src.flows.storage.openrouter_vision import VISION_MODELS + +KNOWN_LANGUAGES = { + "ru", + "en", + "uk", + "es", + "fa", + "pl", + "hi", + "am", + "de", + "fr", + "pt-br", + "ar", + "uz", +} + + +async def get_memes_to_describe(limit: int = 30) -> list[dict[str, Any]]: + """Get image memes without descriptions. + + Priority order: + 1. Recently uploaded memes (last 24h) — enables dedup for user uploads + 2. Most liked memes — improves Wrapped coverage + + Skips memes that have failed 3+ times (tracked in ocr_result.describe_failures). + """ + query = text( + """ + SELECT + M.id, + M.telegram_file_id, + M.ocr_result, + M.status, + M.language_code + FROM meme M + LEFT JOIN meme_stats MS ON MS.meme_id = M.id + LEFT JOIN meme_source SRC ON SRC.id = M.meme_source_id + WHERE M.type = 'image' + AND M.status = 'ok' + AND M.telegram_file_id IS NOT NULL + AND ( + M.ocr_result IS NULL + OR M.ocr_result->>'description' IS NULL + ) + AND COALESCE((M.ocr_result->>'describe_failures')::int, 0) < 3 + ORDER BY + CASE WHEN SRC.type = 'user upload' + AND M.created_at > now() - interval '24 hours' + THEN 0 ELSE 1 END, + COALESCE(MS.nlikes, 0) DESC, + M.id DESC + LIMIT :limit + """ + ).bindparams(limit=limit) + + return await fetch_all(query) + + +async def increment_describe_failures( + meme_id: int, + existing_ocr: dict[str, Any], + reason: str, +) -> None: + """Track describe failures in ocr_result so permanently broken memes get skipped.""" + failures = int(existing_ocr.get("describe_failures", 0)) + 1 + merged = {**existing_ocr, "describe_failures": failures, "last_failure_reason": reason} + update_query = meme.update().where(meme.c.id == meme_id).values(ocr_result=merged) + await execute(update_query) + + +async def save_meme_description( + meme_id: int, + existing_ocr: dict[str, Any], + result: dict[str, Any], +) -> dict[str, Any]: + ocr_text = result.get("ocr_text", "") + description = result.get("description", "") + language = result.get("language", "") + model_used = result.get("__model", VISION_MODELS[0]) + + merged = { + **existing_ocr, + "model": model_used, + "calculated_at": datetime.now(timezone.utc).isoformat(), + "raw_result": { + "ocr_text": ocr_text, + "description": description, + "language": language, + }, + "description": description, + } + + if not existing_ocr.get("text"): + merged["text"] = ocr_text + + update_kwargs: dict[str, Any] = {"ocr_result": merged} + language_code = language.lower() + if language_code in KNOWN_LANGUAGES: + update_kwargs["language_code"] = language_code + + update_query = meme.update().where(meme.c.id == meme_id).values(**update_kwargs).returning(meme) + await fetch_one(update_query) + return merged diff --git a/src/flows/storage/memes.py b/src/flows/storage/memes.py index 66b9c680..efd619fb 100644 --- a/src/flows/storage/memes.py +++ b/src/flows/storage/memes.py @@ -6,17 +6,18 @@ from src.flows.hooks import notify_telegram_on_failure from src.storage import ads from src.storage.constants import MemeStatus, MemeType +from src.storage.deduplication import ( + deduplicate_pending_meme, + sweep_file_id_duplicates, +) from src.storage.etl import ( etl_memes_from_raw_telegram_posts, etl_memes_from_raw_vk_posts, ) from src.storage.service import ( - find_meme_duplicate, - find_meme_duplicate_by_file_id, get_pending_memes, get_unloaded_tg_memes, get_unloaded_vk_memes, - resolve_meme_duplicate, update_meme, update_meme_status_of_ready_memes, ) @@ -200,28 +201,31 @@ async def final_meme_pipeline() -> None: memes = await get_pending_memes() logger.info(f"Final meme pipeline has {len(memes)} pending memes.") + processed_meme_ids = [] for meme in memes: + processed_meme_ids.append(meme["id"]) await analyse_meme_caption(meme) - # exact file_id dedup: catches cross-source reposts of identical files - if meme["telegram_file_id"]: - dup_id = await find_meme_duplicate_by_file_id(meme["id"], meme["telegram_file_id"]) - if dup_id: - await resolve_meme_duplicate(meme["id"], dup_id) - continue - - # it's ok if there is no OCR result for videos - if meme["ocr_result"]: - duplicate_meme_id = await find_meme_duplicate(meme["id"], meme["ocr_result"]["text"]) - if duplicate_meme_id: - await resolve_meme_duplicate(meme["id"], duplicate_meme_id) - continue + result = await deduplicate_pending_meme(meme) + if result.duplicate_found: + logger.info( + "Meme %s resolved as %s duplicate of %s before ok promotion.", + result.meme_id, + result.reason, + result.duplicate_of, + ) - # next step of a pipeline - await update_meme_status_of_ready_memes() + promoted_memes = await update_meme_status_of_ready_memes(processed_meme_ids) + file_id_duplicates = await sweep_file_id_duplicates() + if file_id_duplicates["resolved"]: + logger.info("Resolved file_id duplicates: %s", file_id_duplicates) safe_emit( "ff.pipeline.final.completed", "ff.pipeline.final", - {"memes_processed": len(memes)}, + { + "memes_processed": len(memes), + "memes_promoted": len(promoted_memes), + "file_id_duplicates_resolved": file_id_duplicates["resolved"], + }, ) diff --git a/src/flows/storage/openrouter_vision.py b/src/flows/storage/openrouter_vision.py new file mode 100644 index 00000000..8b07f73b --- /dev/null +++ b/src/flows/storage/openrouter_vision.py @@ -0,0 +1,470 @@ +import json +import re +import time +from datetime import datetime, timezone + +import httpx + +from src.config import settings +from src.redis import redis_client + +OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" +OPENROUTER_FREE_DAILY_REQUEST_LIMIT = 1000 +OPENROUTER_FREE_DAILY_REQUEST_BUDGET = 900 +OPENROUTER_FREE_REQUEST_COUNTER_TTL_SECONDS = 60 * 60 * 48 +OPENROUTER_FREE_STATS_TTL_SECONDS = 60 * 60 * 24 * 14 +OPENROUTER_DEFAULT_RATE_LIMIT_COOLDOWN_SECONDS = 60 * 15 +OPENROUTER_MAX_RATE_LIMIT_COOLDOWN_SECONDS = 60 * 60 +OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS = 60 * 15 +OPENROUTER_FORBIDDEN_MODEL_COOLDOWN_SECONDS = 60 * 60 * 6 + +# FREE models only. Never add paid models here — spending balance below $0 +# blocks ALL models (even free ones) with HTTP 402. Free tier requires $10+ +# lifetime purchases for 1,000 req/day (vs 50/day without). +# See specs/describe-memes.md for full OpenRouter constraints. +# +# Verified available on OpenRouter API as of 2026-05-17. +# Ordered by preference. Falls back to next model on 429/403/timeout/bad response. +# Transient failures set Redis cooldowns so later memes/runs try other free models. +VISION_MODELS = [ + "google/gemma-4-31b-it:free", # 262k context, primary + "google/gemma-4-26b-a4b-it:free", # 262k context, MoE variant + # Gemma 3 free vision fallbacks are no longer listed by OpenRouter. + # nvidia/nemotron-nano-12b-v2-vl:free removed — returns 504s and invalid + # JSON/empty content (see specs/describe-memes.md). +] + +DESCRIBE_PROMPT = ( + "You are analyzing a meme image. Extract the following:\n\n" + "1. OCR_TEXT: ALL text visible in the image, exactly as written. " + "Preserve original language and line breaks. " + "If no text, return empty string.\n\n" + "2. DESCRIPTION: Describe the meme in 1-3 sentences in English. " + "What's happening visually? What's the joke? " + "Be specific (panels, characters, reactions, meme format).\n\n" + "3. LANGUAGE: Primary language of the meme text as ISO 639-1 code " + '(e.g. "ru", "en"). If no text, return "en". ' + "If mixed, return dominant language.\n\n" + "Respond with ONLY valid JSON, no markdown fences:\n" + '{"ocr_text": "...", "description": "...", "language": "..."}' +) + +RATE_LIMITED = "__rate_limited" +ALL_FAILED = "__all_failed" +QUOTA_EXHAUSTED = "__quota_exhausted" +DAILY_BUDGET_EXHAUSTED = "__daily_budget_exhausted" +TRY_NEXT_MODEL = "__try_next_model" + + +class UnsafeOpenRouterModelError(ValueError): + """Raised when a non-free OpenRouter model is configured.""" + + +def _validate_free_vision_models(model_ids: list[str]) -> None: + paid_model_ids = [model_id for model_id in model_ids if not model_id.endswith(":free")] + if paid_model_ids: + raise UnsafeOpenRouterModelError( + "OpenRouter paid models are forbidden in VISION_MODELS: " + ", ".join(paid_model_ids) + ) + + +def _validate_openrouter_free_budget() -> None: + if OPENROUTER_FREE_DAILY_REQUEST_BUDGET >= OPENROUTER_FREE_DAILY_REQUEST_LIMIT: + raise ValueError( + "OpenRouter local safety budget must stay below the documented " + f"{OPENROUTER_FREE_DAILY_REQUEST_LIMIT}/day free-model cap" + ) + + +def _openrouter_free_request_counter_key(now: datetime | None = None) -> str: + now = now or datetime.now(timezone.utc) + return f"openrouter:free_requests:{now.date().isoformat()}" + + +def _openrouter_stats_key(now: datetime | None = None) -> str: + now = now or datetime.now(timezone.utc) + return f"openrouter:free_ocr_stats:{now.strftime('%Y-%m-%d:%H')}" + + +def _openrouter_model_cooldown_key(model_id: str) -> str: + return f"openrouter:free_model_cooldown:{model_id}" + + +def _normalize_retry_after(raw_retry_after: float | None) -> float | None: + if raw_retry_after is None: + return None + if raw_retry_after > 60 * 60 * 24: + return max(0.0, raw_retry_after - time.time()) + return max(0.0, raw_retry_after) + + +def _rate_limit_cooldown_seconds(raw_retry_after: float | None) -> int: + retry_after = _normalize_retry_after(raw_retry_after) + if retry_after is None: + return OPENROUTER_DEFAULT_RATE_LIMIT_COOLDOWN_SECONDS + return int( + min( + max(retry_after, 60.0), + OPENROUTER_MAX_RATE_LIMIT_COOLDOWN_SECONDS, + ) + ) + + +_RESERVE_OPENROUTER_FREE_REQUEST_LUA = """ +local current = tonumber(redis.call("GET", KEYS[1]) or "0") +local budget = tonumber(ARGV[1]) +if current >= budget then + return {0, current} +end + +current = redis.call("INCR", KEYS[1]) +if current == 1 then + redis.call("EXPIRE", KEYS[1], tonumber(ARGV[2])) +end + +return {1, current} +""" + + +async def _reserve_openrouter_free_request(log) -> tuple[bool, int]: + """Reserve one daily free-model request attempt. + + OpenRouter counts failed attempts toward the daily free quota, so we reserve + before every model attempt, including fallbacks. If Redis is unavailable, fail + closed and do not call OpenRouter. + """ + key = _openrouter_free_request_counter_key() + try: + reserved, used_today = await redis_client.eval( + _RESERVE_OPENROUTER_FREE_REQUEST_LUA, + 1, + key, + OPENROUTER_FREE_DAILY_REQUEST_BUDGET, + OPENROUTER_FREE_REQUEST_COUNTER_TTL_SECONDS, + ) + return bool(int(reserved)), int(used_today) + except Exception as e: + log.error("OpenRouter quota guard failed via Redis; refusing request: %s", e) + return False, -1 + + +async def _record_openrouter_metric(model_id: str, outcome: str) -> None: + key = _openrouter_stats_key() + field = f"{model_id}:{outcome}" + try: + async with redis_client.pipeline(transaction=True) as pipe: + await pipe.hincrby(key, field, 1) + await pipe.expire(key, OPENROUTER_FREE_STATS_TTL_SECONDS) + await pipe.execute() + except Exception: + pass + + +async def _get_openrouter_model_cooldown(model_id: str) -> int: + try: + ttl = await redis_client.ttl(_openrouter_model_cooldown_key(model_id)) + except Exception: + return 0 + return int(ttl) if ttl and ttl > 0 else 0 + + +async def _cool_down_openrouter_model(model_id: str, seconds: int, reason: str) -> None: + try: + await redis_client.set( + _openrouter_model_cooldown_key(model_id), + reason, + ex=max(1, int(seconds)), + ) + except Exception: + pass + + +async def _cool_down_transient_openrouter_model(model_id: str, reason: str) -> float: + await _cool_down_openrouter_model( + model_id, + OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS, + reason, + ) + return float(OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS) + + +def _parse_vision_response(raw_content: str) -> dict: + """Parse JSON from model response, stripping markdown fences if present.""" + content = raw_content.strip() + if content.startswith("```"): + content = content.split("\n", 1)[1] if "\n" in content else content[3:] + if content.endswith("```"): + content = content[:-3] + content = content.strip() + if content.startswith("json"): + content = content[4:].strip() + + try: + return json.loads(content) + except json.JSONDecodeError: + pass + + try: + fixed = re.sub(r'\\(?!["\\/bfnrtu])', r"\\\\", content) + return json.loads(fixed) + except (json.JSONDecodeError, Exception): + pass + + result = {} + for key in ("ocr_text", "description", "language"): + match = re.search(rf'"{key}"\s*:\s*"((?:[^"\\]|\\.)*)"', content, re.DOTALL) + if match: + try: + result[key] = json.loads(f'"{match.group(1)}"') + except json.JSONDecodeError: + result[key] = match.group(1) + if result: + return result + + raise json.JSONDecodeError("Could not parse model response", content, 0) + + +def _parse_retry_after(response: httpx.Response) -> float | None: + """Extract retry delay from Retry-After header or response body.""" + header = response.headers.get("retry-after") or response.headers.get("x-ratelimit-reset") + if header: + try: + return float(header) + except ValueError: + pass + try: + body = response.json() + if "error" in body and "metadata" in body["error"]: + reset = body["error"]["metadata"].get("ratelimit_reset") + if reset: + return float(reset) + except Exception: + pass + return None + + +async def call_openrouter_vision(image_b64: str, log, *, deadline: float | None = None) -> dict: + """Call OpenRouter vision model with fallback chain. + + Args: + deadline: monotonic timestamp after which we stop trying models. + + Returns: + dict with result on success, or {RATE_LIMITED: True} / {ALL_FAILED: True} + """ + headers = { + "Authorization": f"Bearer {settings.OPENROUTER_API_KEY}", + "Content-Type": "application/json", + } + + next_retry_after: float | None = None + tried_models = 0 + + async with httpx.AsyncClient(timeout=30.0) as client: + for model_id in VISION_MODELS: + if not model_id.endswith(":free"): + raise UnsafeOpenRouterModelError(f"Refusing non-free OpenRouter model: {model_id}") + + cooldown_ttl = await _get_openrouter_model_cooldown(model_id) + if cooldown_ttl > 0: + log.info( + "Skipping %s — free-model cooldown has %ss remaining.", + model_id, + cooldown_ttl, + ) + if next_retry_after is None or cooldown_ttl < next_retry_after: + next_retry_after = float(cooldown_ttl) + continue + + if deadline is not None and time.monotonic() > deadline - 35: + log.warning("Skipping remaining models — approaching deadline") + break + + payload = { + "model": model_id, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": DESCRIBE_PROMPT}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_b64}", + }, + }, + ], + } + ], + "max_tokens": 500, + "temperature": 0.2, + } + + try: + tried_models += 1 + reserved, used_today = await _reserve_openrouter_free_request(log) + if not reserved: + log.warning( + "OpenRouter free-model daily safety budget exhausted " + "(%s/%s attempts). Refusing request.", + used_today if used_today >= 0 else "unknown", + OPENROUTER_FREE_DAILY_REQUEST_BUDGET, + ) + return {DAILY_BUDGET_EXHAUSTED: True, "__used_today": used_today} + await _record_openrouter_metric(model_id, "attempt") + + response = await client.post( + f"{OPENROUTER_BASE_URL}/chat/completions", + headers=headers, + json=payload, + ) + + status_result = await _handle_status_response(response, model_id, log) + if status_result is not None: + if status_result.get(TRY_NEXT_MODEL): + continue + if status_result.get(RATE_LIMITED): + cooldown = status_result["__retry_after"] + if next_retry_after is None or cooldown < next_retry_after: + next_retry_after = float(cooldown) + continue + return status_result + + response.raise_for_status() + result = await _parse_success_response(response, model_id, log) + if result is not None: + return result + + if next_retry_after is None: + next_retry_after = float(OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS) + continue + + except json.JSONDecodeError as e: + await _record_openrouter_metric(model_id, "invalid_json") + retry_after = await _cool_down_transient_openrouter_model(model_id, "invalid_json") + if next_retry_after is None or retry_after < next_retry_after: + next_retry_after = retry_after + log.warning("Model %s invalid JSON: %s", model_id, e) + continue + except httpx.HTTPStatusError as e: + await _record_openrouter_metric(model_id, f"http_{e.response.status_code}") + if e.response.status_code >= 500: + retry_after = await _cool_down_transient_openrouter_model( + model_id, + f"http_{e.response.status_code}", + ) + if next_retry_after is None or retry_after < next_retry_after: + next_retry_after = retry_after + log.warning("Model %s HTTP %s", model_id, e.response.status_code) + continue + except (httpx.ReadTimeout, httpx.ConnectTimeout) as e: + await _record_openrouter_metric(model_id, "timeout") + retry_after = await _cool_down_transient_openrouter_model(model_id, "timeout") + if next_retry_after is None or retry_after < next_retry_after: + next_retry_after = retry_after + log.warning("Model %s timeout: %s", model_id, type(e).__name__) + continue + except httpx.RequestError as e: + await _record_openrouter_metric(model_id, "request_error") + retry_after = await _cool_down_transient_openrouter_model(model_id, "request_error") + if next_retry_after is None or retry_after < next_retry_after: + next_retry_after = retry_after + log.warning("Model %s request error: %s", model_id, type(e).__name__) + continue + except Exception as e: + await _record_openrouter_metric(model_id, "error") + log.warning("Model %s error: %s", model_id, e) + continue + + if tried_models == 0 or next_retry_after is not None: + return {RATE_LIMITED: True, "__retry_after": next_retry_after} + + return {ALL_FAILED: True} + + +async def _handle_status_response( + response: httpx.Response, + model_id: str, + log, +) -> dict | None: + if response.status_code == 402: + log.warning( + "OpenRouter quota exhausted (HTTP 402). " + "Balance likely below $0 — all models blocked. " + "Check https://openrouter.ai/settings/credits" + ) + await _record_openrouter_metric(model_id, "quota_exhausted") + return {QUOTA_EXHAUSTED: True} + + if response.status_code == 429: + raw_retry_after = _parse_retry_after(response) + retry_after = _normalize_retry_after(raw_retry_after) + cooldown = _rate_limit_cooldown_seconds(raw_retry_after) + await _record_openrouter_metric(model_id, "rate_limited") + await _cool_down_openrouter_model(model_id, cooldown, "rate_limited") + log.info( + "Rate-limited (429) on %s (retry-after: %ss, cooldown: %ss)", + model_id, + retry_after or "unknown", + cooldown, + ) + return {RATE_LIMITED: True, "__retry_after": cooldown} + + if response.status_code == 403: + await _record_openrouter_metric(model_id, "forbidden") + await _cool_down_openrouter_model( + model_id, + OPENROUTER_FORBIDDEN_MODEL_COOLDOWN_SECONDS, + "forbidden", + ) + log.warning("Model %s HTTP 403 (access denied), trying next...", model_id) + return {TRY_NEXT_MODEL: True} + + return None + + +async def _parse_success_response( + response: httpx.Response, + model_id: str, + log, +) -> dict | None: + body = response.text.strip() + json_start = body.find("{") + if json_start < 0: + await _record_bad_response(model_id, log, "bad_response", "returned no JSON", body[:100]) + return None + + data = json.loads(body[json_start:]) + if "choices" not in data: + await _record_bad_response(model_id, log, "bad_response", "no choices", str(data)[:200]) + return None + + content = data["choices"][0]["message"]["content"] + if not content: + await _record_bad_response(model_id, log, "empty_content", "empty content", "") + return None + + result = _parse_vision_response(content) + if "description" not in result and "ocr_text" not in result: + await _record_bad_response(model_id, log, "bad_json", "bad JSON", str(result)[:200]) + return None + + result["__model"] = model_id + await _record_openrouter_metric(model_id, "success") + return result + + +async def _record_bad_response( + model_id: str, + log, + metric: str, + message: str, + detail: str, +) -> None: + await _record_openrouter_metric(model_id, metric) + await _cool_down_transient_openrouter_model(model_id, metric) + log.warning("Model %s %s: %s", model_id, message, detail) + + +_validate_free_vision_models(VISION_MODELS) +_validate_openrouter_free_budget() diff --git a/src/recommendations/meme_queue.py b/src/recommendations/meme_queue.py index a1dc9e76..b151874b 100644 --- a/src/recommendations/meme_queue.py +++ b/src/recommendations/meme_queue.py @@ -2,9 +2,11 @@ import uuid from typing import Any, Optional +from sqlalchemy import text + from src import redis from src.config import settings -from src.database import fetch_all +from src.database import fetch_all, fetch_one from src.recommendations.blender import blend from src.recommendations.blender_experiments import ( MATURE_BLENDER_CONTROL_WEIGHTS, @@ -26,12 +28,49 @@ async def get_next_meme_for_user(user_id: int) -> MemeData | None: queue_key = redis.get_meme_queue_key(user_id) - meme_data = await redis.pop_meme_from_queue_by_key(queue_key) - if not meme_data: - return None + while True: + meme_data = await redis.pop_meme_from_queue_by_key(queue_key) + if not meme_data: + return None + + try: + meme_id = int(meme_data["id"]) + except (KeyError, TypeError, ValueError): + logging.warning( + "discarding malformed queued meme payload for user_id=%s payload=%s", + user_id, + meme_data, + ) + continue - return MemeData(**meme_data) + if await _queued_meme_is_sendable(user_id, meme_id): + return MemeData(**meme_data) + + logging.info( + "discarding stale queued meme payload for user_id=%s meme_id=%s", + user_id, + meme_id, + ) + + +async def _queued_meme_is_sendable(user_id: int, meme_id: int) -> bool: + row = await fetch_one( + text( + """ + SELECT M.id + FROM meme M + LEFT JOIN user_meme_reaction R + ON R.meme_id = M.id + AND R.user_id = :user_id + WHERE M.id = :meme_id + AND M.status = 'ok' + AND R.meme_id IS NULL + """ + ), + {"user_id": user_id, "meme_id": meme_id}, + ) + return row is not None async def has_memes_in_queue(user_id: int) -> bool: diff --git a/src/recommendations/pipeline.py b/src/recommendations/pipeline.py index 219de931..83c7bb7e 100644 --- a/src/recommendations/pipeline.py +++ b/src/recommendations/pipeline.py @@ -37,6 +37,9 @@ logger = logging.getLogger(__name__) +LOW_SENT_POOL_MIN_REACTIONS_FOR_QUALITY_GATE = 10 +LOW_SENT_POOL_MIN_LIKE_RATE = 0.15 + Candidate = dict[str, Any] BlendFunc = Callable[ [ @@ -743,8 +746,19 @@ def _low_sent_query(exclude_ids: list[int]) -> str: WHERE 1=1 AND M.status = 'ok' AND R.meme_id IS NULL + AND ( + COALESCE(MS.nlikes, 0) + COALESCE(MS.ndislikes, 0) + < {LOW_SENT_POOL_MIN_REACTIONS_FOR_QUALITY_GATE} + OR ( + COALESCE(MS.nlikes, 0)::float + / NULLIF(COALESCE(MS.nlikes, 0) + COALESCE(MS.ndislikes, 0), 0) + ) >= {LOW_SENT_POOL_MIN_LIKE_RATE} + ) {exclude_meme_ids_sql_filter(exclude_ids)} - ORDER BY COALESCE(MS.nmemes_sent, 0), M.id + ORDER BY + COALESCE(MS.nlikes, 0) + COALESCE(MS.ndislikes, 0), + COALESCE(MS.nmemes_sent, 0), + M.id LIMIT :limit """ diff --git a/src/stats/meme.py b/src/stats/meme.py index 0ac64975..820ca828 100644 --- a/src/stats/meme.py +++ b/src/stats/meme.py @@ -1,6 +1,9 @@ import logging +from collections.abc import Awaitable, Callable +from typing import Any from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncConnection from src.database import execute, fetch_all @@ -16,12 +19,55 @@ async def calculate_meme_reactions_and_engagement( min_user_reactions: int = 10, min_meme_reactions: int = 3, lookback_hours: int = 3, + meme_ids: list[int] | None = None, + include_user_history: bool = False, +) -> None: + await _execute_meme_reactions_and_engagement( + execute, + min_user_reactions=min_user_reactions, + min_meme_reactions=min_meme_reactions, + lookback_hours=lookback_hours, + meme_ids=meme_ids, + include_user_history=include_user_history, + ) + + +async def calculate_meme_reactions_and_engagement_on_connection( + conn: AsyncConnection, + *, + min_user_reactions: int = 10, + min_meme_reactions: int = 3, + lookback_hours: int = 3, + meme_ids: list[int] | None = None, + include_user_history: bool = False, +) -> None: + await _execute_meme_reactions_and_engagement( + conn.execute, + min_user_reactions=min_user_reactions, + min_meme_reactions=min_meme_reactions, + lookback_hours=lookback_hours, + meme_ids=meme_ids, + include_user_history=include_user_history, + ) + + +async def _execute_meme_reactions_and_engagement( + execute_query: Callable[[Any, dict[str, Any]], Awaitable[Any]], + *, + min_user_reactions: int, + min_meme_reactions: int, + lookback_hours: int, + meme_ids: list[int] | None, + include_user_history: bool, ) -> None: """Combined lr_smoothed + engagement_score + basic counts — incremental mode. Only recomputes stats for memes that received reactions in the last `lookback_hours` hours. Memes with no recent activity keep their existing - meme_stats rows unchanged (ON CONFLICT DO UPDATE only fires for included rows). + meme_stats rows unchanged unless explicitly included in `meme_ids`. + When `include_user_history` is true, user baselines are built from all + reactions by users who touched the target memes; this is used after moving + historical reactions during deduplication. lr_smoothed algorithm: 1. like_symmetrical: reaction_id=1 → +1, else → -1 @@ -50,6 +96,25 @@ async def calculate_meme_reactions_and_engagement( WHERE COALESCE(reacted_at, sent_at) > NOW() - :lookback_hours * INTERVAL '1 hour' ), + FORCED_MEME_IDS AS ( + SELECT M.id AS meme_id + FROM meme M + WHERE :has_forced_meme_ids + AND M.id = ANY(:meme_ids) + ), + + TARGET_MEME_IDS AS ( + SELECT meme_id FROM RECENT_MEME_IDS + UNION + SELECT meme_id FROM FORCED_MEME_IDS + ), + + AFFECTED_USERS AS ( + SELECT DISTINCT user_id + FROM user_meme_reaction + WHERE meme_id IN (SELECT meme_id FROM TARGET_MEME_IDS) + ), + BASE_REACTIONS AS ( SELECT R.user_id, R.meme_id, R.reaction_id, @@ -62,7 +127,13 @@ async def calculate_meme_reactions_and_engagement( OVER (PARTITION BY R.user_id) AS user_last_reaction_sent_at FROM user_meme_reaction R JOIN meme ON R.meme_id = meme.id - WHERE R.meme_id IN (SELECT meme_id FROM RECENT_MEME_IDS) + WHERE ( + (:include_user_history AND R.user_id IN (SELECT user_id FROM AFFECTED_USERS)) + OR ( + NOT :include_user_history + AND R.meme_id IN (SELECT meme_id FROM TARGET_MEME_IDS) + ) + ) ), WITH_USER_AVGS AS ( @@ -118,30 +189,31 @@ async def calculate_meme_reactions_and_engagement( COUNT(lr_smoothed_val) AS n_lr_reactions, COUNT(es_smoothed_val) AS n_es_reactions FROM SMOOTHED + WHERE meme_id IN (SELECT meme_id FROM TARGET_MEME_IDS) GROUP BY meme_id ), BASIC_COUNTS AS ( SELECT - meme_id - , COUNT(*) FILTER (WHERE reaction_id = 1) AS nlikes - , COUNT(*) FILTER (WHERE reaction_id = 2) AS ndislikes - , COUNT(*) AS nmemes_sent + M.id AS meme_id + , COUNT(*) FILTER (WHERE E.reaction_id = 1) AS nlikes + , COUNT(*) FILTER (WHERE E.reaction_id = 2) AS ndislikes + , COUNT(E.*) AS nmemes_sent , MAX(EXTRACT('DAYS' FROM NOW() - M.published_at)) AS age_days , COALESCE(EXTRACT( EPOCH FROM percentile_cont(0.5) - WITHIN GROUP (ORDER BY reacted_at - sent_at) + WITHIN GROUP (ORDER BY E.reacted_at - E.sent_at) FILTER ( - WHERE reacted_at - sent_at + WHERE E.reacted_at - E.sent_at BETWEEN '0.5 second' AND '1 minute' ) ), 99999) AS sec_to_react , NOW() AS updated_at - FROM user_meme_reaction E - INNER JOIN meme M ON M.id = E.meme_id - WHERE E.meme_id IN (SELECT meme_id FROM RECENT_MEME_IDS) + FROM meme M + LEFT JOIN user_meme_reaction E ON E.meme_id = M.id + WHERE M.id IN (SELECT meme_id FROM TARGET_MEME_IDS) GROUP BY 1 ) @@ -173,12 +245,16 @@ async def calculate_meme_reactions_and_engagement( lr_smoothed = EXCLUDED.lr_smoothed, engagement_score = EXCLUDED.engagement_score """ - await execute( + forced_meme_ids = meme_ids or [0] + await execute_query( text(query), { "min_user_reactions": min_user_reactions, "min_meme_reactions": min_meme_reactions, "lookback_hours": lookback_hours, + "has_forced_meme_ids": bool(meme_ids), + "meme_ids": forced_meme_ids, + "include_user_history": include_user_history, }, ) diff --git a/src/storage/deduplication/__init__.py b/src/storage/deduplication/__init__.py new file mode 100644 index 00000000..34560119 --- /dev/null +++ b/src/storage/deduplication/__init__.py @@ -0,0 +1,30 @@ +from src.storage.deduplication.finder import ( + find_duplicate_by_file_id, + find_duplicate_by_ocr_text, + ocr_text_from_meme, +) +from src.storage.deduplication.models import ( + MIN_OCR_DUPLICATE_TEXT_LENGTH, + DeduplicationResult, + DuplicateResolution, +) +from src.storage.deduplication.policies import ( + deduplicate_described_meme, + deduplicate_pending_meme, +) +from src.storage.deduplication.resolver import refresh_original_stats, resolve_duplicate +from src.storage.deduplication.sweep import sweep_file_id_duplicates + +__all__ = [ + "MIN_OCR_DUPLICATE_TEXT_LENGTH", + "DeduplicationResult", + "DuplicateResolution", + "deduplicate_described_meme", + "deduplicate_pending_meme", + "find_duplicate_by_file_id", + "find_duplicate_by_ocr_text", + "ocr_text_from_meme", + "refresh_original_stats", + "resolve_duplicate", + "sweep_file_id_duplicates", +] diff --git a/src/storage/deduplication/finder.py b/src/storage/deduplication/finder.py new file mode 100644 index 00000000..a5703320 --- /dev/null +++ b/src/storage/deduplication/finder.py @@ -0,0 +1,54 @@ +from typing import Any + +from sqlalchemy import text + +from src.database import fetch_one +from src.storage.deduplication.models import MIN_OCR_DUPLICATE_TEXT_LENGTH + + +def ocr_text_from_meme(meme_row: dict[str, Any]) -> str: + ocr_result = meme_row.get("ocr_result") or {} + return ocr_result.get("text") or ocr_result.get("raw_result", {}).get("ocr_text") or "" + + +async def find_duplicate_by_file_id(meme_id: int, telegram_file_id: str) -> int | None: + """Find an earlier meme that stores the same Telegram file_id.""" + query = text( + """ + SELECT id FROM meme + WHERE telegram_file_id = :file_id + AND status IN ('ok', 'published', 'created') + AND id < :meme_id + ORDER BY + CASE WHEN status = 'published' THEN 0 ELSE 1 END, + id ASC + LIMIT 1 + """ + ) + res = await fetch_one(query, {"file_id": telegram_file_id, "meme_id": meme_id}) + return res["id"] if res else None + + +async def find_duplicate_by_ocr_text(meme_id: int, image_text: str) -> int | None: + if len(image_text) < MIN_OCR_DUPLICATE_TEXT_LENGTH: + return None + + select_query = text( + """ + SELECT + M.id + FROM meme M + WHERE M.id < :meme_id + AND M.status IN ('ok', 'published') + AND M.type = 'image' + AND M.ocr_result IS NOT NULL + AND (M.ocr_result ->> 'text') % :image_text + ORDER BY + CASE WHEN M.status = 'published' THEN 0 ELSE 1 END, + M.id ASC + LIMIT 1 + """ + ) + + res = await fetch_one(select_query, {"meme_id": meme_id, "image_text": image_text}) + return res["id"] if res else None diff --git a/src/storage/deduplication/models.py b/src/storage/deduplication/models.py new file mode 100644 index 00000000..ad4dab67 --- /dev/null +++ b/src/storage/deduplication/models.py @@ -0,0 +1,26 @@ +from dataclasses import dataclass + +MIN_OCR_DUPLICATE_TEXT_LENGTH = 12 + + +@dataclass(frozen=True) +class DuplicateResolution: + dupe_id: int + original_id: int + reason: str + reactions_moved: int + reactions_dropped: int + chat_reactions_moved: int + chat_reactions_dropped: int + + +@dataclass(frozen=True) +class DeduplicationResult: + meme_id: int + duplicate_of: int | None = None + reason: str | None = None + resolution: DuplicateResolution | None = None + + @property + def duplicate_found(self) -> bool: + return self.duplicate_of is not None diff --git a/src/storage/deduplication/policies.py b/src/storage/deduplication/policies.py new file mode 100644 index 00000000..8dbdf98e --- /dev/null +++ b/src/storage/deduplication/policies.py @@ -0,0 +1,50 @@ +from typing import Any + +from src.storage.constants import MemeStatus +from src.storage.deduplication.finder import ( + find_duplicate_by_file_id, + find_duplicate_by_ocr_text, + ocr_text_from_meme, +) +from src.storage.deduplication.models import DeduplicationResult +from src.storage.deduplication.resolver import resolve_duplicate + + +async def deduplicate_pending_meme(meme_row: dict[str, Any]) -> DeduplicationResult: + """Run cheap dedup checks before a created meme can be promoted to ok.""" + meme_id = meme_row["id"] + telegram_file_id = meme_row.get("telegram_file_id") + if telegram_file_id: + duplicate_of = await find_duplicate_by_file_id(meme_id, telegram_file_id) + if duplicate_of: + resolution = await resolve_duplicate( + meme_id, + duplicate_of, + reason="telegram_file_id", + ) + return DeduplicationResult(meme_id, duplicate_of, "telegram_file_id", resolution) + + duplicate_of = await find_duplicate_by_ocr_text(meme_id, ocr_text_from_meme(meme_row)) + if duplicate_of: + resolution = await resolve_duplicate(meme_id, duplicate_of, reason="ocr_text") + return DeduplicationResult(meme_id, duplicate_of, "ocr_text", resolution) + + return DeduplicationResult(meme_id) + + +async def deduplicate_described_meme( + meme_id: int, + ocr_text: str, + *, + status: str | None, +) -> DeduplicationResult: + """Run OCR dedup after Describe Memes enriches an already-ok image.""" + if status != MemeStatus.OK.value: + return DeduplicationResult(meme_id) + + duplicate_of = await find_duplicate_by_ocr_text(meme_id, ocr_text) + if not duplicate_of: + return DeduplicationResult(meme_id) + + resolution = await resolve_duplicate(meme_id, duplicate_of, reason="ocr_text") + return DeduplicationResult(meme_id, duplicate_of, "ocr_text", resolution) diff --git a/src/storage/deduplication/resolver.py b/src/storage/deduplication/resolver.py new file mode 100644 index 00000000..e43843ac --- /dev/null +++ b/src/storage/deduplication/resolver.py @@ -0,0 +1,213 @@ +from typing import Any + +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncConnection + +from src.database import run_in_transaction +from src.stats.meme import calculate_meme_reactions_and_engagement_on_connection +from src.storage.deduplication.models import DuplicateResolution + + +async def _fetch_one( + conn: AsyncConnection, + query, + params: dict[str, Any] | None = None, +) -> dict[str, Any] | None: + result = await conn.execute(query, params or {}) + row = result.first() + return row._asdict() if row is not None else None + + +async def _count( + conn: AsyncConnection, + query, + params: dict[str, Any], + field: str, +) -> int: + row = await _fetch_one(conn, query, params) + return int(row[field]) if row else 0 + + +async def _canonical_original_id(conn: AsyncConnection, meme_id: int) -> int: + """Follow duplicate_of links so new duplicates point at a real original.""" + current_id = meme_id + seen = {meme_id} + + while True: + row = await _fetch_one( + conn, + text("SELECT id, duplicate_of FROM meme WHERE id = :meme_id"), + {"meme_id": current_id}, + ) + if not row or row["duplicate_of"] is None: + return current_id + + current_id = row["duplicate_of"] + if current_id in seen: + return current_id + seen.add(current_id) + + +async def resolve_duplicate( + dupe_id: int, + original_id: int, + *, + reason: str, +) -> DuplicateResolution: + """Mark a meme as duplicate and move all safe reaction history to the original.""" + + async def _resolve(conn: AsyncConnection) -> DuplicateResolution: + canonical_original_id = await _canonical_original_id(conn, original_id) + + reactions_moved = await _move_user_reactions(conn, dupe_id, canonical_original_id) + chat_reactions_moved = await _move_chat_reactions(conn, dupe_id, canonical_original_id) + reactions_dropped = await _delete_user_reactions(conn, dupe_id) + chat_reactions_dropped = await _delete_chat_reactions(conn, dupe_id) + + await conn.execute( + text("DELETE FROM meme_stats WHERE meme_id = :dupe_id"), + {"dupe_id": dupe_id}, + ) + await conn.execute( + text( + """ + UPDATE meme + SET status = 'duplicate', duplicate_of = :original_id + WHERE id = :dupe_id + """ + ), + {"dupe_id": dupe_id, "original_id": canonical_original_id}, + ) + await conn.execute( + text( + """ + UPDATE meme + SET duplicate_of = :original_id + WHERE duplicate_of = :dupe_id + """ + ), + {"dupe_id": dupe_id, "original_id": canonical_original_id}, + ) + await _refresh_original_stats(conn, canonical_original_id) + + return DuplicateResolution( + dupe_id=dupe_id, + original_id=canonical_original_id, + reason=reason, + reactions_moved=reactions_moved, + reactions_dropped=reactions_dropped, + chat_reactions_moved=chat_reactions_moved, + chat_reactions_dropped=chat_reactions_dropped, + ) + + return await run_in_transaction(_resolve) + + +async def _move_user_reactions( + conn: AsyncConnection, + dupe_id: int, + original_id: int, +) -> int: + return await _count( + conn, + text( + """ + WITH moved AS ( + INSERT INTO user_meme_reaction + (user_id, meme_id, recommended_by, sent_at, reaction_id, reacted_at) + SELECT user_id, :original_id, recommended_by, sent_at, reaction_id, reacted_at + FROM user_meme_reaction source + WHERE source.meme_id = :dupe_id + AND NOT EXISTS ( + SELECT 1 FROM user_meme_reaction existing + WHERE existing.user_id = source.user_id + AND existing.meme_id = :original_id + ) + ON CONFLICT (user_id, meme_id) DO NOTHING + RETURNING 1 + ) + SELECT count(*) AS moved FROM moved + """ + ), + {"dupe_id": dupe_id, "original_id": original_id}, + "moved", + ) + + +async def _move_chat_reactions( + conn: AsyncConnection, + dupe_id: int, + original_id: int, +) -> int: + return await _count( + conn, + text( + """ + WITH moved AS ( + INSERT INTO chat_meme_reaction + (chat_id, meme_id, user_id, reaction, reacted_at) + SELECT chat_id, :original_id, user_id, reaction, reacted_at + FROM chat_meme_reaction source + WHERE source.meme_id = :dupe_id + AND NOT EXISTS ( + SELECT 1 FROM chat_meme_reaction existing + WHERE existing.chat_id = source.chat_id + AND existing.user_id = source.user_id + AND existing.meme_id = :original_id + ) + ON CONFLICT (chat_id, meme_id, user_id) DO NOTHING + RETURNING 1 + ) + SELECT count(*) AS moved FROM moved + """ + ), + {"dupe_id": dupe_id, "original_id": original_id}, + "moved", + ) + + +async def _delete_user_reactions(conn: AsyncConnection, dupe_id: int) -> int: + return await _count( + conn, + text( + """ + WITH deleted AS ( + DELETE FROM user_meme_reaction WHERE meme_id = :dupe_id RETURNING 1 + ) + SELECT count(*) AS deleted FROM deleted + """ + ), + {"dupe_id": dupe_id}, + "deleted", + ) + + +async def _delete_chat_reactions(conn: AsyncConnection, dupe_id: int) -> int: + return await _count( + conn, + text( + """ + WITH deleted AS ( + DELETE FROM chat_meme_reaction WHERE meme_id = :dupe_id RETURNING 1 + ) + SELECT count(*) AS deleted FROM deleted + """ + ), + {"dupe_id": dupe_id}, + "deleted", + ) + + +async def refresh_original_stats(original_id: int) -> None: + async def _refresh(conn: AsyncConnection) -> None: + await _refresh_original_stats(conn, original_id) + + await run_in_transaction(_refresh) + + +async def _refresh_original_stats(conn: AsyncConnection, original_id: int) -> None: + await calculate_meme_reactions_and_engagement_on_connection( + conn, + meme_ids=[original_id], + include_user_history=True, + ) diff --git a/src/storage/deduplication/sweep.py b/src/storage/deduplication/sweep.py new file mode 100644 index 00000000..9fba8a96 --- /dev/null +++ b/src/storage/deduplication/sweep.py @@ -0,0 +1,63 @@ +from sqlalchemy import text + +from src.database import fetch_all +from src.storage.deduplication.resolver import resolve_duplicate + + +async def sweep_file_id_duplicates() -> dict[str, int]: + """Resolve any exact Telegram file_id duplicates that slipped past batch processing.""" + rows = await fetch_all( + text( + """ + WITH duplicate_groups AS ( + SELECT telegram_file_id + FROM meme + WHERE status IN ('ok', 'published') + AND telegram_file_id IS NOT NULL + GROUP BY telegram_file_id + HAVING count(*) > 1 + ), + canonical AS ( + SELECT DISTINCT ON (m.telegram_file_id) + m.telegram_file_id, + m.id AS original_id + FROM meme m + INNER JOIN duplicate_groups g + ON g.telegram_file_id = m.telegram_file_id + WHERE m.status IN ('ok', 'published') + ORDER BY + m.telegram_file_id, + CASE WHEN m.status = 'published' THEN 0 ELSE 1 END, + m.id ASC + ) + SELECT m.id, m.telegram_file_id, canonical.original_id + FROM meme m + INNER JOIN canonical + ON canonical.telegram_file_id = m.telegram_file_id + WHERE m.status = 'ok' + AND m.id != canonical.original_id + """ + ) + ) + + total_moved = 0 + total_dropped = 0 + total_resolved = 0 + + for row in rows: + if row["id"] == row["original_id"]: + continue + result = await resolve_duplicate( + row["id"], + row["original_id"], + reason="telegram_file_id_sweep", + ) + total_moved += result.reactions_moved + total_dropped += result.reactions_dropped + total_resolved += 1 + + return { + "resolved": total_resolved, + "reactions_moved": total_moved, + "reactions_dropped": total_dropped, + } diff --git a/src/storage/service.py b/src/storage/service.py index d899e44a..f2677977 100644 --- a/src/storage/service.py +++ b/src/storage/service.py @@ -4,7 +4,6 @@ from sqlalchemy import nulls_first, select, text from src.database import ( - execute, fetch_all, fetch_one, meme, @@ -375,8 +374,13 @@ async def get_unloaded_vk_memes(limit: int) -> list[dict[str, Any]]: return await fetch_all(text(select_query)) -async def update_meme_status_of_ready_memes() -> list[dict[str, Any]]: +async def update_meme_status_of_ready_memes( + meme_ids: list[int] | None = None, +) -> list[dict[str, Any]]: """Changes the status of memes to 'ok' if they are ready to be published.""" + if meme_ids is not None and len(meme_ids) == 0: + return [] + update_query = ( meme.update() .where(meme.c.status == MemeStatus.CREATED) @@ -385,159 +389,6 @@ async def update_meme_status_of_ready_memes() -> list[dict[str, Any]]: .values(status=MemeStatus.OK) .returning(meme) ) + if meme_ids is not None: + update_query = update_query.where(meme.c.id.in_(meme_ids)) return await fetch_all(update_query) - - -async def find_meme_duplicate_by_file_id(meme_id: int, telegram_file_id: str) -> int | None: - """Find an existing meme with the same telegram_file_id.""" - query = text( - """ - SELECT id FROM meme - WHERE telegram_file_id = :file_id - AND status IN ('ok', 'created') - AND id < :meme_id - ORDER BY id ASC - LIMIT 1 - """ - ) - res = await fetch_one(query, {"file_id": telegram_file_id, "meme_id": meme_id}) - if res: - return res["id"] - return None - - -async def find_meme_duplicate(meme_id: int, imagetext: str) -> int | None: - if len(imagetext) <= 11: # skip all memes with less than 11 letters - return None - - select_query = text( - """ - SELECT - M.id - FROM meme M - WHERE M.id < :meme_id - AND M.status = 'ok' - AND M.type = 'image' - AND M.ocr_result IS NOT NULL - AND (M.ocr_result ->> 'text') % :imagetext - ORDER BY M.id ASC - LIMIT 1 - """ - ).bindparams(meme_id=meme_id, imagetext=imagetext) - - res = await fetch_one(select_query) - if res: - return res["id"] - return None - - -async def resolve_meme_duplicate(dupe_id: int, original_id: int) -> dict[str, int]: - """Mark a meme as duplicate with full cleanup. - - 1. Move reactions from dupe → original (skip conflicts) - 2. Delete remaining reactions on dupe - 3. Delete meme_stats for dupe - 4. Set meme status='duplicate', duplicate_of=original_id - - Stats for original will auto-recalculate on next 5-15 min cycle. - Returns counts: {moved, conflicts, deleted_stats}. - """ - # 1. Move non-conflicting reactions to original - move_query = text( - """ - WITH moved AS ( - INSERT INTO user_meme_reaction - (user_id, meme_id, recommended_by, sent_at, reaction_id, reacted_at) - SELECT user_id, :original_id, recommended_by, sent_at, reaction_id, reacted_at - FROM user_meme_reaction - WHERE meme_id = :dupe_id - AND NOT EXISTS ( - SELECT 1 FROM user_meme_reaction existing - WHERE existing.user_id = user_meme_reaction.user_id - AND existing.meme_id = :original_id - ) - ON CONFLICT (user_id, meme_id) DO NOTHING - RETURNING 1 - ) - SELECT count(*) AS moved FROM moved - """ - ) - res = await fetch_one(move_query, {"dupe_id": dupe_id, "original_id": original_id}) - moved = res["moved"] if res else 0 - - # 2. Delete all reactions remaining on dupe (conflicts + already moved) - delete_reactions = text( - """ - WITH deleted AS ( - DELETE FROM user_meme_reaction WHERE meme_id = :dupe_id RETURNING 1 - ) - SELECT count(*) AS conflicts FROM deleted - """ - ) - res = await fetch_one(delete_reactions, {"dupe_id": dupe_id}) - conflicts = res["conflicts"] if res else 0 - - # 3. Delete meme_stats for dupe (stale, will not regenerate since no reactions) - await execute( - text("DELETE FROM meme_stats WHERE meme_id = :dupe_id"), - {"dupe_id": dupe_id}, - ) - - # 4. Mark meme as duplicate - await execute( - text( - """ - UPDATE meme - SET status = 'duplicate', duplicate_of = :original_id - WHERE id = :dupe_id - """ - ), - {"dupe_id": dupe_id, "original_id": original_id}, - ) - - return {"moved": moved, "conflicts": conflicts} - - -async def resolve_all_file_id_duplicates() -> dict[str, int]: - """Find and resolve all memes with duplicate telegram_file_id. - - For each group of memes sharing a file_id with status='ok': - keeps the oldest (smallest id), resolves the rest as duplicates. - Returns total counts. - """ - # Find all file_id duplicate groups - dupes_query = text( - """ - SELECT id, telegram_file_id, - FIRST_VALUE(id) OVER ( - PARTITION BY telegram_file_id ORDER BY id ASC - ) AS original_id - FROM meme - WHERE status = 'ok' - AND telegram_file_id IS NOT NULL - AND telegram_file_id IN ( - SELECT telegram_file_id FROM meme - WHERE status = 'ok' AND telegram_file_id IS NOT NULL - GROUP BY telegram_file_id HAVING count(*) > 1 - ) - """ - ) - rows = await fetch_all(dupes_query) - - total_moved = 0 - total_conflicts = 0 - total_resolved = 0 - - for row in rows: - if row["id"] == row["original_id"]: - continue # skip the keeper - result = await resolve_meme_duplicate(row["id"], row["original_id"]) - total_moved += result["moved"] - total_conflicts += result["conflicts"] - total_resolved += 1 - - return { - "resolved": total_resolved, - "reactions_moved": total_moved, - "reactions_dropped": total_conflicts, - } diff --git a/src/storage/upload.py b/src/storage/upload.py index e933f45e..2ff26abc 100644 --- a/src/storage/upload.py +++ b/src/storage/upload.py @@ -14,9 +14,9 @@ sentry_log_extra, ) from src.storage.constants import MemeStatus, MemeType +from src.storage.deduplication import find_duplicate_by_file_id from src.storage.parsers.constants import USER_AGENT from src.storage.service import ( - find_meme_duplicate_by_file_id, update_meme, ) from src.tgbot.bot import bot @@ -107,8 +107,8 @@ async def _upload_meme_content_to_tg( if not file_id: return None - # Check if this file_id already exists on another ok meme (cross-source dupe) - duplicate_of = await find_meme_duplicate_by_file_id(meme_id, file_id) + # Check if this file_id already exists before the meme reaches recommendations. + duplicate_of = await find_duplicate_by_file_id(meme_id, file_id) if duplicate_of: logging.info( "Meme %s is a file_id duplicate of meme %s, marking as duplicate.", diff --git a/src/tgbot/handlers/stats/wrapped.py b/src/tgbot/handlers/stats/wrapped.py index 87e8e0c7..f265b59b 100644 --- a/src/tgbot/handlers/stats/wrapped.py +++ b/src/tgbot/handlers/stats/wrapped.py @@ -1,27 +1,26 @@ import asyncio import datetime -import json import logging import random import sys from html import escape as html_escape from urllib.parse import quote -from openai import AsyncOpenAI from telegram import InlineKeyboardButton, InlineKeyboardMarkup, Update from telegram.constants import ChatAction from telegram.ext import ContextTypes -from src.config import settings from src.localizer import ALMOST_CIS_LANGUAGES from src.redis import get_user_wrapped, set_user_wrapped from src.stats.service import ( get_meme_descriptions_for_wrapped, - get_most_liked_meme_source_urls, - get_top_meme_source_urls, get_user_stats, ) from src.storage.schemas import MemeData +from src.tgbot.handlers.stats.wrapped_generation import ( + generate_wrapped_data, + get_bot_usage_report, +) from src.tgbot.senders.meme import send_new_message_with_meme from src.tgbot.service import ( create_or_update_user, @@ -74,18 +73,6 @@ "Try again →", ] -ABSURD_CATEGORIES = [ - "бытовая техника", - "животное", - "блюдо/еда", - "музыкальный жанр", - "вид транспорта", - "напиток", - "предмет мебели", - "персонаж мультфильма", - "погода", -] - def _log(msg: str) -> None: """Force-log to stderr (bypasses gunicorn log config).""" @@ -125,230 +112,6 @@ def _next_label(is_ru: bool) -> str: return "Дальше →" if is_ru else "Next →" -# ── LLM ────────────────────────────────────────────────── - - -async def call_deepseek(prompt: str) -> str: - client = AsyncOpenAI( - api_key=settings.DEEPSEEK_API_KEY, - base_url=settings.DEEPSEEK_BASE_URL, - ) - resp = await client.chat.completions.create( - model="deepseek-chat", - messages=[{"role": "user", "content": prompt}], - max_tokens=2000, - temperature=0.9, - ) - return resp.choices[0].message.content - - -def parse_json_from_llm(raw: str) -> dict | None: - c = raw.strip() - if c.startswith("```"): - c = c.split("\n", 1)[1] if "\n" in c else c[3:] - if c.endswith("```"): - c = c[:-3] - c = c.strip() - if c.startswith("json"): - c = c[4:].strip() - try: - return json.loads(c) - except Exception: - return None - - -# ── SQL INSIGHTS ───────────────────────────────────────── - - -async def get_reaction_speed_insight(user_id: int) -> dict: - """Median reaction time, split by like/dislike. Pure SQL.""" - from sqlalchemy import text - - from src.database import fetch_one - - row = await fetch_one( - text( - """ - WITH reactions AS ( - SELECT - EXTRACT(EPOCH FROM (reacted_at - sent_at)) AS sec, - reaction_id - FROM user_meme_reaction - WHERE user_id = :user_id - AND reacted_at IS NOT NULL AND sent_at IS NOT NULL - AND EXTRACT(EPOCH FROM (reacted_at - sent_at)) - BETWEEN 0.5 AND 120 - ) - SELECT - PERCENTILE_CONT(0.5) WITHIN GROUP ( - ORDER BY sec - ) AS median_sec, - PERCENTILE_CONT(0.5) WITHIN GROUP ( - ORDER BY sec - ) FILTER (WHERE reaction_id = 1) AS median_like, - PERCENTILE_CONT(0.5) WITHIN GROUP ( - ORDER BY sec - ) FILTER (WHERE reaction_id = 2) AS median_dislike - FROM reactions - """ - ), - {"user_id": user_id}, - ) - - if not row or row["median_sec"] is None: - return {} - return { - "median_sec": round(float(row["median_sec"]), 1), - "median_like": round(float(row["median_like"] or 0), 1), - "median_dislike": round(float(row["median_dislike"] or 0), 1), - } - - -async def get_peak_hour_insight(user_id: int, is_ru: bool = True) -> dict: - """Peak activity hour. Moscow time for RU, UTC for EN.""" - from sqlalchemy import text - - from src.database import fetch_one - - # UTC+3 for Russian users - tz_offset = 3 if is_ru else 0 - row = await fetch_one( - text( - f""" - SELECT - EXTRACT(HOUR FROM reacted_at + interval '{tz_offset} hours') - AS peak_hour, - COUNT(*) AS cnt - FROM user_meme_reaction - WHERE user_id = :user_id AND reacted_at IS NOT NULL - GROUP BY 1 ORDER BY 2 DESC LIMIT 1 - """ - ), - {"user_id": user_id}, - ) - - if not row: - return {} - hour = int(row["peak_hour"]) - if is_ru: - labels = { - (0, 6): "ночной скроллер 🌙", - (6, 10): "утренний мемолюб ☀️", - (10, 14): "дневной прокрастинатор 💼", - (14, 18): "послеобеденный залипатель 🍕", - (18, 22): "вечерний мемоман 🌆", - (22, 24): "полуночный скроллер 🦉", - } - default_label = "мемоман" - else: - labels = { - (0, 6): "night scroller 🌙", - (6, 10): "morning meme lover ☀️", - (10, 14): "daytime procrastinator 💼", - (14, 18): "afternoon meme addict 🍕", - (18, 22): "evening meme connoisseur 🌆", - (22, 24): "midnight scroller 🦉", - } - default_label = "meme lover" - label = next( - (v for (lo, hi), v in labels.items() if lo <= hour < hi), - default_label, - ) - tz_label = "МСК" if is_ru else "UTC" - return {"hour": hour, "label": label, "tz": tz_label} - - -async def get_surprise_meme(user_id: int) -> dict | None: - """Meme user liked but most others didn't.""" - from sqlalchemy import text - - from src.database import fetch_one - - row = await fetch_one( - text( - """ - SELECT m.id AS meme_id, m.type, m.telegram_file_id, - ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100) - AS global_lr_pct - FROM user_meme_reaction umr - JOIN meme m ON m.id = umr.meme_id - LEFT JOIN meme_stats ms ON ms.meme_id = m.id - WHERE umr.user_id = :user_id - AND umr.reaction_id = 1 - AND m.telegram_file_id IS NOT NULL - AND COALESCE(ms.lr_smoothed, 0.5) < 0.35 - AND COALESCE(ms.nmemes_sent, 0) >= 10 - ORDER BY ms.lr_smoothed ASC LIMIT 1 - """ - ), - {"user_id": user_id}, - ) - if not row: - return None - return dict(row) - - -async def get_most_popular_liked_meme(user_id: int) -> dict | None: - """Meme user liked with highest global like rate.""" - from sqlalchemy import text - - from src.database import fetch_one - - row = await fetch_one( - text( - """ - SELECT m.id AS meme_id, m.type, m.telegram_file_id, - ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100) - AS global_lr_pct, - COALESCE(ms.nlikes, 0) AS nlikes - FROM user_meme_reaction umr - JOIN meme m ON m.id = umr.meme_id - LEFT JOIN meme_stats ms ON ms.meme_id = m.id - WHERE umr.user_id = :user_id - AND umr.reaction_id = 1 - AND m.telegram_file_id IS NOT NULL - AND COALESCE(ms.nmemes_sent, 0) >= 10 - ORDER BY ms.lr_smoothed DESC LIMIT 1 - """ - ), - {"user_id": user_id}, - ) - if not row: - return None - return dict(row) - - -async def get_unpopular_opinion_meme(user_id: int) -> dict | None: - """Meme user disliked but was very popular globally.""" - from sqlalchemy import text - - from src.database import fetch_one - - row = await fetch_one( - text( - """ - SELECT m.id AS meme_id, m.type, m.telegram_file_id, - ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100) - AS global_lr_pct, - COALESCE(ms.nlikes, 0) AS nlikes - FROM user_meme_reaction umr - JOIN meme m ON m.id = umr.meme_id - LEFT JOIN meme_stats ms ON ms.meme_id = m.id - WHERE umr.user_id = :user_id - AND umr.reaction_id = 2 - AND m.telegram_file_id IS NOT NULL - AND COALESCE(ms.lr_smoothed, 0.5) > 0.65 - AND COALESCE(ms.nmemes_sent, 0) >= 10 - ORDER BY ms.lr_smoothed DESC LIMIT 1 - """ - ), - {"user_id": user_id}, - ) - if not row: - return None - return dict(row) - - # ── MAIN HANDLER ───────────────────────────────────────── @@ -952,475 +715,6 @@ async def handle_wrapped_clear( await update.message.reply_text("Cache cleared ✓ /wrapped") -# ── GENERATION ─────────────────────────────────────────── - - -async def generate_wrapped_data( - user_id: int, - descriptions: list, - lang: str, - stats_report: str, -) -> dict | None: - # Lock is already set by _generate_and_cache (with is_ru), don't overwrite it - - try: - liked = [d for d in descriptions if d.get("reaction_id") == 1] - disliked = [d for d in descriptions if d.get("reaction_id") == 2] - - liked_texts = "\n".join( - f"[{i}] ✅ {d.get('description') or d.get('ocr_text', '')}" - for i, d in enumerate(liked[:25]) - ) - disliked_texts = "\n".join( - f"❌ {d.get('description') or d.get('ocr_text', '')}" for d in disliked[:15] - ) - - # DeepSeek + SQL in parallel - is_ru = _is_ru(lang) - prompt = _build_mega_prompt(liked_texts, disliked_texts, lang) - - deepseek_task = asyncio.create_task(call_deepseek(prompt)) - sql_tasks = asyncio.gather( - _safe(get_reaction_speed_insight(user_id)), - _safe(get_peak_hour_insight(user_id, is_ru)), - _safe(get_surprise_meme(user_id)), - _safe(_build_sources_report(user_id, is_ru)), - _safe(get_most_popular_liked_meme(user_id)), - _safe(get_unpopular_opinion_meme(user_id)), - ) - - raw, (speed, peak, surprise, sources, popular_meme, unpopular_meme) = await asyncio.gather( - deepseek_task, sql_tasks - ) - - p = parse_json_from_llm(raw) - if not p: - logger.warning( - "DeepSeek JSON failed user %d: %s", - user_id, - raw[:300], - ) - p = {} - - your_meme = _pick_meme(p, liked) - - # Use surprise meme if LLM didn't pick one - if not your_meme and surprise: - lr = surprise.get("global_lr_pct", "?") - if is_ru: - cap = f"🎲 Этот мем лайкнул только ты\n(глобальный лайк-рейт: {lr}%)" - else: - cap = f"🎲 Only you liked this meme\n(global like rate: {lr}%)" - your_meme = {"meme_id": surprise["meme_id"], "caption": cap} - if not your_meme and liked: - pick = random.choice(liked[:10]) - cap = "🎲 А вот мем, который тебе зашёл:" if is_ru else "🎲 Here's a meme you liked:" - your_meme = {"meme_id": pick["meme_id"], "caption": cap} - - # Build slides - # Stats report gets vibe from DeepSeek — replace placeholder vibe - vibe = p.get("vibe", "") - if vibe and stats_report: - if "\n" in stats_report: - idx = stats_report.rfind("\n") - stats_report = stats_report[:idx] - stats_report += f"\n\n{html_escape(vibe)}" - - # Track used meme IDs globally to avoid showing the same meme twice - global_used_memes = set() - if your_meme and your_meme.get("meme_id"): - global_used_memes.add(your_meme["meme_id"]) - - # Pick oneliner meme (avoid your_meme) - oneliner_meme_id = None - if liked: - oneliner_candidates = [m for m in liked[:10] if m["meme_id"] not in global_used_memes] - if oneliner_candidates: - oneliner_meme_id = random.choice(oneliner_candidates)["meme_id"] - else: - oneliner_meme_id = random.choice(liked[:10])["meme_id"] - global_used_memes.add(oneliner_meme_id) - - # Pick memes for absurd comparisons (avoid already used) - absurd_memes = _attach_memes_to_absurd(p, liked, global_used_memes) - - default_prediction = ( - "Летом ты будешь листать мемы вместо работы 🔥" - if is_ru - else "This summer you'll scroll memes instead of working 🔥" - ) - return { - "stats_report": stats_report, - "zodiac": _build_zodiac_slide(p, is_ru), - "your_meme": your_meme, - "humor_dna": _build_humor_dna_slide(p, is_ru), - "humor_oneliner": p.get("humor_oneliner", ""), - "oneliner_meme_id": oneliner_meme_id, - "absurd_items": absurd_memes, - "anti_profile": _build_anti_slide(p, is_ru), - "popular_meme": _build_meme_data(popular_meme, is_popular=True, is_ru=is_ru), - "unpopular_meme": _build_meme_data(unpopular_meme, is_popular=False, is_ru=is_ru), - "stats_extra": _build_extra_slide(sources, speed, peak, is_ru), - "prediction": p.get("prediction", default_prediction), - } - except Exception as e: - logger.error("Wrapped failed user %d: %s", user_id, e, exc_info=True) - default_prediction = ( - "Летом ты будешь листать мемы вместо работы 🔥" - if is_ru - else "This summer you'll scroll memes instead of working 🔥" - ) - return { - "stats_report": stats_report, - "zodiac": "", - "your_meme": None, - "humor_dna": "", - "humor_oneliner": "", - "oneliner_meme_id": None, - "absurd_items": [], - "anti_profile": "", - "popular_meme": None, - "unpopular_meme": None, - "stats_extra": "", - "prediction": default_prediction, - } - - -async def _safe(coro): - try: - return await coro - except Exception as e: - logger.warning("Wrapped SQL insight failed: %s", e) - return {} if not isinstance(e, TypeError) else None - - -def _build_mega_prompt(liked_texts: str, disliked_texts: str, lang: str = "ru") -> str: - categories = random.sample(ABSURD_CATEGORIES, 3) - - lang_instruction = "" - if lang != "ru": - lang_name = "English" if lang == "en" else lang - lang_instruction = f"\n- ЯЗЫК: пиши ВЕСЬ JSON на {lang_name}" - - return f"""Ты мем-психолог. Проанализируй чувство юмора. - -ЛАЙКНУТЫЕ МЕМЫ: -{liked_texts} - -СКИПНУТЫЕ МЕМЫ: -{disliked_texts} - -Сначала молча найди: -1) 2-3 самые частые мотивы в лайках (офис, животные, кринж, токсичная мотивация, low-res chaos, семейная драма, etc.) -2) 1-2 мотива, которые человек стабильно скипает -3) 1 противоречие между лайками и скипами -Рассуждения НЕ выводи. Только JSON. - -Верни ТОЛЬКО JSON: -{{ - "vibe": "подкол от друга по мемам, 10-15 слов", - "meme_index": число (индекс лайкнутого мема [N], который олицетворяет), - "meme_caption": "почему этот мем — это ты (2 предложения, подкол)", - "zodiac_sign": "знак зодиака + эмодзи (♈♉♊♋♌♍♎♏♐♑♒♓)", - "zodiac_why": "1-2 предложения. Выбирай знак НЕ по характеру, \ -а по ЛОГИКЕ мемов. Упомяни конкретный мотив.", - "humor_dna": [ - {{"name": "категория", "pct": число}}, - {{"name": "категория", "pct": число}}, - {{"name": "категория", "pct": число}}, - {{"name": "категория", "pct": число}}, - {{"name": "категория", "pct": число}} - ], - "humor_oneliner": "4-8 слов. Ярлык мем-вкуса, не комплимент. \ -Как кличка от друга, не описание из гороскопа.", - "anti_profile": "2-3 коротких абзаца через \\n\\n. \ -На ТЫ: 'ты терпеть не можешь...'. Конкретно. \ -Последний абзац ОБЯЗАТЕЛЬНО позитивный — что в этом крутого, \ -почему такой вкус в мемах это кайф.", - "absurd_comparisons": [ - {{"category": "{categories[0]}", "thing": "конкретный предмет", \ -"why": "потому что ты лайкаешь X и Y — 1 предложение", \ -"meme_ref": число}}, - {{"category": "{categories[1]}", "thing": "конкретный предмет", \ -"why": "1 предложение", "meme_ref": число}}, - {{"category": "{categories[2]}", "thing": "конкретный предмет", \ -"why": "1 предложение", "meme_ref": число}} - ], - "prediction": "конкретное абсурдное событие на лето 2026. 1-2 предложения." -}} - -Правила: -- humor_dna: 5 конкретных прикольных категорий по 2-3 слова, проценты ~100 -- zodiac: знак как метафора мемного поведения, не "кто он по жизни". \ -ВАЖНО: НЕ БЛИЗНЕЦЫ. Близнецы — запрещённый знак. Выбирай из остальных 11 знаков. \ -Привязывай знак к КОНКРЕТНЫМ паттернам в мемах (например: Овен если агрессивный юмор, \ -Рыбы если меланхолия, Лев если самоирония, Козерог если сухой юмор, и т.д.) -- absurd_comparisons: thing = конкретный предмет (не "хаос-машина"). \ -Каждый comparison на ДРУГИХ мотивах, не повторяй шутку. \ -meme_ref ДОЛЖЕН быть РАЗНЫМ для каждого comparison (три разных числа!) -- meme_ref: индекс [N] из ЛАЙКНУТЫХ мемов. Каждый meme_ref уникален! -- meme_index: ДОЛЖЕН отличаться от всех meme_ref в absurd_comparisons - -АНТИСЛОП: -- ЗАПРЕЩЕНЫ слова: уникальный, особенный, тонкий, изысканный, многогранный, хаотичный, вайб, ирония, абсурд (без конкретики) -- ЗАПРЕЩЕНЫ шаблоны: "ты из тех, кто...", "генерал постиронии", "ценитель абсурда" -- Подкалывай дружески, но ВСЕГДА заканчивай на позитивной ноте. \ -Человек должен улыбнуться, а не расстроиться. \ -Формула: подкол + комплимент ("ты залипаешь на X — но это потому что у тебя Y"). \ -Если мемы пользователя про грусть, депрессию, одиночество — будь мягче и теплее. \ -Не подчёркивай негатив, а покажи что юмор помогает справляться -- Каждое утверждение ДОЛЖНО опираться на конкретный мем -- Если шутка подошла бы любому — перепиши -- Лучший юмор = противоречия: "лайкаешь X, но скипаешь Y"{lang_instruction}""" - - -def _pick_meme(p: dict, liked: list) -> dict | None: - idx = p.get("meme_index") - cap = p.get("meme_caption", "🎯 Этот мем олицетворяет тебя") - if idx is not None and 0 <= idx < len(liked): - return { - "meme_id": liked[idx]["meme_id"], - "caption": f"🎯 Этот мем олицетворяет тебя:\n\n{html_escape(cap)}", - } - return None - - -def _build_humor_dna_slide(p: dict, is_ru: bool = True) -> str: - """Humor DNA bars only — no roast text.""" - dna = p.get("humor_dna", []) - - def bar(pct): - f = round(pct / 10) - return "█" * f + "░" * (10 - f) - - header = "🧬 Твоя ДНК юмора:" if is_ru else "🧬 Your Humor DNA:" - lines = [header + "\n"] - for c in dna[:5]: - pct = min(100, max(0, c.get("pct", 33))) - lines.append(f"{bar(pct)} {pct}%\n{html_escape(c.get('name', '???'))}\n") - - return "\n".join(lines) if len(lines) > 1 else "" - - -def _build_zodiac_slide(p: dict, is_ru: bool = True) -> str: - sign = p.get("zodiac_sign", "") - why = p.get("zodiac_why", "") - if not sign: - return "" - header = "🔮 Твой мем-зодиак:" if is_ru else "🔮 Your Meme Zodiac:" - return f"{header}\n\n{html_escape(sign)}\n\n{html_escape(why)}" - - -def _attach_memes_to_absurd(p: dict, liked: list, used_ids: set | None = None) -> list: - """Attach meme IDs to each absurd comparison, ensuring no duplicates.""" - comparisons = p.get("absurd_comparisons", []) - result = [] - if used_ids is None: - used_ids = set() - else: - used_ids = set(used_ids) # don't mutate caller's set - for c in comparisons[:3]: - meme_id = None - # Try LLM-suggested meme_ref (but skip if already used) - ref = c.get("meme_ref") - if ref is not None and isinstance(ref, int) and 0 <= ref < len(liked): - candidate = liked[ref]["meme_id"] - if candidate not in used_ids: - meme_id = candidate - # Fallback: random liked meme not yet used - if not meme_id and liked: - available = [m for m in liked[:15] if m["meme_id"] not in used_ids] - if available: - pick = random.choice(available) - meme_id = pick["meme_id"] - if meme_id: - used_ids.add(meme_id) - result.append( - { - "category": c.get("category", "?"), - "thing": c.get("thing", "?"), - "why": c.get("why", ""), - "meme_id": meme_id, - } - ) - return result - - -def _build_meme_data(meme: dict | None, is_popular: bool, is_ru: bool = True) -> dict | None: - if not meme: - return None - lr = meme.get("global_lr_pct", "?") - nlikes = meme.get("nlikes") - if is_popular: - if is_ru: - extra = f" ({nlikes} чел.)" if nlikes else "" - caption = f"🏆 Самый залайканный мем из твоих лайков!\n\nЕго лайкнули {lr}%{extra}" - else: - extra = f" ({nlikes} people)" if nlikes else "" - caption = f"🏆 The most liked meme from your likes!\n\nLiked by {lr}%{extra}" - else: - if is_ru: - extra = f" ({nlikes} чел.)" if nlikes else "" - caption = f"🤔 А этот мем ты скипнул...\n\nХотя его лайкнули {lr}%{extra}!" - else: - extra = f" ({nlikes} people)" if nlikes else "" - caption = f"🤔 You skipped this one...\n\nBut {lr}%{extra} liked it!" - return {"meme_id": meme["meme_id"], "caption": caption} - - -def _build_anti_slide(p: dict, is_ru: bool = True) -> str: - anti = p.get("anti_profile", "") - if not anti: - return "" - header = ( - "🚫 Что говорят твои скипы:" if is_ru else "🚫 What your skips say about you:" - ) - return f"{header}\n\n{html_escape(anti)}" - - -def _build_extra_slide( - sources: str, - speed: dict, - peak: dict, - is_ru: bool = True, -) -> str: - parts = [] - if sources: - parts.append(sources) - - if speed: - med = speed.get("median_sec", 0) - ml = speed.get("median_like", 0) - md = speed.get("median_dislike", 0) - if is_ru: - parts.append( - f"⚡ Скорость реакции: {med} сек\n(до лайка: {ml} сек, до скипа: {md} сек)" - ) - else: - parts.append(f"⚡ Reaction speed: {med}s\n(to like: {ml}s, to skip: {md}s)") - - if peak: - h = peak.get("hour", 0) - label = peak.get("label", "") - tz = peak.get("tz", "") - if is_ru: - parts.append(f"🕐 Пик активности: {h}:00 {tz}\nТы — {label}") - else: - parts.append(f"🕐 Peak activity: {h}:00 {tz}\nYou're a {label}") - - return "\n\n".join(parts) if parts else "" - - -async def _build_sources_report(user_id: int, is_ru: bool = True) -> str: - sources = await get_most_liked_meme_source_urls(user_id, limit=10) - real = [ - s - for s in (sources or []) - if s.get("url") - and not s["url"].startswith("tg://user") - and ("t.me/" in s["url"] or "vk.com/" in s["url"]) - ] - if len(real) < 3: - try: - top = await get_top_meme_source_urls(limit=5) - for t in top or []: - if ( - t.get("url") - and not t["url"].startswith("tg://user") - and t["url"] not in [s["url"] for s in real] - ): - real.append(t) - if len(real) >= 3: - break - except Exception: - pass - if not real: - return "" - src_list = "\n".join(f"▪️ {s['url']}" for s in real[:3]) - header = "📡 Твои топ мем-паблики:" if is_ru else "📡 Your top meme channels:" - return f"{header}\n\n{src_list}" - - -# ── STATS SLIDE ────────────────────────────────────────── - - -async def get_bot_usage_report( - user_id: int, - user_stats: dict, - user: dict, - is_ru: bool = True, -) -> str | None: - if user_stats is None: - return None - - days = (datetime.datetime.utcnow() - user["created_at"]).days + 1 - sessions = user_stats.get("nsessions", 0) - memes_sent = user_stats.get("nmemes_sent", 0) - likes = user_stats.get("nlikes", 0) - time_sec = user_stats.get("time_spent_sec", 0) - - if likes < 10: - return None - - like_rate = round(100 * likes / max(memes_sent, 1)) - - if is_ru: - report = ( - "📊 Meme Wrapped 2026\n\n" - "Начнём с цифр.\n\n" - f"Ты с нами уже {days} дней.\n\n" - f"🤝 Посмотрел {memes_sent} мемов\n" - f"👍 Лайкнул {likes} из них " - f"({like_rate}%)\n" - f"👋 Заходил {sessions} раз\n" - ) - if time_sec > 0: - if time_sec < 60: - t = f"{time_sec} сек" - elif time_sec < 3600: - t = f"{time_sec // 60} мин {time_sec % 60} сек" - else: - t = f"больше {time_sec // 3600} часов 😳" - report += f"🕒 В боте {t}\n" - if like_rate > 50: - vibe = "Лайкаешь больше половины — тебе всё смешно 😄" - elif like_rate > 30: - vibe = "Лайкаешь каждый третий — у тебя есть вкус 👌" - elif like_rate > 15: - vibe = "Лайкаешь каждый пятый — избирательный 🧐" - else: - vibe = "Менее 15% мемов достойны — мем-сноб 🎩" - else: - report = ( - "📊 Meme Wrapped 2026\n\n" - "Let's start with the numbers.\n\n" - f"You've been with us for {days} days.\n\n" - f"🤝 Seen {memes_sent} memes\n" - f"👍 Liked {likes} of them " - f"({like_rate}%)\n" - f"👋 Visited {sessions} times\n" - ) - if time_sec > 0: - if time_sec < 60: - t = f"{time_sec}s" - elif time_sec < 3600: - t = f"{time_sec // 60}m {time_sec % 60}s" - else: - t = f"over {time_sec // 3600} hours 😳" - report += f"🕒 Time in bot: {t}\n" - if like_rate > 50: - vibe = "You like more than half — everything's funny to you 😄" - elif like_rate > 30: - vibe = "You like every third one — you've got taste 👌" - elif like_rate > 15: - vibe = "You like every fifth one — picky 🧐" - else: - vibe = "Less than 15% are worthy — meme snob 🎩" - - report += f"\n{vibe}" - return report - - def get_user_interface_language(user) -> str: lang = user.get("language_code") if user else None return lang if lang else "ru" diff --git a/src/tgbot/handlers/stats/wrapped_generation.py b/src/tgbot/handlers/stats/wrapped_generation.py new file mode 100644 index 00000000..ad313d72 --- /dev/null +++ b/src/tgbot/handlers/stats/wrapped_generation.py @@ -0,0 +1,728 @@ +import asyncio +import datetime +import json +import logging +import random +from html import escape as html_escape + +from openai import AsyncOpenAI + +from src.config import settings +from src.localizer import ALMOST_CIS_LANGUAGES +from src.stats.service import ( + get_most_liked_meme_source_urls, + get_top_meme_source_urls, +) + +logger = logging.getLogger(__name__) + + +def _is_ru(lang_code: str | None) -> bool: + return (lang_code or "ru") in ALMOST_CIS_LANGUAGES + + +ABSURD_CATEGORIES = [ + "бытовая техника", + "животное", + "блюдо/еда", + "музыкальный жанр", + "вид транспорта", + "напиток", + "предмет мебели", + "персонаж мультфильма", + "погода", +] + +# ── LLM ────────────────────────────────────────────────── + + +async def call_deepseek(prompt: str) -> str: + client = AsyncOpenAI( + api_key=settings.DEEPSEEK_API_KEY, + base_url=settings.DEEPSEEK_BASE_URL, + ) + resp = await client.chat.completions.create( + model="deepseek-chat", + messages=[{"role": "user", "content": prompt}], + max_tokens=2000, + temperature=0.9, + ) + return resp.choices[0].message.content + + +def parse_json_from_llm(raw: str) -> dict | None: + c = raw.strip() + if c.startswith("```"): + c = c.split("\n", 1)[1] if "\n" in c else c[3:] + if c.endswith("```"): + c = c[:-3] + c = c.strip() + if c.startswith("json"): + c = c[4:].strip() + try: + return json.loads(c) + except Exception: + return None + + +# ── SQL INSIGHTS ───────────────────────────────────────── + + +async def get_reaction_speed_insight(user_id: int) -> dict: + """Median reaction time, split by like/dislike. Pure SQL.""" + from sqlalchemy import text + + from src.database import fetch_one + + row = await fetch_one( + text( + """ + WITH reactions AS ( + SELECT + EXTRACT(EPOCH FROM (reacted_at - sent_at)) AS sec, + reaction_id + FROM user_meme_reaction + WHERE user_id = :user_id + AND reacted_at IS NOT NULL AND sent_at IS NOT NULL + AND EXTRACT(EPOCH FROM (reacted_at - sent_at)) + BETWEEN 0.5 AND 120 + ) + SELECT + PERCENTILE_CONT(0.5) WITHIN GROUP ( + ORDER BY sec + ) AS median_sec, + PERCENTILE_CONT(0.5) WITHIN GROUP ( + ORDER BY sec + ) FILTER (WHERE reaction_id = 1) AS median_like, + PERCENTILE_CONT(0.5) WITHIN GROUP ( + ORDER BY sec + ) FILTER (WHERE reaction_id = 2) AS median_dislike + FROM reactions + """ + ), + {"user_id": user_id}, + ) + + if not row or row["median_sec"] is None: + return {} + return { + "median_sec": round(float(row["median_sec"]), 1), + "median_like": round(float(row["median_like"] or 0), 1), + "median_dislike": round(float(row["median_dislike"] or 0), 1), + } + + +async def get_peak_hour_insight(user_id: int, is_ru: bool = True) -> dict: + """Peak activity hour. Moscow time for RU, UTC for EN.""" + from sqlalchemy import text + + from src.database import fetch_one + + # UTC+3 for Russian users + tz_offset = 3 if is_ru else 0 + row = await fetch_one( + text( + f""" + SELECT + EXTRACT(HOUR FROM reacted_at + interval '{tz_offset} hours') + AS peak_hour, + COUNT(*) AS cnt + FROM user_meme_reaction + WHERE user_id = :user_id AND reacted_at IS NOT NULL + GROUP BY 1 ORDER BY 2 DESC LIMIT 1 + """ + ), + {"user_id": user_id}, + ) + + if not row: + return {} + hour = int(row["peak_hour"]) + if is_ru: + labels = { + (0, 6): "ночной скроллер 🌙", + (6, 10): "утренний мемолюб ☀️", + (10, 14): "дневной прокрастинатор 💼", + (14, 18): "послеобеденный залипатель 🍕", + (18, 22): "вечерний мемоман 🌆", + (22, 24): "полуночный скроллер 🦉", + } + default_label = "мемоман" + else: + labels = { + (0, 6): "night scroller 🌙", + (6, 10): "morning meme lover ☀️", + (10, 14): "daytime procrastinator 💼", + (14, 18): "afternoon meme addict 🍕", + (18, 22): "evening meme connoisseur 🌆", + (22, 24): "midnight scroller 🦉", + } + default_label = "meme lover" + label = next( + (v for (lo, hi), v in labels.items() if lo <= hour < hi), + default_label, + ) + tz_label = "МСК" if is_ru else "UTC" + return {"hour": hour, "label": label, "tz": tz_label} + + +async def get_surprise_meme(user_id: int) -> dict | None: + """Meme user liked but most others didn't.""" + from sqlalchemy import text + + from src.database import fetch_one + + row = await fetch_one( + text( + """ + SELECT m.id AS meme_id, m.type, m.telegram_file_id, + ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100) + AS global_lr_pct + FROM user_meme_reaction umr + JOIN meme m ON m.id = umr.meme_id + LEFT JOIN meme_stats ms ON ms.meme_id = m.id + WHERE umr.user_id = :user_id + AND umr.reaction_id = 1 + AND m.telegram_file_id IS NOT NULL + AND COALESCE(ms.lr_smoothed, 0.5) < 0.35 + AND COALESCE(ms.nmemes_sent, 0) >= 10 + ORDER BY ms.lr_smoothed ASC LIMIT 1 + """ + ), + {"user_id": user_id}, + ) + if not row: + return None + return dict(row) + + +async def get_most_popular_liked_meme(user_id: int) -> dict | None: + """Meme user liked with highest global like rate.""" + from sqlalchemy import text + + from src.database import fetch_one + + row = await fetch_one( + text( + """ + SELECT m.id AS meme_id, m.type, m.telegram_file_id, + ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100) + AS global_lr_pct, + COALESCE(ms.nlikes, 0) AS nlikes + FROM user_meme_reaction umr + JOIN meme m ON m.id = umr.meme_id + LEFT JOIN meme_stats ms ON ms.meme_id = m.id + WHERE umr.user_id = :user_id + AND umr.reaction_id = 1 + AND m.telegram_file_id IS NOT NULL + AND COALESCE(ms.nmemes_sent, 0) >= 10 + ORDER BY ms.lr_smoothed DESC LIMIT 1 + """ + ), + {"user_id": user_id}, + ) + if not row: + return None + return dict(row) + + +async def get_unpopular_opinion_meme(user_id: int) -> dict | None: + """Meme user disliked but was very popular globally.""" + from sqlalchemy import text + + from src.database import fetch_one + + row = await fetch_one( + text( + """ + SELECT m.id AS meme_id, m.type, m.telegram_file_id, + ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100) + AS global_lr_pct, + COALESCE(ms.nlikes, 0) AS nlikes + FROM user_meme_reaction umr + JOIN meme m ON m.id = umr.meme_id + LEFT JOIN meme_stats ms ON ms.meme_id = m.id + WHERE umr.user_id = :user_id + AND umr.reaction_id = 2 + AND m.telegram_file_id IS NOT NULL + AND COALESCE(ms.lr_smoothed, 0.5) > 0.65 + AND COALESCE(ms.nmemes_sent, 0) >= 10 + ORDER BY ms.lr_smoothed DESC LIMIT 1 + """ + ), + {"user_id": user_id}, + ) + if not row: + return None + return dict(row) + + +# ── GENERATION ─────────────────────────────────────────── + + +async def generate_wrapped_data( + user_id: int, + descriptions: list, + lang: str, + stats_report: str, +) -> dict | None: + # Lock is already set by _generate_and_cache (with is_ru), don't overwrite it + + try: + liked = [d for d in descriptions if d.get("reaction_id") == 1] + disliked = [d for d in descriptions if d.get("reaction_id") == 2] + + liked_texts = "\n".join( + f"[{i}] ✅ {d.get('description') or d.get('ocr_text', '')}" + for i, d in enumerate(liked[:25]) + ) + disliked_texts = "\n".join( + f"❌ {d.get('description') or d.get('ocr_text', '')}" for d in disliked[:15] + ) + + # DeepSeek + SQL in parallel + is_ru = _is_ru(lang) + prompt = _build_mega_prompt(liked_texts, disliked_texts, lang) + + deepseek_task = asyncio.create_task(call_deepseek(prompt)) + sql_tasks = asyncio.gather( + _safe(get_reaction_speed_insight(user_id)), + _safe(get_peak_hour_insight(user_id, is_ru)), + _safe(get_surprise_meme(user_id)), + _safe(_build_sources_report(user_id, is_ru)), + _safe(get_most_popular_liked_meme(user_id)), + _safe(get_unpopular_opinion_meme(user_id)), + ) + + raw, (speed, peak, surprise, sources, popular_meme, unpopular_meme) = await asyncio.gather( + deepseek_task, sql_tasks + ) + + p = parse_json_from_llm(raw) + if not p: + logger.warning( + "DeepSeek JSON failed user %d: %s", + user_id, + raw[:300], + ) + p = {} + + your_meme = _pick_meme(p, liked) + + # Use surprise meme if LLM didn't pick one + if not your_meme and surprise: + lr = surprise.get("global_lr_pct", "?") + if is_ru: + cap = f"🎲 Этот мем лайкнул только ты\n(глобальный лайк-рейт: {lr}%)" + else: + cap = f"🎲 Only you liked this meme\n(global like rate: {lr}%)" + your_meme = {"meme_id": surprise["meme_id"], "caption": cap} + if not your_meme and liked: + pick = random.choice(liked[:10]) + cap = "🎲 А вот мем, который тебе зашёл:" if is_ru else "🎲 Here's a meme you liked:" + your_meme = {"meme_id": pick["meme_id"], "caption": cap} + + # Build slides + # Stats report gets vibe from DeepSeek — replace placeholder vibe + vibe = p.get("vibe", "") + if vibe and stats_report: + if "\n" in stats_report: + idx = stats_report.rfind("\n") + stats_report = stats_report[:idx] + stats_report += f"\n\n{html_escape(vibe)}" + + # Track used meme IDs globally to avoid showing the same meme twice + global_used_memes = set() + if your_meme and your_meme.get("meme_id"): + global_used_memes.add(your_meme["meme_id"]) + + # Pick oneliner meme (avoid your_meme) + oneliner_meme_id = None + if liked: + oneliner_candidates = [m for m in liked[:10] if m["meme_id"] not in global_used_memes] + if oneliner_candidates: + oneliner_meme_id = random.choice(oneliner_candidates)["meme_id"] + else: + oneliner_meme_id = random.choice(liked[:10])["meme_id"] + global_used_memes.add(oneliner_meme_id) + + # Pick memes for absurd comparisons (avoid already used) + absurd_memes = _attach_memes_to_absurd(p, liked, global_used_memes) + + default_prediction = ( + "Летом ты будешь листать мемы вместо работы 🔥" + if is_ru + else "This summer you'll scroll memes instead of working 🔥" + ) + return { + "stats_report": stats_report, + "zodiac": _build_zodiac_slide(p, is_ru), + "your_meme": your_meme, + "humor_dna": _build_humor_dna_slide(p, is_ru), + "humor_oneliner": p.get("humor_oneliner", ""), + "oneliner_meme_id": oneliner_meme_id, + "absurd_items": absurd_memes, + "anti_profile": _build_anti_slide(p, is_ru), + "popular_meme": _build_meme_data(popular_meme, is_popular=True, is_ru=is_ru), + "unpopular_meme": _build_meme_data(unpopular_meme, is_popular=False, is_ru=is_ru), + "stats_extra": _build_extra_slide(sources, speed, peak, is_ru), + "prediction": p.get("prediction", default_prediction), + } + except Exception as e: + logger.error("Wrapped failed user %d: %s", user_id, e, exc_info=True) + default_prediction = ( + "Летом ты будешь листать мемы вместо работы 🔥" + if is_ru + else "This summer you'll scroll memes instead of working 🔥" + ) + return { + "stats_report": stats_report, + "zodiac": "", + "your_meme": None, + "humor_dna": "", + "humor_oneliner": "", + "oneliner_meme_id": None, + "absurd_items": [], + "anti_profile": "", + "popular_meme": None, + "unpopular_meme": None, + "stats_extra": "", + "prediction": default_prediction, + } + + +async def _safe(coro): + try: + return await coro + except Exception as e: + logger.warning("Wrapped SQL insight failed: %s", e) + return {} if not isinstance(e, TypeError) else None + + +def _build_mega_prompt(liked_texts: str, disliked_texts: str, lang: str = "ru") -> str: + categories = random.sample(ABSURD_CATEGORIES, 3) + + lang_instruction = "" + if lang != "ru": + lang_name = "English" if lang == "en" else lang + lang_instruction = f"\n- ЯЗЫК: пиши ВЕСЬ JSON на {lang_name}" + + return f"""Ты мем-психолог. Проанализируй чувство юмора. + +ЛАЙКНУТЫЕ МЕМЫ: +{liked_texts} + +СКИПНУТЫЕ МЕМЫ: +{disliked_texts} + +Сначала молча найди: +1) 2-3 самые частые мотивы в лайках (офис, животные, кринж, +токсичная мотивация, low-res chaos, семейная драма, etc.) +2) 1-2 мотива, которые человек стабильно скипает +3) 1 противоречие между лайками и скипами +Рассуждения НЕ выводи. Только JSON. + +Верни ТОЛЬКО JSON: +{{ + "vibe": "подкол от друга по мемам, 10-15 слов", + "meme_index": число (индекс лайкнутого мема [N], который олицетворяет), + "meme_caption": "почему этот мем — это ты (2 предложения, подкол)", + "zodiac_sign": "знак зодиака + эмодзи (♈♉♊♋♌♍♎♏♐♑♒♓)", + "zodiac_why": "1-2 предложения. Выбирай знак НЕ по характеру, \ +а по ЛОГИКЕ мемов. Упомяни конкретный мотив.", + "humor_dna": [ + {{"name": "категория", "pct": число}}, + {{"name": "категория", "pct": число}}, + {{"name": "категория", "pct": число}}, + {{"name": "категория", "pct": число}}, + {{"name": "категория", "pct": число}} + ], + "humor_oneliner": "4-8 слов. Ярлык мем-вкуса, не комплимент. \ +Как кличка от друга, не описание из гороскопа.", + "anti_profile": "2-3 коротких абзаца через \\n\\n. \ +На ТЫ: 'ты терпеть не можешь...'. Конкретно. \ +Последний абзац ОБЯЗАТЕЛЬНО позитивный — что в этом крутого, \ +почему такой вкус в мемах это кайф.", + "absurd_comparisons": [ + {{"category": "{categories[0]}", "thing": "конкретный предмет", \ +"why": "потому что ты лайкаешь X и Y — 1 предложение", \ +"meme_ref": число}}, + {{"category": "{categories[1]}", "thing": "конкретный предмет", \ +"why": "1 предложение", "meme_ref": число}}, + {{"category": "{categories[2]}", "thing": "конкретный предмет", \ +"why": "1 предложение", "meme_ref": число}} + ], + "prediction": "конкретное абсурдное событие на лето 2026. 1-2 предложения." +}} + +Правила: +- humor_dna: 5 конкретных прикольных категорий по 2-3 слова, проценты ~100 +- zodiac: знак как метафора мемного поведения, не "кто он по жизни". \ +ВАЖНО: НЕ БЛИЗНЕЦЫ. Близнецы — запрещённый знак. Выбирай из остальных 11 знаков. \ +Привязывай знак к КОНКРЕТНЫМ паттернам в мемах (например: Овен если агрессивный юмор, \ +Рыбы если меланхолия, Лев если самоирония, Козерог если сухой юмор, и т.д.) +- absurd_comparisons: thing = конкретный предмет (не "хаос-машина"). \ +Каждый comparison на ДРУГИХ мотивах, не повторяй шутку. \ +meme_ref ДОЛЖЕН быть РАЗНЫМ для каждого comparison (три разных числа!) +- meme_ref: индекс [N] из ЛАЙКНУТЫХ мемов. Каждый meme_ref уникален! +- meme_index: ДОЛЖЕН отличаться от всех meme_ref в absurd_comparisons + +АНТИСЛОП: +- ЗАПРЕЩЕНЫ слова: уникальный, особенный, тонкий, изысканный, +многогранный, хаотичный, вайб, ирония, абсурд (без конкретики) +- ЗАПРЕЩЕНЫ шаблоны: "ты из тех, кто...", "генерал постиронии", "ценитель абсурда" +- Подкалывай дружески, но ВСЕГДА заканчивай на позитивной ноте. \ +Человек должен улыбнуться, а не расстроиться. \ +Формула: подкол + комплимент ("ты залипаешь на X — но это потому что у тебя Y"). \ +Если мемы пользователя про грусть, депрессию, одиночество — будь мягче и теплее. \ +Не подчёркивай негатив, а покажи что юмор помогает справляться +- Каждое утверждение ДОЛЖНО опираться на конкретный мем +- Если шутка подошла бы любому — перепиши +- Лучший юмор = противоречия: "лайкаешь X, но скипаешь Y"{lang_instruction}""" + + +def _pick_meme(p: dict, liked: list) -> dict | None: + idx = p.get("meme_index") + cap = p.get("meme_caption", "🎯 Этот мем олицетворяет тебя") + if idx is not None and 0 <= idx < len(liked): + return { + "meme_id": liked[idx]["meme_id"], + "caption": f"🎯 Этот мем олицетворяет тебя:\n\n{html_escape(cap)}", + } + return None + + +def _build_humor_dna_slide(p: dict, is_ru: bool = True) -> str: + """Humor DNA bars only — no roast text.""" + dna = p.get("humor_dna", []) + + def bar(pct): + f = round(pct / 10) + return "█" * f + "░" * (10 - f) + + header = "🧬 Твоя ДНК юмора:" if is_ru else "🧬 Your Humor DNA:" + lines = [header + "\n"] + for c in dna[:5]: + pct = min(100, max(0, c.get("pct", 33))) + lines.append(f"{bar(pct)} {pct}%\n{html_escape(c.get('name', '???'))}\n") + + return "\n".join(lines) if len(lines) > 1 else "" + + +def _build_zodiac_slide(p: dict, is_ru: bool = True) -> str: + sign = p.get("zodiac_sign", "") + why = p.get("zodiac_why", "") + if not sign: + return "" + header = "🔮 Твой мем-зодиак:" if is_ru else "🔮 Your Meme Zodiac:" + return f"{header}\n\n{html_escape(sign)}\n\n{html_escape(why)}" + + +def _attach_memes_to_absurd(p: dict, liked: list, used_ids: set | None = None) -> list: + """Attach meme IDs to each absurd comparison, ensuring no duplicates.""" + comparisons = p.get("absurd_comparisons", []) + result = [] + if used_ids is None: + used_ids = set() + else: + used_ids = set(used_ids) # don't mutate caller's set + for c in comparisons[:3]: + meme_id = None + # Try LLM-suggested meme_ref (but skip if already used) + ref = c.get("meme_ref") + if ref is not None and isinstance(ref, int) and 0 <= ref < len(liked): + candidate = liked[ref]["meme_id"] + if candidate not in used_ids: + meme_id = candidate + # Fallback: random liked meme not yet used + if not meme_id and liked: + available = [m for m in liked[:15] if m["meme_id"] not in used_ids] + if available: + pick = random.choice(available) + meme_id = pick["meme_id"] + if meme_id: + used_ids.add(meme_id) + result.append( + { + "category": c.get("category", "?"), + "thing": c.get("thing", "?"), + "why": c.get("why", ""), + "meme_id": meme_id, + } + ) + return result + + +def _build_meme_data(meme: dict | None, is_popular: bool, is_ru: bool = True) -> dict | None: + if not meme: + return None + lr = meme.get("global_lr_pct", "?") + nlikes = meme.get("nlikes") + if is_popular: + if is_ru: + extra = f" ({nlikes} чел.)" if nlikes else "" + caption = f"🏆 Самый залайканный мем из твоих лайков!\n\nЕго лайкнули {lr}%{extra}" + else: + extra = f" ({nlikes} people)" if nlikes else "" + caption = f"🏆 The most liked meme from your likes!\n\nLiked by {lr}%{extra}" + else: + if is_ru: + extra = f" ({nlikes} чел.)" if nlikes else "" + caption = f"🤔 А этот мем ты скипнул...\n\nХотя его лайкнули {lr}%{extra}!" + else: + extra = f" ({nlikes} people)" if nlikes else "" + caption = f"🤔 You skipped this one...\n\nBut {lr}%{extra} liked it!" + return {"meme_id": meme["meme_id"], "caption": caption} + + +def _build_anti_slide(p: dict, is_ru: bool = True) -> str: + anti = p.get("anti_profile", "") + if not anti: + return "" + header = ( + "🚫 Что говорят твои скипы:" if is_ru else "🚫 What your skips say about you:" + ) + return f"{header}\n\n{html_escape(anti)}" + + +def _build_extra_slide( + sources: str, + speed: dict, + peak: dict, + is_ru: bool = True, +) -> str: + parts = [] + if sources: + parts.append(sources) + + if speed: + med = speed.get("median_sec", 0) + ml = speed.get("median_like", 0) + md = speed.get("median_dislike", 0) + if is_ru: + parts.append( + f"⚡ Скорость реакции: {med} сек\n(до лайка: {ml} сек, до скипа: {md} сек)" + ) + else: + parts.append(f"⚡ Reaction speed: {med}s\n(to like: {ml}s, to skip: {md}s)") + + if peak: + h = peak.get("hour", 0) + label = peak.get("label", "") + tz = peak.get("tz", "") + if is_ru: + parts.append(f"🕐 Пик активности: {h}:00 {tz}\nТы — {label}") + else: + parts.append(f"🕐 Peak activity: {h}:00 {tz}\nYou're a {label}") + + return "\n\n".join(parts) if parts else "" + + +async def _build_sources_report(user_id: int, is_ru: bool = True) -> str: + sources = await get_most_liked_meme_source_urls(user_id, limit=10) + real = [ + s + for s in (sources or []) + if s.get("url") + and not s["url"].startswith("tg://user") + and ("t.me/" in s["url"] or "vk.com/" in s["url"]) + ] + if len(real) < 3: + try: + top = await get_top_meme_source_urls(limit=5) + for t in top or []: + if ( + t.get("url") + and not t["url"].startswith("tg://user") + and t["url"] not in [s["url"] for s in real] + ): + real.append(t) + if len(real) >= 3: + break + except Exception: + pass + if not real: + return "" + src_list = "\n".join(f"▪️ {s['url']}" for s in real[:3]) + header = "📡 Твои топ мем-паблики:" if is_ru else "📡 Your top meme channels:" + return f"{header}\n\n{src_list}" + + +# ── STATS SLIDE ────────────────────────────────────────── + + +async def get_bot_usage_report( + user_id: int, + user_stats: dict, + user: dict, + is_ru: bool = True, +) -> str | None: + if user_stats is None: + return None + + days = (datetime.datetime.utcnow() - user["created_at"]).days + 1 + sessions = user_stats.get("nsessions", 0) + memes_sent = user_stats.get("nmemes_sent", 0) + likes = user_stats.get("nlikes", 0) + time_sec = user_stats.get("time_spent_sec", 0) + + if likes < 10: + return None + + like_rate = round(100 * likes / max(memes_sent, 1)) + + if is_ru: + report = ( + "📊 Meme Wrapped 2026\n\n" + "Начнём с цифр.\n\n" + f"Ты с нами уже {days} дней.\n\n" + f"🤝 Посмотрел {memes_sent} мемов\n" + f"👍 Лайкнул {likes} из них " + f"({like_rate}%)\n" + f"👋 Заходил {sessions} раз\n" + ) + if time_sec > 0: + if time_sec < 60: + t = f"{time_sec} сек" + elif time_sec < 3600: + t = f"{time_sec // 60} мин {time_sec % 60} сек" + else: + t = f"больше {time_sec // 3600} часов 😳" + report += f"🕒 В боте {t}\n" + if like_rate > 50: + vibe = "Лайкаешь больше половины — тебе всё смешно 😄" + elif like_rate > 30: + vibe = "Лайкаешь каждый третий — у тебя есть вкус 👌" + elif like_rate > 15: + vibe = "Лайкаешь каждый пятый — избирательный 🧐" + else: + vibe = "Менее 15% мемов достойны — мем-сноб 🎩" + else: + report = ( + "📊 Meme Wrapped 2026\n\n" + "Let's start with the numbers.\n\n" + f"You've been with us for {days} days.\n\n" + f"🤝 Seen {memes_sent} memes\n" + f"👍 Liked {likes} of them " + f"({like_rate}%)\n" + f"👋 Visited {sessions} times\n" + ) + if time_sec > 0: + if time_sec < 60: + t = f"{time_sec}s" + elif time_sec < 3600: + t = f"{time_sec // 60}m {time_sec % 60}s" + else: + t = f"over {time_sec // 3600} hours 😳" + report += f"🕒 Time in bot: {t}\n" + if like_rate > 50: + vibe = "You like more than half — everything's funny to you 😄" + elif like_rate > 30: + vibe = "You like every third one — you've got taste 👌" + elif like_rate > 15: + vibe = "You like every fifth one — picky 🧐" + else: + vibe = "Less than 15% are worthy — meme snob 🎩" + + report += f"\n{vibe}" + return report diff --git a/src/tgbot/handlers/upload/moderation.py b/src/tgbot/handlers/upload/moderation.py index 6b805aca..725d1034 100644 --- a/src/tgbot/handlers/upload/moderation.py +++ b/src/tgbot/handlers/upload/moderation.py @@ -1,6 +1,7 @@ import asyncio import logging import re +from dataclasses import dataclass from datetime import datetime from typing import Any @@ -26,7 +27,11 @@ from src.stats.meme import calculate_meme_reactions_and_engagement from src.stats.meme_source import calculate_meme_source_stats from src.storage.constants import MemeStatus, MemeType -from src.storage.service import find_meme_duplicate, update_meme +from src.storage.deduplication import ( + find_duplicate_by_ocr_text, + resolve_duplicate, +) +from src.storage.service import update_meme from src.storage.upload import download_meme_content_from_tg from src.tgbot.handlers.treasury.constants import TrxType from src.tgbot.handlers.treasury.payments import pay_if_not_paid_with_alert @@ -66,6 +71,13 @@ def _telegram_download_failure_kind(exc: BadRequest) -> str: return "telegram_download_bad_request" +@dataclass(frozen=True) +class UploadAutoReviewDuplicate: + meme_id: int + duplicate_of: int + reason: str + + async def _notify_uploader( bot: Bot, meme_upload: dict[str, Any], @@ -98,7 +110,45 @@ async def _get_uploader_lang(user_id: int) -> str | None: return user["interface_lang"] if user else None -async def _check_duplicate_via_ocr(meme: dict[str, Any]) -> tuple[dict[str, Any], int | None]: +def _stored_duplicate_result(meme: dict[str, Any]) -> UploadAutoReviewDuplicate | None: + if meme["status"] != MemeStatus.DUPLICATE.value or meme.get("duplicate_of") is None: + return None + return UploadAutoReviewDuplicate( + meme_id=meme["id"], + duplicate_of=meme["duplicate_of"], + reason="telegram_file_id", + ) + + +async def _reject_duplicate_upload( + bot: Bot, + meme_upload: dict[str, Any], + duplicate: UploadAutoReviewDuplicate, + uploader_lang: str | None, +) -> None: + logging.info( + "Uploaded meme %s is a %s duplicate of %s, auto-rejecting", + duplicate.meme_id, + duplicate.reason, + duplicate.duplicate_of, + ) + await create_user_meme_reaction( + meme_upload["user_id"], + duplicate.duplicate_of, + "uploaded_meme", + reaction_id=1, + reacted_at=datetime.utcnow(), + ) + await _notify_uploader( + bot, + meme_upload, + localizer.t("upload.rejected_duplicate", uploader_lang), + ) + + +async def _deduplicate_upload_via_ocr( + meme: dict[str, Any], +) -> tuple[dict[str, Any], UploadAutoReviewDuplicate | None]: """Describe the meme inline via OpenRouter vision and check for OCR-text duplicates. Why: describe_memes cron is intentionally slow; for uploads we can't wait — run it synchronously @@ -106,7 +156,7 @@ async def _check_duplicate_via_ocr(meme: dict[str, Any]) -> tuple[dict[str, Any] Non-images skip describe (OCR is image-only). Failures (rate limit, model errors, short text) fall through silently — manual review kicks in. - Returns: (refreshed_meme, duplicate_of_id or None). + Returns: (refreshed_meme, duplicate details or None). """ if meme["type"] != MemeType.IMAGE: return meme, None @@ -133,8 +183,16 @@ async def _check_duplicate_via_ocr(meme: dict[str, Any]) -> tuple[dict[str, Any] if len(ocr_text) < 12: return refreshed, None - dup_id = await find_meme_duplicate(refreshed["id"], ocr_text) - return refreshed, dup_id + dup_id = await find_duplicate_by_ocr_text(refreshed["id"], ocr_text) + if dup_id is None: + return refreshed, None + + resolution = await resolve_duplicate(refreshed["id"], dup_id, reason="upload_ocr_text") + return refreshed, UploadAutoReviewDuplicate( + meme_id=refreshed["id"], + duplicate_of=resolution.original_id, + reason=resolution.reason, + ) async def uploaded_meme_auto_review( @@ -251,6 +309,10 @@ async def _uploaded_meme_auto_review( bot, meme_upload, localizer.t("upload.tg_upload_failed", uploader_lang) ) + stored_duplicate = _stored_duplicate_result(meme) + if stored_duplicate is not None: + return await _reject_duplicate_upload(bot, meme_upload, stored_duplicate, uploader_lang) + logging.info(f"Updating meme {meme['id']} status to WAITING_REVIEW") meme = await update_meme( meme["id"], @@ -258,25 +320,9 @@ async def _uploaded_meme_auto_review( ) # Inline OCR + trigram dedup. Auto-reject on duplicate, else fall through to manual review. - meme, duplicate_of = await _check_duplicate_via_ocr(meme) - if duplicate_of is not None: - logging.info(f"Meme {meme['id']} is a duplicate of {duplicate_of}, auto-rejecting") - await update_meme( - meme["id"], - status=MemeStatus.DUPLICATE, - duplicate_of=duplicate_of, - ) - # Credit the uploader with a like on the original, so it counts as engagement - await create_user_meme_reaction( - meme_upload["user_id"], - duplicate_of, - "uploaded_meme", - reaction_id=1, - reacted_at=datetime.utcnow(), - ) - return await _notify_uploader( - bot, meme_upload, localizer.t("upload.rejected_duplicate", uploader_lang) - ) + meme, ocr_duplicate = await _deduplicate_upload_via_ocr(meme) + if ocr_duplicate is not None: + return await _reject_duplicate_upload(bot, meme_upload, ocr_duplicate, uploader_lang) return await send_uploaded_meme_to_manual_review(meme, meme_upload, bot) diff --git a/tests/factories.py b/tests/factories.py index e43e714c..f63f63f8 100644 --- a/tests/factories.py +++ b/tests/factories.py @@ -5,6 +5,7 @@ from sqlalchemy.ext.asyncio import AsyncConnection from src.database import ( + chat_meme_reaction, meme, meme_source, meme_source_candidate, @@ -13,6 +14,7 @@ meme_source_stats, meme_stats, user, + user_deep_link_log, user_language, user_meme_reaction, user_meme_source_stats, @@ -109,6 +111,7 @@ async def create_meme_stats( ndislikes: int = 5, nmemes_sent: int = 20, lr_smoothed: float = 0.5, + engagement_score: float = 0.0, age_days: int = 30, raw_impr_rank: int = 0, sec_to_react: float = 7.0, @@ -120,13 +123,21 @@ async def create_meme_stats( "ndislikes": ndislikes, "nmemes_sent": nmemes_sent, "lr_smoothed": lr_smoothed, + "engagement_score": engagement_score, "age_days": age_days, "raw_impr_rank": raw_impr_rank, "sec_to_react": sec_to_react, "invited_count": invited_count, "updated_at": FIXED_DT, } - await conn.execute(insert(meme_stats).values(row).on_conflict_do_nothing()) + await conn.execute( + insert(meme_stats) + .values(row) + .on_conflict_do_update( + index_elements=(meme_stats.c.meme_id,), + set_={key: value for key, value in row.items() if key != "meme_id"}, + ) + ) return row @@ -228,6 +239,9 @@ async def cleanup_test_data(conn: AsyncConnection) -> None: delete(meme_source_candidate).where(meme_source_candidate.c.id >= TEST_ID_START) ) await conn.execute(delete(meme_stats).where(meme_stats.c.meme_id >= TEST_ID_START)) + await conn.execute( + delete(user_deep_link_log).where(user_deep_link_log.c.user_id >= TEST_ID_START) + ) await conn.execute( delete(meme_source_stats).where(meme_source_stats.c.meme_source_id >= TEST_ID_START) ) @@ -237,6 +251,9 @@ async def cleanup_test_data(conn: AsyncConnection) -> None: await conn.execute( delete(user_meme_reaction).where(user_meme_reaction.c.user_id >= TEST_ID_START) ) + await conn.execute( + delete(chat_meme_reaction).where(chat_meme_reaction.c.user_id >= TEST_ID_START) + ) await conn.execute(delete(user_language).where(user_language.c.user_id >= TEST_ID_START)) await conn.execute(delete(user_stats).where(user_stats.c.user_id >= TEST_ID_START)) await conn.execute(delete(meme).where(meme.c.id >= TEST_ID_START)) diff --git a/tests/flows/storage/test_final_meme_pipeline.py b/tests/flows/storage/test_final_meme_pipeline.py new file mode 100644 index 00000000..9d2a2861 --- /dev/null +++ b/tests/flows/storage/test_final_meme_pipeline.py @@ -0,0 +1,49 @@ +from unittest.mock import AsyncMock + +import pytest + +from src.flows.storage import memes +from src.storage.deduplication import DeduplicationResult + + +@pytest.mark.asyncio +async def test_final_meme_pipeline_deduplicates_batch_before_ok_promotion(monkeypatch): + calls = [] + + class FakeLogger: + def info(self, *args): + calls.append(("log_info", args)) + + pending_memes = [ + {"id": 10001, "caption": None}, + {"id": 10002, "caption": None}, + ] + + async def fake_analyse(meme): + calls.append(("analyse", meme["id"])) + + async def fake_deduplicate(meme): + calls.append(("dedup", meme["id"])) + return DeduplicationResult(meme["id"]) + + async def fake_update_ready(meme_ids): + calls.append(("promote", meme_ids)) + return [{"id": meme_ids[0]}] + + async def fake_sweep(): + calls.append(("sweep",)) + return {"resolved": 0, "reactions_moved": 0, "reactions_dropped": 0} + + monkeypatch.setattr(memes, "get_run_logger", lambda: FakeLogger()) + monkeypatch.setattr(memes, "get_pending_memes", AsyncMock(return_value=pending_memes)) + monkeypatch.setattr(memes, "analyse_meme_caption", fake_analyse) + monkeypatch.setattr(memes, "deduplicate_pending_meme", fake_deduplicate) + monkeypatch.setattr(memes, "update_meme_status_of_ready_memes", fake_update_ready) + monkeypatch.setattr(memes, "sweep_file_id_duplicates", fake_sweep) + monkeypatch.setattr(memes, "safe_emit", lambda *args, **kwargs: calls.append(("emit", args))) + + await memes.final_meme_pipeline.fn() + + assert calls.index(("dedup", 10001)) < calls.index(("promote", [10001, 10002])) + assert calls.index(("dedup", 10002)) < calls.index(("promote", [10001, 10002])) + assert calls.index(("promote", [10001, 10002])) < calls.index(("sweep",)) diff --git a/tests/recommendations/test_low_sent_pool.py b/tests/recommendations/test_low_sent_pool.py new file mode 100644 index 00000000..4968c37c --- /dev/null +++ b/tests/recommendations/test_low_sent_pool.py @@ -0,0 +1,69 @@ +import pytest +import pytest_asyncio +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncConnection +from tests.factories import ( + cleanup_test_data, + create_meme, + create_meme_source, + create_meme_stats, + create_user, + create_user_language, +) + +from src.database import engine +from src.recommendations.pipeline import _low_sent_query + +LOW_SENT_USER_ID = 10020 +LOW_SENT_SOURCE_ID = 10020 + + +@pytest_asyncio.fixture() +async def conn(): + async with engine.connect() as conn: + await cleanup_test_data(conn) + yield conn + await cleanup_test_data(conn) + + +async def _create_low_sent_meme( + conn: AsyncConnection, + meme_id: int, + *, + nlikes: int, + ndislikes: int, + nmemes_sent: int, +) -> None: + await create_meme(conn, id=meme_id, meme_source_id=LOW_SENT_SOURCE_ID) + await create_meme_stats( + conn, + meme_id=meme_id, + nlikes=nlikes, + ndislikes=ndislikes, + nmemes_sent=nmemes_sent, + ) + + +@pytest.mark.asyncio +async def test_low_sent_pool_prioritizes_unreacted_memes_and_filters_failed_memes( + conn: AsyncConnection, +) -> None: + await create_user(conn, id=LOW_SENT_USER_ID, type="moderator") + await create_user_language(conn, user_id=LOW_SENT_USER_ID) + await create_meme_source(conn, id=LOW_SENT_SOURCE_ID) + await _create_low_sent_meme(conn, 10021, nlikes=0, ndislikes=0, nmemes_sent=0) + await _create_low_sent_meme(conn, 10022, nlikes=1, ndislikes=0, nmemes_sent=1) + await _create_low_sent_meme(conn, 10023, nlikes=0, ndislikes=9, nmemes_sent=9) + await _create_low_sent_meme(conn, 10024, nlikes=0, ndislikes=10, nmemes_sent=10) + await _create_low_sent_meme(conn, 10025, nlikes=2, ndislikes=8, nmemes_sent=10) + await conn.commit() + + rows = await conn.execute( + text(_low_sent_query([])), + {"user_id": LOW_SENT_USER_ID, "limit": 10}, + ) + ids = [row.id for row in rows] + + assert ids[:3] == [10021, 10022, 10023] + assert 10024 not in ids + assert 10025 in ids diff --git a/tests/recommendations/test_meme_queue.py b/tests/recommendations/test_meme_queue.py index 1c2a5deb..5c817bee 100644 --- a/tests/recommendations/test_meme_queue.py +++ b/tests/recommendations/test_meme_queue.py @@ -8,7 +8,7 @@ MATURE_BLENDER_TREATMENT_WEIGHTS, ) from src.recommendations.candidates import CandidatesRetriever -from src.recommendations.meme_queue import generate_recommendations +from src.recommendations.meme_queue import generate_recommendations, get_next_meme_for_user TEST_USER_ID = 99999 @@ -53,6 +53,47 @@ def mock_redis(): # ── Cold start Phase 1 (nmemes_sent < 6): cold_start_explore ── +@pytest.mark.asyncio +async def test_get_next_meme_for_user_skips_stale_queue_payloads(): + queued_payloads = [ + { + "id": 101, + "type": "image", + "telegram_file_id": "stale-file-id", + "caption": None, + }, + { + "id": 102, + "type": "image", + "telegram_file_id": "fresh-file-id", + "caption": None, + }, + ] + + async def pop_queue(_queue_key): + return queued_payloads.pop(0) if queued_payloads else None + + async def is_sendable(_user_id: int, meme_id: int) -> bool: + return meme_id == 102 + + with ( + patch( + "src.recommendations.meme_queue.redis.pop_meme_from_queue_by_key", + new_callable=AsyncMock, + side_effect=pop_queue, + ), + patch( + "src.recommendations.meme_queue._queued_meme_is_sendable", + new_callable=AsyncMock, + side_effect=is_sendable, + ), + ): + meme = await get_next_meme_for_user(TEST_USER_ID) + + assert meme is not None + assert meme.id == 102 + + @pytest.mark.asyncio async def test_cold_start_phase1_uses_explore(): """Phase 1 (<6 memes): uses cold_start_explore engine""" diff --git a/tests/recommendations/test_queue_correctness.py b/tests/recommendations/test_queue_correctness.py index f079f492..d5587a50 100644 --- a/tests/recommendations/test_queue_correctness.py +++ b/tests/recommendations/test_queue_correctness.py @@ -16,7 +16,8 @@ from src import redis from src.database import engine from src.recommendations.candidates import CandidatesRetriever -from src.recommendations.meme_queue import generate_recommendations +from src.recommendations.meme_queue import generate_recommendations, get_next_meme_for_user +from src.storage.constants import MemeStatus # IDs for queue tests QUEUE_USER = 10010 @@ -228,6 +229,34 @@ async def test_queue_memes_have_required_fields(queue_user): assert "nlikes" in c +@pytest.mark.asyncio +async def test_get_next_meme_skips_stale_duplicate_payload(queue_user): + async with engine.connect() as conn: + await create_meme( + conn, + id=10020, + meme_source_id=10010, + status=MemeStatus.DUPLICATE.value, + ) + await create_meme(conn, id=10021, meme_source_id=10010) + await conn.commit() + + queue_key = redis.get_meme_queue_key(QUEUE_USER) + await redis.add_memes_to_queue_by_key( + queue_key, + [ + _meme(10020, "lr_smoothed"), + _meme(10021, "lr_smoothed"), + ], + ) + + next_meme = await get_next_meme_for_user(QUEUE_USER) + + assert next_meme is not None + assert next_meme.id == 10021 + assert await redis.get_all_memes_in_queue_by_key(queue_key) == [] + + @pytest.mark.asyncio async def test_generate_excludes_already_queued(queue_user): """Second generate should not duplicate memes already in queue.""" diff --git a/tests/scripts/test_agent_doctor.py b/tests/scripts/test_agent_doctor.py index a1cda10f..2df4cf94 100644 --- a/tests/scripts/test_agent_doctor.py +++ b/tests/scripts/test_agent_doctor.py @@ -40,6 +40,21 @@ def test_real_describe_memes_models_are_free() -> None: assert result.name == "describe_memes:free_models" +def test_describe_memes_models_can_live_in_openrouter_client(tmp_path: Path) -> None: + storage = tmp_path / "src" / "flows" / "storage" + storage.mkdir(parents=True) + (storage / "describe_memes.py").write_text("# orchestration only\n", encoding="utf-8") + (storage / "openrouter_vision.py").write_text( + 'VISION_MODELS = ["qwen/qwen2.5-vl-72b-instruct:free"]\n', + encoding="utf-8", + ) + + result = doctor.check_describe_memes_models(tmp_path) + + assert result.ok is True + assert "openrouter_vision.py" in result.detail + + def test_paperclip_access_adapter_accepts_repo_local_wrapper(tmp_path: Path) -> None: skill = tmp_path / ".codex" / "skills" / "paperclip" tools = tmp_path / ".codex" / "paperclip-tools" diff --git a/tests/scripts/test_eval_crossposting_ml.py b/tests/scripts/test_eval_crossposting_ml.py new file mode 100644 index 00000000..cfec4ea0 --- /dev/null +++ b/tests/scripts/test_eval_crossposting_ml.py @@ -0,0 +1,14 @@ +import pytest +from scripts.eval_crossposting_ml import top_quintile_lift + + +def test_top_quintile_lift_is_neutral_when_scores_are_tied(): + labels = [1, 0, 1, 0, 0] + + assert top_quintile_lift([0, 0, 0, 0, 0], labels) == 1.0 + + +def test_top_quintile_lift_handles_boundary_ties_without_label_leakage(): + labels = [1, 0, 0, 1, 0, 0, 0, 0, 0, 0] + + assert top_quintile_lift([10, 9, 9, 9, 1, 0, 0, 0, 0, 0], labels) == pytest.approx(10 / 3) diff --git a/tests/storage/test_deduplication.py b/tests/storage/test_deduplication.py new file mode 100644 index 00000000..7bf5e2d0 --- /dev/null +++ b/tests/storage/test_deduplication.py @@ -0,0 +1,373 @@ +from datetime import datetime, timedelta + +import pytest +import pytest_asyncio +from sqlalchemy import insert, select + +from src.database import chat_meme_reaction, engine, meme, meme_stats, user_meme_reaction +from src.storage.constants import MemeStatus +from src.storage.deduplication import ( + deduplicate_described_meme, + deduplicate_pending_meme, + find_duplicate_by_file_id, + find_duplicate_by_ocr_text, + resolve_duplicate, + sweep_file_id_duplicates, +) +from tests.factories import ( + cleanup_test_data, + create_meme, + create_meme_source, + create_meme_stats, + create_reaction, + create_user, +) + + +@pytest_asyncio.fixture() +async def dedup_setup(): + async with engine.connect() as conn: + await create_meme_source(conn, id=10001) + for user_id in range(10001, 10008): + await create_user(conn, id=user_id) + await conn.commit() + + yield + + async with engine.connect() as conn: + await cleanup_test_data(conn) + + +async def _row(table, **where): + async with engine.connect() as conn: + query = select(table) + for column, value in where.items(): + query = query.where(getattr(table.c, column) == value) + result = await conn.execute(query) + row = result.first() + return row._asdict() if row else None + + +@pytest.mark.asyncio +async def test_find_duplicate_by_file_id_uses_older_ok_or_created_memes(dedup_setup): + async with engine.connect() as conn: + await create_meme( + conn, + id=10001, + meme_source_id=10001, + status=MemeStatus.OK.value, + telegram_file_id="same-file-id", + ) + await create_meme( + conn, + id=10002, + meme_source_id=10001, + status=MemeStatus.CREATED.value, + telegram_file_id="same-file-id", + ) + await conn.commit() + + assert await find_duplicate_by_file_id(10002, "same-file-id") == 10001 + assert await find_duplicate_by_file_id(10001, "same-file-id") is None + + +@pytest.mark.asyncio +async def test_find_duplicate_by_file_id_prefers_published_original(dedup_setup): + async with engine.connect() as conn: + await create_meme( + conn, + id=10001, + meme_source_id=10001, + status=MemeStatus.OK.value, + telegram_file_id="same-file-id", + ) + await create_meme( + conn, + id=10002, + meme_source_id=10001, + status=MemeStatus.PUBLISHED.value, + telegram_file_id="same-file-id", + ) + await conn.commit() + + assert await find_duplicate_by_file_id(10003, "same-file-id") == 10002 + + +@pytest.mark.asyncio +async def test_find_duplicate_by_ocr_text_skips_short_text(dedup_setup): + assert await find_duplicate_by_ocr_text(10001, "too short") is None + + +@pytest.mark.asyncio +async def test_resolve_duplicate_moves_reactions_and_refreshes_stats(dedup_setup): + async with engine.connect() as conn: + await create_meme(conn, id=10001, meme_source_id=10001) + await create_meme(conn, id=10002, meme_source_id=10001) + await create_meme_stats( + conn, + meme_id=10001, + nlikes=1, + ndislikes=1, + nmemes_sent=2, + lr_smoothed=99, + engagement_score=99, + ) + await create_meme_stats(conn, meme_id=10002, nlikes=2, ndislikes=1, nmemes_sent=3) + await create_reaction(conn, user_id=10001, meme_id=10001, reaction_id=1) + await create_reaction(conn, user_id=10002, meme_id=10001, reaction_id=2) + await create_reaction(conn, user_id=10002, meme_id=10002, reaction_id=1) + await create_reaction(conn, user_id=10003, meme_id=10002, reaction_id=1) + await create_reaction(conn, user_id=10004, meme_id=10002, reaction_id=2) + await conn.commit() + + result = await resolve_duplicate(10002, 10001, reason="test") + + assert result.reactions_moved == 2 + assert result.reactions_dropped == 3 + + original_stats = await _row(meme_stats, meme_id=10001) + assert original_stats["nlikes"] == 2 + assert original_stats["ndislikes"] == 2 + assert original_stats["nmemes_sent"] == 4 + assert original_stats["lr_smoothed"] == 0 + assert original_stats["engagement_score"] == 0 + + dupe = await _row(meme, id=10002) + assert dupe["status"] == MemeStatus.DUPLICATE.value + assert dupe["duplicate_of"] == 10001 + assert await _row(meme_stats, meme_id=10002) is None + + async with engine.connect() as conn: + reaction_rows = await conn.execute( + select(user_meme_reaction).where(user_meme_reaction.c.meme_id == 10002) + ) + assert reaction_rows.all() == [] + + +@pytest.mark.asyncio +async def test_resolve_duplicate_recomputes_derived_original_stats(dedup_setup): + base_sent_at = datetime(2024, 1, 1, 12, 0, 0) + affected_users = [10001, 10002, 10003] + + async with engine.connect() as conn: + await create_meme(conn, id=10001, meme_source_id=10001) + await create_meme(conn, id=10002, meme_source_id=10001) + await create_meme_stats(conn, meme_id=10001, lr_smoothed=-9.0) + await conn.execute( + meme_stats.update().where(meme_stats.c.meme_id == 10001).values(engagement_score=-9.0) + ) + + for index in range(9): + other_meme_id = 10100 + index + sent_at = base_sent_at + timedelta(minutes=index) + await create_meme(conn, id=other_meme_id, meme_source_id=10001) + for user_id in affected_users: + await create_reaction( + conn, + user_id=user_id, + meme_id=other_meme_id, + reaction_id=2, + sent_at=sent_at, + reacted_at=sent_at + timedelta(seconds=5), + ) + + target_sent_at = base_sent_at + timedelta(minutes=9) + for user_id in affected_users: + await create_reaction( + conn, + user_id=user_id, + meme_id=10002, + reaction_id=1, + sent_at=target_sent_at, + reacted_at=target_sent_at + timedelta(seconds=5), + ) + await conn.commit() + + await resolve_duplicate(10002, 10001, reason="test") + + original_stats = await _row(meme_stats, meme_id=10001) + assert original_stats["nlikes"] == 3 + assert original_stats["nmemes_sent"] == 3 + assert original_stats["lr_smoothed"] == pytest.approx(1.8) + assert original_stats["engagement_score"] == pytest.approx(1.8) + + +@pytest.mark.asyncio +async def test_resolve_duplicate_reparents_existing_duplicate_children(dedup_setup): + async with engine.connect() as conn: + await create_meme(conn, id=10001, meme_source_id=10001) + await create_meme(conn, id=10002, meme_source_id=10001) + await create_meme(conn, id=10003, meme_source_id=10001, status=MemeStatus.DUPLICATE.value) + await conn.execute(meme.update().where(meme.c.id == 10003).values(duplicate_of=10002)) + await create_reaction(conn, user_id=10001, meme_id=10002, reaction_id=1) + await conn.commit() + + await resolve_duplicate(10002, 10001, reason="test") + + dupe = await _row(meme, id=10002) + child = await _row(meme, id=10003) + assert dupe["duplicate_of"] == 10001 + assert child["duplicate_of"] == 10001 + + +@pytest.mark.asyncio +async def test_resolve_duplicate_moves_chat_reactions(dedup_setup): + async with engine.connect() as conn: + await create_meme(conn, id=10001, meme_source_id=10001) + await create_meme(conn, id=10002, meme_source_id=10001) + await conn.execute( + insert(chat_meme_reaction), + [ + {"chat_id": 1, "meme_id": 10001, "user_id": 10001, "reaction": 1}, + {"chat_id": 1, "meme_id": 10002, "user_id": 10001, "reaction": 2}, + {"chat_id": 1, "meme_id": 10002, "user_id": 10002, "reaction": 1}, + ], + ) + await conn.commit() + + result = await resolve_duplicate(10002, 10001, reason="test") + + assert result.chat_reactions_moved == 1 + assert result.chat_reactions_dropped == 2 + + async with engine.connect() as conn: + original_rows = await conn.execute( + select(chat_meme_reaction) + .where(chat_meme_reaction.c.meme_id == 10001) + .order_by(chat_meme_reaction.c.user_id) + ) + dupe_rows = await conn.execute( + select(chat_meme_reaction).where(chat_meme_reaction.c.meme_id == 10002) + ) + + assert [row._asdict()["user_id"] for row in original_rows.all()] == [10001, 10002] + assert dupe_rows.all() == [] + + +@pytest.mark.asyncio +async def test_deduplicate_pending_meme_resolves_file_id_before_ok_promotion(dedup_setup): + async with engine.connect() as conn: + await create_meme( + conn, + id=10001, + meme_source_id=10001, + status=MemeStatus.OK.value, + telegram_file_id="same-file-id", + ) + pending = await create_meme( + conn, + id=10002, + meme_source_id=10001, + status=MemeStatus.CREATED.value, + telegram_file_id="same-file-id", + ) + await conn.commit() + + result = await deduplicate_pending_meme(pending) + + assert result.duplicate_found is True + assert result.duplicate_of == 10001 + assert result.reason == "telegram_file_id" + dupe = await _row(meme, id=10002) + assert dupe["status"] == MemeStatus.DUPLICATE.value + + +@pytest.mark.asyncio +async def test_deduplicate_described_meme_resolves_only_ok_memes(dedup_setup): + async with engine.connect() as conn: + await create_meme( + conn, + id=10001, + meme_source_id=10001, + ocr_result={"text": "same visible meme text", "calculated_at": "2026-05-20T00:00:00Z"}, + ) + await create_meme(conn, id=10002, meme_source_id=10001, status=MemeStatus.OK.value) + await create_meme( + conn, + id=10003, + meme_source_id=10001, + status=MemeStatus.WAITING_REVIEW.value, + ) + await conn.commit() + + ok_result = await deduplicate_described_meme( + 10002, + "same visible meme text", + status=MemeStatus.OK.value, + ) + review_result = await deduplicate_described_meme( + 10003, + "same visible meme text", + status=MemeStatus.WAITING_REVIEW.value, + ) + + assert ok_result.duplicate_found is True + assert ok_result.duplicate_of == 10001 + assert review_result.duplicate_found is False + review_meme = await _row(meme, id=10003) + assert review_meme["status"] == MemeStatus.WAITING_REVIEW.value + + +@pytest.mark.asyncio +async def test_sweep_file_id_duplicates_resolves_ok_exact_duplicates(dedup_setup): + async with engine.connect() as conn: + await create_meme( + conn, + id=10001, + meme_source_id=10001, + telegram_file_id="same-file-id", + ) + await create_meme( + conn, + id=10002, + meme_source_id=10001, + telegram_file_id="same-file-id", + ) + await create_reaction(conn, user_id=10001, meme_id=10001, reaction_id=1) + await create_reaction(conn, user_id=10002, meme_id=10002, reaction_id=2) + await conn.commit() + + result = await sweep_file_id_duplicates() + + assert result["resolved"] == 1 + assert result["reactions_moved"] == 1 + + original_stats = await _row(meme_stats, meme_id=10001) + assert original_stats["nlikes"] == 1 + assert original_stats["ndislikes"] == 1 + assert original_stats["nmemes_sent"] == 2 + + dupe = await _row(meme, id=10002) + assert dupe["status"] == MemeStatus.DUPLICATE.value + assert dupe["duplicate_of"] == 10001 + + +@pytest.mark.asyncio +async def test_sweep_file_id_duplicates_resolves_ok_meme_to_published_original(dedup_setup): + async with engine.connect() as conn: + await create_meme( + conn, + id=10001, + meme_source_id=10001, + telegram_file_id="same-file-id", + ) + await create_meme( + conn, + id=10002, + meme_source_id=10001, + status=MemeStatus.PUBLISHED.value, + telegram_file_id="same-file-id", + ) + await create_reaction(conn, user_id=10001, meme_id=10001, reaction_id=1) + await conn.commit() + + result = await sweep_file_id_duplicates() + + assert result["resolved"] == 1 + dupe = await _row(meme, id=10001) + assert dupe["status"] == MemeStatus.DUPLICATE.value + assert dupe["duplicate_of"] == 10002 + + published_stats = await _row(meme_stats, meme_id=10002) + assert published_stats["nlikes"] == 1 + assert published_stats["nmemes_sent"] == 1 diff --git a/tests/test_crossposting_meme.py b/tests/test_crossposting_meme.py index 9d3258b2..fbc11415 100644 --- a/tests/test_crossposting_meme.py +++ b/tests/test_crossposting_meme.py @@ -442,6 +442,31 @@ async def test_ru_share_max_picker_boosts_prior_inbot_shares(clean_xpost): assert top_candidate["share_max_score"] > top_candidate["share_max_base_score"] +@pytest.mark.asyncio +async def test_ru_share_max_picker_keeps_cold_sources_in_pool(clean_xpost): + async with engine.connect() as conn: + await create_meme_source(conn, id=10330, language_code="ru") + await create_meme_source(conn, id=10340, language_code="ru") + await create_meme( + conn, id=10331, meme_source_id=10330, language_code="ru", type="image", status="ok" + ) + await create_meme( + conn, id=10341, meme_source_id=10340, language_code="ru", type="image", status="ok" + ) + await create_meme_stats(conn, meme_id=10331, nlikes=10, ndislikes=2) + await create_meme_stats(conn, meme_id=10341, nlikes=10, ndislikes=2, invited_count=5) + await conn.commit() + + picked, decision = await get_next_share_max_meme_for_tgchannelru() + assert picked is not None + assert picked["id"] == 10341 + assert decision is not None + assert decision["pool_size"] == 2 + candidate_ids = {c["meme_id"] for c in decision["candidates"]} + assert candidate_ids == {10331, 10341} + assert all(c["share_source_base"] == 1.0 for c in decision["candidates"]) + + @pytest.mark.asyncio async def test_en_share_max_picker_logs_but_does_not_boost_prior_shares(clean_xpost): async with engine.connect() as conn: diff --git a/tests/tgbot/test_upload_moderation.py b/tests/tgbot/test_upload_moderation.py index a972bd2f..3c6e75be 100644 --- a/tests/tgbot/test_upload_moderation.py +++ b/tests/tgbot/test_upload_moderation.py @@ -1,5 +1,5 @@ from types import SimpleNamespace -from unittest.mock import AsyncMock, patch +from unittest.mock import ANY, AsyncMock, patch import pytest @@ -126,3 +126,99 @@ async def test_upload_review_chat_member_can_reject_without_moderator_user_type( notify.assert_awaited_once() assert "выбран не тот язык" in notify.await_args.args[2] get_user_info.assert_awaited_once_with(7) + + +@pytest.mark.asyncio +async def test_auto_review_does_not_revive_exact_file_id_duplicate(monkeypatch): + meme = { + "id": 10002, + "type": moderation.MemeType.IMAGE, + "telegram_file_id": "uploaded-file-id", + } + meme_upload = {"id": 42, "user_id": 10001, "message_id": 777} + stored_duplicate = { + **meme, + "status": moderation.MemeStatus.DUPLICATE.value, + "duplicate_of": 10000, + } + bot = AsyncMock() + + with ( + patch.object(moderation, "_get_uploader_lang", new=AsyncMock(return_value="ru")), + patch.object( + moderation, + "download_meme_content_from_tg", + new=AsyncMock(return_value=b"image"), + ), + patch.object( + moderation, + "add_watermark_to_meme_content", + new=AsyncMock(return_value=b"watermarked"), + ), + patch.object( + moderation, + "upload_meme_content_to_tg", + new=AsyncMock(return_value=stored_duplicate), + ), + patch.object(moderation, "update_meme", new=AsyncMock()) as update_meme, + patch.object( + moderation, + "create_user_meme_reaction", + new=AsyncMock(), + ) as create_reaction, + patch.object(moderation, "_notify_uploader", new=AsyncMock()) as notify, + patch.object(moderation, "send_uploaded_meme_to_manual_review", new=AsyncMock()) as review, + ): + await moderation._uploaded_meme_auto_review(meme, meme_upload, bot, {}) + + update_meme.assert_not_awaited() + review.assert_not_awaited() + create_reaction.assert_awaited_once_with( + 10001, + 10000, + "uploaded_meme", + reaction_id=1, + reacted_at=ANY, + ) + notify.assert_awaited_once() + assert "повтор" in notify.await_args.args[2].lower() + + +@pytest.mark.asyncio +async def test_inline_ocr_duplicate_uses_dedup_resolver(monkeypatch): + meme = { + "id": 10002, + "type": moderation.MemeType.IMAGE, + "telegram_file_id": "uploaded-file-id", + } + refreshed = { + **meme, + "ocr_result": {"text": "same visible meme text"}, + } + + with ( + patch.object(moderation, "describe_single_meme", new=AsyncMock(return_value="ok")), + patch( + "src.tgbot.service.get_meme_by_id", + new=AsyncMock(return_value=refreshed), + ), + patch.object( + moderation, + "find_duplicate_by_ocr_text", + new=AsyncMock(return_value=10000), + ) as find_duplicate, + patch.object( + moderation, + "resolve_duplicate", + new=AsyncMock( + return_value=SimpleNamespace(original_id=10000, reason="upload_ocr_text") + ), + ) as resolve_duplicate, + ): + refreshed_result, duplicate = await moderation._deduplicate_upload_via_ocr(meme) + + assert refreshed_result == refreshed + assert duplicate is not None + assert duplicate.duplicate_of == 10000 + find_duplicate.assert_awaited_once_with(10002, "same visible meme text") + resolve_duplicate.assert_awaited_once_with(10002, 10000, reason="upload_ocr_text")