diff --git a/CONTEXT.md b/CONTEXT.md
index 398bbf09..ad7aa4e5 100644
--- a/CONTEXT.md
+++ b/CONTEXT.md
@@ -57,6 +57,13 @@ _Avoid_: отдельная роль, модератор
_Code aliases_: `low_sent_pool`
_Avoid_: обычная лента, топовая лента
+**Дедупликация мемов**:
+Шаг пайплайна, который распознаёт повторки мемов до попадания в рекомендации,
+помечает дубль как `duplicate`, переносит неконфликтующие реакции на оригинал
+и обновляет счётчики оригинала.
+_Code aliases_: `storage.deduplication`, `meme.duplicate_of`, `MemeStatus.DUPLICATE`
+_Avoid_: ручной cleanup, постфактум чистка очереди
+
**Кандидат в источники**:
Публичный источник мемов, найденный или предложенный для возможного добавления в бота.
_Code aliases_: `meme_source_candidate`, Source Candidate
@@ -108,6 +115,7 @@ _Avoid_: router
## Relationships
- **Модератор** может участвовать в **модераторском чате** и получать **разбор новых мемов** в своей ленте.
+- **Дедупликация мемов** должна выполняться до того, как мем становится доступен обычной рекомендательной системе.
- **Модераторский чат** используется для комьюнити-циклов; **чат проверки загрузок** используется для решений по пользовательским загрузкам.
- **Проверяющий загрузки** определяется доступом к **чату проверки загрузок**, а не `UserType`.
- Только **русскоязычный кандидат в источники** может попасть в автоматическое **голосование за источник**.
diff --git a/scripts/agent_doctor.py b/scripts/agent_doctor.py
index d4ce19e1..76e30eb7 100755
--- a/scripts/agent_doctor.py
+++ b/scripts/agent_doctor.py
@@ -73,11 +73,21 @@ def check_command_available(command: str) -> CheckResult:
def check_describe_memes_models(root: Path = ROOT) -> CheckResult:
- path = root / "src" / "flows" / "storage" / "describe_memes.py"
- try:
- model_ids = extract_vision_models(path)
- except Exception as exc:
- return CheckResult("describe_memes:free_models", False, str(exc))
+ paths = (
+ root / "src" / "flows" / "storage" / "describe_memes.py",
+ root / "src" / "flows" / "storage" / "openrouter_vision.py",
+ )
+ errors: list[str] = []
+ for path in paths:
+ try:
+ model_ids = extract_vision_models(path)
+ source_path = path
+ break
+ except Exception as exc:
+ errors.append(str(exc))
+ else:
+ return CheckResult("describe_memes:free_models", False, "; ".join(errors))
+
paid = non_free_openrouter_models(model_ids)
if paid:
return CheckResult(
@@ -85,7 +95,11 @@ def check_describe_memes_models(root: Path = ROOT) -> CheckResult:
False,
"non-free model ids: " + ", ".join(paid),
)
- return CheckResult("describe_memes:free_models", True, f"{len(model_ids)} free model(s)")
+ return CheckResult(
+ "describe_memes:free_models",
+ True,
+ f"{len(model_ids)} free model(s) in {source_path.relative_to(root)}",
+ )
def check_paperclip_access_adapter(
diff --git a/scripts/deep_parse.py b/scripts/deep_parse.py
index ae0c0b4c..b650e09f 100644
--- a/scripts/deep_parse.py
+++ b/scripts/deep_parse.py
@@ -39,7 +39,6 @@ async def main():
# Find source in DB
from sqlalchemy import text
- from src.database import fetch_one
source = await fetch_one(
text("SELECT id, url, status FROM meme_source WHERE url = :url"),
@@ -72,7 +71,7 @@ async def main():
if posts:
await insert_parsed_posts_from_telegram(source["id"], posts)
- print(f"Inserted into meme_raw_telegram")
+ print("Inserted into meme_raw_telegram")
await update_meme_source(
meme_source_id=source["id"], parsed_at=datetime.now(timezone.utc)
diff --git a/scripts/e2e_smoke.py b/scripts/e2e_smoke.py
index 80304673..f8cbb458 100644
--- a/scripts/e2e_smoke.py
+++ b/scripts/e2e_smoke.py
@@ -26,7 +26,6 @@
from telethon import TelegramClient
from telethon.sessions import StringSession
-
# --- Config ---
API_ID = os.environ.get("TELEGRAM_API_ID")
API_HASH = os.environ.get("TELEGRAM_API_HASH")
@@ -150,9 +149,11 @@ async def test_delete(client, bot):
if btn.data and b"delete" in btn.data.lower():
await msg.click(data=btn.data)
confirm_msg = await wait_for_response(client, bot, msg.id)
- if confirm_msg and ("ciao" in (confirm_msg.text or "").lower() or "start" in (confirm_msg.text or "").lower()):
+ confirm_text = (confirm_msg.text or "").lower() if confirm_msg else ""
+ if confirm_msg and ("ciao" in confirm_text or "start" in confirm_text):
return "PASS", f"State deleted: {(confirm_msg.text or '')[:80]}"
- return "WARN", f"Delete clicked but unexpected response: {(confirm_msg.text if confirm_msg else 'no response')[:80]}"
+ response = confirm_msg.text if confirm_msg else "no response"
+ return "WARN", f"Delete clicked but unexpected response: {response[:80]}"
if "sure" in (msg.text or "").lower() or "delete" in (msg.text or "").lower():
return "WARN", f"Got confirmation prompt but no button found: {(msg.text or '')[:80]}"
diff --git a/scripts/eval_crossposting_ml.py b/scripts/eval_crossposting_ml.py
new file mode 100644
index 00000000..f980ef5d
--- /dev/null
+++ b/scripts/eval_crossposting_ml.py
@@ -0,0 +1,390 @@
+"""Offline evaluator for simple crossposting virality models.
+
+The goal is deliberately narrow: predict whether a posted image meme lands
+above the channel median 24h forward rate. This is not a production ranker.
+It is a read-only gate for deciding whether simple linear ML features are worth
+promoting into shadow scoring.
+
+Usage:
+ ANALYST_DATABASE_URL=... python scripts/eval_crossposting_ml.py
+ ANALYST_DATABASE_URL=... python scripts/eval_crossposting_ml.py --days 120
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import math
+import os
+import sys
+from dataclasses import dataclass
+from typing import Iterable
+
+import asyncpg
+
+FEATURE_NAMES = [
+ "log_source_signal",
+ "log_source_posts",
+ "log_pre_likes",
+ "pre_like_rate",
+ "log_pre_reactions",
+ "log_pre_share_users",
+ "caption_present",
+ "hour_sin",
+ "hour_cos",
+]
+
+
+@dataclass
+class Example:
+ channel: str
+ posted_at: object
+ fwd_per_1k_24h: float
+ features: list[float]
+
+
+async def get_connection() -> asyncpg.Connection:
+ url = os.environ.get("ANALYST_DATABASE_URL") or os.environ.get("DATABASE_URL")
+ if not url:
+ print("ERROR: set ANALYST_DATABASE_URL or DATABASE_URL", file=sys.stderr)
+ sys.exit(1)
+ return await asyncpg.connect(url, statement_cache_size=0)
+
+
+async def fetch_examples(conn: asyncpg.Connection, days: int) -> list[Example]:
+ rows = await conn.fetch(
+ """
+ WITH labels AS (
+ SELECT
+ cp.channel,
+ cp.meme_id,
+ cp.created_at AS posted_at,
+ m.meme_source_id,
+ (m.caption IS NOT NULL)::int AS caption_present,
+ s24.views AS views_24h,
+ s24.forwards AS forwards_24h,
+ 1000.0 * s24.forwards / NULLIF(s24.views, 0) AS fwd_per_1k_24h
+ FROM crossposting cp
+ JOIN meme m ON m.id = cp.meme_id
+ JOIN LATERAL (
+ SELECT cps.snapshot_at, cps.views, cps.forwards
+ FROM crossposting_snapshots cps
+ WHERE cps.channel = cp.channel
+ AND cps.meme_id = cp.meme_id
+ AND cps.snapshot_at BETWEEN cp.created_at + interval '20 hours'
+ AND cp.created_at + interval '36 hours'
+ AND cps.views > 0
+ AND cps.forwards IS NOT NULL
+ ORDER BY abs(
+ extract(epoch FROM cps.snapshot_at - (cp.created_at + interval '24 hours'))
+ )
+ LIMIT 1
+ ) s24 ON true
+ WHERE cp.channel IN ('tgchannelru', 'tgchannelen')
+ AND cp.created_at < now() - interval '36 hours'
+ AND cp.created_at >= now() - ($1 || ' days')::interval
+ AND m.type = 'image'
+ ),
+ reaction_features AS (
+ SELECT
+ l.channel,
+ l.meme_id,
+ count(*) FILTER (WHERE r.reaction_id = 1) AS pre_likes,
+ count(*) FILTER (WHERE r.reaction_id = 2) AS pre_skips,
+ count(*) FILTER (WHERE r.reaction_id IN (1, 2)) AS pre_reactions
+ FROM labels l
+ LEFT JOIN user_meme_reaction r
+ ON r.meme_id = l.meme_id
+ AND r.reacted_at IS NOT NULL
+ AND r.reacted_at < l.posted_at
+ AND r.reaction_id IN (1, 2)
+ GROUP BY l.channel, l.meme_id
+ ),
+ share_clicks AS (
+ SELECT
+ share_match.parts[2]::bigint AS meme_id,
+ udll.user_id,
+ udll.created_at
+ FROM user_deep_link_log udll
+ CROSS JOIN LATERAL regexp_matches(
+ udll.deep_link,
+ '^s_([1-9][0-9]{0,18})_([1-9][0-9]{0,18})$'
+ ) AS share_match(parts)
+ WHERE udll.created_at >= now() - ($1 || ' days')::interval
+ AND CASE
+ WHEN length(share_match.parts[1]) = 19
+ AND share_match.parts[1] > '9223372036854775807' THEN false
+ WHEN length(share_match.parts[2]) = 19
+ AND share_match.parts[2] > '9223372036854775807' THEN false
+ ELSE udll.user_id <> share_match.parts[1]::bigint
+ END
+ ),
+ share_features AS (
+ SELECT
+ l.channel,
+ l.meme_id,
+ count(*) AS pre_share_clicks,
+ count(DISTINCT sc.user_id) AS pre_share_users
+ FROM labels l
+ LEFT JOIN share_clicks sc
+ ON sc.meme_id = l.meme_id
+ AND sc.created_at < l.posted_at
+ GROUP BY l.channel, l.meme_id
+ )
+ SELECT
+ l.channel,
+ l.posted_at,
+ l.fwd_per_1k_24h,
+ l.caption_present,
+ COALESCE(rf.pre_likes, 0) AS pre_likes,
+ COALESCE(rf.pre_skips, 0) AS pre_skips,
+ COALESCE(rf.pre_reactions, 0) AS pre_reactions,
+ COALESCE(sf.pre_share_users, 0) AS pre_share_users,
+ extract(hour FROM l.posted_at + interval '3 hours')::int AS hour_msk,
+ COALESCE(sq.source_signal, 0) AS source_signal,
+ COALESCE(sq.source_posts, 0) AS source_posts
+ FROM labels l
+ JOIN reaction_features rf ON rf.channel = l.channel AND rf.meme_id = l.meme_id
+ JOIN share_features sf ON sf.channel = l.channel AND sf.meme_id = l.meme_id
+ LEFT JOIN LATERAL (
+ SELECT
+ AVG(cp2.forwards * SQRT(GREATEST(cp2.views, 1) / 100.0)) AS source_signal,
+ COUNT(*) AS source_posts
+ FROM crossposting cp2
+ JOIN meme m2 ON m2.id = cp2.meme_id
+ WHERE cp2.channel = l.channel
+ AND cp2.created_at > l.posted_at - interval '30 days'
+ AND cp2.created_at < l.posted_at - interval '48 hours'
+ AND cp2.views IS NOT NULL
+ AND cp2.views > 0
+ AND cp2.forwards IS NOT NULL
+ AND m2.type = 'image'
+ AND m2.meme_source_id = l.meme_source_id
+ ) sq ON true
+ ORDER BY l.channel, l.posted_at
+ """,
+ str(days),
+ )
+
+ examples: list[Example] = []
+ for row in rows:
+ pre_reactions = row["pre_reactions"] or 0
+ pre_likes = row["pre_likes"] or 0
+ pre_like_rate = pre_likes / pre_reactions if pre_reactions else 0.5
+ hour_angle = 2 * math.pi * (row["hour_msk"] or 0) / 24
+ features = [
+ math.log1p(float(row["source_signal"] or 0)),
+ math.log1p(float(row["source_posts"] or 0)),
+ math.log1p(float(pre_likes)),
+ pre_like_rate,
+ math.log1p(float(pre_reactions)),
+ math.log1p(float(row["pre_share_users"] or 0)),
+ float(row["caption_present"] or 0),
+ math.sin(hour_angle),
+ math.cos(hour_angle),
+ ]
+ examples.append(
+ Example(
+ channel=row["channel"],
+ posted_at=row["posted_at"],
+ fwd_per_1k_24h=float(row["fwd_per_1k_24h"]),
+ features=features,
+ )
+ )
+ return examples
+
+
+def median(values: Iterable[float]) -> float:
+ ordered = sorted(values)
+ n = len(ordered)
+ if n == 0:
+ raise ValueError("median of empty list")
+ midpoint = n // 2
+ if n % 2:
+ return ordered[midpoint]
+ return (ordered[midpoint - 1] + ordered[midpoint]) / 2
+
+
+def standardize(
+ train_x: list[list[float]],
+ test_x: list[list[float]],
+) -> tuple[list[list[float]], list[list[float]]]:
+ n_features = len(train_x[0])
+ means = [sum(x[j] for x in train_x) / len(train_x) for j in range(n_features)]
+ stds = []
+ for j in range(n_features):
+ variance = sum((x[j] - means[j]) ** 2 for x in train_x) / len(train_x)
+ stds.append(math.sqrt(variance) or 1.0)
+
+ def transform(rows: list[list[float]]) -> list[list[float]]:
+ return [[(x[j] - means[j]) / stds[j] for j in range(n_features)] for x in rows]
+
+ return transform(train_x), transform(test_x)
+
+
+def sigmoid(value: float) -> float:
+ if value >= 0:
+ z = math.exp(-value)
+ return 1 / (1 + z)
+ z = math.exp(value)
+ return z / (1 + z)
+
+
+def train_logistic_regression(
+ train_x: list[list[float]],
+ train_y: list[int],
+ *,
+ iterations: int,
+ lr: float,
+ l2: float,
+) -> list[float]:
+ n_features = len(train_x[0])
+ weights = [0.0] * (n_features + 1)
+
+ for _ in range(iterations):
+ gradients = [0.0] * (n_features + 1)
+ for x, y in zip(train_x, train_y):
+ z = weights[0] + sum(w * v for w, v in zip(weights[1:], x))
+ error = sigmoid(z) - y
+ gradients[0] += error
+ for j, value in enumerate(x, start=1):
+ gradients[j] += error * value
+
+ n = len(train_x)
+ weights[0] -= lr * gradients[0] / n
+ for j in range(1, len(weights)):
+ gradients[j] = gradients[j] / n + l2 * weights[j]
+ weights[j] -= lr * gradients[j]
+
+ return weights
+
+
+def predict(weights: list[float], rows: list[list[float]]) -> list[float]:
+ return [sigmoid(weights[0] + sum(w * v for w, v in zip(weights[1:], x))) for x in rows]
+
+
+def auc(scores: list[float], labels: list[int]) -> float:
+ positives = [(s, y) for s, y in zip(scores, labels) if y == 1]
+ negatives = [(s, y) for s, y in zip(scores, labels) if y == 0]
+ if not positives or not negatives:
+ return 0.5
+
+ wins = 0.0
+ total = 0
+ for pos_score, _ in positives:
+ for neg_score, _ in negatives:
+ total += 1
+ if pos_score > neg_score:
+ wins += 1
+ elif pos_score == neg_score:
+ wins += 0.5
+ return wins / total
+
+
+def top_quintile_lift(scores: list[float], labels: list[int]) -> float:
+ if not labels or sum(labels) == 0:
+ return 0.0
+ paired = sorted(zip(scores, labels), key=lambda pair: pair[0], reverse=True)
+ top_n = max(1, math.ceil(len(paired) * 0.2))
+ selected_count = 0.0
+ selected_positives = 0.0
+ index = 0
+ while selected_count < top_n and index < len(paired):
+ score = paired[index][0]
+ group_labels: list[int] = []
+ while index < len(paired) and paired[index][0] == score:
+ group_labels.append(paired[index][1])
+ index += 1
+
+ remaining = top_n - selected_count
+ if len(group_labels) <= remaining:
+ selected_count += len(group_labels)
+ selected_positives += sum(group_labels)
+ else:
+ selected_count += remaining
+ selected_positives += sum(group_labels) * (remaining / len(group_labels))
+
+ top_rate = selected_positives / top_n
+ base_rate = sum(labels) / len(labels)
+ return top_rate / base_rate if base_rate else 0.0
+
+
+def evaluate_channel(channel: str, examples: list[Example], train_fraction: float) -> None:
+ channel_examples = [e for e in examples if e.channel == channel]
+ channel_examples.sort(key=lambda e: e.posted_at)
+ if len(channel_examples) < 30:
+ print(f"\n{channel}: not enough labeled posts ({len(channel_examples)})")
+ return
+
+ split = max(1, min(len(channel_examples) - 1, int(len(channel_examples) * train_fraction)))
+ train = channel_examples[:split]
+ test = channel_examples[split:]
+ threshold = median(e.fwd_per_1k_24h for e in train)
+
+ train_x = [e.features for e in train]
+ test_x = [e.features for e in test]
+ train_y = [int(e.fwd_per_1k_24h >= threshold) for e in train]
+ test_y = [int(e.fwd_per_1k_24h >= threshold) for e in test]
+
+ if len(set(train_y)) < 2 or len(set(test_y)) < 2:
+ print(f"\n{channel}: split has one target class, cannot evaluate")
+ return
+
+ train_x_std, test_x_std = standardize(train_x, test_x)
+ weights = train_logistic_regression(
+ train_x_std,
+ train_y,
+ iterations=2500,
+ lr=0.05,
+ l2=0.05,
+ )
+ scores = predict(weights, test_x_std)
+
+ baselines = {
+ "source_signal": [x[0] for x in test_x],
+ "pre_likes": [x[2] for x in test_x],
+ "pre_share_users": [x[5] for x in test_x],
+ }
+
+ print(f"\n{channel}")
+ print(f" labeled posts: {len(channel_examples)}")
+ print(f" train/test: {len(train)}/{len(test)}")
+ print(f" train median target: {threshold:.2f} fwd/1k")
+ print(f" logistic_auc: {auc(scores, test_y):.3f}")
+ print(f" logistic_top20_lift: {top_quintile_lift(scores, test_y):.2f}x")
+ print(f" pre_share_users_coverage: {sum(1 for x in test_x if x[5] > 0)}/{len(test_x)}")
+ for name, baseline_scores in baselines.items():
+ print(f" {name}_auc: {auc(baseline_scores, test_y):.3f}")
+ print(f" {name}_top20_lift: {top_quintile_lift(baseline_scores, test_y):.2f}x")
+
+ coef_pairs = sorted(
+ zip(FEATURE_NAMES, weights[1:]),
+ key=lambda item: abs(item[1]),
+ reverse=True,
+ )
+ print(" strongest coefficients:")
+ for name, value in coef_pairs[:5]:
+ print(f" {name}: {value:+.3f}")
+
+
+async def main() -> None:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--days", type=int, default=120)
+ parser.add_argument("--train-fraction", type=float, default=0.7)
+ args = parser.parse_args()
+
+ conn = await get_connection()
+ try:
+ await conn.execute("SET statement_timeout = '30s'")
+ examples = await fetch_examples(conn, args.days)
+ finally:
+ await conn.close()
+
+ print("Crossposting ML offline eval")
+ print(f"Examples: {len(examples)} image posts over {args.days} days")
+ for channel in ("tgchannelru", "tgchannelen"):
+ evaluate_channel(channel, examples, args.train_fraction)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/scripts/eval_ranking.py b/scripts/eval_ranking.py
index b9720766..193b2ff9 100644
--- a/scripts/eval_ranking.py
+++ b/scripts/eval_ranking.py
@@ -47,7 +47,7 @@ async def eval_lr_smoothed(conn: asyncpg.Connection, hours: int, min_reactions:
"""
print(f"\n{'='*60}")
- print(f" OFFLINE EVAL: lr_smoothed scoring")
+ print(" OFFLINE EVAL: lr_smoothed scoring")
print(f" Test window: last {hours} hours")
print(f" Min reactions per user: {min_reactions}")
print(f"{'='*60}\n")
@@ -171,7 +171,7 @@ async def eval_lr_smoothed(conn: asyncpg.Connection, hours: int, min_reactions:
print(f" Pairwise accuracy: {pairwise_acc:.1%}")
print(f" Avg per-user acc: {avg_user_acc:.1%}")
print(f" Tie rate: {tie_rate:.1%}")
- print(f" Random baseline: 50.0%")
+ print(" Random baseline: 50.0%")
print()
# Breakdown by user engagement level
@@ -207,7 +207,7 @@ async def eval_engagement_score(conn: asyncpg.Connection, hours: int, min_reacti
"""Evaluate engagement_score as a ranking signal."""
print(f"\n{'='*60}")
- print(f" OFFLINE EVAL: engagement_score")
+ print(" OFFLINE EVAL: engagement_score")
print(f"{'='*60}\n")
t0 = time.time()
diff --git a/scripts/generate_session_string.py b/scripts/generate_session_string.py
index 304e5874..662aea35 100644
--- a/scripts/generate_session_string.py
+++ b/scripts/generate_session_string.py
@@ -13,8 +13,8 @@
The output is a session string — store it as TELEGRAM_SESSION_STRING env var.
"""
-from telethon.sync import TelegramClient
from telethon.sessions import StringSession
+from telethon.sync import TelegramClient
api_id = int(input("API ID: "))
api_hash = input("API Hash: ")
diff --git a/scripts/serve_flows.py b/scripts/serve_flows.py
index 1cd7ec77..17ae2f41 100644
--- a/scripts/serve_flows.py
+++ b/scripts/serve_flows.py
@@ -150,10 +150,12 @@
# ── Crossposting (Moscow timezone) ──
post_meme_to_tgchannelru.to_deployment(
name="Post to TG Channel RU",
- # Experiment: dropped 18:00 MSK (21:00 UTC, 10.2 fwd/1k — worst slot).
- # Added 11:00 MSK. Data: 10:00 MSK = 25.6 fwd/1k (best), 18:00 MSK = 15.6 fwd/1k.
- # Baseline (pre-2026-04-13): 8,10,12,14,16,18 MSK (6x/day)
- schedules=[CronSchedule(cron="20 8,10,11,12,14,16 * * *", timezone=MSK)],
+ # May 21 readout: v2 ranker is stable, but the 10/11/12 MSK cluster
+ # makes the channel feel hourly. Keep 5/day and move one slot into
+ # the evening reactivation window.
+ # Prior experiment: dropped weak 18:00 MSK slot.
+ # Baseline (pre-2026-04-13): 8,10,12,14,16,18 MSK (6x/day).
+ schedules=[CronSchedule(cron="20 8,10,14,16,21 * * *", timezone=MSK)],
),
post_meme_to_tgchannelen.to_deployment(
name="Post to TG Channel EN",
diff --git a/specs/crossposting-share-optimization-2026-05-18.md b/specs/crossposting-share-optimization-2026-05-18.md
index c912e852..452be0c0 100644
--- a/specs/crossposting-share-optimization-2026-05-18.md
+++ b/specs/crossposting-share-optimization-2026-05-18.md
@@ -161,3 +161,88 @@ CREATE INDEX CONCURRENTLY IF NOT EXISTS
ON user_meme_reaction (meme_id, reacted_at)
INCLUDE (user_id, reaction_id, sent_at);
```
+
+## May 21 recheck
+
+Prod snapshots were fresh through 2026-05-21 11:00 UTC. The v2 ranker did not
+show a clear forward-rate regression:
+
+| Channel | Recent mature image posts | Recent agg fwd/1k | v2 agg fwd/1k |
+| --- | ---: | ---: | ---: |
+| RU | 43 | 23.57 | 24.59 |
+| EN | 39 | 18.65 | 18.42 |
+
+Subscriber growth is the unsolved part: RU was 2165 -> 2155 over the last
+30 days, while EN was 623 -> 653. Better meme selection alone is still not
+enough to grow RU.
+
+Operational finding: normal scheduled posts were not the only channel volume.
+Weekly uploaded-meme reward albums add 5 media posts at once and were logged as
+`score_version=1`, which mixed non-ranker posts into old-ranker readouts. The
+May 21 cleanup sets reward album logs to `score_version=0` and keeps their
+caption on the first media item for analysis.
+
+Frequency adjustment: RU scheduled posts move from `8,10,11,12,14,16` MSK to
+`8,10,14,16,21` MSK. This removes the 10/11/12 hourly cluster and moves one
+slot into the evening reactivation window. Bot activity in the last 30 days:
+21:00 MSK had 30.5k reactions / 367 active users; 22:00 MSK had 32.0k reactions
+/ 349 active users. Use 21:00 first because the active-user base is slightly
+wider and the slot is less late.
+
+ML status: `scripts/eval_crossposting_ml.py` now runs a read-only logistic
+baseline against 24h channel labels. Initial 90-day run:
+
+| Channel | Labeled images | Logistic AUC | Source-signal AUC | Pre-share top20 lift |
+| --- | ---: | ---: | ---: | ---: |
+| RU | 164 | 0.491 | 0.568 | 1.96x |
+| EN | 162 | 0.548 | 0.410 | 2.45x |
+
+Conclusion: this is not yet strong enough to ship an ML ranker. The next useful
+step is richer candidate-level offline evaluation, not turning on `score_version=3`.
+Keep ML work timestamp-safe: labels from 24h snapshots, features only from data
+available before the simulated decision.
+
+May 22 correction: the first `pre_share_users_top20_lift` readout was inflated
+by evaluator tie-bias. `top_quintile_lift` sorted `(score, label)` tuples, so
+equal scores placed positive labels before negative labels. After making ties
+label-neutral, `pre_share_users` is not shippable: the 120-day split has only
+1/52 RU test posts and 0/51 EN test posts with positive pre-share coverage;
+the corrected pre-share top20 lift is 0.93x for RU and 1.00x for EN. Keep prior
+share clicks as a logged feature until coverage improves.
+
+### Segment-first ML plan
+
+The flat meme-level model is not the right abstraction. The next evaluator
+should model `meme x user_segment` evidence first, then aggregate segment
+responses into a channel-success prediction.
+
+User segments to test before any production ranker:
+
+- Engagement depth: new, casual, regular, heavy, based on recent reaction count
+ and active days.
+- Taste/source affinity: top source clusters per user from historical likes and
+ skips; start with `meme_source_id` families, add OCR/description embeddings
+ only after the tabular baseline is sane.
+- Reaction behavior: fast liker, slow reader, fast skipper, high-share clicker.
+- Language/context: selected languages, observed liked meme language, local
+ active-hour bucket.
+
+Candidate segment features:
+
+- Segment impressions before channel post.
+- Segment like rate and Wilson-smoothed like rate.
+- Segment median reaction time and fast-skip rate.
+- Segment in-bot share click users.
+- Coverage: number of distinct segments with enough evidence.
+
+Targets should stay channel-specific:
+
+- Primary target: `forwards_24h / views_24h` above channel rolling median or
+ top quartile. This captures shareability without over-rewarding high reach.
+- Secondary target: reaction rate above rolling median.
+- Reach target: `views_24h` above expected views for that channel/hour/day.
+ Keep reach separate because post timing and subscriber base can dominate it.
+
+Do not train on all-time aggregates such as current `meme_stats.invited_count`
+for historical examples. Every feature must be reconstructed as of the simulated
+decision time.
diff --git a/specs/dedup.md b/specs/dedup.md
index b9a4e84c..a316d8e7 100644
--- a/specs/dedup.md
+++ b/specs/dedup.md
@@ -6,9 +6,10 @@
1. **ETL single-media filter** (~80% of the 17%) — carousel posts removed. Not true dedup.
2. **Telegram forwarded_url** — same-source repost detection at ETL time.
-3. **OCR text trigram similarity** — PostgreSQL `pg_trgm` operator `%` on extracted text. Min 12 chars. Works on memes with `ocr_result` populated by [Describe Memes](describe-memes.md) (OpenRouter vision).
+3. **Telegram file_id exact match** — storage/final pipeline check before recommendation eligibility.
+4. **OCR text trigram similarity** — PostgreSQL `pg_trgm` operator `%` on extracted text. Min 12 chars. Works on memes with `ocr_result` populated by [Describe Memes](describe-memes.md) (OpenRouter vision).
-The text-based dedup (`find_meme_duplicate()` in `src/storage/service.py`) uses:
+The text-based dedup (`find_duplicate_by_ocr_text()` in `src/storage/deduplication.py`) uses:
```sql
AND (M.ocr_result ->> 'text') % '{imagetext}' -- trigram similarity > 0.3
```
diff --git a/specs/parsing-etl.md b/specs/parsing-etl.md
index 64fc883f..7a5f9429 100644
--- a/specs/parsing-etl.md
+++ b/specs/parsing-etl.md
@@ -83,7 +83,8 @@ See [dedup.md](dedup.md) for improvement plan.
| `src/storage/parsers/vk.py` | VK API parser |
| `src/storage/parsers/ig.py` | Instagram HikerAPI parser |
| `src/storage/etl.py` | Raw -> processed meme transformation |
-| `src/storage/service.py` | DB queries, find_meme_duplicate() |
+| `src/storage/service.py` | Shared DB queries and meme status updates |
+| `src/storage/deduplication.py` | File ID/OCR duplicate detection and resolution |
| `src/storage/watermark.py` | Image watermarking (Pillow) |
| `src/storage/ads.py` | Ad keyword detection |
| `src/flows/storage/memes.py` | Pipeline orchestration (tg/vk/ig_meme_pipeline) |
diff --git a/specs/testing.md b/specs/testing.md
index ff13ebb9..4c71edb3 100644
--- a/specs/testing.md
+++ b/specs/testing.md
@@ -68,7 +68,7 @@
### Phase 3: ETL and parsing
**3a. Dedup tests**:
-- find_meme_duplicate() with matching OCR text
+- find_duplicate_by_ocr_text() with matching OCR text
- Perceptual hash dedup (after implementation)
**3b. ETL integration tests**:
diff --git a/src/crossposting/service.py b/src/crossposting/service.py
index 53201a4c..fb658b94 100644
--- a/src/crossposting/service.py
+++ b/src/crossposting/service.py
@@ -41,8 +41,8 @@ async def log_meme_sent(
# Per-channel ranker constants (mirror the SQL ORDER BY).
_CHANNEL_PARAMS: dict[str, dict[str, Any]] = {
- "tgchannelru": {"impr_penalty": 0.8, "age_threshold": 7},
- "tgchannelen": {"impr_penalty": 0.5, "age_threshold": 90},
+ "tgchannelru": {"impr_penalty": 0.8, "age_threshold": 7, "language_code": "ru"},
+ "tgchannelen": {"impr_penalty": 0.5, "age_threshold": 90, "language_code": "en"},
}
@@ -185,7 +185,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
return {k: candidate[k] for k in _PICKED_FIELDS}
-_RU_QUERY = """
+_STANDARD_RANKER_QUERY = """
WITH selected_at AS (
SELECT NOW() AS decided_at
),
@@ -196,124 +196,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
COUNT(*) AS n_posts
FROM crossposting cp
JOIN meme m ON m.id = cp.meme_id
- WHERE cp.channel = 'tgchannelru'
- AND cp.created_at > NOW() - INTERVAL '30 days'
- AND cp.created_at < NOW() - INTERVAL '48 hours'
- AND cp.views IS NOT NULL
- AND cp.views > 0
- AND m.type = 'image'
- GROUP BY m.meme_source_id
- HAVING COUNT(*) >= 5
- ),
- src_median AS (
- SELECT PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY signal) AS m_signal
- FROM src_quality
- ),
- recent_src AS (
- SELECT DISTINCT m2.meme_source_id
- FROM crossposting cp2
- JOIN meme m2 ON m2.id = cp2.meme_id
- WHERE cp2.channel = 'tgchannelru'
- AND cp2.created_at > NOW() - INTERVAL '24 hours'
- AND cp2.telegram_message_id IS NOT NULL
- ),
- ranked AS (
- SELECT
- M.id, M.type, M.telegram_file_id, M.caption,
- M.meme_source_id,
- MS.nlikes, MS.ndislikes, MS.raw_impr_rank,
- MS.age_days, MS.nmemes_sent, MS.invited_count,
- SQ.signal AS src_signal,
- (SELECT m_signal FROM src_median) AS median_signal,
- COUNT(*) OVER () AS candidate_pool_size,
- ROW_NUMBER() OVER (
- ORDER BY -1
- * COALESCE((MS.nlikes + 1.) / (MS.nlikes + MS.ndislikes + 1), 0.5)
- * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE 0.8 END
- * CASE WHEN MS.age_days < 7 THEN 1 ELSE 0.8 END
- * CASE WHEN M.caption IS NULL THEN 1 ELSE 0.8 END
- * CASE
- WHEN MS.nmemes_sent <= 1 THEN 1
- ELSE (MS.nlikes + MS.ndislikes) * 1. / MS.nmemes_sent
- END
- * COALESCE(
- LEAST(2.0, GREATEST(0.5,
- SQ.signal / NULLIF((SELECT m_signal FROM src_median), 0)
- )),
- 1.0
- )
- * (1.0 + LEAST(MS.invited_count, 10) * 0.1),
- M.id
- ) AS candidate_rank
- FROM meme M
- INNER JOIN meme_stats MS ON MS.meme_id = M.id
- LEFT JOIN crossposting CP ON CP.meme_id = M.id AND CP.channel = 'tgchannelru'
- LEFT JOIN src_quality SQ ON SQ.meme_source_id = M.meme_source_id
- WHERE 1=1
- AND CP.meme_id IS NULL
- AND M.status = 'ok'
- AND M.language_code = 'ru'
- AND M.type = 'image'
- AND MS.nlikes >= 5
- AND M.meme_source_id NOT IN (SELECT meme_source_id FROM recent_src)
- ORDER BY -1
- * COALESCE((MS.nlikes + 1.) / (MS.nlikes + MS.ndislikes + 1), 0.5)
- * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE 0.8 END
- * CASE WHEN MS.age_days < 7 THEN 1 ELSE 0.8 END
- * CASE WHEN M.caption IS NULL THEN 1 ELSE 0.8 END
- * CASE
- WHEN MS.nmemes_sent <= 1 THEN 1
- ELSE (MS.nlikes + MS.ndislikes) * 1. / MS.nmemes_sent
- END
- * COALESCE(
- LEAST(2.0, GREATEST(0.5,
- SQ.signal / NULLIF((SELECT m_signal FROM src_median), 0)
- )),
- 1.0
- )
- * (1.0 + LEAST(MS.invited_count, 10) * 0.1),
- M.id
- LIMIT :limit
- )
- SELECT
- ranked.*,
- COALESCE(share_clicks.pre_inbot_share_clicks, 0) AS pre_inbot_share_clicks,
- COALESCE(share_clicks.pre_inbot_share_click_users, 0) AS pre_inbot_share_click_users
- FROM ranked
- CROSS JOIN selected_at
- LEFT JOIN LATERAL (
- SELECT
- COUNT(*) AS pre_inbot_share_clicks,
- COUNT(DISTINCT user_id) AS pre_inbot_share_click_users
- FROM user_deep_link_log udll
- CROSS JOIN LATERAL (
- SELECT substring(
- udll.deep_link FROM ('^s_([1-9][0-9]{0,18})_' || ranked.id || '$')
- ) AS sharer_id
- ) share_link
- WHERE udll.created_at < selected_at.decided_at
- AND CASE
- WHEN share_link.sharer_id IS NULL THEN false
- WHEN length(share_link.sharer_id) = 19
- AND share_link.sharer_id > '9223372036854775807' THEN false
- ELSE udll.user_id <> share_link.sharer_id::bigint
- END
- ) share_clicks ON true
- ORDER BY ranked.candidate_rank
-"""
-
-_EN_QUERY = """
- WITH selected_at AS (
- SELECT NOW() AS decided_at
- ),
- src_quality AS (
- SELECT
- m.meme_source_id,
- AVG(cp.forwards * SQRT(GREATEST(cp.views, 1) / 100.0)) AS signal,
- COUNT(*) AS n_posts
- FROM crossposting cp
- JOIN meme m ON m.id = cp.meme_id
- WHERE cp.channel = 'tgchannelen'
+ WHERE cp.channel = :channel
AND cp.created_at > NOW() - INTERVAL '30 days'
AND cp.created_at < NOW() - INTERVAL '48 hours'
AND cp.views IS NOT NULL
@@ -330,7 +213,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
SELECT DISTINCT m2.meme_source_id
FROM crossposting cp2
JOIN meme m2 ON m2.id = cp2.meme_id
- WHERE cp2.channel = 'tgchannelen'
+ WHERE cp2.channel = :channel
AND cp2.created_at > NOW() - INTERVAL '24 hours'
AND cp2.telegram_message_id IS NOT NULL
),
@@ -346,8 +229,8 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
ROW_NUMBER() OVER (
ORDER BY -1
* COALESCE((MS.nlikes + 1.) / (MS.nlikes + MS.ndislikes + 1), 0.5)
- * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE 0.5 END
- * CASE WHEN MS.age_days < 90 THEN 1 ELSE 0.8 END
+ * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE :impr_penalty END
+ * CASE WHEN MS.age_days < :age_threshold THEN 1 ELSE 0.8 END
* CASE WHEN M.caption IS NULL THEN 1 ELSE 0.8 END
* CASE
WHEN MS.nmemes_sent <= 1 THEN 1
@@ -364,19 +247,19 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
) AS candidate_rank
FROM meme M
INNER JOIN meme_stats MS ON MS.meme_id = M.id
- LEFT JOIN crossposting CP ON CP.meme_id = M.id AND CP.channel = 'tgchannelen'
+ LEFT JOIN crossposting CP ON CP.meme_id = M.id AND CP.channel = :channel
LEFT JOIN src_quality SQ ON SQ.meme_source_id = M.meme_source_id
WHERE 1=1
AND CP.meme_id IS NULL
AND M.status = 'ok'
- AND M.language_code = 'en'
+ AND M.language_code = :language_code
AND M.type = 'image'
AND MS.nlikes >= 5
AND M.meme_source_id NOT IN (SELECT meme_source_id FROM recent_src)
ORDER BY -1
* COALESCE((MS.nlikes + 1.) / (MS.nlikes + MS.ndislikes + 1), 0.5)
- * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE 0.5 END
- * CASE WHEN MS.age_days < 90 THEN 1 ELSE 0.8 END
+ * CASE WHEN MS.raw_impr_rank <= 1 THEN 1 ELSE :impr_penalty END
+ * CASE WHEN MS.age_days < :age_threshold THEN 1 ELSE 0.8 END
* CASE WHEN M.caption IS NULL THEN 1 ELSE 0.8 END
* CASE
WHEN MS.nmemes_sent <= 1 THEN 1
@@ -477,6 +360,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
(MS.nlikes + MS.ndislikes) * 1.0 / MS.nmemes_sent
))
END DESC,
+ MS.invited_count DESC,
M.id
) AS base_source_rank
FROM meme M
@@ -490,7 +374,6 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
AND M.type = 'image'
AND M.telegram_file_id IS NOT NULL
AND MS.nlikes >= 5
- AND SQ.signal IS NOT NULL
AND (
:respect_recent_source_cap = false
OR M.meme_source_id NOT IN (SELECT meme_source_id FROM recent_src)
@@ -513,6 +396,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
(nlikes + ndislikes) * 1.0 / nmemes_sent
))
END DESC,
+ invited_count DESC,
id
) AS base_rank
FROM base
@@ -546,7 +430,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
with_shares.*,
ROW_NUMBER() OVER (
PARTITION BY meme_source_id
- ORDER BY share_max_score DESC, id
+ ORDER BY share_max_score DESC, invited_count DESC, id
) AS source_rank
FROM (
SELECT
@@ -593,7 +477,7 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
SELECT *
FROM scored
WHERE source_rank = 1
- ORDER BY share_max_score DESC, id
+ ORDER BY share_max_score DESC, invited_count DESC, id
LIMIT :limit
"""
@@ -616,6 +500,23 @@ def _picked_meme_dict(candidate: dict[str, Any]) -> dict[str, Any]:
}
+async def _get_next_meme_for_channel(
+ channel: str,
+ *,
+ log_top_n: int,
+ score_version: int,
+) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
+ params = {
+ **_CHANNEL_PARAMS[channel],
+ "channel": channel,
+ "limit": log_top_n,
+ }
+ rows = await fetch_all(text(_STANDARD_RANKER_QUERY), params)
+ if not rows:
+ return None, None
+ return _picked_meme_dict(rows[0]), _build_decision_log(channel, score_version, rows)
+
+
async def get_next_meme_for_tgchannelru(
log_top_n: int = 5,
) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
@@ -629,20 +530,22 @@ async def get_next_meme_for_tgchannelru(
- ``decision_log`` — kwargs dict for ``log_ranker_decision``, with the top-N
candidates and per-candidate score breakdown. ``None`` when no candidates.
"""
- rows = await fetch_all(text(_RU_QUERY), {"limit": log_top_n})
- if not rows:
- return None, None
- return _picked_meme_dict(rows[0]), _build_decision_log("tgchannelru", 2, rows)
+ return await _get_next_meme_for_channel(
+ "tgchannelru",
+ log_top_n=log_top_n,
+ score_version=2,
+ )
async def get_next_meme_for_tgchannelen(
log_top_n: int = 5,
) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
"""Same as :func:`get_next_meme_for_tgchannelru` but for @fast_food_memes (EN)."""
- rows = await fetch_all(text(_EN_QUERY), {"limit": log_top_n})
- if not rows:
- return None, None
- return _picked_meme_dict(rows[0]), _build_decision_log("tgchannelen", 2, rows)
+ return await _get_next_meme_for_channel(
+ "tgchannelen",
+ log_top_n=log_top_n,
+ score_version=2,
+ )
async def get_next_share_max_meme_for_tgchannelru(
diff --git a/src/database.py b/src/database.py
index 365a505d..c59fc79c 100644
--- a/src/database.py
+++ b/src/database.py
@@ -1,6 +1,6 @@
import asyncio
import uuid
-from typing import Any
+from typing import Any, Awaitable, Callable, TypeVar
from sqlalchemy import (
BigInteger,
@@ -27,7 +27,7 @@
text,
)
from sqlalchemy.dialects.postgresql import JSONB
-from sqlalchemy.ext.asyncio import create_async_engine
+from sqlalchemy.ext.asyncio import AsyncConnection, create_async_engine
from sqlalchemy.pool import NullPool
from src.config import settings
@@ -41,6 +41,8 @@
DATABASE_URL = str(settings.DATABASE_URL)
+T = TypeVar("T")
+
_engine_kwargs: dict = dict(
connect_args={
"prepared_statement_name_func": lambda: f"__asyncpg_{uuid.uuid4()}__",
@@ -882,3 +884,25 @@ async def execute(
raise
# Unreachable — loop always returns or raises
raise RuntimeError("execute retry loop exhausted without returning") # pragma: no cover
+
+
+async def run_in_transaction(fn: Callable[[AsyncConnection], Awaitable[T]]) -> T:
+ """Run several DB statements in one transaction with the standard retry policy."""
+ _DEADLOCK_MAX_RETRIES = 2
+ _max_attempts = max(_TRANSIENT_MAX_RETRIES, _DEADLOCK_MAX_RETRIES) + 1
+ _transient_attempts = 0
+
+ for attempt in range(_max_attempts):
+ try:
+ async with engine.begin() as conn:
+ return await fn(conn)
+ except Exception as exc:
+ if _is_transient_connection_error(exc) and _transient_attempts < _TRANSIENT_MAX_RETRIES:
+ await asyncio.sleep(0.025 * (2**_transient_attempts))
+ _transient_attempts += 1
+ continue
+ if _is_deadlock_error(exc) and attempt < _DEADLOCK_MAX_RETRIES:
+ await asyncio.sleep(0.1 * (2**attempt))
+ continue
+ raise
+ raise RuntimeError("transaction retry loop exhausted without returning") # pragma: no cover
diff --git a/src/flows/rewards/uploaded_memes.py b/src/flows/rewards/uploaded_memes.py
index 81c8edbc..71c46557 100644
--- a/src/flows/rewards/uploaded_memes.py
+++ b/src/flows/rewards/uploaded_memes.py
@@ -26,6 +26,8 @@
from src.tgbot.handlers.treasury.payments import pay_if_not_paid_with_alert
from src.tgbot.logs import log
+REWARD_ALBUM_SCORE_VERSION = 0
+
"""
1. Get all uploaded memes this week.
2. Calculate some stats:
@@ -58,72 +60,39 @@ def _meme_dict_to_input_media(m: dict):
raise Exception(f"Can't get meme type from: {m}")
-@flow(
- name="Reward RU users for weekly top uploaded memes",
- retries=1,
- retry_delay_seconds=60,
- timeout_seconds=300,
- on_failure=[notify_telegram_on_failure],
+REWARD_TRX_TYPES = (
+ TrxType.UPLOADER_TOP_WEEKLY_1,
+ TrxType.UPLOADER_TOP_WEEKLY_2,
+ TrxType.UPLOADER_TOP_WEEKLY_3,
+ TrxType.UPLOADER_TOP_WEEKLY_4,
+ TrxType.UPLOADER_TOP_WEEKLY_5,
)
-async def reward_ru_users_for_weekly_top_uploaded_memes():
- logger = get_run_logger()
- logger.info("Going to reward users for weekly top uploaded memes")
-
- uploaded_memes = await get_all_uploaded_memes_weekly_ru()
- logger.info(f"Received {len(uploaded_memes)} uploaded memes")
-
- if len(uploaded_memes) < 5:
- await log(f"Not enough memes to reward users: only {len(uploaded_memes)}")
- return
-
- nuploaded = len(uploaded_memes)
- nusers = len(set(m["author_id"] for m in uploaded_memes))
- views = sum(m["nmemes_sent"] for m in uploaded_memes)
- likes = sum(m["nlikes"] for m in uploaded_memes)
- dislikes = sum(m["ndislikes"] for m in uploaded_memes)
- avg_like = likes / (likes + dislikes) if likes + dislikes > 0 else 0
- logger.info(f"Uploaded: {nuploaded} by {nusers}, views: {views}, like%: {avg_like}")
- today = datetime.today().date().strftime("%Y-%m-%d")
- ###########################
- # reward top authors
+def _like_rate(meme: dict) -> float:
+ reactions = meme["nlikes"] + meme["ndislikes"]
+ return meme["nlikes"] / reactions if reactions > 0 else 0
- top_memes = sorted(
- uploaded_memes,
- key=lambda m: (
- m["nlikes"] / (m["nlikes"] + m["ndislikes"]) if m["nlikes"] + m["ndislikes"] > 0 else 0
- ),
- reverse=True,
- )[:5]
- for i, top_meme in enumerate(top_memes):
- if i == 0:
- type = TrxType.UPLOADER_TOP_WEEKLY_1
- elif i == 1:
- type = TrxType.UPLOADER_TOP_WEEKLY_2
- elif i == 2:
- type = TrxType.UPLOADER_TOP_WEEKLY_3
- elif i == 3:
- type = TrxType.UPLOADER_TOP_WEEKLY_4
- elif i == 4:
- type = TrxType.UPLOADER_TOP_WEEKLY_5
- else:
- continue
+def _like_percent(likes: int, dislikes: int) -> int:
+ total = likes + dislikes
+ return round(likes * 100.0 / total) if total else 0
- await pay_if_not_paid_with_alert(
- bot,
- top_meme["author_id"],
- type,
- external_id=today,
- )
- if top_meme["status"] != MemeStatus.PUBLISHED:
- await update_meme(top_meme["meme_id"], status=MemeStatus.PUBLISHED)
+def _top_uploaded_memes(uploaded_memes: list[dict], limit: int = 5) -> list[dict]:
+ return sorted(uploaded_memes, key=_like_rate, reverse=True)[:limit]
- # send message to tgchannelru
- channel_text = f"""
+def _ru_channel_text(
+ top_memes: list[dict],
+ *,
+ uploaded_count: int,
+ user_count: int,
+ views: int,
+ likes: int,
+ dislikes: int,
+) -> str:
+ return f"""
🏆 ТОП-5 загруженных мемов недели
🥇 - {top_memes[0]["nickname"] or "???"}
@@ -132,187 +101,206 @@ async def reward_ru_users_for_weekly_top_uploaded_memes():
🏅 - {top_memes[3]["nickname"] or "???"}
🏅 - {top_memes[4]["nickname"] or "???"}
-📥 Загружено мемов: {nuploaded}
-👤 Пользователями: {nusers}
+📥 Загружено мемов: {uploaded_count}
+👤 Пользователями: {user_count}
👁️ Просмотры: {views}
-👍 Доля лайков: {round(likes * 100.0 / (likes + dislikes))}%
+👍 Доля лайков: {_like_percent(likes, dislikes)}%
Перешли топ мем в бота → выиграй до 500 🍔
""" # noqa
- ms = await bot.send_media_group(
- TELEGRAM_CHANNEL_RU_CHAT_ID,
- [_meme_dict_to_input_media(m) for m in top_memes],
- caption=channel_text,
- parse_mode="HTML",
- )
- # log_meme_sent failures must NOT propagate — Prefect would retry the flow
- # and re-publish the album publicly. Missing one diversity-cap row is the
- # smaller harm; the safe block below mirrors the author-notify pattern.
- for i, top_meme in enumerate(top_memes):
- try:
- await log_meme_sent(
- top_meme["meme_id"],
- channel=Channel.TG_CHANNEL_RU,
- telegram_message_id=ms[i].id,
- )
- except Exception as e:
- logger.error(f"Failed to log meme_sent for {top_meme['meme_id']}: {e}")
+def _en_channel_text(
+ top_memes: list[dict],
+ *,
+ uploaded_count: int,
+ user_count: int,
+ views: int,
+ likes: int,
+ dislikes: int,
+) -> str:
+ return f"""
+🏆 Best uploaded memes of a week
- message_link = f"{TELEGRAM_CHANNEL_RU_LINK}/{ms[0].id}"
+🥇 - {top_memes[0]["nickname"] or "???"}
+🥈 - {top_memes[1]["nickname"] or "???"}
+🥉 - {top_memes[2]["nickname"] or "???"}
+🏅 - {top_memes[3]["nickname"] or "???"}
+🏅 - {top_memes[4]["nickname"] or "???"}
- # send message to authors
+📥 uploaded memes: {uploaded_count}
+👤 by users: {user_count}
+👁️ views: {views}
+👍 like %: {_like_percent(likes, dislikes)}%
- author_ids = set(m["author_id"] for m in top_memes)
- logger.info(f"Going to notify {len(author_ids)} authors about rewards")
- for author_id in author_ids:
- user_uploaded_memes = [m for m in uploaded_memes if m["author_id"] == author_id]
- likes = sum(m["nlikes"] for m in user_uploaded_memes)
- dislikes = sum(m["ndislikes"] for m in user_uploaded_memes)
- like_prc = round(likes * 100.0 / (likes + dislikes)) if likes + dislikes else 0
- views = sum(m["nmemes_sent"] for m in uploaded_memes)
+Forward top meme to our bot → win up to 500 🍔
+ """ # noqa
- user_text = f"""
+
+def _ru_user_text(
+ user_uploaded_memes: list[dict],
+ *,
+ views: int,
+ like_percent: int,
+ message_link: str,
+) -> str:
+ return f"""
Стата по загруженным тобой мемам:
📥 Загружено мемов: {len(user_uploaded_memes)}
👁️ Просмотры: {views}
-👍 Доля лайков: {like_prc}%
+👍 Доля лайков: {like_percent}%
Смотри топ-5 мемов недели в нашем канале: {message_link}
"""
- try:
- await bot.send_message(author_id, user_text)
- except Exception as e:
- logger.error(f"Failed to send message to {author_id}: {e}")
- await asyncio.sleep(2)
+def _en_user_text(
+ user_uploaded_memes: list[dict],
+ *,
+ views: int,
+ like_percent: int,
+ message_link: str,
+) -> str:
+ return f"""
+Your stats for uploaded memes:
+📥 Uploaded memes: {len(user_uploaded_memes)}
+👁️ Views: {views}
+👍 Like %: {like_percent}%
-@flow(
- name="Reward EN users for weekly top uploaded memes",
- retries=1,
- retry_delay_seconds=60,
- timeout_seconds=300,
- on_failure=[notify_telegram_on_failure],
-)
-async def reward_en_users_for_weekly_top_uploaded_memes():
- logger = get_run_logger()
- logger.info("Going to reward users for weekly top uploaded memes")
+Check out top-5 uploaded memes of the week in our channel: {message_link}
+ """
- uploaded_memes = await get_all_uploaded_memes_weekly_en()
- logger.info(f"Received {len(uploaded_memes)} uploaded memes")
+
+async def _reward_users_for_weekly_top_uploaded_memes(
+ *,
+ uploaded_memes: list[dict],
+ channel: Channel,
+ channel_chat_id: int,
+ channel_link: str,
+ channel_text_builder,
+ user_text_builder,
+) -> None:
+ logger = get_run_logger()
+ logger.info("Received %d uploaded memes", len(uploaded_memes))
if len(uploaded_memes) < 5:
await log(f"Not enough memes to reward users: only {len(uploaded_memes)}")
return
- nuploaded = len(uploaded_memes)
- nusers = len(set(m["author_id"] for m in uploaded_memes))
+ uploaded_count = len(uploaded_memes)
+ user_count = len({m["author_id"] for m in uploaded_memes})
views = sum(m["nmemes_sent"] for m in uploaded_memes)
likes = sum(m["nlikes"] for m in uploaded_memes)
dislikes = sum(m["ndislikes"] for m in uploaded_memes)
- avg_like = likes / (likes + dislikes) if likes + dislikes > 0 else 0
+ logger.info(
+ "Uploaded: %d by %d, views: %d, like%%: %.3f",
+ uploaded_count,
+ user_count,
+ views,
+ _like_percent(likes, dislikes) / 100,
+ )
- logger.info(f"Uploaded: {nuploaded} by {nusers}, views: {views}, like%: {avg_like}")
today = datetime.today().date().strftime("%Y-%m-%d")
-
- ###########################
- # reward top authors
-
- top_memes = sorted(
- uploaded_memes,
- key=lambda m: (
- m["nlikes"] / (m["nlikes"] + m["ndislikes"]) if m["nlikes"] + m["ndislikes"] > 0 else 0
- ),
- reverse=True,
- )[:5]
+ top_memes = _top_uploaded_memes(uploaded_memes)
for i, top_meme in enumerate(top_memes):
- if i == 0:
- type = TrxType.UPLOADER_TOP_WEEKLY_1
- elif i == 1:
- type = TrxType.UPLOADER_TOP_WEEKLY_2
- elif i == 2:
- type = TrxType.UPLOADER_TOP_WEEKLY_3
- elif i == 3:
- type = TrxType.UPLOADER_TOP_WEEKLY_4
- elif i == 4:
- type = TrxType.UPLOADER_TOP_WEEKLY_5
- else:
- continue
-
await pay_if_not_paid_with_alert(
bot,
top_meme["author_id"],
- type,
+ REWARD_TRX_TYPES[i],
external_id=today,
)
if top_meme["status"] != MemeStatus.PUBLISHED:
await update_meme(top_meme["meme_id"], status=MemeStatus.PUBLISHED)
- # send message to tgchannelen
-
- channel_text = f"""
-🏆 Best uploaded memes of a week
-
-🥇 - {top_memes[0]["nickname"] or "???"}
-🥈 - {top_memes[1]["nickname"] or "???"}
-🥉 - {top_memes[2]["nickname"] or "???"}
-🏅 - {top_memes[3]["nickname"] or "???"}
-🏅 - {top_memes[4]["nickname"] or "???"}
-
-📥 uploaded memes: {nuploaded}
-👤 by users: {nusers}
-👁️ views: {views}
-👍 like %: {round(likes * 100.0 / (likes + dislikes))}%
-
-Forward top meme to our bot → win up to 500 🍔
- """ # noqa
-
- ms = await bot.send_media_group(
- TELEGRAM_CHANNEL_EN_CHAT_ID,
+ channel_text = channel_text_builder(
+ top_memes,
+ uploaded_count=uploaded_count,
+ user_count=user_count,
+ views=views,
+ likes=likes,
+ dislikes=dislikes,
+ )
+ messages = await bot.send_media_group(
+ channel_chat_id,
[_meme_dict_to_input_media(m) for m in top_memes],
caption=channel_text,
parse_mode="HTML",
)
- # log_meme_sent failures must NOT propagate (see RU flow above for context).
+ # log_meme_sent failures must NOT propagate — Prefect would retry the flow
+ # and re-publish the album publicly. Missing one diversity-cap row is the
+ # smaller harm; the safe block below mirrors the author-notify pattern.
for i, top_meme in enumerate(top_memes):
try:
await log_meme_sent(
top_meme["meme_id"],
- channel=Channel.TG_CHANNEL_EN,
- telegram_message_id=ms[i].id,
+ channel=channel,
+ telegram_message_id=messages[i].id,
+ caption_text=channel_text if i == 0 else None,
+ score_version=REWARD_ALBUM_SCORE_VERSION,
)
except Exception as e:
logger.error(f"Failed to log meme_sent for {top_meme['meme_id']}: {e}")
- message_link = f"{TELEGRAM_CHANNEL_EN_LINK}/{ms[0].id}"
-
- # send message to authors
-
- author_ids = set(m["author_id"] for m in top_memes)
- logger.info(f"Going to notify {len(author_ids)} authors about rewards")
+ message_link = f"{channel_link}/{messages[0].id}"
+ author_ids = {m["author_id"] for m in top_memes}
+ logger.info("Going to notify %d authors about rewards", len(author_ids))
for author_id in author_ids:
user_uploaded_memes = [m for m in uploaded_memes if m["author_id"] == author_id]
- likes = sum(m["nlikes"] for m in user_uploaded_memes)
- dislikes = sum(m["ndislikes"] for m in user_uploaded_memes)
- like_prc = round(likes * 100.0 / (likes + dislikes)) if likes + dislikes else 0
- views = sum(m["nmemes_sent"] for m in uploaded_memes)
-
- user_text = f"""
-Your stats for uploaded memes:
-📥 Uploaded memes: {len(user_uploaded_memes)}
-👁️ Views: {views}
-👍 Like %: {like_prc}%
-
-Check out top-5 uploaded memes of the week in our channel: {message_link}
- """
+ user_likes = sum(m["nlikes"] for m in user_uploaded_memes)
+ user_dislikes = sum(m["ndislikes"] for m in user_uploaded_memes)
+ user_views = sum(m["nmemes_sent"] for m in user_uploaded_memes)
+ user_text = user_text_builder(
+ user_uploaded_memes,
+ views=user_views,
+ like_percent=_like_percent(user_likes, user_dislikes),
+ message_link=message_link,
+ )
try:
await bot.send_message(author_id, user_text)
except Exception as e:
logger.error(f"Failed to send message to {author_id}: {e}")
await asyncio.sleep(2)
+
+
+@flow(
+ name="Reward RU users for weekly top uploaded memes",
+ retries=1,
+ retry_delay_seconds=60,
+ timeout_seconds=300,
+ on_failure=[notify_telegram_on_failure],
+)
+async def reward_ru_users_for_weekly_top_uploaded_memes():
+ logger = get_run_logger()
+ logger.info("Going to reward users for weekly top uploaded memes")
+ await _reward_users_for_weekly_top_uploaded_memes(
+ uploaded_memes=await get_all_uploaded_memes_weekly_ru(),
+ channel=Channel.TG_CHANNEL_RU,
+ channel_chat_id=TELEGRAM_CHANNEL_RU_CHAT_ID,
+ channel_link=TELEGRAM_CHANNEL_RU_LINK,
+ channel_text_builder=_ru_channel_text,
+ user_text_builder=_ru_user_text,
+ )
+
+
+@flow(
+ name="Reward EN users for weekly top uploaded memes",
+ retries=1,
+ retry_delay_seconds=60,
+ timeout_seconds=300,
+ on_failure=[notify_telegram_on_failure],
+)
+async def reward_en_users_for_weekly_top_uploaded_memes():
+ logger = get_run_logger()
+ logger.info("Going to reward users for weekly top uploaded memes")
+ await _reward_users_for_weekly_top_uploaded_memes(
+ uploaded_memes=await get_all_uploaded_memes_weekly_en(),
+ channel=Channel.TG_CHANNEL_EN,
+ channel_chat_id=TELEGRAM_CHANNEL_EN_CHAT_ID,
+ channel_link=TELEGRAM_CHANNEL_EN_LINK,
+ channel_text_builder=_en_channel_text,
+ user_text_builder=_en_user_text,
+ )
diff --git a/src/flows/storage/describe_memes.py b/src/flows/storage/describe_memes.py
index 02c91e58..16674c49 100644
--- a/src/flows/storage/describe_memes.py
+++ b/src/flows/storage/describe_memes.py
@@ -26,532 +26,30 @@
import asyncio
import base64
-import json
-import re
import time
-from datetime import datetime, timezone
-import httpx
from prefect import flow, get_run_logger
from src.config import settings
-from src.database import execute, fetch_all, fetch_one, meme
from src.flows.events import safe_emit
from src.flows.hooks import notify_telegram_on_failure
-from src.redis import redis_client
-from src.storage.upload import download_meme_content_from_tg
-
-OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
-OPENROUTER_FREE_DAILY_REQUEST_LIMIT = 1000
-OPENROUTER_FREE_DAILY_REQUEST_BUDGET = 900
-OPENROUTER_FREE_REQUEST_COUNTER_TTL_SECONDS = 60 * 60 * 48
-OPENROUTER_FREE_STATS_TTL_SECONDS = 60 * 60 * 24 * 14
-OPENROUTER_DEFAULT_RATE_LIMIT_COOLDOWN_SECONDS = 60 * 15
-OPENROUTER_MAX_RATE_LIMIT_COOLDOWN_SECONDS = 60 * 60
-OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS = 60 * 15
-OPENROUTER_FORBIDDEN_MODEL_COOLDOWN_SECONDS = 60 * 60 * 6
-
-# FREE models only. Never add paid models here — spending balance below $0
-# blocks ALL models (even free ones) with HTTP 402. Free tier requires $10+
-# lifetime purchases for 1,000 req/day (vs 50/day without).
-# See specs/describe-memes.md for full OpenRouter constraints.
-#
-# Verified available on OpenRouter API as of 2026-05-17.
-# Ordered by preference. Falls back to next model on 429/403/timeout/bad response.
-# Transient failures set Redis cooldowns so later memes/runs try other free models.
-VISION_MODELS = [
- "google/gemma-4-31b-it:free", # 262k context, primary
- "google/gemma-4-26b-a4b-it:free", # 262k context, MoE variant
- # Gemma 3 free vision fallbacks are no longer listed by OpenRouter.
- # nvidia/nemotron-nano-12b-v2-vl:free removed — returns 504s and invalid
- # JSON/empty content (see specs/describe-memes.md).
-]
-
-DESCRIBE_PROMPT = (
- "You are analyzing a meme image. Extract the following:\n\n"
- "1. OCR_TEXT: ALL text visible in the image, exactly as written. "
- "Preserve original language and line breaks. "
- "If no text, return empty string.\n\n"
- "2. DESCRIPTION: Describe the meme in 1-3 sentences in English. "
- "What's happening visually? What's the joke? "
- "Be specific (panels, characters, reactions, meme format).\n\n"
- "3. LANGUAGE: Primary language of the meme text as ISO 639-1 code "
- '(e.g. "ru", "en"). If no text, return "en". '
- "If mixed, return dominant language.\n\n"
- "Respond with ONLY valid JSON, no markdown fences:\n"
- '{"ocr_text": "...", "description": "...", "language": "..."}'
+from src.flows.storage.describe_memes_repository import (
+ get_memes_to_describe,
+ increment_describe_failures,
+ save_meme_description,
)
+from src.flows.storage.openrouter_vision import (
+ ALL_FAILED,
+ DAILY_BUDGET_EXHAUSTED,
+ QUOTA_EXHAUSTED,
+ RATE_LIMITED,
+ VISION_MODELS,
+ call_openrouter_vision,
+)
+from src.storage.deduplication import deduplicate_described_meme
+from src.storage.upload import download_meme_content_from_tg
-# Sentinel return values from call_openrouter_vision
-RATE_LIMITED = "__rate_limited"
-ALL_FAILED = "__all_failed"
-QUOTA_EXHAUSTED = "__quota_exhausted"
-DAILY_BUDGET_EXHAUSTED = "__daily_budget_exhausted"
-
-
-class UnsafeOpenRouterModelError(ValueError):
- """Raised when a non-free OpenRouter model is configured."""
-
-
-def _validate_free_vision_models(model_ids: list[str]) -> None:
- paid_model_ids = [model_id for model_id in model_ids if not model_id.endswith(":free")]
- if paid_model_ids:
- raise UnsafeOpenRouterModelError(
- "OpenRouter paid models are forbidden in VISION_MODELS: " + ", ".join(paid_model_ids)
- )
-
-
-def _validate_openrouter_free_budget() -> None:
- if OPENROUTER_FREE_DAILY_REQUEST_BUDGET >= OPENROUTER_FREE_DAILY_REQUEST_LIMIT:
- raise ValueError(
- "OpenRouter local safety budget must stay below the documented "
- f"{OPENROUTER_FREE_DAILY_REQUEST_LIMIT}/day free-model cap"
- )
-
-
-def _openrouter_free_request_counter_key(now: datetime | None = None) -> str:
- now = now or datetime.now(timezone.utc)
- return f"openrouter:free_requests:{now.date().isoformat()}"
-
-
-def _openrouter_stats_key(now: datetime | None = None) -> str:
- now = now or datetime.now(timezone.utc)
- return f"openrouter:free_ocr_stats:{now.strftime('%Y-%m-%d:%H')}"
-
-
-def _openrouter_model_cooldown_key(model_id: str) -> str:
- return f"openrouter:free_model_cooldown:{model_id}"
-
-
-def _normalize_retry_after(raw_retry_after: float | None) -> float | None:
- if raw_retry_after is None:
- return None
- if raw_retry_after > 60 * 60 * 24:
- return max(0.0, raw_retry_after - time.time())
- return max(0.0, raw_retry_after)
-
-
-def _rate_limit_cooldown_seconds(raw_retry_after: float | None) -> int:
- retry_after = _normalize_retry_after(raw_retry_after)
- if retry_after is None:
- return OPENROUTER_DEFAULT_RATE_LIMIT_COOLDOWN_SECONDS
- return int(
- min(
- max(retry_after, 60.0),
- OPENROUTER_MAX_RATE_LIMIT_COOLDOWN_SECONDS,
- )
- )
-
-
-_RESERVE_OPENROUTER_FREE_REQUEST_LUA = """
-local current = tonumber(redis.call("GET", KEYS[1]) or "0")
-local budget = tonumber(ARGV[1])
-if current >= budget then
- return {0, current}
-end
-
-current = redis.call("INCR", KEYS[1])
-if current == 1 then
- redis.call("EXPIRE", KEYS[1], tonumber(ARGV[2]))
-end
-
-return {1, current}
-"""
-
-
-async def _reserve_openrouter_free_request(log) -> tuple[bool, int]:
- """Reserve one daily free-model request attempt.
-
- OpenRouter counts failed attempts toward the daily free quota, so we reserve
- before every model attempt, including fallbacks. If Redis is unavailable, fail
- closed and do not call OpenRouter.
- """
- key = _openrouter_free_request_counter_key()
- try:
- reserved, used_today = await redis_client.eval(
- _RESERVE_OPENROUTER_FREE_REQUEST_LUA,
- 1,
- key,
- OPENROUTER_FREE_DAILY_REQUEST_BUDGET,
- OPENROUTER_FREE_REQUEST_COUNTER_TTL_SECONDS,
- )
- return bool(int(reserved)), int(used_today)
- except Exception as e:
- log.error("OpenRouter quota guard failed via Redis; refusing request: %s", e)
- return False, -1
-
-
-async def _record_openrouter_metric(model_id: str, outcome: str) -> None:
- key = _openrouter_stats_key()
- field = f"{model_id}:{outcome}"
- try:
- async with redis_client.pipeline(transaction=True) as pipe:
- await pipe.hincrby(key, field, 1)
- await pipe.expire(key, OPENROUTER_FREE_STATS_TTL_SECONDS)
- await pipe.execute()
- except Exception:
- pass
-
-
-async def _get_openrouter_model_cooldown(model_id: str) -> int:
- try:
- ttl = await redis_client.ttl(_openrouter_model_cooldown_key(model_id))
- except Exception:
- return 0
- return int(ttl) if ttl and ttl > 0 else 0
-
-
-async def _cool_down_openrouter_model(model_id: str, seconds: int, reason: str) -> None:
- try:
- await redis_client.set(
- _openrouter_model_cooldown_key(model_id),
- reason,
- ex=max(1, int(seconds)),
- )
- except Exception:
- pass
-
-
-async def _cool_down_transient_openrouter_model(model_id: str, reason: str) -> float:
- await _cool_down_openrouter_model(
- model_id,
- OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS,
- reason,
- )
- return float(OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS)
-
-
-_validate_free_vision_models(VISION_MODELS)
-_validate_openrouter_free_budget()
-
-
-async def get_memes_to_describe(limit: int = 30) -> list[dict]:
- """Get image memes without descriptions.
-
- Priority order:
- 1. Recently uploaded memes (last 24h) — enables dedup for user uploads
- 2. Most liked memes — improves Wrapped coverage
-
- Skips memes that have failed 3+ times (tracked in ocr_result.describe_failures).
- """
- from sqlalchemy import text
-
- query = text(
- """
- SELECT
- M.id,
- M.telegram_file_id,
- M.ocr_result,
- M.language_code
- FROM meme M
- LEFT JOIN meme_stats MS ON MS.meme_id = M.id
- LEFT JOIN meme_source SRC ON SRC.id = M.meme_source_id
- WHERE M.type = 'image'
- AND M.status = 'ok'
- AND M.telegram_file_id IS NOT NULL
- AND (
- M.ocr_result IS NULL
- OR M.ocr_result->>'description' IS NULL
- )
- AND COALESCE((M.ocr_result->>'describe_failures')::int, 0) < 3
- ORDER BY
- CASE WHEN SRC.type = 'user upload'
- AND M.created_at > now() - interval '24 hours'
- THEN 0 ELSE 1 END,
- COALESCE(MS.nlikes, 0) DESC,
- M.id DESC
- LIMIT :limit
- """
- ).bindparams(limit=limit)
-
- return await fetch_all(query)
-
-
-def _parse_vision_response(raw_content: str) -> dict:
- """Parse JSON from model response, stripping markdown fences if present.
-
- Falls back to escape-fixing and regex extraction to handle common LLM JSON issues
- (invalid escape sequences, unterminated strings from lower-quality models).
- """
- content = raw_content.strip()
- if content.startswith("```"):
- content = content.split("\n", 1)[1] if "\n" in content else content[3:]
- if content.endswith("```"):
- content = content[:-3]
- content = content.strip()
- if content.startswith("json"):
- content = content[4:].strip()
-
- # 1. Standard parse
- try:
- return json.loads(content)
- except json.JSONDecodeError:
- pass
-
- # 2. Fix invalid escape sequences (e.g. \' or \k not valid in JSON)
- try:
- fixed = re.sub(r'\\(?!["\\/bfnrtu])', r"\\\\", content)
- return json.loads(fixed)
- except (json.JSONDecodeError, Exception):
- pass
-
- # 3. Regex extraction — last resort for severely malformed output
- result = {}
- for key in ("ocr_text", "description", "language"):
- match = re.search(rf'"{key}"\s*:\s*"((?:[^"\\]|\\.)*)"', content, re.DOTALL)
- if match:
- try:
- result[key] = json.loads(f'"{match.group(1)}"')
- except json.JSONDecodeError:
- result[key] = match.group(1)
- if result:
- return result
-
- raise json.JSONDecodeError("Could not parse model response", content, 0)
-
-
-def _parse_retry_after(response: httpx.Response) -> float | None:
- """Extract retry delay from Retry-After header or response body."""
- header = response.headers.get("retry-after") or response.headers.get("x-ratelimit-reset")
- if header:
- try:
- return float(header)
- except ValueError:
- pass
- try:
- body = response.json()
- if "error" in body and "metadata" in body["error"]:
- reset = body["error"]["metadata"].get("ratelimit_reset")
- if reset:
- return float(reset)
- except Exception:
- pass
- return None
-
-
-async def call_openrouter_vision(image_b64: str, log, *, deadline: float | None = None) -> dict:
- """Call OpenRouter vision model with fallback chain.
-
- Args:
- deadline: monotonic timestamp after which we stop trying models.
-
- Returns:
- dict with result on success, or {RATE_LIMITED: True} / {ALL_FAILED: True}
- """
- headers = {
- "Authorization": f"Bearer {settings.OPENROUTER_API_KEY}",
- "Content-Type": "application/json",
- }
-
- next_retry_after: float | None = None
- tried_models = 0
-
- async with httpx.AsyncClient(timeout=30.0) as client:
- for model_id in VISION_MODELS:
- if not model_id.endswith(":free"):
- raise UnsafeOpenRouterModelError(f"Refusing non-free OpenRouter model: {model_id}")
-
- cooldown_ttl = await _get_openrouter_model_cooldown(model_id)
- if cooldown_ttl > 0:
- log.info(
- "Skipping %s — free-model cooldown has %ss remaining.",
- model_id,
- cooldown_ttl,
- )
- if next_retry_after is None or cooldown_ttl < next_retry_after:
- next_retry_after = float(cooldown_ttl)
- continue
-
- # Stop trying more models if we're running out of time
- if deadline is not None and time.monotonic() > deadline - 35:
- log.warning("Skipping remaining models — approaching deadline")
- break
-
- payload = {
- "model": model_id,
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "text", "text": DESCRIBE_PROMPT},
- {
- "type": "image_url",
- "image_url": {
- "url": f"data:image/jpeg;base64,{image_b64}",
- },
- },
- ],
- }
- ],
- "max_tokens": 500,
- "temperature": 0.2,
- }
-
- try:
- tried_models += 1
- reserved, used_today = await _reserve_openrouter_free_request(log)
- if not reserved:
- log.warning(
- "OpenRouter free-model daily safety budget exhausted "
- "(%s/%s attempts). Refusing request.",
- used_today if used_today >= 0 else "unknown",
- OPENROUTER_FREE_DAILY_REQUEST_BUDGET,
- )
- return {DAILY_BUDGET_EXHAUSTED: True, "__used_today": used_today}
- await _record_openrouter_metric(model_id, "attempt")
-
- response = await client.post(
- f"{OPENROUTER_BASE_URL}/chat/completions",
- headers=headers,
- json=payload,
- )
-
- if response.status_code == 402:
- log.warning(
- "OpenRouter quota exhausted (HTTP 402). "
- "Balance likely below $0 — all models blocked. "
- "Check https://openrouter.ai/settings/credits"
- )
- await _record_openrouter_metric(model_id, "quota_exhausted")
- return {QUOTA_EXHAUSTED: True}
-
- if response.status_code == 429:
- raw_retry_after = _parse_retry_after(response)
- retry_after = _normalize_retry_after(raw_retry_after)
- cooldown = _rate_limit_cooldown_seconds(raw_retry_after)
- await _record_openrouter_metric(model_id, "rate_limited")
- await _cool_down_openrouter_model(model_id, cooldown, "rate_limited")
- if next_retry_after is None or cooldown < next_retry_after:
- next_retry_after = float(cooldown)
- log.info(
- "Rate-limited (429) on %s (retry-after: %ss, cooldown: %ss)",
- model_id,
- retry_after or "unknown",
- cooldown,
- )
- continue
-
- if response.status_code == 403:
- await _record_openrouter_metric(model_id, "forbidden")
- await _cool_down_openrouter_model(
- model_id,
- OPENROUTER_FORBIDDEN_MODEL_COOLDOWN_SECONDS,
- "forbidden",
- )
- log.warning("Model %s HTTP 403 (access denied), trying next...", model_id)
- continue
-
- response.raise_for_status()
-
- body = response.text.strip()
- json_start = body.find("{")
- if json_start < 0:
- await _record_openrouter_metric(model_id, "bad_response")
- retry_after = await _cool_down_transient_openrouter_model(
- model_id,
- "bad_response",
- )
- if next_retry_after is None or retry_after < next_retry_after:
- next_retry_after = retry_after
- log.warning("Model %s returned no JSON: %s", model_id, body[:100])
- continue
- data = json.loads(body[json_start:])
-
- if "choices" not in data:
- await _record_openrouter_metric(model_id, "bad_response")
- retry_after = await _cool_down_transient_openrouter_model(
- model_id,
- "bad_response",
- )
- if next_retry_after is None or retry_after < next_retry_after:
- next_retry_after = retry_after
- log.warning("Model %s no choices: %s", model_id, str(data)[:200])
- continue
-
- content = data["choices"][0]["message"]["content"]
- if not content:
- await _record_openrouter_metric(model_id, "empty_content")
- retry_after = await _cool_down_transient_openrouter_model(
- model_id,
- "empty_content",
- )
- if next_retry_after is None or retry_after < next_retry_after:
- next_retry_after = retry_after
- log.warning("Model %s empty content", model_id)
- continue
- result = _parse_vision_response(content)
-
- if "description" not in result and "ocr_text" not in result:
- await _record_openrouter_metric(model_id, "bad_json")
- retry_after = await _cool_down_transient_openrouter_model(
- model_id,
- "bad_json",
- )
- if next_retry_after is None or retry_after < next_retry_after:
- next_retry_after = retry_after
- log.warning("Model %s bad JSON: %s", model_id, str(result)[:200])
- continue
-
- result["__model"] = model_id
- await _record_openrouter_metric(model_id, "success")
- return result
-
- except json.JSONDecodeError as e:
- await _record_openrouter_metric(model_id, "invalid_json")
- retry_after = await _cool_down_transient_openrouter_model(
- model_id,
- "invalid_json",
- )
- if next_retry_after is None or retry_after < next_retry_after:
- next_retry_after = retry_after
- log.warning("Model %s invalid JSON: %s", model_id, e)
- continue
- except httpx.HTTPStatusError as e:
- await _record_openrouter_metric(model_id, f"http_{e.response.status_code}")
- if e.response.status_code >= 500:
- retry_after = await _cool_down_transient_openrouter_model(
- model_id,
- f"http_{e.response.status_code}",
- )
- if next_retry_after is None or retry_after < next_retry_after:
- next_retry_after = retry_after
- log.warning("Model %s HTTP %s", model_id, e.response.status_code)
- continue
- except (httpx.ReadTimeout, httpx.ConnectTimeout) as e:
- await _record_openrouter_metric(model_id, "timeout")
- retry_after = await _cool_down_transient_openrouter_model(model_id, "timeout")
- if next_retry_after is None or retry_after < next_retry_after:
- next_retry_after = retry_after
- log.warning("Model %s timeout: %s", model_id, type(e).__name__)
- continue
- except httpx.RequestError as e:
- await _record_openrouter_metric(model_id, "request_error")
- retry_after = await _cool_down_transient_openrouter_model(
- model_id,
- "request_error",
- )
- if next_retry_after is None or retry_after < next_retry_after:
- next_retry_after = retry_after
- log.warning("Model %s request error: %s", model_id, type(e).__name__)
- continue
- except Exception as e:
- await _record_openrouter_metric(model_id, "error")
- log.warning("Model %s error: %s", model_id, e)
- continue
-
- if tried_models == 0 or next_retry_after is not None:
- return {RATE_LIMITED: True, "__retry_after": next_retry_after}
-
- # All models exhausted on non-retryable responses.
- return {ALL_FAILED: True}
-
-
-async def _increment_describe_failures(meme_id: int, existing_ocr: dict, reason: str):
- """Track describe failures in ocr_result so permanently broken memes get skipped."""
- failures = int(existing_ocr.get("describe_failures", 0)) + 1
- merged = {**existing_ocr, "describe_failures": failures, "last_failure_reason": reason}
- update_query = meme.update().where(meme.c.id == meme_id).values(ocr_result=merged)
- await execute(update_query)
+__all__ = ["VISION_MODELS", "describe_memes_flow", "describe_single_meme"]
async def describe_single_meme(meme_row: dict, log, *, deadline: float | None = None) -> str:
@@ -568,7 +66,7 @@ async def describe_single_meme(meme_row: dict, log, *, deadline: float | None =
image_bytes = await download_meme_content_from_tg(file_id)
except Exception as e:
log.warning("Meme %s: download failed: %s", meme_id, e)
- await _increment_describe_failures(meme_id, existing_ocr, str(e))
+ await increment_describe_failures(meme_id, existing_ocr, str(e))
return "failed"
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
@@ -578,11 +76,11 @@ async def describe_single_meme(meme_row: dict, log, *, deadline: float | None =
result = await call_openrouter_vision(image_b64, log, deadline=deadline)
except Exception as e:
log.warning("Meme %s: OpenRouter error: %s", meme_id, e)
- await _increment_describe_failures(meme_id, existing_ocr, str(e))
+ await increment_describe_failures(meme_id, existing_ocr, str(e))
return "failed"
if result is None:
- await _increment_describe_failures(meme_id, existing_ocr, "no result")
+ await increment_describe_failures(meme_id, existing_ocr, "no result")
return "failed"
if result.get(RATE_LIMITED):
@@ -598,54 +96,22 @@ async def describe_single_meme(meme_row: dict, log, *, deadline: float | None =
return "daily_budget_exhausted"
if result.get(ALL_FAILED):
- await _increment_describe_failures(meme_id, existing_ocr, "all models failed")
+ await increment_describe_failures(meme_id, existing_ocr, "all models failed")
return "failed"
- # Merge with existing ocr_result
- ocr_text = result.get("ocr_text", "")
- description = result.get("description", "")
- language = result.get("language", "")
- model_used = result.get("__model", VISION_MODELS[0])
-
- merged = {
- **existing_ocr,
- "model": model_used,
- "calculated_at": datetime.now(timezone.utc).isoformat(),
- "raw_result": {
- "ocr_text": ocr_text,
- "description": description,
- "language": language,
- },
- "description": description,
- }
-
- if not existing_ocr.get("text"):
- merged["text"] = ocr_text
-
- update_kwargs = {"ocr_result": merged}
-
- # Only update language_code if the detected language is one we already use
- # This ensures inner joins with user_language work correctly
- KNOWN_LANGUAGES = {
- "ru",
- "en",
- "uk",
- "es",
- "fa",
- "pl",
- "hi",
- "am",
- "de",
- "fr",
- "pt-br",
- "ar",
- "uz",
- }
- if language and language.lower() in KNOWN_LANGUAGES:
- update_kwargs["language_code"] = language.lower()
-
- update_query = meme.update().where(meme.c.id == meme_id).values(**update_kwargs).returning(meme)
- await fetch_one(update_query)
+ merged = await save_meme_description(meme_id, existing_ocr, result)
+ dedup_result = await deduplicate_described_meme(
+ meme_id,
+ merged.get("text", ""),
+ status=meme_row.get("status"),
+ )
+ if dedup_result.duplicate_found:
+ log.info(
+ "Meme %s resolved as OCR duplicate of %s after describe: %s",
+ meme_id,
+ dedup_result.duplicate_of,
+ dedup_result.resolution,
+ )
return "ok"
@@ -719,7 +185,7 @@ async def describe_memes_flow(batch_size: int = 20) -> None:
i + 1,
len(memes),
)
- await _increment_describe_failures(
+ await increment_describe_failures(
meme_row["id"],
meme_row["ocr_result"] or {},
f"per-meme timeout ({effective_timeout:.0f}s)",
diff --git a/src/flows/storage/describe_memes_repository.py b/src/flows/storage/describe_memes_repository.py
new file mode 100644
index 00000000..b1b924de
--- /dev/null
+++ b/src/flows/storage/describe_memes_repository.py
@@ -0,0 +1,111 @@
+from datetime import datetime, timezone
+from typing import Any
+
+from sqlalchemy import text
+
+from src.database import execute, fetch_all, fetch_one, meme
+from src.flows.storage.openrouter_vision import VISION_MODELS
+
+KNOWN_LANGUAGES = {
+ "ru",
+ "en",
+ "uk",
+ "es",
+ "fa",
+ "pl",
+ "hi",
+ "am",
+ "de",
+ "fr",
+ "pt-br",
+ "ar",
+ "uz",
+}
+
+
+async def get_memes_to_describe(limit: int = 30) -> list[dict[str, Any]]:
+ """Get image memes without descriptions.
+
+ Priority order:
+ 1. Recently uploaded memes (last 24h) — enables dedup for user uploads
+ 2. Most liked memes — improves Wrapped coverage
+
+ Skips memes that have failed 3+ times (tracked in ocr_result.describe_failures).
+ """
+ query = text(
+ """
+ SELECT
+ M.id,
+ M.telegram_file_id,
+ M.ocr_result,
+ M.status,
+ M.language_code
+ FROM meme M
+ LEFT JOIN meme_stats MS ON MS.meme_id = M.id
+ LEFT JOIN meme_source SRC ON SRC.id = M.meme_source_id
+ WHERE M.type = 'image'
+ AND M.status = 'ok'
+ AND M.telegram_file_id IS NOT NULL
+ AND (
+ M.ocr_result IS NULL
+ OR M.ocr_result->>'description' IS NULL
+ )
+ AND COALESCE((M.ocr_result->>'describe_failures')::int, 0) < 3
+ ORDER BY
+ CASE WHEN SRC.type = 'user upload'
+ AND M.created_at > now() - interval '24 hours'
+ THEN 0 ELSE 1 END,
+ COALESCE(MS.nlikes, 0) DESC,
+ M.id DESC
+ LIMIT :limit
+ """
+ ).bindparams(limit=limit)
+
+ return await fetch_all(query)
+
+
+async def increment_describe_failures(
+ meme_id: int,
+ existing_ocr: dict[str, Any],
+ reason: str,
+) -> None:
+ """Track describe failures in ocr_result so permanently broken memes get skipped."""
+ failures = int(existing_ocr.get("describe_failures", 0)) + 1
+ merged = {**existing_ocr, "describe_failures": failures, "last_failure_reason": reason}
+ update_query = meme.update().where(meme.c.id == meme_id).values(ocr_result=merged)
+ await execute(update_query)
+
+
+async def save_meme_description(
+ meme_id: int,
+ existing_ocr: dict[str, Any],
+ result: dict[str, Any],
+) -> dict[str, Any]:
+ ocr_text = result.get("ocr_text", "")
+ description = result.get("description", "")
+ language = result.get("language", "")
+ model_used = result.get("__model", VISION_MODELS[0])
+
+ merged = {
+ **existing_ocr,
+ "model": model_used,
+ "calculated_at": datetime.now(timezone.utc).isoformat(),
+ "raw_result": {
+ "ocr_text": ocr_text,
+ "description": description,
+ "language": language,
+ },
+ "description": description,
+ }
+
+ if not existing_ocr.get("text"):
+ merged["text"] = ocr_text
+
+ update_kwargs: dict[str, Any] = {"ocr_result": merged}
+ language_code = language.lower()
+ if language_code in KNOWN_LANGUAGES:
+ update_kwargs["language_code"] = language_code
+
+ update_query = meme.update().where(meme.c.id == meme_id).values(**update_kwargs).returning(meme)
+ await fetch_one(update_query)
+ return merged
diff --git a/src/flows/storage/memes.py b/src/flows/storage/memes.py
index 66b9c680..efd619fb 100644
--- a/src/flows/storage/memes.py
+++ b/src/flows/storage/memes.py
@@ -6,17 +6,18 @@
from src.flows.hooks import notify_telegram_on_failure
from src.storage import ads
from src.storage.constants import MemeStatus, MemeType
+from src.storage.deduplication import (
+ deduplicate_pending_meme,
+ sweep_file_id_duplicates,
+)
from src.storage.etl import (
etl_memes_from_raw_telegram_posts,
etl_memes_from_raw_vk_posts,
)
from src.storage.service import (
- find_meme_duplicate,
- find_meme_duplicate_by_file_id,
get_pending_memes,
get_unloaded_tg_memes,
get_unloaded_vk_memes,
- resolve_meme_duplicate,
update_meme,
update_meme_status_of_ready_memes,
)
@@ -200,28 +201,31 @@ async def final_meme_pipeline() -> None:
memes = await get_pending_memes()
logger.info(f"Final meme pipeline has {len(memes)} pending memes.")
+ processed_meme_ids = []
for meme in memes:
+ processed_meme_ids.append(meme["id"])
await analyse_meme_caption(meme)
- # exact file_id dedup: catches cross-source reposts of identical files
- if meme["telegram_file_id"]:
- dup_id = await find_meme_duplicate_by_file_id(meme["id"], meme["telegram_file_id"])
- if dup_id:
- await resolve_meme_duplicate(meme["id"], dup_id)
- continue
-
- # it's ok if there is no OCR result for videos
- if meme["ocr_result"]:
- duplicate_meme_id = await find_meme_duplicate(meme["id"], meme["ocr_result"]["text"])
- if duplicate_meme_id:
- await resolve_meme_duplicate(meme["id"], duplicate_meme_id)
- continue
+ result = await deduplicate_pending_meme(meme)
+ if result.duplicate_found:
+ logger.info(
+ "Meme %s resolved as %s duplicate of %s before ok promotion.",
+ result.meme_id,
+ result.reason,
+ result.duplicate_of,
+ )
- # next step of a pipeline
- await update_meme_status_of_ready_memes()
+ promoted_memes = await update_meme_status_of_ready_memes(processed_meme_ids)
+ file_id_duplicates = await sweep_file_id_duplicates()
+ if file_id_duplicates["resolved"]:
+ logger.info("Resolved file_id duplicates: %s", file_id_duplicates)
safe_emit(
"ff.pipeline.final.completed",
"ff.pipeline.final",
- {"memes_processed": len(memes)},
+ {
+ "memes_processed": len(memes),
+ "memes_promoted": len(promoted_memes),
+ "file_id_duplicates_resolved": file_id_duplicates["resolved"],
+ },
)
diff --git a/src/flows/storage/openrouter_vision.py b/src/flows/storage/openrouter_vision.py
new file mode 100644
index 00000000..8b07f73b
--- /dev/null
+++ b/src/flows/storage/openrouter_vision.py
@@ -0,0 +1,470 @@
+import json
+import re
+import time
+from datetime import datetime, timezone
+
+import httpx
+
+from src.config import settings
+from src.redis import redis_client
+
+OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
+OPENROUTER_FREE_DAILY_REQUEST_LIMIT = 1000
+OPENROUTER_FREE_DAILY_REQUEST_BUDGET = 900
+OPENROUTER_FREE_REQUEST_COUNTER_TTL_SECONDS = 60 * 60 * 48
+OPENROUTER_FREE_STATS_TTL_SECONDS = 60 * 60 * 24 * 14
+OPENROUTER_DEFAULT_RATE_LIMIT_COOLDOWN_SECONDS = 60 * 15
+OPENROUTER_MAX_RATE_LIMIT_COOLDOWN_SECONDS = 60 * 60
+OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS = 60 * 15
+OPENROUTER_FORBIDDEN_MODEL_COOLDOWN_SECONDS = 60 * 60 * 6
+
+# FREE models only. Never add paid models here — spending balance below $0
+# blocks ALL models (even free ones) with HTTP 402. Free tier requires $10+
+# lifetime purchases for 1,000 req/day (vs 50/day without).
+# See specs/describe-memes.md for full OpenRouter constraints.
+#
+# Verified available on OpenRouter API as of 2026-05-17.
+# Ordered by preference. Falls back to next model on 429/403/timeout/bad response.
+# Transient failures set Redis cooldowns so later memes/runs try other free models.
+VISION_MODELS = [
+ "google/gemma-4-31b-it:free", # 262k context, primary
+ "google/gemma-4-26b-a4b-it:free", # 262k context, MoE variant
+ # Gemma 3 free vision fallbacks are no longer listed by OpenRouter.
+ # nvidia/nemotron-nano-12b-v2-vl:free removed — returns 504s and invalid
+ # JSON/empty content (see specs/describe-memes.md).
+]
+
+DESCRIBE_PROMPT = (
+ "You are analyzing a meme image. Extract the following:\n\n"
+ "1. OCR_TEXT: ALL text visible in the image, exactly as written. "
+ "Preserve original language and line breaks. "
+ "If no text, return empty string.\n\n"
+ "2. DESCRIPTION: Describe the meme in 1-3 sentences in English. "
+ "What's happening visually? What's the joke? "
+ "Be specific (panels, characters, reactions, meme format).\n\n"
+ "3. LANGUAGE: Primary language of the meme text as ISO 639-1 code "
+ '(e.g. "ru", "en"). If no text, return "en". '
+ "If mixed, return dominant language.\n\n"
+ "Respond with ONLY valid JSON, no markdown fences:\n"
+ '{"ocr_text": "...", "description": "...", "language": "..."}'
+)
+
+RATE_LIMITED = "__rate_limited"
+ALL_FAILED = "__all_failed"
+QUOTA_EXHAUSTED = "__quota_exhausted"
+DAILY_BUDGET_EXHAUSTED = "__daily_budget_exhausted"
+TRY_NEXT_MODEL = "__try_next_model"
+
+
+class UnsafeOpenRouterModelError(ValueError):
+ """Raised when a non-free OpenRouter model is configured."""
+
+
+def _validate_free_vision_models(model_ids: list[str]) -> None:
+ paid_model_ids = [model_id for model_id in model_ids if not model_id.endswith(":free")]
+ if paid_model_ids:
+ raise UnsafeOpenRouterModelError(
+ "OpenRouter paid models are forbidden in VISION_MODELS: " + ", ".join(paid_model_ids)
+ )
+
+
+def _validate_openrouter_free_budget() -> None:
+ if OPENROUTER_FREE_DAILY_REQUEST_BUDGET >= OPENROUTER_FREE_DAILY_REQUEST_LIMIT:
+ raise ValueError(
+ "OpenRouter local safety budget must stay below the documented "
+ f"{OPENROUTER_FREE_DAILY_REQUEST_LIMIT}/day free-model cap"
+ )
+
+
+def _openrouter_free_request_counter_key(now: datetime | None = None) -> str:
+ now = now or datetime.now(timezone.utc)
+ return f"openrouter:free_requests:{now.date().isoformat()}"
+
+
+def _openrouter_stats_key(now: datetime | None = None) -> str:
+ now = now or datetime.now(timezone.utc)
+ return f"openrouter:free_ocr_stats:{now.strftime('%Y-%m-%d:%H')}"
+
+
+def _openrouter_model_cooldown_key(model_id: str) -> str:
+ return f"openrouter:free_model_cooldown:{model_id}"
+
+
+def _normalize_retry_after(raw_retry_after: float | None) -> float | None:
+ if raw_retry_after is None:
+ return None
+ if raw_retry_after > 60 * 60 * 24:
+ return max(0.0, raw_retry_after - time.time())
+ return max(0.0, raw_retry_after)
+
+
+def _rate_limit_cooldown_seconds(raw_retry_after: float | None) -> int:
+ retry_after = _normalize_retry_after(raw_retry_after)
+ if retry_after is None:
+ return OPENROUTER_DEFAULT_RATE_LIMIT_COOLDOWN_SECONDS
+ return int(
+ min(
+ max(retry_after, 60.0),
+ OPENROUTER_MAX_RATE_LIMIT_COOLDOWN_SECONDS,
+ )
+ )
+
+
+_RESERVE_OPENROUTER_FREE_REQUEST_LUA = """
+local current = tonumber(redis.call("GET", KEYS[1]) or "0")
+local budget = tonumber(ARGV[1])
+if current >= budget then
+ return {0, current}
+end
+
+current = redis.call("INCR", KEYS[1])
+if current == 1 then
+ redis.call("EXPIRE", KEYS[1], tonumber(ARGV[2]))
+end
+
+return {1, current}
+"""
+
+
+async def _reserve_openrouter_free_request(log) -> tuple[bool, int]:
+ """Reserve one daily free-model request attempt.
+
+ OpenRouter counts failed attempts toward the daily free quota, so we reserve
+ before every model attempt, including fallbacks. If Redis is unavailable, fail
+ closed and do not call OpenRouter.
+ """
+ key = _openrouter_free_request_counter_key()
+ try:
+ reserved, used_today = await redis_client.eval(
+ _RESERVE_OPENROUTER_FREE_REQUEST_LUA,
+ 1,
+ key,
+ OPENROUTER_FREE_DAILY_REQUEST_BUDGET,
+ OPENROUTER_FREE_REQUEST_COUNTER_TTL_SECONDS,
+ )
+ return bool(int(reserved)), int(used_today)
+ except Exception as e:
+ log.error("OpenRouter quota guard failed via Redis; refusing request: %s", e)
+ return False, -1
+
+
+async def _record_openrouter_metric(model_id: str, outcome: str) -> None:
+ key = _openrouter_stats_key()
+ field = f"{model_id}:{outcome}"
+ try:
+ async with redis_client.pipeline(transaction=True) as pipe:
+ await pipe.hincrby(key, field, 1)
+ await pipe.expire(key, OPENROUTER_FREE_STATS_TTL_SECONDS)
+ await pipe.execute()
+ except Exception:
+ pass
+
+
+async def _get_openrouter_model_cooldown(model_id: str) -> int:
+ try:
+ ttl = await redis_client.ttl(_openrouter_model_cooldown_key(model_id))
+ except Exception:
+ return 0
+ return int(ttl) if ttl and ttl > 0 else 0
+
+
+async def _cool_down_openrouter_model(model_id: str, seconds: int, reason: str) -> None:
+ try:
+ await redis_client.set(
+ _openrouter_model_cooldown_key(model_id),
+ reason,
+ ex=max(1, int(seconds)),
+ )
+ except Exception:
+ pass
+
+
+async def _cool_down_transient_openrouter_model(model_id: str, reason: str) -> float:
+ await _cool_down_openrouter_model(
+ model_id,
+ OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS,
+ reason,
+ )
+ return float(OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS)
+
+
+def _parse_vision_response(raw_content: str) -> dict:
+ """Parse JSON from model response, stripping markdown fences if present."""
+ content = raw_content.strip()
+ if content.startswith("```"):
+ content = content.split("\n", 1)[1] if "\n" in content else content[3:]
+ if content.endswith("```"):
+ content = content[:-3]
+ content = content.strip()
+ if content.startswith("json"):
+ content = content[4:].strip()
+
+ try:
+ return json.loads(content)
+ except json.JSONDecodeError:
+ pass
+
+ try:
+ fixed = re.sub(r'\\(?!["\\/bfnrtu])', r"\\\\", content)
+ return json.loads(fixed)
+ except (json.JSONDecodeError, Exception):
+ pass
+
+ result = {}
+ for key in ("ocr_text", "description", "language"):
+ match = re.search(rf'"{key}"\s*:\s*"((?:[^"\\]|\\.)*)"', content, re.DOTALL)
+ if match:
+ try:
+ result[key] = json.loads(f'"{match.group(1)}"')
+ except json.JSONDecodeError:
+ result[key] = match.group(1)
+ if result:
+ return result
+
+ raise json.JSONDecodeError("Could not parse model response", content, 0)
+
+
+def _parse_retry_after(response: httpx.Response) -> float | None:
+ """Extract retry delay from Retry-After header or response body."""
+ header = response.headers.get("retry-after") or response.headers.get("x-ratelimit-reset")
+ if header:
+ try:
+ return float(header)
+ except ValueError:
+ pass
+ try:
+ body = response.json()
+ if "error" in body and "metadata" in body["error"]:
+ reset = body["error"]["metadata"].get("ratelimit_reset")
+ if reset:
+ return float(reset)
+ except Exception:
+ pass
+ return None
+
+
+async def call_openrouter_vision(image_b64: str, log, *, deadline: float | None = None) -> dict:
+ """Call OpenRouter vision model with fallback chain.
+
+ Args:
+ deadline: monotonic timestamp after which we stop trying models.
+
+ Returns:
+ dict with result on success, or {RATE_LIMITED: True} / {ALL_FAILED: True}
+ """
+ headers = {
+ "Authorization": f"Bearer {settings.OPENROUTER_API_KEY}",
+ "Content-Type": "application/json",
+ }
+
+ next_retry_after: float | None = None
+ tried_models = 0
+
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ for model_id in VISION_MODELS:
+ if not model_id.endswith(":free"):
+ raise UnsafeOpenRouterModelError(f"Refusing non-free OpenRouter model: {model_id}")
+
+ cooldown_ttl = await _get_openrouter_model_cooldown(model_id)
+ if cooldown_ttl > 0:
+ log.info(
+ "Skipping %s — free-model cooldown has %ss remaining.",
+ model_id,
+ cooldown_ttl,
+ )
+ if next_retry_after is None or cooldown_ttl < next_retry_after:
+ next_retry_after = float(cooldown_ttl)
+ continue
+
+ if deadline is not None and time.monotonic() > deadline - 35:
+ log.warning("Skipping remaining models — approaching deadline")
+ break
+
+ payload = {
+ "model": model_id,
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": DESCRIBE_PROMPT},
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{image_b64}",
+ },
+ },
+ ],
+ }
+ ],
+ "max_tokens": 500,
+ "temperature": 0.2,
+ }
+
+ try:
+ tried_models += 1
+ reserved, used_today = await _reserve_openrouter_free_request(log)
+ if not reserved:
+ log.warning(
+ "OpenRouter free-model daily safety budget exhausted "
+ "(%s/%s attempts). Refusing request.",
+ used_today if used_today >= 0 else "unknown",
+ OPENROUTER_FREE_DAILY_REQUEST_BUDGET,
+ )
+ return {DAILY_BUDGET_EXHAUSTED: True, "__used_today": used_today}
+ await _record_openrouter_metric(model_id, "attempt")
+
+ response = await client.post(
+ f"{OPENROUTER_BASE_URL}/chat/completions",
+ headers=headers,
+ json=payload,
+ )
+
+ status_result = await _handle_status_response(response, model_id, log)
+ if status_result is not None:
+ if status_result.get(TRY_NEXT_MODEL):
+ continue
+ if status_result.get(RATE_LIMITED):
+ cooldown = status_result["__retry_after"]
+ if next_retry_after is None or cooldown < next_retry_after:
+ next_retry_after = float(cooldown)
+ continue
+ return status_result
+
+ response.raise_for_status()
+ result = await _parse_success_response(response, model_id, log)
+ if result is not None:
+ return result
+
+ if next_retry_after is None:
+ next_retry_after = float(OPENROUTER_TRANSIENT_MODEL_COOLDOWN_SECONDS)
+ continue
+
+ except json.JSONDecodeError as e:
+ await _record_openrouter_metric(model_id, "invalid_json")
+ retry_after = await _cool_down_transient_openrouter_model(model_id, "invalid_json")
+ if next_retry_after is None or retry_after < next_retry_after:
+ next_retry_after = retry_after
+ log.warning("Model %s invalid JSON: %s", model_id, e)
+ continue
+ except httpx.HTTPStatusError as e:
+ await _record_openrouter_metric(model_id, f"http_{e.response.status_code}")
+ if e.response.status_code >= 500:
+ retry_after = await _cool_down_transient_openrouter_model(
+ model_id,
+ f"http_{e.response.status_code}",
+ )
+ if next_retry_after is None or retry_after < next_retry_after:
+ next_retry_after = retry_after
+ log.warning("Model %s HTTP %s", model_id, e.response.status_code)
+ continue
+ except (httpx.ReadTimeout, httpx.ConnectTimeout) as e:
+ await _record_openrouter_metric(model_id, "timeout")
+ retry_after = await _cool_down_transient_openrouter_model(model_id, "timeout")
+ if next_retry_after is None or retry_after < next_retry_after:
+ next_retry_after = retry_after
+ log.warning("Model %s timeout: %s", model_id, type(e).__name__)
+ continue
+ except httpx.RequestError as e:
+ await _record_openrouter_metric(model_id, "request_error")
+ retry_after = await _cool_down_transient_openrouter_model(model_id, "request_error")
+ if next_retry_after is None or retry_after < next_retry_after:
+ next_retry_after = retry_after
+ log.warning("Model %s request error: %s", model_id, type(e).__name__)
+ continue
+ except Exception as e:
+ await _record_openrouter_metric(model_id, "error")
+ log.warning("Model %s error: %s", model_id, e)
+ continue
+
+ if tried_models == 0 or next_retry_after is not None:
+ return {RATE_LIMITED: True, "__retry_after": next_retry_after}
+
+ return {ALL_FAILED: True}
+
+
+async def _handle_status_response(
+ response: httpx.Response,
+ model_id: str,
+ log,
+) -> dict | None:
+ if response.status_code == 402:
+ log.warning(
+ "OpenRouter quota exhausted (HTTP 402). "
+ "Balance likely below $0 — all models blocked. "
+ "Check https://openrouter.ai/settings/credits"
+ )
+ await _record_openrouter_metric(model_id, "quota_exhausted")
+ return {QUOTA_EXHAUSTED: True}
+
+ if response.status_code == 429:
+ raw_retry_after = _parse_retry_after(response)
+ retry_after = _normalize_retry_after(raw_retry_after)
+ cooldown = _rate_limit_cooldown_seconds(raw_retry_after)
+ await _record_openrouter_metric(model_id, "rate_limited")
+ await _cool_down_openrouter_model(model_id, cooldown, "rate_limited")
+ log.info(
+ "Rate-limited (429) on %s (retry-after: %ss, cooldown: %ss)",
+ model_id,
+ retry_after or "unknown",
+ cooldown,
+ )
+ return {RATE_LIMITED: True, "__retry_after": cooldown}
+
+ if response.status_code == 403:
+ await _record_openrouter_metric(model_id, "forbidden")
+ await _cool_down_openrouter_model(
+ model_id,
+ OPENROUTER_FORBIDDEN_MODEL_COOLDOWN_SECONDS,
+ "forbidden",
+ )
+ log.warning("Model %s HTTP 403 (access denied), trying next...", model_id)
+ return {TRY_NEXT_MODEL: True}
+
+ return None
+
+
+async def _parse_success_response(
+ response: httpx.Response,
+ model_id: str,
+ log,
+) -> dict | None:
+ body = response.text.strip()
+ json_start = body.find("{")
+ if json_start < 0:
+ await _record_bad_response(model_id, log, "bad_response", "returned no JSON", body[:100])
+ return None
+
+ data = json.loads(body[json_start:])
+ if "choices" not in data:
+ await _record_bad_response(model_id, log, "bad_response", "no choices", str(data)[:200])
+ return None
+
+ content = data["choices"][0]["message"]["content"]
+ if not content:
+ await _record_bad_response(model_id, log, "empty_content", "empty content", "")
+ return None
+
+ result = _parse_vision_response(content)
+ if "description" not in result and "ocr_text" not in result:
+ await _record_bad_response(model_id, log, "bad_json", "bad JSON", str(result)[:200])
+ return None
+
+ result["__model"] = model_id
+ await _record_openrouter_metric(model_id, "success")
+ return result
+
+
+async def _record_bad_response(
+ model_id: str,
+ log,
+ metric: str,
+ message: str,
+ detail: str,
+) -> None:
+ await _record_openrouter_metric(model_id, metric)
+ await _cool_down_transient_openrouter_model(model_id, metric)
+ log.warning("Model %s %s: %s", model_id, message, detail)
+
+
+_validate_free_vision_models(VISION_MODELS)
+_validate_openrouter_free_budget()
diff --git a/src/recommendations/meme_queue.py b/src/recommendations/meme_queue.py
index a1dc9e76..b151874b 100644
--- a/src/recommendations/meme_queue.py
+++ b/src/recommendations/meme_queue.py
@@ -2,9 +2,11 @@
import uuid
from typing import Any, Optional
+from sqlalchemy import text
+
from src import redis
from src.config import settings
-from src.database import fetch_all
+from src.database import fetch_all, fetch_one
from src.recommendations.blender import blend
from src.recommendations.blender_experiments import (
MATURE_BLENDER_CONTROL_WEIGHTS,
@@ -26,12 +28,49 @@
async def get_next_meme_for_user(user_id: int) -> MemeData | None:
queue_key = redis.get_meme_queue_key(user_id)
- meme_data = await redis.pop_meme_from_queue_by_key(queue_key)
- if not meme_data:
- return None
+ while True:
+ meme_data = await redis.pop_meme_from_queue_by_key(queue_key)
+ if not meme_data:
+ return None
+
+ try:
+ meme_id = int(meme_data["id"])
+ except (KeyError, TypeError, ValueError):
+ logging.warning(
+ "discarding malformed queued meme payload for user_id=%s payload=%s",
+ user_id,
+ meme_data,
+ )
+ continue
- return MemeData(**meme_data)
+ if await _queued_meme_is_sendable(user_id, meme_id):
+ return MemeData(**meme_data)
+
+ logging.info(
+ "discarding stale queued meme payload for user_id=%s meme_id=%s",
+ user_id,
+ meme_id,
+ )
+
+
+async def _queued_meme_is_sendable(user_id: int, meme_id: int) -> bool:
+ row = await fetch_one(
+ text(
+ """
+ SELECT M.id
+ FROM meme M
+ LEFT JOIN user_meme_reaction R
+ ON R.meme_id = M.id
+ AND R.user_id = :user_id
+ WHERE M.id = :meme_id
+ AND M.status = 'ok'
+ AND R.meme_id IS NULL
+ """
+ ),
+ {"user_id": user_id, "meme_id": meme_id},
+ )
+ return row is not None
async def has_memes_in_queue(user_id: int) -> bool:
diff --git a/src/recommendations/pipeline.py b/src/recommendations/pipeline.py
index 219de931..83c7bb7e 100644
--- a/src/recommendations/pipeline.py
+++ b/src/recommendations/pipeline.py
@@ -37,6 +37,9 @@
logger = logging.getLogger(__name__)
+LOW_SENT_POOL_MIN_REACTIONS_FOR_QUALITY_GATE = 10
+LOW_SENT_POOL_MIN_LIKE_RATE = 0.15
+
Candidate = dict[str, Any]
BlendFunc = Callable[
[
@@ -743,8 +746,19 @@ def _low_sent_query(exclude_ids: list[int]) -> str:
WHERE 1=1
AND M.status = 'ok'
AND R.meme_id IS NULL
+ AND (
+ COALESCE(MS.nlikes, 0) + COALESCE(MS.ndislikes, 0)
+ < {LOW_SENT_POOL_MIN_REACTIONS_FOR_QUALITY_GATE}
+ OR (
+ COALESCE(MS.nlikes, 0)::float
+ / NULLIF(COALESCE(MS.nlikes, 0) + COALESCE(MS.ndislikes, 0), 0)
+ ) >= {LOW_SENT_POOL_MIN_LIKE_RATE}
+ )
{exclude_meme_ids_sql_filter(exclude_ids)}
- ORDER BY COALESCE(MS.nmemes_sent, 0), M.id
+ ORDER BY
+ COALESCE(MS.nlikes, 0) + COALESCE(MS.ndislikes, 0),
+ COALESCE(MS.nmemes_sent, 0),
+ M.id
LIMIT :limit
"""
diff --git a/src/stats/meme.py b/src/stats/meme.py
index 0ac64975..820ca828 100644
--- a/src/stats/meme.py
+++ b/src/stats/meme.py
@@ -1,6 +1,9 @@
import logging
+from collections.abc import Awaitable, Callable
+from typing import Any
from sqlalchemy import text
+from sqlalchemy.ext.asyncio import AsyncConnection
from src.database import execute, fetch_all
@@ -16,12 +19,55 @@ async def calculate_meme_reactions_and_engagement(
min_user_reactions: int = 10,
min_meme_reactions: int = 3,
lookback_hours: int = 3,
+ meme_ids: list[int] | None = None,
+ include_user_history: bool = False,
+) -> None:
+ await _execute_meme_reactions_and_engagement(
+ execute,
+ min_user_reactions=min_user_reactions,
+ min_meme_reactions=min_meme_reactions,
+ lookback_hours=lookback_hours,
+ meme_ids=meme_ids,
+ include_user_history=include_user_history,
+ )
+
+
+async def calculate_meme_reactions_and_engagement_on_connection(
+ conn: AsyncConnection,
+ *,
+ min_user_reactions: int = 10,
+ min_meme_reactions: int = 3,
+ lookback_hours: int = 3,
+ meme_ids: list[int] | None = None,
+ include_user_history: bool = False,
+) -> None:
+ await _execute_meme_reactions_and_engagement(
+ conn.execute,
+ min_user_reactions=min_user_reactions,
+ min_meme_reactions=min_meme_reactions,
+ lookback_hours=lookback_hours,
+ meme_ids=meme_ids,
+ include_user_history=include_user_history,
+ )
+
+
+async def _execute_meme_reactions_and_engagement(
+ execute_query: Callable[[Any, dict[str, Any]], Awaitable[Any]],
+ *,
+ min_user_reactions: int,
+ min_meme_reactions: int,
+ lookback_hours: int,
+ meme_ids: list[int] | None,
+ include_user_history: bool,
) -> None:
"""Combined lr_smoothed + engagement_score + basic counts — incremental mode.
Only recomputes stats for memes that received reactions in the last
`lookback_hours` hours. Memes with no recent activity keep their existing
- meme_stats rows unchanged (ON CONFLICT DO UPDATE only fires for included rows).
+ meme_stats rows unchanged unless explicitly included in `meme_ids`.
+ When `include_user_history` is true, user baselines are built from all
+ reactions by users who touched the target memes; this is used after moving
+ historical reactions during deduplication.
lr_smoothed algorithm:
1. like_symmetrical: reaction_id=1 → +1, else → -1
@@ -50,6 +96,25 @@ async def calculate_meme_reactions_and_engagement(
WHERE COALESCE(reacted_at, sent_at) > NOW() - :lookback_hours * INTERVAL '1 hour'
),
+ FORCED_MEME_IDS AS (
+ SELECT M.id AS meme_id
+ FROM meme M
+ WHERE :has_forced_meme_ids
+ AND M.id = ANY(:meme_ids)
+ ),
+
+ TARGET_MEME_IDS AS (
+ SELECT meme_id FROM RECENT_MEME_IDS
+ UNION
+ SELECT meme_id FROM FORCED_MEME_IDS
+ ),
+
+ AFFECTED_USERS AS (
+ SELECT DISTINCT user_id
+ FROM user_meme_reaction
+ WHERE meme_id IN (SELECT meme_id FROM TARGET_MEME_IDS)
+ ),
+
BASE_REACTIONS AS (
SELECT
R.user_id, R.meme_id, R.reaction_id,
@@ -62,7 +127,13 @@ async def calculate_meme_reactions_and_engagement(
OVER (PARTITION BY R.user_id) AS user_last_reaction_sent_at
FROM user_meme_reaction R
JOIN meme ON R.meme_id = meme.id
- WHERE R.meme_id IN (SELECT meme_id FROM RECENT_MEME_IDS)
+ WHERE (
+ (:include_user_history AND R.user_id IN (SELECT user_id FROM AFFECTED_USERS))
+ OR (
+ NOT :include_user_history
+ AND R.meme_id IN (SELECT meme_id FROM TARGET_MEME_IDS)
+ )
+ )
),
WITH_USER_AVGS AS (
@@ -118,30 +189,31 @@ async def calculate_meme_reactions_and_engagement(
COUNT(lr_smoothed_val) AS n_lr_reactions,
COUNT(es_smoothed_val) AS n_es_reactions
FROM SMOOTHED
+ WHERE meme_id IN (SELECT meme_id FROM TARGET_MEME_IDS)
GROUP BY meme_id
),
BASIC_COUNTS AS (
SELECT
- meme_id
- , COUNT(*) FILTER (WHERE reaction_id = 1) AS nlikes
- , COUNT(*) FILTER (WHERE reaction_id = 2) AS ndislikes
- , COUNT(*) AS nmemes_sent
+ M.id AS meme_id
+ , COUNT(*) FILTER (WHERE E.reaction_id = 1) AS nlikes
+ , COUNT(*) FILTER (WHERE E.reaction_id = 2) AS ndislikes
+ , COUNT(E.*) AS nmemes_sent
, MAX(EXTRACT('DAYS' FROM NOW() - M.published_at)) AS age_days
, COALESCE(EXTRACT(
EPOCH FROM
percentile_cont(0.5)
- WITHIN GROUP (ORDER BY reacted_at - sent_at)
+ WITHIN GROUP (ORDER BY E.reacted_at - E.sent_at)
FILTER (
- WHERE reacted_at - sent_at
+ WHERE E.reacted_at - E.sent_at
BETWEEN '0.5 second'
AND '1 minute'
)
), 99999) AS sec_to_react
, NOW() AS updated_at
- FROM user_meme_reaction E
- INNER JOIN meme M ON M.id = E.meme_id
- WHERE E.meme_id IN (SELECT meme_id FROM RECENT_MEME_IDS)
+ FROM meme M
+ LEFT JOIN user_meme_reaction E ON E.meme_id = M.id
+ WHERE M.id IN (SELECT meme_id FROM TARGET_MEME_IDS)
GROUP BY 1
)
@@ -173,12 +245,16 @@ async def calculate_meme_reactions_and_engagement(
lr_smoothed = EXCLUDED.lr_smoothed,
engagement_score = EXCLUDED.engagement_score
"""
- await execute(
+ forced_meme_ids = meme_ids or [0]
+ await execute_query(
text(query),
{
"min_user_reactions": min_user_reactions,
"min_meme_reactions": min_meme_reactions,
"lookback_hours": lookback_hours,
+ "has_forced_meme_ids": bool(meme_ids),
+ "meme_ids": forced_meme_ids,
+ "include_user_history": include_user_history,
},
)
diff --git a/src/storage/deduplication/__init__.py b/src/storage/deduplication/__init__.py
new file mode 100644
index 00000000..34560119
--- /dev/null
+++ b/src/storage/deduplication/__init__.py
@@ -0,0 +1,30 @@
+from src.storage.deduplication.finder import (
+ find_duplicate_by_file_id,
+ find_duplicate_by_ocr_text,
+ ocr_text_from_meme,
+)
+from src.storage.deduplication.models import (
+ MIN_OCR_DUPLICATE_TEXT_LENGTH,
+ DeduplicationResult,
+ DuplicateResolution,
+)
+from src.storage.deduplication.policies import (
+ deduplicate_described_meme,
+ deduplicate_pending_meme,
+)
+from src.storage.deduplication.resolver import refresh_original_stats, resolve_duplicate
+from src.storage.deduplication.sweep import sweep_file_id_duplicates
+
+__all__ = [
+ "MIN_OCR_DUPLICATE_TEXT_LENGTH",
+ "DeduplicationResult",
+ "DuplicateResolution",
+ "deduplicate_described_meme",
+ "deduplicate_pending_meme",
+ "find_duplicate_by_file_id",
+ "find_duplicate_by_ocr_text",
+ "ocr_text_from_meme",
+ "refresh_original_stats",
+ "resolve_duplicate",
+ "sweep_file_id_duplicates",
+]
diff --git a/src/storage/deduplication/finder.py b/src/storage/deduplication/finder.py
new file mode 100644
index 00000000..a5703320
--- /dev/null
+++ b/src/storage/deduplication/finder.py
@@ -0,0 +1,54 @@
+from typing import Any
+
+from sqlalchemy import text
+
+from src.database import fetch_one
+from src.storage.deduplication.models import MIN_OCR_DUPLICATE_TEXT_LENGTH
+
+
+def ocr_text_from_meme(meme_row: dict[str, Any]) -> str:
+ ocr_result = meme_row.get("ocr_result") or {}
+ return ocr_result.get("text") or ocr_result.get("raw_result", {}).get("ocr_text") or ""
+
+
+async def find_duplicate_by_file_id(meme_id: int, telegram_file_id: str) -> int | None:
+ """Find an earlier meme that stores the same Telegram file_id."""
+ query = text(
+ """
+ SELECT id FROM meme
+ WHERE telegram_file_id = :file_id
+ AND status IN ('ok', 'published', 'created')
+ AND id < :meme_id
+ ORDER BY
+ CASE WHEN status = 'published' THEN 0 ELSE 1 END,
+ id ASC
+ LIMIT 1
+ """
+ )
+ res = await fetch_one(query, {"file_id": telegram_file_id, "meme_id": meme_id})
+ return res["id"] if res else None
+
+
+async def find_duplicate_by_ocr_text(meme_id: int, image_text: str) -> int | None:
+ if len(image_text) < MIN_OCR_DUPLICATE_TEXT_LENGTH:
+ return None
+
+ select_query = text(
+ """
+ SELECT
+ M.id
+ FROM meme M
+ WHERE M.id < :meme_id
+ AND M.status IN ('ok', 'published')
+ AND M.type = 'image'
+ AND M.ocr_result IS NOT NULL
+ AND (M.ocr_result ->> 'text') % :image_text
+ ORDER BY
+ CASE WHEN M.status = 'published' THEN 0 ELSE 1 END,
+ M.id ASC
+ LIMIT 1
+ """
+ )
+
+ res = await fetch_one(select_query, {"meme_id": meme_id, "image_text": image_text})
+ return res["id"] if res else None
diff --git a/src/storage/deduplication/models.py b/src/storage/deduplication/models.py
new file mode 100644
index 00000000..ad4dab67
--- /dev/null
+++ b/src/storage/deduplication/models.py
@@ -0,0 +1,26 @@
+from dataclasses import dataclass
+
+MIN_OCR_DUPLICATE_TEXT_LENGTH = 12
+
+
+@dataclass(frozen=True)
+class DuplicateResolution:
+ dupe_id: int
+ original_id: int
+ reason: str
+ reactions_moved: int
+ reactions_dropped: int
+ chat_reactions_moved: int
+ chat_reactions_dropped: int
+
+
+@dataclass(frozen=True)
+class DeduplicationResult:
+ meme_id: int
+ duplicate_of: int | None = None
+ reason: str | None = None
+ resolution: DuplicateResolution | None = None
+
+ @property
+ def duplicate_found(self) -> bool:
+ return self.duplicate_of is not None
diff --git a/src/storage/deduplication/policies.py b/src/storage/deduplication/policies.py
new file mode 100644
index 00000000..8dbdf98e
--- /dev/null
+++ b/src/storage/deduplication/policies.py
@@ -0,0 +1,50 @@
+from typing import Any
+
+from src.storage.constants import MemeStatus
+from src.storage.deduplication.finder import (
+ find_duplicate_by_file_id,
+ find_duplicate_by_ocr_text,
+ ocr_text_from_meme,
+)
+from src.storage.deduplication.models import DeduplicationResult
+from src.storage.deduplication.resolver import resolve_duplicate
+
+
+async def deduplicate_pending_meme(meme_row: dict[str, Any]) -> DeduplicationResult:
+ """Run cheap dedup checks before a created meme can be promoted to ok."""
+ meme_id = meme_row["id"]
+ telegram_file_id = meme_row.get("telegram_file_id")
+ if telegram_file_id:
+ duplicate_of = await find_duplicate_by_file_id(meme_id, telegram_file_id)
+ if duplicate_of:
+ resolution = await resolve_duplicate(
+ meme_id,
+ duplicate_of,
+ reason="telegram_file_id",
+ )
+ return DeduplicationResult(meme_id, duplicate_of, "telegram_file_id", resolution)
+
+ duplicate_of = await find_duplicate_by_ocr_text(meme_id, ocr_text_from_meme(meme_row))
+ if duplicate_of:
+ resolution = await resolve_duplicate(meme_id, duplicate_of, reason="ocr_text")
+ return DeduplicationResult(meme_id, duplicate_of, "ocr_text", resolution)
+
+ return DeduplicationResult(meme_id)
+
+
+async def deduplicate_described_meme(
+ meme_id: int,
+ ocr_text: str,
+ *,
+ status: str | None,
+) -> DeduplicationResult:
+ """Run OCR dedup after Describe Memes enriches an already-ok image."""
+ if status != MemeStatus.OK.value:
+ return DeduplicationResult(meme_id)
+
+ duplicate_of = await find_duplicate_by_ocr_text(meme_id, ocr_text)
+ if not duplicate_of:
+ return DeduplicationResult(meme_id)
+
+ resolution = await resolve_duplicate(meme_id, duplicate_of, reason="ocr_text")
+ return DeduplicationResult(meme_id, duplicate_of, "ocr_text", resolution)
diff --git a/src/storage/deduplication/resolver.py b/src/storage/deduplication/resolver.py
new file mode 100644
index 00000000..e43843ac
--- /dev/null
+++ b/src/storage/deduplication/resolver.py
@@ -0,0 +1,213 @@
+from typing import Any
+
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import AsyncConnection
+
+from src.database import run_in_transaction
+from src.stats.meme import calculate_meme_reactions_and_engagement_on_connection
+from src.storage.deduplication.models import DuplicateResolution
+
+
+async def _fetch_one(
+ conn: AsyncConnection,
+ query,
+ params: dict[str, Any] | None = None,
+) -> dict[str, Any] | None:
+ result = await conn.execute(query, params or {})
+ row = result.first()
+ return row._asdict() if row is not None else None
+
+
+async def _count(
+ conn: AsyncConnection,
+ query,
+ params: dict[str, Any],
+ field: str,
+) -> int:
+ row = await _fetch_one(conn, query, params)
+ return int(row[field]) if row else 0
+
+
+async def _canonical_original_id(conn: AsyncConnection, meme_id: int) -> int:
+ """Follow duplicate_of links so new duplicates point at a real original."""
+ current_id = meme_id
+ seen = {meme_id}
+
+ while True:
+ row = await _fetch_one(
+ conn,
+ text("SELECT id, duplicate_of FROM meme WHERE id = :meme_id"),
+ {"meme_id": current_id},
+ )
+ if not row or row["duplicate_of"] is None:
+ return current_id
+
+ current_id = row["duplicate_of"]
+ if current_id in seen:
+ return current_id
+ seen.add(current_id)
+
+
+async def resolve_duplicate(
+ dupe_id: int,
+ original_id: int,
+ *,
+ reason: str,
+) -> DuplicateResolution:
+ """Mark a meme as duplicate and move all safe reaction history to the original."""
+
+ async def _resolve(conn: AsyncConnection) -> DuplicateResolution:
+ canonical_original_id = await _canonical_original_id(conn, original_id)
+
+ reactions_moved = await _move_user_reactions(conn, dupe_id, canonical_original_id)
+ chat_reactions_moved = await _move_chat_reactions(conn, dupe_id, canonical_original_id)
+ reactions_dropped = await _delete_user_reactions(conn, dupe_id)
+ chat_reactions_dropped = await _delete_chat_reactions(conn, dupe_id)
+
+ await conn.execute(
+ text("DELETE FROM meme_stats WHERE meme_id = :dupe_id"),
+ {"dupe_id": dupe_id},
+ )
+ await conn.execute(
+ text(
+ """
+ UPDATE meme
+ SET status = 'duplicate', duplicate_of = :original_id
+ WHERE id = :dupe_id
+ """
+ ),
+ {"dupe_id": dupe_id, "original_id": canonical_original_id},
+ )
+ await conn.execute(
+ text(
+ """
+ UPDATE meme
+ SET duplicate_of = :original_id
+ WHERE duplicate_of = :dupe_id
+ """
+ ),
+ {"dupe_id": dupe_id, "original_id": canonical_original_id},
+ )
+ await _refresh_original_stats(conn, canonical_original_id)
+
+ return DuplicateResolution(
+ dupe_id=dupe_id,
+ original_id=canonical_original_id,
+ reason=reason,
+ reactions_moved=reactions_moved,
+ reactions_dropped=reactions_dropped,
+ chat_reactions_moved=chat_reactions_moved,
+ chat_reactions_dropped=chat_reactions_dropped,
+ )
+
+ return await run_in_transaction(_resolve)
+
+
+async def _move_user_reactions(
+ conn: AsyncConnection,
+ dupe_id: int,
+ original_id: int,
+) -> int:
+ return await _count(
+ conn,
+ text(
+ """
+ WITH moved AS (
+ INSERT INTO user_meme_reaction
+ (user_id, meme_id, recommended_by, sent_at, reaction_id, reacted_at)
+ SELECT user_id, :original_id, recommended_by, sent_at, reaction_id, reacted_at
+ FROM user_meme_reaction source
+ WHERE source.meme_id = :dupe_id
+ AND NOT EXISTS (
+ SELECT 1 FROM user_meme_reaction existing
+ WHERE existing.user_id = source.user_id
+ AND existing.meme_id = :original_id
+ )
+ ON CONFLICT (user_id, meme_id) DO NOTHING
+ RETURNING 1
+ )
+ SELECT count(*) AS moved FROM moved
+ """
+ ),
+ {"dupe_id": dupe_id, "original_id": original_id},
+ "moved",
+ )
+
+
+async def _move_chat_reactions(
+ conn: AsyncConnection,
+ dupe_id: int,
+ original_id: int,
+) -> int:
+ return await _count(
+ conn,
+ text(
+ """
+ WITH moved AS (
+ INSERT INTO chat_meme_reaction
+ (chat_id, meme_id, user_id, reaction, reacted_at)
+ SELECT chat_id, :original_id, user_id, reaction, reacted_at
+ FROM chat_meme_reaction source
+ WHERE source.meme_id = :dupe_id
+ AND NOT EXISTS (
+ SELECT 1 FROM chat_meme_reaction existing
+ WHERE existing.chat_id = source.chat_id
+ AND existing.user_id = source.user_id
+ AND existing.meme_id = :original_id
+ )
+ ON CONFLICT (chat_id, meme_id, user_id) DO NOTHING
+ RETURNING 1
+ )
+ SELECT count(*) AS moved FROM moved
+ """
+ ),
+ {"dupe_id": dupe_id, "original_id": original_id},
+ "moved",
+ )
+
+
+async def _delete_user_reactions(conn: AsyncConnection, dupe_id: int) -> int:
+ return await _count(
+ conn,
+ text(
+ """
+ WITH deleted AS (
+ DELETE FROM user_meme_reaction WHERE meme_id = :dupe_id RETURNING 1
+ )
+ SELECT count(*) AS deleted FROM deleted
+ """
+ ),
+ {"dupe_id": dupe_id},
+ "deleted",
+ )
+
+
+async def _delete_chat_reactions(conn: AsyncConnection, dupe_id: int) -> int:
+ return await _count(
+ conn,
+ text(
+ """
+ WITH deleted AS (
+ DELETE FROM chat_meme_reaction WHERE meme_id = :dupe_id RETURNING 1
+ )
+ SELECT count(*) AS deleted FROM deleted
+ """
+ ),
+ {"dupe_id": dupe_id},
+ "deleted",
+ )
+
+
+async def refresh_original_stats(original_id: int) -> None:
+ async def _refresh(conn: AsyncConnection) -> None:
+ await _refresh_original_stats(conn, original_id)
+
+ await run_in_transaction(_refresh)
+
+
+async def _refresh_original_stats(conn: AsyncConnection, original_id: int) -> None:
+ await calculate_meme_reactions_and_engagement_on_connection(
+ conn,
+ meme_ids=[original_id],
+ include_user_history=True,
+ )
diff --git a/src/storage/deduplication/sweep.py b/src/storage/deduplication/sweep.py
new file mode 100644
index 00000000..9fba8a96
--- /dev/null
+++ b/src/storage/deduplication/sweep.py
@@ -0,0 +1,63 @@
+from sqlalchemy import text
+
+from src.database import fetch_all
+from src.storage.deduplication.resolver import resolve_duplicate
+
+
+async def sweep_file_id_duplicates() -> dict[str, int]:
+ """Resolve any exact Telegram file_id duplicates that slipped past batch processing."""
+ rows = await fetch_all(
+ text(
+ """
+ WITH duplicate_groups AS (
+ SELECT telegram_file_id
+ FROM meme
+ WHERE status IN ('ok', 'published')
+ AND telegram_file_id IS NOT NULL
+ GROUP BY telegram_file_id
+ HAVING count(*) > 1
+ ),
+ canonical AS (
+ SELECT DISTINCT ON (m.telegram_file_id)
+ m.telegram_file_id,
+ m.id AS original_id
+ FROM meme m
+ INNER JOIN duplicate_groups g
+ ON g.telegram_file_id = m.telegram_file_id
+ WHERE m.status IN ('ok', 'published')
+ ORDER BY
+ m.telegram_file_id,
+ CASE WHEN m.status = 'published' THEN 0 ELSE 1 END,
+ m.id ASC
+ )
+ SELECT m.id, m.telegram_file_id, canonical.original_id
+ FROM meme m
+ INNER JOIN canonical
+ ON canonical.telegram_file_id = m.telegram_file_id
+ WHERE m.status = 'ok'
+ AND m.id != canonical.original_id
+ """
+ )
+ )
+
+ total_moved = 0
+ total_dropped = 0
+ total_resolved = 0
+
+ for row in rows:
+ if row["id"] == row["original_id"]:
+ continue
+ result = await resolve_duplicate(
+ row["id"],
+ row["original_id"],
+ reason="telegram_file_id_sweep",
+ )
+ total_moved += result.reactions_moved
+ total_dropped += result.reactions_dropped
+ total_resolved += 1
+
+ return {
+ "resolved": total_resolved,
+ "reactions_moved": total_moved,
+ "reactions_dropped": total_dropped,
+ }
diff --git a/src/storage/service.py b/src/storage/service.py
index d899e44a..f2677977 100644
--- a/src/storage/service.py
+++ b/src/storage/service.py
@@ -4,7 +4,6 @@
from sqlalchemy import nulls_first, select, text
from src.database import (
- execute,
fetch_all,
fetch_one,
meme,
@@ -375,8 +374,13 @@ async def get_unloaded_vk_memes(limit: int) -> list[dict[str, Any]]:
return await fetch_all(text(select_query))
-async def update_meme_status_of_ready_memes() -> list[dict[str, Any]]:
+async def update_meme_status_of_ready_memes(
+ meme_ids: list[int] | None = None,
+) -> list[dict[str, Any]]:
"""Changes the status of memes to 'ok' if they are ready to be published."""
+ if meme_ids is not None and len(meme_ids) == 0:
+ return []
+
update_query = (
meme.update()
.where(meme.c.status == MemeStatus.CREATED)
@@ -385,159 +389,6 @@ async def update_meme_status_of_ready_memes() -> list[dict[str, Any]]:
.values(status=MemeStatus.OK)
.returning(meme)
)
+ if meme_ids is not None:
+ update_query = update_query.where(meme.c.id.in_(meme_ids))
return await fetch_all(update_query)
-
-
-async def find_meme_duplicate_by_file_id(meme_id: int, telegram_file_id: str) -> int | None:
- """Find an existing meme with the same telegram_file_id."""
- query = text(
- """
- SELECT id FROM meme
- WHERE telegram_file_id = :file_id
- AND status IN ('ok', 'created')
- AND id < :meme_id
- ORDER BY id ASC
- LIMIT 1
- """
- )
- res = await fetch_one(query, {"file_id": telegram_file_id, "meme_id": meme_id})
- if res:
- return res["id"]
- return None
-
-
-async def find_meme_duplicate(meme_id: int, imagetext: str) -> int | None:
- if len(imagetext) <= 11: # skip all memes with less than 11 letters
- return None
-
- select_query = text(
- """
- SELECT
- M.id
- FROM meme M
- WHERE M.id < :meme_id
- AND M.status = 'ok'
- AND M.type = 'image'
- AND M.ocr_result IS NOT NULL
- AND (M.ocr_result ->> 'text') % :imagetext
- ORDER BY M.id ASC
- LIMIT 1
- """
- ).bindparams(meme_id=meme_id, imagetext=imagetext)
-
- res = await fetch_one(select_query)
- if res:
- return res["id"]
- return None
-
-
-async def resolve_meme_duplicate(dupe_id: int, original_id: int) -> dict[str, int]:
- """Mark a meme as duplicate with full cleanup.
-
- 1. Move reactions from dupe → original (skip conflicts)
- 2. Delete remaining reactions on dupe
- 3. Delete meme_stats for dupe
- 4. Set meme status='duplicate', duplicate_of=original_id
-
- Stats for original will auto-recalculate on next 5-15 min cycle.
- Returns counts: {moved, conflicts, deleted_stats}.
- """
- # 1. Move non-conflicting reactions to original
- move_query = text(
- """
- WITH moved AS (
- INSERT INTO user_meme_reaction
- (user_id, meme_id, recommended_by, sent_at, reaction_id, reacted_at)
- SELECT user_id, :original_id, recommended_by, sent_at, reaction_id, reacted_at
- FROM user_meme_reaction
- WHERE meme_id = :dupe_id
- AND NOT EXISTS (
- SELECT 1 FROM user_meme_reaction existing
- WHERE existing.user_id = user_meme_reaction.user_id
- AND existing.meme_id = :original_id
- )
- ON CONFLICT (user_id, meme_id) DO NOTHING
- RETURNING 1
- )
- SELECT count(*) AS moved FROM moved
- """
- )
- res = await fetch_one(move_query, {"dupe_id": dupe_id, "original_id": original_id})
- moved = res["moved"] if res else 0
-
- # 2. Delete all reactions remaining on dupe (conflicts + already moved)
- delete_reactions = text(
- """
- WITH deleted AS (
- DELETE FROM user_meme_reaction WHERE meme_id = :dupe_id RETURNING 1
- )
- SELECT count(*) AS conflicts FROM deleted
- """
- )
- res = await fetch_one(delete_reactions, {"dupe_id": dupe_id})
- conflicts = res["conflicts"] if res else 0
-
- # 3. Delete meme_stats for dupe (stale, will not regenerate since no reactions)
- await execute(
- text("DELETE FROM meme_stats WHERE meme_id = :dupe_id"),
- {"dupe_id": dupe_id},
- )
-
- # 4. Mark meme as duplicate
- await execute(
- text(
- """
- UPDATE meme
- SET status = 'duplicate', duplicate_of = :original_id
- WHERE id = :dupe_id
- """
- ),
- {"dupe_id": dupe_id, "original_id": original_id},
- )
-
- return {"moved": moved, "conflicts": conflicts}
-
-
-async def resolve_all_file_id_duplicates() -> dict[str, int]:
- """Find and resolve all memes with duplicate telegram_file_id.
-
- For each group of memes sharing a file_id with status='ok':
- keeps the oldest (smallest id), resolves the rest as duplicates.
- Returns total counts.
- """
- # Find all file_id duplicate groups
- dupes_query = text(
- """
- SELECT id, telegram_file_id,
- FIRST_VALUE(id) OVER (
- PARTITION BY telegram_file_id ORDER BY id ASC
- ) AS original_id
- FROM meme
- WHERE status = 'ok'
- AND telegram_file_id IS NOT NULL
- AND telegram_file_id IN (
- SELECT telegram_file_id FROM meme
- WHERE status = 'ok' AND telegram_file_id IS NOT NULL
- GROUP BY telegram_file_id HAVING count(*) > 1
- )
- """
- )
- rows = await fetch_all(dupes_query)
-
- total_moved = 0
- total_conflicts = 0
- total_resolved = 0
-
- for row in rows:
- if row["id"] == row["original_id"]:
- continue # skip the keeper
- result = await resolve_meme_duplicate(row["id"], row["original_id"])
- total_moved += result["moved"]
- total_conflicts += result["conflicts"]
- total_resolved += 1
-
- return {
- "resolved": total_resolved,
- "reactions_moved": total_moved,
- "reactions_dropped": total_conflicts,
- }
diff --git a/src/storage/upload.py b/src/storage/upload.py
index e933f45e..2ff26abc 100644
--- a/src/storage/upload.py
+++ b/src/storage/upload.py
@@ -14,9 +14,9 @@
sentry_log_extra,
)
from src.storage.constants import MemeStatus, MemeType
+from src.storage.deduplication import find_duplicate_by_file_id
from src.storage.parsers.constants import USER_AGENT
from src.storage.service import (
- find_meme_duplicate_by_file_id,
update_meme,
)
from src.tgbot.bot import bot
@@ -107,8 +107,8 @@ async def _upload_meme_content_to_tg(
if not file_id:
return None
- # Check if this file_id already exists on another ok meme (cross-source dupe)
- duplicate_of = await find_meme_duplicate_by_file_id(meme_id, file_id)
+ # Check if this file_id already exists before the meme reaches recommendations.
+ duplicate_of = await find_duplicate_by_file_id(meme_id, file_id)
if duplicate_of:
logging.info(
"Meme %s is a file_id duplicate of meme %s, marking as duplicate.",
diff --git a/src/tgbot/handlers/stats/wrapped.py b/src/tgbot/handlers/stats/wrapped.py
index 87e8e0c7..f265b59b 100644
--- a/src/tgbot/handlers/stats/wrapped.py
+++ b/src/tgbot/handlers/stats/wrapped.py
@@ -1,27 +1,26 @@
import asyncio
import datetime
-import json
import logging
import random
import sys
from html import escape as html_escape
from urllib.parse import quote
-from openai import AsyncOpenAI
from telegram import InlineKeyboardButton, InlineKeyboardMarkup, Update
from telegram.constants import ChatAction
from telegram.ext import ContextTypes
-from src.config import settings
from src.localizer import ALMOST_CIS_LANGUAGES
from src.redis import get_user_wrapped, set_user_wrapped
from src.stats.service import (
get_meme_descriptions_for_wrapped,
- get_most_liked_meme_source_urls,
- get_top_meme_source_urls,
get_user_stats,
)
from src.storage.schemas import MemeData
+from src.tgbot.handlers.stats.wrapped_generation import (
+ generate_wrapped_data,
+ get_bot_usage_report,
+)
from src.tgbot.senders.meme import send_new_message_with_meme
from src.tgbot.service import (
create_or_update_user,
@@ -74,18 +73,6 @@
"Try again →",
]
-ABSURD_CATEGORIES = [
- "бытовая техника",
- "животное",
- "блюдо/еда",
- "музыкальный жанр",
- "вид транспорта",
- "напиток",
- "предмет мебели",
- "персонаж мультфильма",
- "погода",
-]
-
def _log(msg: str) -> None:
"""Force-log to stderr (bypasses gunicorn log config)."""
@@ -125,230 +112,6 @@ def _next_label(is_ru: bool) -> str:
return "Дальше →" if is_ru else "Next →"
-# ── LLM ──────────────────────────────────────────────────
-
-
-async def call_deepseek(prompt: str) -> str:
- client = AsyncOpenAI(
- api_key=settings.DEEPSEEK_API_KEY,
- base_url=settings.DEEPSEEK_BASE_URL,
- )
- resp = await client.chat.completions.create(
- model="deepseek-chat",
- messages=[{"role": "user", "content": prompt}],
- max_tokens=2000,
- temperature=0.9,
- )
- return resp.choices[0].message.content
-
-
-def parse_json_from_llm(raw: str) -> dict | None:
- c = raw.strip()
- if c.startswith("```"):
- c = c.split("\n", 1)[1] if "\n" in c else c[3:]
- if c.endswith("```"):
- c = c[:-3]
- c = c.strip()
- if c.startswith("json"):
- c = c[4:].strip()
- try:
- return json.loads(c)
- except Exception:
- return None
-
-
-# ── SQL INSIGHTS ─────────────────────────────────────────
-
-
-async def get_reaction_speed_insight(user_id: int) -> dict:
- """Median reaction time, split by like/dislike. Pure SQL."""
- from sqlalchemy import text
-
- from src.database import fetch_one
-
- row = await fetch_one(
- text(
- """
- WITH reactions AS (
- SELECT
- EXTRACT(EPOCH FROM (reacted_at - sent_at)) AS sec,
- reaction_id
- FROM user_meme_reaction
- WHERE user_id = :user_id
- AND reacted_at IS NOT NULL AND sent_at IS NOT NULL
- AND EXTRACT(EPOCH FROM (reacted_at - sent_at))
- BETWEEN 0.5 AND 120
- )
- SELECT
- PERCENTILE_CONT(0.5) WITHIN GROUP (
- ORDER BY sec
- ) AS median_sec,
- PERCENTILE_CONT(0.5) WITHIN GROUP (
- ORDER BY sec
- ) FILTER (WHERE reaction_id = 1) AS median_like,
- PERCENTILE_CONT(0.5) WITHIN GROUP (
- ORDER BY sec
- ) FILTER (WHERE reaction_id = 2) AS median_dislike
- FROM reactions
- """
- ),
- {"user_id": user_id},
- )
-
- if not row or row["median_sec"] is None:
- return {}
- return {
- "median_sec": round(float(row["median_sec"]), 1),
- "median_like": round(float(row["median_like"] or 0), 1),
- "median_dislike": round(float(row["median_dislike"] or 0), 1),
- }
-
-
-async def get_peak_hour_insight(user_id: int, is_ru: bool = True) -> dict:
- """Peak activity hour. Moscow time for RU, UTC for EN."""
- from sqlalchemy import text
-
- from src.database import fetch_one
-
- # UTC+3 for Russian users
- tz_offset = 3 if is_ru else 0
- row = await fetch_one(
- text(
- f"""
- SELECT
- EXTRACT(HOUR FROM reacted_at + interval '{tz_offset} hours')
- AS peak_hour,
- COUNT(*) AS cnt
- FROM user_meme_reaction
- WHERE user_id = :user_id AND reacted_at IS NOT NULL
- GROUP BY 1 ORDER BY 2 DESC LIMIT 1
- """
- ),
- {"user_id": user_id},
- )
-
- if not row:
- return {}
- hour = int(row["peak_hour"])
- if is_ru:
- labels = {
- (0, 6): "ночной скроллер 🌙",
- (6, 10): "утренний мемолюб ☀️",
- (10, 14): "дневной прокрастинатор 💼",
- (14, 18): "послеобеденный залипатель 🍕",
- (18, 22): "вечерний мемоман 🌆",
- (22, 24): "полуночный скроллер 🦉",
- }
- default_label = "мемоман"
- else:
- labels = {
- (0, 6): "night scroller 🌙",
- (6, 10): "morning meme lover ☀️",
- (10, 14): "daytime procrastinator 💼",
- (14, 18): "afternoon meme addict 🍕",
- (18, 22): "evening meme connoisseur 🌆",
- (22, 24): "midnight scroller 🦉",
- }
- default_label = "meme lover"
- label = next(
- (v for (lo, hi), v in labels.items() if lo <= hour < hi),
- default_label,
- )
- tz_label = "МСК" if is_ru else "UTC"
- return {"hour": hour, "label": label, "tz": tz_label}
-
-
-async def get_surprise_meme(user_id: int) -> dict | None:
- """Meme user liked but most others didn't."""
- from sqlalchemy import text
-
- from src.database import fetch_one
-
- row = await fetch_one(
- text(
- """
- SELECT m.id AS meme_id, m.type, m.telegram_file_id,
- ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100)
- AS global_lr_pct
- FROM user_meme_reaction umr
- JOIN meme m ON m.id = umr.meme_id
- LEFT JOIN meme_stats ms ON ms.meme_id = m.id
- WHERE umr.user_id = :user_id
- AND umr.reaction_id = 1
- AND m.telegram_file_id IS NOT NULL
- AND COALESCE(ms.lr_smoothed, 0.5) < 0.35
- AND COALESCE(ms.nmemes_sent, 0) >= 10
- ORDER BY ms.lr_smoothed ASC LIMIT 1
- """
- ),
- {"user_id": user_id},
- )
- if not row:
- return None
- return dict(row)
-
-
-async def get_most_popular_liked_meme(user_id: int) -> dict | None:
- """Meme user liked with highest global like rate."""
- from sqlalchemy import text
-
- from src.database import fetch_one
-
- row = await fetch_one(
- text(
- """
- SELECT m.id AS meme_id, m.type, m.telegram_file_id,
- ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100)
- AS global_lr_pct,
- COALESCE(ms.nlikes, 0) AS nlikes
- FROM user_meme_reaction umr
- JOIN meme m ON m.id = umr.meme_id
- LEFT JOIN meme_stats ms ON ms.meme_id = m.id
- WHERE umr.user_id = :user_id
- AND umr.reaction_id = 1
- AND m.telegram_file_id IS NOT NULL
- AND COALESCE(ms.nmemes_sent, 0) >= 10
- ORDER BY ms.lr_smoothed DESC LIMIT 1
- """
- ),
- {"user_id": user_id},
- )
- if not row:
- return None
- return dict(row)
-
-
-async def get_unpopular_opinion_meme(user_id: int) -> dict | None:
- """Meme user disliked but was very popular globally."""
- from sqlalchemy import text
-
- from src.database import fetch_one
-
- row = await fetch_one(
- text(
- """
- SELECT m.id AS meme_id, m.type, m.telegram_file_id,
- ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100)
- AS global_lr_pct,
- COALESCE(ms.nlikes, 0) AS nlikes
- FROM user_meme_reaction umr
- JOIN meme m ON m.id = umr.meme_id
- LEFT JOIN meme_stats ms ON ms.meme_id = m.id
- WHERE umr.user_id = :user_id
- AND umr.reaction_id = 2
- AND m.telegram_file_id IS NOT NULL
- AND COALESCE(ms.lr_smoothed, 0.5) > 0.65
- AND COALESCE(ms.nmemes_sent, 0) >= 10
- ORDER BY ms.lr_smoothed DESC LIMIT 1
- """
- ),
- {"user_id": user_id},
- )
- if not row:
- return None
- return dict(row)
-
-
# ── MAIN HANDLER ─────────────────────────────────────────
@@ -952,475 +715,6 @@ async def handle_wrapped_clear(
await update.message.reply_text("Cache cleared ✓ /wrapped")
-# ── GENERATION ───────────────────────────────────────────
-
-
-async def generate_wrapped_data(
- user_id: int,
- descriptions: list,
- lang: str,
- stats_report: str,
-) -> dict | None:
- # Lock is already set by _generate_and_cache (with is_ru), don't overwrite it
-
- try:
- liked = [d for d in descriptions if d.get("reaction_id") == 1]
- disliked = [d for d in descriptions if d.get("reaction_id") == 2]
-
- liked_texts = "\n".join(
- f"[{i}] ✅ {d.get('description') or d.get('ocr_text', '')}"
- for i, d in enumerate(liked[:25])
- )
- disliked_texts = "\n".join(
- f"❌ {d.get('description') or d.get('ocr_text', '')}" for d in disliked[:15]
- )
-
- # DeepSeek + SQL in parallel
- is_ru = _is_ru(lang)
- prompt = _build_mega_prompt(liked_texts, disliked_texts, lang)
-
- deepseek_task = asyncio.create_task(call_deepseek(prompt))
- sql_tasks = asyncio.gather(
- _safe(get_reaction_speed_insight(user_id)),
- _safe(get_peak_hour_insight(user_id, is_ru)),
- _safe(get_surprise_meme(user_id)),
- _safe(_build_sources_report(user_id, is_ru)),
- _safe(get_most_popular_liked_meme(user_id)),
- _safe(get_unpopular_opinion_meme(user_id)),
- )
-
- raw, (speed, peak, surprise, sources, popular_meme, unpopular_meme) = await asyncio.gather(
- deepseek_task, sql_tasks
- )
-
- p = parse_json_from_llm(raw)
- if not p:
- logger.warning(
- "DeepSeek JSON failed user %d: %s",
- user_id,
- raw[:300],
- )
- p = {}
-
- your_meme = _pick_meme(p, liked)
-
- # Use surprise meme if LLM didn't pick one
- if not your_meme and surprise:
- lr = surprise.get("global_lr_pct", "?")
- if is_ru:
- cap = f"🎲 Этот мем лайкнул только ты\n(глобальный лайк-рейт: {lr}%)"
- else:
- cap = f"🎲 Only you liked this meme\n(global like rate: {lr}%)"
- your_meme = {"meme_id": surprise["meme_id"], "caption": cap}
- if not your_meme and liked:
- pick = random.choice(liked[:10])
- cap = "🎲 А вот мем, который тебе зашёл:" if is_ru else "🎲 Here's a meme you liked:"
- your_meme = {"meme_id": pick["meme_id"], "caption": cap}
-
- # Build slides
- # Stats report gets vibe from DeepSeek — replace placeholder vibe
- vibe = p.get("vibe", "")
- if vibe and stats_report:
- if "\n" in stats_report:
- idx = stats_report.rfind("\n")
- stats_report = stats_report[:idx]
- stats_report += f"\n\n{html_escape(vibe)}"
-
- # Track used meme IDs globally to avoid showing the same meme twice
- global_used_memes = set()
- if your_meme and your_meme.get("meme_id"):
- global_used_memes.add(your_meme["meme_id"])
-
- # Pick oneliner meme (avoid your_meme)
- oneliner_meme_id = None
- if liked:
- oneliner_candidates = [m for m in liked[:10] if m["meme_id"] not in global_used_memes]
- if oneliner_candidates:
- oneliner_meme_id = random.choice(oneliner_candidates)["meme_id"]
- else:
- oneliner_meme_id = random.choice(liked[:10])["meme_id"]
- global_used_memes.add(oneliner_meme_id)
-
- # Pick memes for absurd comparisons (avoid already used)
- absurd_memes = _attach_memes_to_absurd(p, liked, global_used_memes)
-
- default_prediction = (
- "Летом ты будешь листать мемы вместо работы 🔥"
- if is_ru
- else "This summer you'll scroll memes instead of working 🔥"
- )
- return {
- "stats_report": stats_report,
- "zodiac": _build_zodiac_slide(p, is_ru),
- "your_meme": your_meme,
- "humor_dna": _build_humor_dna_slide(p, is_ru),
- "humor_oneliner": p.get("humor_oneliner", ""),
- "oneliner_meme_id": oneliner_meme_id,
- "absurd_items": absurd_memes,
- "anti_profile": _build_anti_slide(p, is_ru),
- "popular_meme": _build_meme_data(popular_meme, is_popular=True, is_ru=is_ru),
- "unpopular_meme": _build_meme_data(unpopular_meme, is_popular=False, is_ru=is_ru),
- "stats_extra": _build_extra_slide(sources, speed, peak, is_ru),
- "prediction": p.get("prediction", default_prediction),
- }
- except Exception as e:
- logger.error("Wrapped failed user %d: %s", user_id, e, exc_info=True)
- default_prediction = (
- "Летом ты будешь листать мемы вместо работы 🔥"
- if is_ru
- else "This summer you'll scroll memes instead of working 🔥"
- )
- return {
- "stats_report": stats_report,
- "zodiac": "",
- "your_meme": None,
- "humor_dna": "",
- "humor_oneliner": "",
- "oneliner_meme_id": None,
- "absurd_items": [],
- "anti_profile": "",
- "popular_meme": None,
- "unpopular_meme": None,
- "stats_extra": "",
- "prediction": default_prediction,
- }
-
-
-async def _safe(coro):
- try:
- return await coro
- except Exception as e:
- logger.warning("Wrapped SQL insight failed: %s", e)
- return {} if not isinstance(e, TypeError) else None
-
-
-def _build_mega_prompt(liked_texts: str, disliked_texts: str, lang: str = "ru") -> str:
- categories = random.sample(ABSURD_CATEGORIES, 3)
-
- lang_instruction = ""
- if lang != "ru":
- lang_name = "English" if lang == "en" else lang
- lang_instruction = f"\n- ЯЗЫК: пиши ВЕСЬ JSON на {lang_name}"
-
- return f"""Ты мем-психолог. Проанализируй чувство юмора.
-
-ЛАЙКНУТЫЕ МЕМЫ:
-{liked_texts}
-
-СКИПНУТЫЕ МЕМЫ:
-{disliked_texts}
-
-Сначала молча найди:
-1) 2-3 самые частые мотивы в лайках (офис, животные, кринж, токсичная мотивация, low-res chaos, семейная драма, etc.)
-2) 1-2 мотива, которые человек стабильно скипает
-3) 1 противоречие между лайками и скипами
-Рассуждения НЕ выводи. Только JSON.
-
-Верни ТОЛЬКО JSON:
-{{
- "vibe": "подкол от друга по мемам, 10-15 слов",
- "meme_index": число (индекс лайкнутого мема [N], который олицетворяет),
- "meme_caption": "почему этот мем — это ты (2 предложения, подкол)",
- "zodiac_sign": "знак зодиака + эмодзи (♈♉♊♋♌♍♎♏♐♑♒♓)",
- "zodiac_why": "1-2 предложения. Выбирай знак НЕ по характеру, \
-а по ЛОГИКЕ мемов. Упомяни конкретный мотив.",
- "humor_dna": [
- {{"name": "категория", "pct": число}},
- {{"name": "категория", "pct": число}},
- {{"name": "категория", "pct": число}},
- {{"name": "категория", "pct": число}},
- {{"name": "категория", "pct": число}}
- ],
- "humor_oneliner": "4-8 слов. Ярлык мем-вкуса, не комплимент. \
-Как кличка от друга, не описание из гороскопа.",
- "anti_profile": "2-3 коротких абзаца через \\n\\n. \
-На ТЫ: 'ты терпеть не можешь...'. Конкретно. \
-Последний абзац ОБЯЗАТЕЛЬНО позитивный — что в этом крутого, \
-почему такой вкус в мемах это кайф.",
- "absurd_comparisons": [
- {{"category": "{categories[0]}", "thing": "конкретный предмет", \
-"why": "потому что ты лайкаешь X и Y — 1 предложение", \
-"meme_ref": число}},
- {{"category": "{categories[1]}", "thing": "конкретный предмет", \
-"why": "1 предложение", "meme_ref": число}},
- {{"category": "{categories[2]}", "thing": "конкретный предмет", \
-"why": "1 предложение", "meme_ref": число}}
- ],
- "prediction": "конкретное абсурдное событие на лето 2026. 1-2 предложения."
-}}
-
-Правила:
-- humor_dna: 5 конкретных прикольных категорий по 2-3 слова, проценты ~100
-- zodiac: знак как метафора мемного поведения, не "кто он по жизни". \
-ВАЖНО: НЕ БЛИЗНЕЦЫ. Близнецы — запрещённый знак. Выбирай из остальных 11 знаков. \
-Привязывай знак к КОНКРЕТНЫМ паттернам в мемах (например: Овен если агрессивный юмор, \
-Рыбы если меланхолия, Лев если самоирония, Козерог если сухой юмор, и т.д.)
-- absurd_comparisons: thing = конкретный предмет (не "хаос-машина"). \
-Каждый comparison на ДРУГИХ мотивах, не повторяй шутку. \
-meme_ref ДОЛЖЕН быть РАЗНЫМ для каждого comparison (три разных числа!)
-- meme_ref: индекс [N] из ЛАЙКНУТЫХ мемов. Каждый meme_ref уникален!
-- meme_index: ДОЛЖЕН отличаться от всех meme_ref в absurd_comparisons
-
-АНТИСЛОП:
-- ЗАПРЕЩЕНЫ слова: уникальный, особенный, тонкий, изысканный, многогранный, хаотичный, вайб, ирония, абсурд (без конкретики)
-- ЗАПРЕЩЕНЫ шаблоны: "ты из тех, кто...", "генерал постиронии", "ценитель абсурда"
-- Подкалывай дружески, но ВСЕГДА заканчивай на позитивной ноте. \
-Человек должен улыбнуться, а не расстроиться. \
-Формула: подкол + комплимент ("ты залипаешь на X — но это потому что у тебя Y"). \
-Если мемы пользователя про грусть, депрессию, одиночество — будь мягче и теплее. \
-Не подчёркивай негатив, а покажи что юмор помогает справляться
-- Каждое утверждение ДОЛЖНО опираться на конкретный мем
-- Если шутка подошла бы любому — перепиши
-- Лучший юмор = противоречия: "лайкаешь X, но скипаешь Y"{lang_instruction}"""
-
-
-def _pick_meme(p: dict, liked: list) -> dict | None:
- idx = p.get("meme_index")
- cap = p.get("meme_caption", "🎯 Этот мем олицетворяет тебя")
- if idx is not None and 0 <= idx < len(liked):
- return {
- "meme_id": liked[idx]["meme_id"],
- "caption": f"🎯 Этот мем олицетворяет тебя:\n\n{html_escape(cap)}",
- }
- return None
-
-
-def _build_humor_dna_slide(p: dict, is_ru: bool = True) -> str:
- """Humor DNA bars only — no roast text."""
- dna = p.get("humor_dna", [])
-
- def bar(pct):
- f = round(pct / 10)
- return "█" * f + "░" * (10 - f)
-
- header = "🧬 Твоя ДНК юмора:" if is_ru else "🧬 Your Humor DNA:"
- lines = [header + "\n"]
- for c in dna[:5]:
- pct = min(100, max(0, c.get("pct", 33)))
- lines.append(f"{bar(pct)} {pct}%\n{html_escape(c.get('name', '???'))}\n")
-
- return "\n".join(lines) if len(lines) > 1 else ""
-
-
-def _build_zodiac_slide(p: dict, is_ru: bool = True) -> str:
- sign = p.get("zodiac_sign", "")
- why = p.get("zodiac_why", "")
- if not sign:
- return ""
- header = "🔮 Твой мем-зодиак:" if is_ru else "🔮 Your Meme Zodiac:"
- return f"{header}\n\n{html_escape(sign)}\n\n{html_escape(why)}"
-
-
-def _attach_memes_to_absurd(p: dict, liked: list, used_ids: set | None = None) -> list:
- """Attach meme IDs to each absurd comparison, ensuring no duplicates."""
- comparisons = p.get("absurd_comparisons", [])
- result = []
- if used_ids is None:
- used_ids = set()
- else:
- used_ids = set(used_ids) # don't mutate caller's set
- for c in comparisons[:3]:
- meme_id = None
- # Try LLM-suggested meme_ref (but skip if already used)
- ref = c.get("meme_ref")
- if ref is not None and isinstance(ref, int) and 0 <= ref < len(liked):
- candidate = liked[ref]["meme_id"]
- if candidate not in used_ids:
- meme_id = candidate
- # Fallback: random liked meme not yet used
- if not meme_id and liked:
- available = [m for m in liked[:15] if m["meme_id"] not in used_ids]
- if available:
- pick = random.choice(available)
- meme_id = pick["meme_id"]
- if meme_id:
- used_ids.add(meme_id)
- result.append(
- {
- "category": c.get("category", "?"),
- "thing": c.get("thing", "?"),
- "why": c.get("why", ""),
- "meme_id": meme_id,
- }
- )
- return result
-
-
-def _build_meme_data(meme: dict | None, is_popular: bool, is_ru: bool = True) -> dict | None:
- if not meme:
- return None
- lr = meme.get("global_lr_pct", "?")
- nlikes = meme.get("nlikes")
- if is_popular:
- if is_ru:
- extra = f" ({nlikes} чел.)" if nlikes else ""
- caption = f"🏆 Самый залайканный мем из твоих лайков!\n\nЕго лайкнули {lr}%{extra}"
- else:
- extra = f" ({nlikes} people)" if nlikes else ""
- caption = f"🏆 The most liked meme from your likes!\n\nLiked by {lr}%{extra}"
- else:
- if is_ru:
- extra = f" ({nlikes} чел.)" if nlikes else ""
- caption = f"🤔 А этот мем ты скипнул...\n\nХотя его лайкнули {lr}%{extra}!"
- else:
- extra = f" ({nlikes} people)" if nlikes else ""
- caption = f"🤔 You skipped this one...\n\nBut {lr}%{extra} liked it!"
- return {"meme_id": meme["meme_id"], "caption": caption}
-
-
-def _build_anti_slide(p: dict, is_ru: bool = True) -> str:
- anti = p.get("anti_profile", "")
- if not anti:
- return ""
- header = (
- "🚫 Что говорят твои скипы:" if is_ru else "🚫 What your skips say about you:"
- )
- return f"{header}\n\n{html_escape(anti)}"
-
-
-def _build_extra_slide(
- sources: str,
- speed: dict,
- peak: dict,
- is_ru: bool = True,
-) -> str:
- parts = []
- if sources:
- parts.append(sources)
-
- if speed:
- med = speed.get("median_sec", 0)
- ml = speed.get("median_like", 0)
- md = speed.get("median_dislike", 0)
- if is_ru:
- parts.append(
- f"⚡ Скорость реакции: {med} сек\n(до лайка: {ml} сек, до скипа: {md} сек)"
- )
- else:
- parts.append(f"⚡ Reaction speed: {med}s\n(to like: {ml}s, to skip: {md}s)")
-
- if peak:
- h = peak.get("hour", 0)
- label = peak.get("label", "")
- tz = peak.get("tz", "")
- if is_ru:
- parts.append(f"🕐 Пик активности: {h}:00 {tz}\nТы — {label}")
- else:
- parts.append(f"🕐 Peak activity: {h}:00 {tz}\nYou're a {label}")
-
- return "\n\n".join(parts) if parts else ""
-
-
-async def _build_sources_report(user_id: int, is_ru: bool = True) -> str:
- sources = await get_most_liked_meme_source_urls(user_id, limit=10)
- real = [
- s
- for s in (sources or [])
- if s.get("url")
- and not s["url"].startswith("tg://user")
- and ("t.me/" in s["url"] or "vk.com/" in s["url"])
- ]
- if len(real) < 3:
- try:
- top = await get_top_meme_source_urls(limit=5)
- for t in top or []:
- if (
- t.get("url")
- and not t["url"].startswith("tg://user")
- and t["url"] not in [s["url"] for s in real]
- ):
- real.append(t)
- if len(real) >= 3:
- break
- except Exception:
- pass
- if not real:
- return ""
- src_list = "\n".join(f"▪️ {s['url']}" for s in real[:3])
- header = "📡 Твои топ мем-паблики:" if is_ru else "📡 Your top meme channels:"
- return f"{header}\n\n{src_list}"
-
-
-# ── STATS SLIDE ──────────────────────────────────────────
-
-
-async def get_bot_usage_report(
- user_id: int,
- user_stats: dict,
- user: dict,
- is_ru: bool = True,
-) -> str | None:
- if user_stats is None:
- return None
-
- days = (datetime.datetime.utcnow() - user["created_at"]).days + 1
- sessions = user_stats.get("nsessions", 0)
- memes_sent = user_stats.get("nmemes_sent", 0)
- likes = user_stats.get("nlikes", 0)
- time_sec = user_stats.get("time_spent_sec", 0)
-
- if likes < 10:
- return None
-
- like_rate = round(100 * likes / max(memes_sent, 1))
-
- if is_ru:
- report = (
- "📊 Meme Wrapped 2026\n\n"
- "Начнём с цифр.\n\n"
- f"Ты с нами уже {days} дней.\n\n"
- f"🤝 Посмотрел {memes_sent} мемов\n"
- f"👍 Лайкнул {likes} из них "
- f"({like_rate}%)\n"
- f"👋 Заходил {sessions} раз\n"
- )
- if time_sec > 0:
- if time_sec < 60:
- t = f"{time_sec} сек"
- elif time_sec < 3600:
- t = f"{time_sec // 60} мин {time_sec % 60} сек"
- else:
- t = f"больше {time_sec // 3600} часов 😳"
- report += f"🕒 В боте {t}\n"
- if like_rate > 50:
- vibe = "Лайкаешь больше половины — тебе всё смешно 😄"
- elif like_rate > 30:
- vibe = "Лайкаешь каждый третий — у тебя есть вкус 👌"
- elif like_rate > 15:
- vibe = "Лайкаешь каждый пятый — избирательный 🧐"
- else:
- vibe = "Менее 15% мемов достойны — мем-сноб 🎩"
- else:
- report = (
- "📊 Meme Wrapped 2026\n\n"
- "Let's start with the numbers.\n\n"
- f"You've been with us for {days} days.\n\n"
- f"🤝 Seen {memes_sent} memes\n"
- f"👍 Liked {likes} of them "
- f"({like_rate}%)\n"
- f"👋 Visited {sessions} times\n"
- )
- if time_sec > 0:
- if time_sec < 60:
- t = f"{time_sec}s"
- elif time_sec < 3600:
- t = f"{time_sec // 60}m {time_sec % 60}s"
- else:
- t = f"over {time_sec // 3600} hours 😳"
- report += f"🕒 Time in bot: {t}\n"
- if like_rate > 50:
- vibe = "You like more than half — everything's funny to you 😄"
- elif like_rate > 30:
- vibe = "You like every third one — you've got taste 👌"
- elif like_rate > 15:
- vibe = "You like every fifth one — picky 🧐"
- else:
- vibe = "Less than 15% are worthy — meme snob 🎩"
-
- report += f"\n{vibe}"
- return report
-
-
def get_user_interface_language(user) -> str:
lang = user.get("language_code") if user else None
return lang if lang else "ru"
diff --git a/src/tgbot/handlers/stats/wrapped_generation.py b/src/tgbot/handlers/stats/wrapped_generation.py
new file mode 100644
index 00000000..ad313d72
--- /dev/null
+++ b/src/tgbot/handlers/stats/wrapped_generation.py
@@ -0,0 +1,728 @@
+import asyncio
+import datetime
+import json
+import logging
+import random
+from html import escape as html_escape
+
+from openai import AsyncOpenAI
+
+from src.config import settings
+from src.localizer import ALMOST_CIS_LANGUAGES
+from src.stats.service import (
+ get_most_liked_meme_source_urls,
+ get_top_meme_source_urls,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _is_ru(lang_code: str | None) -> bool:
+ return (lang_code or "ru") in ALMOST_CIS_LANGUAGES
+
+
+ABSURD_CATEGORIES = [
+ "бытовая техника",
+ "животное",
+ "блюдо/еда",
+ "музыкальный жанр",
+ "вид транспорта",
+ "напиток",
+ "предмет мебели",
+ "персонаж мультфильма",
+ "погода",
+]
+
+# ── LLM ──────────────────────────────────────────────────
+
+
+async def call_deepseek(prompt: str) -> str:
+ client = AsyncOpenAI(
+ api_key=settings.DEEPSEEK_API_KEY,
+ base_url=settings.DEEPSEEK_BASE_URL,
+ )
+ resp = await client.chat.completions.create(
+ model="deepseek-chat",
+ messages=[{"role": "user", "content": prompt}],
+ max_tokens=2000,
+ temperature=0.9,
+ )
+ return resp.choices[0].message.content
+
+
+def parse_json_from_llm(raw: str) -> dict | None:
+ c = raw.strip()
+ if c.startswith("```"):
+ c = c.split("\n", 1)[1] if "\n" in c else c[3:]
+ if c.endswith("```"):
+ c = c[:-3]
+ c = c.strip()
+ if c.startswith("json"):
+ c = c[4:].strip()
+ try:
+ return json.loads(c)
+ except Exception:
+ return None
+
+
+# ── SQL INSIGHTS ─────────────────────────────────────────
+
+
+async def get_reaction_speed_insight(user_id: int) -> dict:
+ """Median reaction time, split by like/dislike. Pure SQL."""
+ from sqlalchemy import text
+
+ from src.database import fetch_one
+
+ row = await fetch_one(
+ text(
+ """
+ WITH reactions AS (
+ SELECT
+ EXTRACT(EPOCH FROM (reacted_at - sent_at)) AS sec,
+ reaction_id
+ FROM user_meme_reaction
+ WHERE user_id = :user_id
+ AND reacted_at IS NOT NULL AND sent_at IS NOT NULL
+ AND EXTRACT(EPOCH FROM (reacted_at - sent_at))
+ BETWEEN 0.5 AND 120
+ )
+ SELECT
+ PERCENTILE_CONT(0.5) WITHIN GROUP (
+ ORDER BY sec
+ ) AS median_sec,
+ PERCENTILE_CONT(0.5) WITHIN GROUP (
+ ORDER BY sec
+ ) FILTER (WHERE reaction_id = 1) AS median_like,
+ PERCENTILE_CONT(0.5) WITHIN GROUP (
+ ORDER BY sec
+ ) FILTER (WHERE reaction_id = 2) AS median_dislike
+ FROM reactions
+ """
+ ),
+ {"user_id": user_id},
+ )
+
+ if not row or row["median_sec"] is None:
+ return {}
+ return {
+ "median_sec": round(float(row["median_sec"]), 1),
+ "median_like": round(float(row["median_like"] or 0), 1),
+ "median_dislike": round(float(row["median_dislike"] or 0), 1),
+ }
+
+
+async def get_peak_hour_insight(user_id: int, is_ru: bool = True) -> dict:
+ """Peak activity hour. Moscow time for RU, UTC for EN."""
+ from sqlalchemy import text
+
+ from src.database import fetch_one
+
+ # UTC+3 for Russian users
+ tz_offset = 3 if is_ru else 0
+ row = await fetch_one(
+ text(
+ f"""
+ SELECT
+ EXTRACT(HOUR FROM reacted_at + interval '{tz_offset} hours')
+ AS peak_hour,
+ COUNT(*) AS cnt
+ FROM user_meme_reaction
+ WHERE user_id = :user_id AND reacted_at IS NOT NULL
+ GROUP BY 1 ORDER BY 2 DESC LIMIT 1
+ """
+ ),
+ {"user_id": user_id},
+ )
+
+ if not row:
+ return {}
+ hour = int(row["peak_hour"])
+ if is_ru:
+ labels = {
+ (0, 6): "ночной скроллер 🌙",
+ (6, 10): "утренний мемолюб ☀️",
+ (10, 14): "дневной прокрастинатор 💼",
+ (14, 18): "послеобеденный залипатель 🍕",
+ (18, 22): "вечерний мемоман 🌆",
+ (22, 24): "полуночный скроллер 🦉",
+ }
+ default_label = "мемоман"
+ else:
+ labels = {
+ (0, 6): "night scroller 🌙",
+ (6, 10): "morning meme lover ☀️",
+ (10, 14): "daytime procrastinator 💼",
+ (14, 18): "afternoon meme addict 🍕",
+ (18, 22): "evening meme connoisseur 🌆",
+ (22, 24): "midnight scroller 🦉",
+ }
+ default_label = "meme lover"
+ label = next(
+ (v for (lo, hi), v in labels.items() if lo <= hour < hi),
+ default_label,
+ )
+ tz_label = "МСК" if is_ru else "UTC"
+ return {"hour": hour, "label": label, "tz": tz_label}
+
+
+async def get_surprise_meme(user_id: int) -> dict | None:
+ """Meme user liked but most others didn't."""
+ from sqlalchemy import text
+
+ from src.database import fetch_one
+
+ row = await fetch_one(
+ text(
+ """
+ SELECT m.id AS meme_id, m.type, m.telegram_file_id,
+ ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100)
+ AS global_lr_pct
+ FROM user_meme_reaction umr
+ JOIN meme m ON m.id = umr.meme_id
+ LEFT JOIN meme_stats ms ON ms.meme_id = m.id
+ WHERE umr.user_id = :user_id
+ AND umr.reaction_id = 1
+ AND m.telegram_file_id IS NOT NULL
+ AND COALESCE(ms.lr_smoothed, 0.5) < 0.35
+ AND COALESCE(ms.nmemes_sent, 0) >= 10
+ ORDER BY ms.lr_smoothed ASC LIMIT 1
+ """
+ ),
+ {"user_id": user_id},
+ )
+ if not row:
+ return None
+ return dict(row)
+
+
+async def get_most_popular_liked_meme(user_id: int) -> dict | None:
+ """Meme user liked with highest global like rate."""
+ from sqlalchemy import text
+
+ from src.database import fetch_one
+
+ row = await fetch_one(
+ text(
+ """
+ SELECT m.id AS meme_id, m.type, m.telegram_file_id,
+ ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100)
+ AS global_lr_pct,
+ COALESCE(ms.nlikes, 0) AS nlikes
+ FROM user_meme_reaction umr
+ JOIN meme m ON m.id = umr.meme_id
+ LEFT JOIN meme_stats ms ON ms.meme_id = m.id
+ WHERE umr.user_id = :user_id
+ AND umr.reaction_id = 1
+ AND m.telegram_file_id IS NOT NULL
+ AND COALESCE(ms.nmemes_sent, 0) >= 10
+ ORDER BY ms.lr_smoothed DESC LIMIT 1
+ """
+ ),
+ {"user_id": user_id},
+ )
+ if not row:
+ return None
+ return dict(row)
+
+
+async def get_unpopular_opinion_meme(user_id: int) -> dict | None:
+ """Meme user disliked but was very popular globally."""
+ from sqlalchemy import text
+
+ from src.database import fetch_one
+
+ row = await fetch_one(
+ text(
+ """
+ SELECT m.id AS meme_id, m.type, m.telegram_file_id,
+ ROUND(COALESCE(ms.lr_smoothed, 0.5) * 100)
+ AS global_lr_pct,
+ COALESCE(ms.nlikes, 0) AS nlikes
+ FROM user_meme_reaction umr
+ JOIN meme m ON m.id = umr.meme_id
+ LEFT JOIN meme_stats ms ON ms.meme_id = m.id
+ WHERE umr.user_id = :user_id
+ AND umr.reaction_id = 2
+ AND m.telegram_file_id IS NOT NULL
+ AND COALESCE(ms.lr_smoothed, 0.5) > 0.65
+ AND COALESCE(ms.nmemes_sent, 0) >= 10
+ ORDER BY ms.lr_smoothed DESC LIMIT 1
+ """
+ ),
+ {"user_id": user_id},
+ )
+ if not row:
+ return None
+ return dict(row)
+
+
+# ── GENERATION ───────────────────────────────────────────
+
+
+async def generate_wrapped_data(
+ user_id: int,
+ descriptions: list,
+ lang: str,
+ stats_report: str,
+) -> dict | None:
+ # Lock is already set by _generate_and_cache (with is_ru), don't overwrite it
+
+ try:
+ liked = [d for d in descriptions if d.get("reaction_id") == 1]
+ disliked = [d for d in descriptions if d.get("reaction_id") == 2]
+
+ liked_texts = "\n".join(
+ f"[{i}] ✅ {d.get('description') or d.get('ocr_text', '')}"
+ for i, d in enumerate(liked[:25])
+ )
+ disliked_texts = "\n".join(
+ f"❌ {d.get('description') or d.get('ocr_text', '')}" for d in disliked[:15]
+ )
+
+ # DeepSeek + SQL in parallel
+ is_ru = _is_ru(lang)
+ prompt = _build_mega_prompt(liked_texts, disliked_texts, lang)
+
+ deepseek_task = asyncio.create_task(call_deepseek(prompt))
+ sql_tasks = asyncio.gather(
+ _safe(get_reaction_speed_insight(user_id)),
+ _safe(get_peak_hour_insight(user_id, is_ru)),
+ _safe(get_surprise_meme(user_id)),
+ _safe(_build_sources_report(user_id, is_ru)),
+ _safe(get_most_popular_liked_meme(user_id)),
+ _safe(get_unpopular_opinion_meme(user_id)),
+ )
+
+ raw, (speed, peak, surprise, sources, popular_meme, unpopular_meme) = await asyncio.gather(
+ deepseek_task, sql_tasks
+ )
+
+ p = parse_json_from_llm(raw)
+ if not p:
+ logger.warning(
+ "DeepSeek JSON failed user %d: %s",
+ user_id,
+ raw[:300],
+ )
+ p = {}
+
+ your_meme = _pick_meme(p, liked)
+
+ # Use surprise meme if LLM didn't pick one
+ if not your_meme and surprise:
+ lr = surprise.get("global_lr_pct", "?")
+ if is_ru:
+ cap = f"🎲 Этот мем лайкнул только ты\n(глобальный лайк-рейт: {lr}%)"
+ else:
+ cap = f"🎲 Only you liked this meme\n(global like rate: {lr}%)"
+ your_meme = {"meme_id": surprise["meme_id"], "caption": cap}
+ if not your_meme and liked:
+ pick = random.choice(liked[:10])
+ cap = "🎲 А вот мем, который тебе зашёл:" if is_ru else "🎲 Here's a meme you liked:"
+ your_meme = {"meme_id": pick["meme_id"], "caption": cap}
+
+ # Build slides
+ # Stats report gets vibe from DeepSeek — replace placeholder vibe
+ vibe = p.get("vibe", "")
+ if vibe and stats_report:
+ if "\n" in stats_report:
+ idx = stats_report.rfind("\n")
+ stats_report = stats_report[:idx]
+ stats_report += f"\n\n{html_escape(vibe)}"
+
+ # Track used meme IDs globally to avoid showing the same meme twice
+ global_used_memes = set()
+ if your_meme and your_meme.get("meme_id"):
+ global_used_memes.add(your_meme["meme_id"])
+
+ # Pick oneliner meme (avoid your_meme)
+ oneliner_meme_id = None
+ if liked:
+ oneliner_candidates = [m for m in liked[:10] if m["meme_id"] not in global_used_memes]
+ if oneliner_candidates:
+ oneliner_meme_id = random.choice(oneliner_candidates)["meme_id"]
+ else:
+ oneliner_meme_id = random.choice(liked[:10])["meme_id"]
+ global_used_memes.add(oneliner_meme_id)
+
+ # Pick memes for absurd comparisons (avoid already used)
+ absurd_memes = _attach_memes_to_absurd(p, liked, global_used_memes)
+
+ default_prediction = (
+ "Летом ты будешь листать мемы вместо работы 🔥"
+ if is_ru
+ else "This summer you'll scroll memes instead of working 🔥"
+ )
+ return {
+ "stats_report": stats_report,
+ "zodiac": _build_zodiac_slide(p, is_ru),
+ "your_meme": your_meme,
+ "humor_dna": _build_humor_dna_slide(p, is_ru),
+ "humor_oneliner": p.get("humor_oneliner", ""),
+ "oneliner_meme_id": oneliner_meme_id,
+ "absurd_items": absurd_memes,
+ "anti_profile": _build_anti_slide(p, is_ru),
+ "popular_meme": _build_meme_data(popular_meme, is_popular=True, is_ru=is_ru),
+ "unpopular_meme": _build_meme_data(unpopular_meme, is_popular=False, is_ru=is_ru),
+ "stats_extra": _build_extra_slide(sources, speed, peak, is_ru),
+ "prediction": p.get("prediction", default_prediction),
+ }
+ except Exception as e:
+ logger.error("Wrapped failed user %d: %s", user_id, e, exc_info=True)
+ default_prediction = (
+ "Летом ты будешь листать мемы вместо работы 🔥"
+ if is_ru
+ else "This summer you'll scroll memes instead of working 🔥"
+ )
+ return {
+ "stats_report": stats_report,
+ "zodiac": "",
+ "your_meme": None,
+ "humor_dna": "",
+ "humor_oneliner": "",
+ "oneliner_meme_id": None,
+ "absurd_items": [],
+ "anti_profile": "",
+ "popular_meme": None,
+ "unpopular_meme": None,
+ "stats_extra": "",
+ "prediction": default_prediction,
+ }
+
+
+async def _safe(coro):
+ try:
+ return await coro
+ except Exception as e:
+ logger.warning("Wrapped SQL insight failed: %s", e)
+ return {} if not isinstance(e, TypeError) else None
+
+
+def _build_mega_prompt(liked_texts: str, disliked_texts: str, lang: str = "ru") -> str:
+ categories = random.sample(ABSURD_CATEGORIES, 3)
+
+ lang_instruction = ""
+ if lang != "ru":
+ lang_name = "English" if lang == "en" else lang
+ lang_instruction = f"\n- ЯЗЫК: пиши ВЕСЬ JSON на {lang_name}"
+
+ return f"""Ты мем-психолог. Проанализируй чувство юмора.
+
+ЛАЙКНУТЫЕ МЕМЫ:
+{liked_texts}
+
+СКИПНУТЫЕ МЕМЫ:
+{disliked_texts}
+
+Сначала молча найди:
+1) 2-3 самые частые мотивы в лайках (офис, животные, кринж,
+токсичная мотивация, low-res chaos, семейная драма, etc.)
+2) 1-2 мотива, которые человек стабильно скипает
+3) 1 противоречие между лайками и скипами
+Рассуждения НЕ выводи. Только JSON.
+
+Верни ТОЛЬКО JSON:
+{{
+ "vibe": "подкол от друга по мемам, 10-15 слов",
+ "meme_index": число (индекс лайкнутого мема [N], который олицетворяет),
+ "meme_caption": "почему этот мем — это ты (2 предложения, подкол)",
+ "zodiac_sign": "знак зодиака + эмодзи (♈♉♊♋♌♍♎♏♐♑♒♓)",
+ "zodiac_why": "1-2 предложения. Выбирай знак НЕ по характеру, \
+а по ЛОГИКЕ мемов. Упомяни конкретный мотив.",
+ "humor_dna": [
+ {{"name": "категория", "pct": число}},
+ {{"name": "категория", "pct": число}},
+ {{"name": "категория", "pct": число}},
+ {{"name": "категория", "pct": число}},
+ {{"name": "категория", "pct": число}}
+ ],
+ "humor_oneliner": "4-8 слов. Ярлык мем-вкуса, не комплимент. \
+Как кличка от друга, не описание из гороскопа.",
+ "anti_profile": "2-3 коротких абзаца через \\n\\n. \
+На ТЫ: 'ты терпеть не можешь...'. Конкретно. \
+Последний абзац ОБЯЗАТЕЛЬНО позитивный — что в этом крутого, \
+почему такой вкус в мемах это кайф.",
+ "absurd_comparisons": [
+ {{"category": "{categories[0]}", "thing": "конкретный предмет", \
+"why": "потому что ты лайкаешь X и Y — 1 предложение", \
+"meme_ref": число}},
+ {{"category": "{categories[1]}", "thing": "конкретный предмет", \
+"why": "1 предложение", "meme_ref": число}},
+ {{"category": "{categories[2]}", "thing": "конкретный предмет", \
+"why": "1 предложение", "meme_ref": число}}
+ ],
+ "prediction": "конкретное абсурдное событие на лето 2026. 1-2 предложения."
+}}
+
+Правила:
+- humor_dna: 5 конкретных прикольных категорий по 2-3 слова, проценты ~100
+- zodiac: знак как метафора мемного поведения, не "кто он по жизни". \
+ВАЖНО: НЕ БЛИЗНЕЦЫ. Близнецы — запрещённый знак. Выбирай из остальных 11 знаков. \
+Привязывай знак к КОНКРЕТНЫМ паттернам в мемах (например: Овен если агрессивный юмор, \
+Рыбы если меланхолия, Лев если самоирония, Козерог если сухой юмор, и т.д.)
+- absurd_comparisons: thing = конкретный предмет (не "хаос-машина"). \
+Каждый comparison на ДРУГИХ мотивах, не повторяй шутку. \
+meme_ref ДОЛЖЕН быть РАЗНЫМ для каждого comparison (три разных числа!)
+- meme_ref: индекс [N] из ЛАЙКНУТЫХ мемов. Каждый meme_ref уникален!
+- meme_index: ДОЛЖЕН отличаться от всех meme_ref в absurd_comparisons
+
+АНТИСЛОП:
+- ЗАПРЕЩЕНЫ слова: уникальный, особенный, тонкий, изысканный,
+многогранный, хаотичный, вайб, ирония, абсурд (без конкретики)
+- ЗАПРЕЩЕНЫ шаблоны: "ты из тех, кто...", "генерал постиронии", "ценитель абсурда"
+- Подкалывай дружески, но ВСЕГДА заканчивай на позитивной ноте. \
+Человек должен улыбнуться, а не расстроиться. \
+Формула: подкол + комплимент ("ты залипаешь на X — но это потому что у тебя Y"). \
+Если мемы пользователя про грусть, депрессию, одиночество — будь мягче и теплее. \
+Не подчёркивай негатив, а покажи что юмор помогает справляться
+- Каждое утверждение ДОЛЖНО опираться на конкретный мем
+- Если шутка подошла бы любому — перепиши
+- Лучший юмор = противоречия: "лайкаешь X, но скипаешь Y"{lang_instruction}"""
+
+
+def _pick_meme(p: dict, liked: list) -> dict | None:
+ idx = p.get("meme_index")
+ cap = p.get("meme_caption", "🎯 Этот мем олицетворяет тебя")
+ if idx is not None and 0 <= idx < len(liked):
+ return {
+ "meme_id": liked[idx]["meme_id"],
+ "caption": f"🎯 Этот мем олицетворяет тебя:\n\n{html_escape(cap)}",
+ }
+ return None
+
+
+def _build_humor_dna_slide(p: dict, is_ru: bool = True) -> str:
+ """Humor DNA bars only — no roast text."""
+ dna = p.get("humor_dna", [])
+
+ def bar(pct):
+ f = round(pct / 10)
+ return "█" * f + "░" * (10 - f)
+
+ header = "🧬 Твоя ДНК юмора:" if is_ru else "🧬 Your Humor DNA:"
+ lines = [header + "\n"]
+ for c in dna[:5]:
+ pct = min(100, max(0, c.get("pct", 33)))
+ lines.append(f"{bar(pct)} {pct}%\n{html_escape(c.get('name', '???'))}\n")
+
+ return "\n".join(lines) if len(lines) > 1 else ""
+
+
+def _build_zodiac_slide(p: dict, is_ru: bool = True) -> str:
+ sign = p.get("zodiac_sign", "")
+ why = p.get("zodiac_why", "")
+ if not sign:
+ return ""
+ header = "🔮 Твой мем-зодиак:" if is_ru else "🔮 Your Meme Zodiac:"
+ return f"{header}\n\n{html_escape(sign)}\n\n{html_escape(why)}"
+
+
+def _attach_memes_to_absurd(p: dict, liked: list, used_ids: set | None = None) -> list:
+ """Attach meme IDs to each absurd comparison, ensuring no duplicates."""
+ comparisons = p.get("absurd_comparisons", [])
+ result = []
+ if used_ids is None:
+ used_ids = set()
+ else:
+ used_ids = set(used_ids) # don't mutate caller's set
+ for c in comparisons[:3]:
+ meme_id = None
+ # Try LLM-suggested meme_ref (but skip if already used)
+ ref = c.get("meme_ref")
+ if ref is not None and isinstance(ref, int) and 0 <= ref < len(liked):
+ candidate = liked[ref]["meme_id"]
+ if candidate not in used_ids:
+ meme_id = candidate
+ # Fallback: random liked meme not yet used
+ if not meme_id and liked:
+ available = [m for m in liked[:15] if m["meme_id"] not in used_ids]
+ if available:
+ pick = random.choice(available)
+ meme_id = pick["meme_id"]
+ if meme_id:
+ used_ids.add(meme_id)
+ result.append(
+ {
+ "category": c.get("category", "?"),
+ "thing": c.get("thing", "?"),
+ "why": c.get("why", ""),
+ "meme_id": meme_id,
+ }
+ )
+ return result
+
+
+def _build_meme_data(meme: dict | None, is_popular: bool, is_ru: bool = True) -> dict | None:
+ if not meme:
+ return None
+ lr = meme.get("global_lr_pct", "?")
+ nlikes = meme.get("nlikes")
+ if is_popular:
+ if is_ru:
+ extra = f" ({nlikes} чел.)" if nlikes else ""
+ caption = f"🏆 Самый залайканный мем из твоих лайков!\n\nЕго лайкнули {lr}%{extra}"
+ else:
+ extra = f" ({nlikes} people)" if nlikes else ""
+ caption = f"🏆 The most liked meme from your likes!\n\nLiked by {lr}%{extra}"
+ else:
+ if is_ru:
+ extra = f" ({nlikes} чел.)" if nlikes else ""
+ caption = f"🤔 А этот мем ты скипнул...\n\nХотя его лайкнули {lr}%{extra}!"
+ else:
+ extra = f" ({nlikes} people)" if nlikes else ""
+ caption = f"🤔 You skipped this one...\n\nBut {lr}%{extra} liked it!"
+ return {"meme_id": meme["meme_id"], "caption": caption}
+
+
+def _build_anti_slide(p: dict, is_ru: bool = True) -> str:
+ anti = p.get("anti_profile", "")
+ if not anti:
+ return ""
+ header = (
+ "🚫 Что говорят твои скипы:" if is_ru else "🚫 What your skips say about you:"
+ )
+ return f"{header}\n\n{html_escape(anti)}"
+
+
+def _build_extra_slide(
+ sources: str,
+ speed: dict,
+ peak: dict,
+ is_ru: bool = True,
+) -> str:
+ parts = []
+ if sources:
+ parts.append(sources)
+
+ if speed:
+ med = speed.get("median_sec", 0)
+ ml = speed.get("median_like", 0)
+ md = speed.get("median_dislike", 0)
+ if is_ru:
+ parts.append(
+ f"⚡ Скорость реакции: {med} сек\n(до лайка: {ml} сек, до скипа: {md} сек)"
+ )
+ else:
+ parts.append(f"⚡ Reaction speed: {med}s\n(to like: {ml}s, to skip: {md}s)")
+
+ if peak:
+ h = peak.get("hour", 0)
+ label = peak.get("label", "")
+ tz = peak.get("tz", "")
+ if is_ru:
+ parts.append(f"🕐 Пик активности: {h}:00 {tz}\nТы — {label}")
+ else:
+ parts.append(f"🕐 Peak activity: {h}:00 {tz}\nYou're a {label}")
+
+ return "\n\n".join(parts) if parts else ""
+
+
+async def _build_sources_report(user_id: int, is_ru: bool = True) -> str:
+ sources = await get_most_liked_meme_source_urls(user_id, limit=10)
+ real = [
+ s
+ for s in (sources or [])
+ if s.get("url")
+ and not s["url"].startswith("tg://user")
+ and ("t.me/" in s["url"] or "vk.com/" in s["url"])
+ ]
+ if len(real) < 3:
+ try:
+ top = await get_top_meme_source_urls(limit=5)
+ for t in top or []:
+ if (
+ t.get("url")
+ and not t["url"].startswith("tg://user")
+ and t["url"] not in [s["url"] for s in real]
+ ):
+ real.append(t)
+ if len(real) >= 3:
+ break
+ except Exception:
+ pass
+ if not real:
+ return ""
+ src_list = "\n".join(f"▪️ {s['url']}" for s in real[:3])
+ header = "📡 Твои топ мем-паблики:" if is_ru else "📡 Your top meme channels:"
+ return f"{header}\n\n{src_list}"
+
+
+# ── STATS SLIDE ──────────────────────────────────────────
+
+
+async def get_bot_usage_report(
+ user_id: int,
+ user_stats: dict,
+ user: dict,
+ is_ru: bool = True,
+) -> str | None:
+ if user_stats is None:
+ return None
+
+ days = (datetime.datetime.utcnow() - user["created_at"]).days + 1
+ sessions = user_stats.get("nsessions", 0)
+ memes_sent = user_stats.get("nmemes_sent", 0)
+ likes = user_stats.get("nlikes", 0)
+ time_sec = user_stats.get("time_spent_sec", 0)
+
+ if likes < 10:
+ return None
+
+ like_rate = round(100 * likes / max(memes_sent, 1))
+
+ if is_ru:
+ report = (
+ "📊 Meme Wrapped 2026\n\n"
+ "Начнём с цифр.\n\n"
+ f"Ты с нами уже {days} дней.\n\n"
+ f"🤝 Посмотрел {memes_sent} мемов\n"
+ f"👍 Лайкнул {likes} из них "
+ f"({like_rate}%)\n"
+ f"👋 Заходил {sessions} раз\n"
+ )
+ if time_sec > 0:
+ if time_sec < 60:
+ t = f"{time_sec} сек"
+ elif time_sec < 3600:
+ t = f"{time_sec // 60} мин {time_sec % 60} сек"
+ else:
+ t = f"больше {time_sec // 3600} часов 😳"
+ report += f"🕒 В боте {t}\n"
+ if like_rate > 50:
+ vibe = "Лайкаешь больше половины — тебе всё смешно 😄"
+ elif like_rate > 30:
+ vibe = "Лайкаешь каждый третий — у тебя есть вкус 👌"
+ elif like_rate > 15:
+ vibe = "Лайкаешь каждый пятый — избирательный 🧐"
+ else:
+ vibe = "Менее 15% мемов достойны — мем-сноб 🎩"
+ else:
+ report = (
+ "📊 Meme Wrapped 2026\n\n"
+ "Let's start with the numbers.\n\n"
+ f"You've been with us for {days} days.\n\n"
+ f"🤝 Seen {memes_sent} memes\n"
+ f"👍 Liked {likes} of them "
+ f"({like_rate}%)\n"
+ f"👋 Visited {sessions} times\n"
+ )
+ if time_sec > 0:
+ if time_sec < 60:
+ t = f"{time_sec}s"
+ elif time_sec < 3600:
+ t = f"{time_sec // 60}m {time_sec % 60}s"
+ else:
+ t = f"over {time_sec // 3600} hours 😳"
+ report += f"🕒 Time in bot: {t}\n"
+ if like_rate > 50:
+ vibe = "You like more than half — everything's funny to you 😄"
+ elif like_rate > 30:
+ vibe = "You like every third one — you've got taste 👌"
+ elif like_rate > 15:
+ vibe = "You like every fifth one — picky 🧐"
+ else:
+ vibe = "Less than 15% are worthy — meme snob 🎩"
+
+ report += f"\n{vibe}"
+ return report
diff --git a/src/tgbot/handlers/upload/moderation.py b/src/tgbot/handlers/upload/moderation.py
index 6b805aca..725d1034 100644
--- a/src/tgbot/handlers/upload/moderation.py
+++ b/src/tgbot/handlers/upload/moderation.py
@@ -1,6 +1,7 @@
import asyncio
import logging
import re
+from dataclasses import dataclass
from datetime import datetime
from typing import Any
@@ -26,7 +27,11 @@
from src.stats.meme import calculate_meme_reactions_and_engagement
from src.stats.meme_source import calculate_meme_source_stats
from src.storage.constants import MemeStatus, MemeType
-from src.storage.service import find_meme_duplicate, update_meme
+from src.storage.deduplication import (
+ find_duplicate_by_ocr_text,
+ resolve_duplicate,
+)
+from src.storage.service import update_meme
from src.storage.upload import download_meme_content_from_tg
from src.tgbot.handlers.treasury.constants import TrxType
from src.tgbot.handlers.treasury.payments import pay_if_not_paid_with_alert
@@ -66,6 +71,13 @@ def _telegram_download_failure_kind(exc: BadRequest) -> str:
return "telegram_download_bad_request"
+@dataclass(frozen=True)
+class UploadAutoReviewDuplicate:
+ meme_id: int
+ duplicate_of: int
+ reason: str
+
+
async def _notify_uploader(
bot: Bot,
meme_upload: dict[str, Any],
@@ -98,7 +110,45 @@ async def _get_uploader_lang(user_id: int) -> str | None:
return user["interface_lang"] if user else None
-async def _check_duplicate_via_ocr(meme: dict[str, Any]) -> tuple[dict[str, Any], int | None]:
+def _stored_duplicate_result(meme: dict[str, Any]) -> UploadAutoReviewDuplicate | None:
+ if meme["status"] != MemeStatus.DUPLICATE.value or meme.get("duplicate_of") is None:
+ return None
+ return UploadAutoReviewDuplicate(
+ meme_id=meme["id"],
+ duplicate_of=meme["duplicate_of"],
+ reason="telegram_file_id",
+ )
+
+
+async def _reject_duplicate_upload(
+ bot: Bot,
+ meme_upload: dict[str, Any],
+ duplicate: UploadAutoReviewDuplicate,
+ uploader_lang: str | None,
+) -> None:
+ logging.info(
+ "Uploaded meme %s is a %s duplicate of %s, auto-rejecting",
+ duplicate.meme_id,
+ duplicate.reason,
+ duplicate.duplicate_of,
+ )
+ await create_user_meme_reaction(
+ meme_upload["user_id"],
+ duplicate.duplicate_of,
+ "uploaded_meme",
+ reaction_id=1,
+ reacted_at=datetime.utcnow(),
+ )
+ await _notify_uploader(
+ bot,
+ meme_upload,
+ localizer.t("upload.rejected_duplicate", uploader_lang),
+ )
+
+
+async def _deduplicate_upload_via_ocr(
+ meme: dict[str, Any],
+) -> tuple[dict[str, Any], UploadAutoReviewDuplicate | None]:
"""Describe the meme inline via OpenRouter vision and check for OCR-text duplicates.
Why: describe_memes cron is intentionally slow; for uploads we can't wait — run it synchronously
@@ -106,7 +156,7 @@ async def _check_duplicate_via_ocr(meme: dict[str, Any]) -> tuple[dict[str, Any]
Non-images skip describe (OCR is image-only).
Failures (rate limit, model errors, short text) fall through silently — manual review kicks in.
- Returns: (refreshed_meme, duplicate_of_id or None).
+ Returns: (refreshed_meme, duplicate details or None).
"""
if meme["type"] != MemeType.IMAGE:
return meme, None
@@ -133,8 +183,16 @@ async def _check_duplicate_via_ocr(meme: dict[str, Any]) -> tuple[dict[str, Any]
if len(ocr_text) < 12:
return refreshed, None
- dup_id = await find_meme_duplicate(refreshed["id"], ocr_text)
- return refreshed, dup_id
+ dup_id = await find_duplicate_by_ocr_text(refreshed["id"], ocr_text)
+ if dup_id is None:
+ return refreshed, None
+
+ resolution = await resolve_duplicate(refreshed["id"], dup_id, reason="upload_ocr_text")
+ return refreshed, UploadAutoReviewDuplicate(
+ meme_id=refreshed["id"],
+ duplicate_of=resolution.original_id,
+ reason=resolution.reason,
+ )
async def uploaded_meme_auto_review(
@@ -251,6 +309,10 @@ async def _uploaded_meme_auto_review(
bot, meme_upload, localizer.t("upload.tg_upload_failed", uploader_lang)
)
+ stored_duplicate = _stored_duplicate_result(meme)
+ if stored_duplicate is not None:
+ return await _reject_duplicate_upload(bot, meme_upload, stored_duplicate, uploader_lang)
+
logging.info(f"Updating meme {meme['id']} status to WAITING_REVIEW")
meme = await update_meme(
meme["id"],
@@ -258,25 +320,9 @@ async def _uploaded_meme_auto_review(
)
# Inline OCR + trigram dedup. Auto-reject on duplicate, else fall through to manual review.
- meme, duplicate_of = await _check_duplicate_via_ocr(meme)
- if duplicate_of is not None:
- logging.info(f"Meme {meme['id']} is a duplicate of {duplicate_of}, auto-rejecting")
- await update_meme(
- meme["id"],
- status=MemeStatus.DUPLICATE,
- duplicate_of=duplicate_of,
- )
- # Credit the uploader with a like on the original, so it counts as engagement
- await create_user_meme_reaction(
- meme_upload["user_id"],
- duplicate_of,
- "uploaded_meme",
- reaction_id=1,
- reacted_at=datetime.utcnow(),
- )
- return await _notify_uploader(
- bot, meme_upload, localizer.t("upload.rejected_duplicate", uploader_lang)
- )
+ meme, ocr_duplicate = await _deduplicate_upload_via_ocr(meme)
+ if ocr_duplicate is not None:
+ return await _reject_duplicate_upload(bot, meme_upload, ocr_duplicate, uploader_lang)
return await send_uploaded_meme_to_manual_review(meme, meme_upload, bot)
diff --git a/tests/factories.py b/tests/factories.py
index e43e714c..f63f63f8 100644
--- a/tests/factories.py
+++ b/tests/factories.py
@@ -5,6 +5,7 @@
from sqlalchemy.ext.asyncio import AsyncConnection
from src.database import (
+ chat_meme_reaction,
meme,
meme_source,
meme_source_candidate,
@@ -13,6 +14,7 @@
meme_source_stats,
meme_stats,
user,
+ user_deep_link_log,
user_language,
user_meme_reaction,
user_meme_source_stats,
@@ -109,6 +111,7 @@ async def create_meme_stats(
ndislikes: int = 5,
nmemes_sent: int = 20,
lr_smoothed: float = 0.5,
+ engagement_score: float = 0.0,
age_days: int = 30,
raw_impr_rank: int = 0,
sec_to_react: float = 7.0,
@@ -120,13 +123,21 @@ async def create_meme_stats(
"ndislikes": ndislikes,
"nmemes_sent": nmemes_sent,
"lr_smoothed": lr_smoothed,
+ "engagement_score": engagement_score,
"age_days": age_days,
"raw_impr_rank": raw_impr_rank,
"sec_to_react": sec_to_react,
"invited_count": invited_count,
"updated_at": FIXED_DT,
}
- await conn.execute(insert(meme_stats).values(row).on_conflict_do_nothing())
+ await conn.execute(
+ insert(meme_stats)
+ .values(row)
+ .on_conflict_do_update(
+ index_elements=(meme_stats.c.meme_id,),
+ set_={key: value for key, value in row.items() if key != "meme_id"},
+ )
+ )
return row
@@ -228,6 +239,9 @@ async def cleanup_test_data(conn: AsyncConnection) -> None:
delete(meme_source_candidate).where(meme_source_candidate.c.id >= TEST_ID_START)
)
await conn.execute(delete(meme_stats).where(meme_stats.c.meme_id >= TEST_ID_START))
+ await conn.execute(
+ delete(user_deep_link_log).where(user_deep_link_log.c.user_id >= TEST_ID_START)
+ )
await conn.execute(
delete(meme_source_stats).where(meme_source_stats.c.meme_source_id >= TEST_ID_START)
)
@@ -237,6 +251,9 @@ async def cleanup_test_data(conn: AsyncConnection) -> None:
await conn.execute(
delete(user_meme_reaction).where(user_meme_reaction.c.user_id >= TEST_ID_START)
)
+ await conn.execute(
+ delete(chat_meme_reaction).where(chat_meme_reaction.c.user_id >= TEST_ID_START)
+ )
await conn.execute(delete(user_language).where(user_language.c.user_id >= TEST_ID_START))
await conn.execute(delete(user_stats).where(user_stats.c.user_id >= TEST_ID_START))
await conn.execute(delete(meme).where(meme.c.id >= TEST_ID_START))
diff --git a/tests/flows/storage/test_final_meme_pipeline.py b/tests/flows/storage/test_final_meme_pipeline.py
new file mode 100644
index 00000000..9d2a2861
--- /dev/null
+++ b/tests/flows/storage/test_final_meme_pipeline.py
@@ -0,0 +1,49 @@
+from unittest.mock import AsyncMock
+
+import pytest
+
+from src.flows.storage import memes
+from src.storage.deduplication import DeduplicationResult
+
+
+@pytest.mark.asyncio
+async def test_final_meme_pipeline_deduplicates_batch_before_ok_promotion(monkeypatch):
+ calls = []
+
+ class FakeLogger:
+ def info(self, *args):
+ calls.append(("log_info", args))
+
+ pending_memes = [
+ {"id": 10001, "caption": None},
+ {"id": 10002, "caption": None},
+ ]
+
+ async def fake_analyse(meme):
+ calls.append(("analyse", meme["id"]))
+
+ async def fake_deduplicate(meme):
+ calls.append(("dedup", meme["id"]))
+ return DeduplicationResult(meme["id"])
+
+ async def fake_update_ready(meme_ids):
+ calls.append(("promote", meme_ids))
+ return [{"id": meme_ids[0]}]
+
+ async def fake_sweep():
+ calls.append(("sweep",))
+ return {"resolved": 0, "reactions_moved": 0, "reactions_dropped": 0}
+
+ monkeypatch.setattr(memes, "get_run_logger", lambda: FakeLogger())
+ monkeypatch.setattr(memes, "get_pending_memes", AsyncMock(return_value=pending_memes))
+ monkeypatch.setattr(memes, "analyse_meme_caption", fake_analyse)
+ monkeypatch.setattr(memes, "deduplicate_pending_meme", fake_deduplicate)
+ monkeypatch.setattr(memes, "update_meme_status_of_ready_memes", fake_update_ready)
+ monkeypatch.setattr(memes, "sweep_file_id_duplicates", fake_sweep)
+ monkeypatch.setattr(memes, "safe_emit", lambda *args, **kwargs: calls.append(("emit", args)))
+
+ await memes.final_meme_pipeline.fn()
+
+ assert calls.index(("dedup", 10001)) < calls.index(("promote", [10001, 10002]))
+ assert calls.index(("dedup", 10002)) < calls.index(("promote", [10001, 10002]))
+ assert calls.index(("promote", [10001, 10002])) < calls.index(("sweep",))
diff --git a/tests/recommendations/test_low_sent_pool.py b/tests/recommendations/test_low_sent_pool.py
new file mode 100644
index 00000000..4968c37c
--- /dev/null
+++ b/tests/recommendations/test_low_sent_pool.py
@@ -0,0 +1,69 @@
+import pytest
+import pytest_asyncio
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import AsyncConnection
+from tests.factories import (
+ cleanup_test_data,
+ create_meme,
+ create_meme_source,
+ create_meme_stats,
+ create_user,
+ create_user_language,
+)
+
+from src.database import engine
+from src.recommendations.pipeline import _low_sent_query
+
+LOW_SENT_USER_ID = 10020
+LOW_SENT_SOURCE_ID = 10020
+
+
+@pytest_asyncio.fixture()
+async def conn():
+ async with engine.connect() as conn:
+ await cleanup_test_data(conn)
+ yield conn
+ await cleanup_test_data(conn)
+
+
+async def _create_low_sent_meme(
+ conn: AsyncConnection,
+ meme_id: int,
+ *,
+ nlikes: int,
+ ndislikes: int,
+ nmemes_sent: int,
+) -> None:
+ await create_meme(conn, id=meme_id, meme_source_id=LOW_SENT_SOURCE_ID)
+ await create_meme_stats(
+ conn,
+ meme_id=meme_id,
+ nlikes=nlikes,
+ ndislikes=ndislikes,
+ nmemes_sent=nmemes_sent,
+ )
+
+
+@pytest.mark.asyncio
+async def test_low_sent_pool_prioritizes_unreacted_memes_and_filters_failed_memes(
+ conn: AsyncConnection,
+) -> None:
+ await create_user(conn, id=LOW_SENT_USER_ID, type="moderator")
+ await create_user_language(conn, user_id=LOW_SENT_USER_ID)
+ await create_meme_source(conn, id=LOW_SENT_SOURCE_ID)
+ await _create_low_sent_meme(conn, 10021, nlikes=0, ndislikes=0, nmemes_sent=0)
+ await _create_low_sent_meme(conn, 10022, nlikes=1, ndislikes=0, nmemes_sent=1)
+ await _create_low_sent_meme(conn, 10023, nlikes=0, ndislikes=9, nmemes_sent=9)
+ await _create_low_sent_meme(conn, 10024, nlikes=0, ndislikes=10, nmemes_sent=10)
+ await _create_low_sent_meme(conn, 10025, nlikes=2, ndislikes=8, nmemes_sent=10)
+ await conn.commit()
+
+ rows = await conn.execute(
+ text(_low_sent_query([])),
+ {"user_id": LOW_SENT_USER_ID, "limit": 10},
+ )
+ ids = [row.id for row in rows]
+
+ assert ids[:3] == [10021, 10022, 10023]
+ assert 10024 not in ids
+ assert 10025 in ids
diff --git a/tests/recommendations/test_meme_queue.py b/tests/recommendations/test_meme_queue.py
index 1c2a5deb..5c817bee 100644
--- a/tests/recommendations/test_meme_queue.py
+++ b/tests/recommendations/test_meme_queue.py
@@ -8,7 +8,7 @@
MATURE_BLENDER_TREATMENT_WEIGHTS,
)
from src.recommendations.candidates import CandidatesRetriever
-from src.recommendations.meme_queue import generate_recommendations
+from src.recommendations.meme_queue import generate_recommendations, get_next_meme_for_user
TEST_USER_ID = 99999
@@ -53,6 +53,47 @@ def mock_redis():
# ── Cold start Phase 1 (nmemes_sent < 6): cold_start_explore ──
+@pytest.mark.asyncio
+async def test_get_next_meme_for_user_skips_stale_queue_payloads():
+ queued_payloads = [
+ {
+ "id": 101,
+ "type": "image",
+ "telegram_file_id": "stale-file-id",
+ "caption": None,
+ },
+ {
+ "id": 102,
+ "type": "image",
+ "telegram_file_id": "fresh-file-id",
+ "caption": None,
+ },
+ ]
+
+ async def pop_queue(_queue_key):
+ return queued_payloads.pop(0) if queued_payloads else None
+
+ async def is_sendable(_user_id: int, meme_id: int) -> bool:
+ return meme_id == 102
+
+ with (
+ patch(
+ "src.recommendations.meme_queue.redis.pop_meme_from_queue_by_key",
+ new_callable=AsyncMock,
+ side_effect=pop_queue,
+ ),
+ patch(
+ "src.recommendations.meme_queue._queued_meme_is_sendable",
+ new_callable=AsyncMock,
+ side_effect=is_sendable,
+ ),
+ ):
+ meme = await get_next_meme_for_user(TEST_USER_ID)
+
+ assert meme is not None
+ assert meme.id == 102
+
+
@pytest.mark.asyncio
async def test_cold_start_phase1_uses_explore():
"""Phase 1 (<6 memes): uses cold_start_explore engine"""
diff --git a/tests/recommendations/test_queue_correctness.py b/tests/recommendations/test_queue_correctness.py
index f079f492..d5587a50 100644
--- a/tests/recommendations/test_queue_correctness.py
+++ b/tests/recommendations/test_queue_correctness.py
@@ -16,7 +16,8 @@
from src import redis
from src.database import engine
from src.recommendations.candidates import CandidatesRetriever
-from src.recommendations.meme_queue import generate_recommendations
+from src.recommendations.meme_queue import generate_recommendations, get_next_meme_for_user
+from src.storage.constants import MemeStatus
# IDs for queue tests
QUEUE_USER = 10010
@@ -228,6 +229,34 @@ async def test_queue_memes_have_required_fields(queue_user):
assert "nlikes" in c
+@pytest.mark.asyncio
+async def test_get_next_meme_skips_stale_duplicate_payload(queue_user):
+ async with engine.connect() as conn:
+ await create_meme(
+ conn,
+ id=10020,
+ meme_source_id=10010,
+ status=MemeStatus.DUPLICATE.value,
+ )
+ await create_meme(conn, id=10021, meme_source_id=10010)
+ await conn.commit()
+
+ queue_key = redis.get_meme_queue_key(QUEUE_USER)
+ await redis.add_memes_to_queue_by_key(
+ queue_key,
+ [
+ _meme(10020, "lr_smoothed"),
+ _meme(10021, "lr_smoothed"),
+ ],
+ )
+
+ next_meme = await get_next_meme_for_user(QUEUE_USER)
+
+ assert next_meme is not None
+ assert next_meme.id == 10021
+ assert await redis.get_all_memes_in_queue_by_key(queue_key) == []
+
+
@pytest.mark.asyncio
async def test_generate_excludes_already_queued(queue_user):
"""Second generate should not duplicate memes already in queue."""
diff --git a/tests/scripts/test_agent_doctor.py b/tests/scripts/test_agent_doctor.py
index a1cda10f..2df4cf94 100644
--- a/tests/scripts/test_agent_doctor.py
+++ b/tests/scripts/test_agent_doctor.py
@@ -40,6 +40,21 @@ def test_real_describe_memes_models_are_free() -> None:
assert result.name == "describe_memes:free_models"
+def test_describe_memes_models_can_live_in_openrouter_client(tmp_path: Path) -> None:
+ storage = tmp_path / "src" / "flows" / "storage"
+ storage.mkdir(parents=True)
+ (storage / "describe_memes.py").write_text("# orchestration only\n", encoding="utf-8")
+ (storage / "openrouter_vision.py").write_text(
+ 'VISION_MODELS = ["qwen/qwen2.5-vl-72b-instruct:free"]\n',
+ encoding="utf-8",
+ )
+
+ result = doctor.check_describe_memes_models(tmp_path)
+
+ assert result.ok is True
+ assert "openrouter_vision.py" in result.detail
+
+
def test_paperclip_access_adapter_accepts_repo_local_wrapper(tmp_path: Path) -> None:
skill = tmp_path / ".codex" / "skills" / "paperclip"
tools = tmp_path / ".codex" / "paperclip-tools"
diff --git a/tests/scripts/test_eval_crossposting_ml.py b/tests/scripts/test_eval_crossposting_ml.py
new file mode 100644
index 00000000..cfec4ea0
--- /dev/null
+++ b/tests/scripts/test_eval_crossposting_ml.py
@@ -0,0 +1,14 @@
+import pytest
+from scripts.eval_crossposting_ml import top_quintile_lift
+
+
+def test_top_quintile_lift_is_neutral_when_scores_are_tied():
+ labels = [1, 0, 1, 0, 0]
+
+ assert top_quintile_lift([0, 0, 0, 0, 0], labels) == 1.0
+
+
+def test_top_quintile_lift_handles_boundary_ties_without_label_leakage():
+ labels = [1, 0, 0, 1, 0, 0, 0, 0, 0, 0]
+
+ assert top_quintile_lift([10, 9, 9, 9, 1, 0, 0, 0, 0, 0], labels) == pytest.approx(10 / 3)
diff --git a/tests/storage/test_deduplication.py b/tests/storage/test_deduplication.py
new file mode 100644
index 00000000..7bf5e2d0
--- /dev/null
+++ b/tests/storage/test_deduplication.py
@@ -0,0 +1,373 @@
+from datetime import datetime, timedelta
+
+import pytest
+import pytest_asyncio
+from sqlalchemy import insert, select
+
+from src.database import chat_meme_reaction, engine, meme, meme_stats, user_meme_reaction
+from src.storage.constants import MemeStatus
+from src.storage.deduplication import (
+ deduplicate_described_meme,
+ deduplicate_pending_meme,
+ find_duplicate_by_file_id,
+ find_duplicate_by_ocr_text,
+ resolve_duplicate,
+ sweep_file_id_duplicates,
+)
+from tests.factories import (
+ cleanup_test_data,
+ create_meme,
+ create_meme_source,
+ create_meme_stats,
+ create_reaction,
+ create_user,
+)
+
+
+@pytest_asyncio.fixture()
+async def dedup_setup():
+ async with engine.connect() as conn:
+ await create_meme_source(conn, id=10001)
+ for user_id in range(10001, 10008):
+ await create_user(conn, id=user_id)
+ await conn.commit()
+
+ yield
+
+ async with engine.connect() as conn:
+ await cleanup_test_data(conn)
+
+
+async def _row(table, **where):
+ async with engine.connect() as conn:
+ query = select(table)
+ for column, value in where.items():
+ query = query.where(getattr(table.c, column) == value)
+ result = await conn.execute(query)
+ row = result.first()
+ return row._asdict() if row else None
+
+
+@pytest.mark.asyncio
+async def test_find_duplicate_by_file_id_uses_older_ok_or_created_memes(dedup_setup):
+ async with engine.connect() as conn:
+ await create_meme(
+ conn,
+ id=10001,
+ meme_source_id=10001,
+ status=MemeStatus.OK.value,
+ telegram_file_id="same-file-id",
+ )
+ await create_meme(
+ conn,
+ id=10002,
+ meme_source_id=10001,
+ status=MemeStatus.CREATED.value,
+ telegram_file_id="same-file-id",
+ )
+ await conn.commit()
+
+ assert await find_duplicate_by_file_id(10002, "same-file-id") == 10001
+ assert await find_duplicate_by_file_id(10001, "same-file-id") is None
+
+
+@pytest.mark.asyncio
+async def test_find_duplicate_by_file_id_prefers_published_original(dedup_setup):
+ async with engine.connect() as conn:
+ await create_meme(
+ conn,
+ id=10001,
+ meme_source_id=10001,
+ status=MemeStatus.OK.value,
+ telegram_file_id="same-file-id",
+ )
+ await create_meme(
+ conn,
+ id=10002,
+ meme_source_id=10001,
+ status=MemeStatus.PUBLISHED.value,
+ telegram_file_id="same-file-id",
+ )
+ await conn.commit()
+
+ assert await find_duplicate_by_file_id(10003, "same-file-id") == 10002
+
+
+@pytest.mark.asyncio
+async def test_find_duplicate_by_ocr_text_skips_short_text(dedup_setup):
+ assert await find_duplicate_by_ocr_text(10001, "too short") is None
+
+
+@pytest.mark.asyncio
+async def test_resolve_duplicate_moves_reactions_and_refreshes_stats(dedup_setup):
+ async with engine.connect() as conn:
+ await create_meme(conn, id=10001, meme_source_id=10001)
+ await create_meme(conn, id=10002, meme_source_id=10001)
+ await create_meme_stats(
+ conn,
+ meme_id=10001,
+ nlikes=1,
+ ndislikes=1,
+ nmemes_sent=2,
+ lr_smoothed=99,
+ engagement_score=99,
+ )
+ await create_meme_stats(conn, meme_id=10002, nlikes=2, ndislikes=1, nmemes_sent=3)
+ await create_reaction(conn, user_id=10001, meme_id=10001, reaction_id=1)
+ await create_reaction(conn, user_id=10002, meme_id=10001, reaction_id=2)
+ await create_reaction(conn, user_id=10002, meme_id=10002, reaction_id=1)
+ await create_reaction(conn, user_id=10003, meme_id=10002, reaction_id=1)
+ await create_reaction(conn, user_id=10004, meme_id=10002, reaction_id=2)
+ await conn.commit()
+
+ result = await resolve_duplicate(10002, 10001, reason="test")
+
+ assert result.reactions_moved == 2
+ assert result.reactions_dropped == 3
+
+ original_stats = await _row(meme_stats, meme_id=10001)
+ assert original_stats["nlikes"] == 2
+ assert original_stats["ndislikes"] == 2
+ assert original_stats["nmemes_sent"] == 4
+ assert original_stats["lr_smoothed"] == 0
+ assert original_stats["engagement_score"] == 0
+
+ dupe = await _row(meme, id=10002)
+ assert dupe["status"] == MemeStatus.DUPLICATE.value
+ assert dupe["duplicate_of"] == 10001
+ assert await _row(meme_stats, meme_id=10002) is None
+
+ async with engine.connect() as conn:
+ reaction_rows = await conn.execute(
+ select(user_meme_reaction).where(user_meme_reaction.c.meme_id == 10002)
+ )
+ assert reaction_rows.all() == []
+
+
+@pytest.mark.asyncio
+async def test_resolve_duplicate_recomputes_derived_original_stats(dedup_setup):
+ base_sent_at = datetime(2024, 1, 1, 12, 0, 0)
+ affected_users = [10001, 10002, 10003]
+
+ async with engine.connect() as conn:
+ await create_meme(conn, id=10001, meme_source_id=10001)
+ await create_meme(conn, id=10002, meme_source_id=10001)
+ await create_meme_stats(conn, meme_id=10001, lr_smoothed=-9.0)
+ await conn.execute(
+ meme_stats.update().where(meme_stats.c.meme_id == 10001).values(engagement_score=-9.0)
+ )
+
+ for index in range(9):
+ other_meme_id = 10100 + index
+ sent_at = base_sent_at + timedelta(minutes=index)
+ await create_meme(conn, id=other_meme_id, meme_source_id=10001)
+ for user_id in affected_users:
+ await create_reaction(
+ conn,
+ user_id=user_id,
+ meme_id=other_meme_id,
+ reaction_id=2,
+ sent_at=sent_at,
+ reacted_at=sent_at + timedelta(seconds=5),
+ )
+
+ target_sent_at = base_sent_at + timedelta(minutes=9)
+ for user_id in affected_users:
+ await create_reaction(
+ conn,
+ user_id=user_id,
+ meme_id=10002,
+ reaction_id=1,
+ sent_at=target_sent_at,
+ reacted_at=target_sent_at + timedelta(seconds=5),
+ )
+ await conn.commit()
+
+ await resolve_duplicate(10002, 10001, reason="test")
+
+ original_stats = await _row(meme_stats, meme_id=10001)
+ assert original_stats["nlikes"] == 3
+ assert original_stats["nmemes_sent"] == 3
+ assert original_stats["lr_smoothed"] == pytest.approx(1.8)
+ assert original_stats["engagement_score"] == pytest.approx(1.8)
+
+
+@pytest.mark.asyncio
+async def test_resolve_duplicate_reparents_existing_duplicate_children(dedup_setup):
+ async with engine.connect() as conn:
+ await create_meme(conn, id=10001, meme_source_id=10001)
+ await create_meme(conn, id=10002, meme_source_id=10001)
+ await create_meme(conn, id=10003, meme_source_id=10001, status=MemeStatus.DUPLICATE.value)
+ await conn.execute(meme.update().where(meme.c.id == 10003).values(duplicate_of=10002))
+ await create_reaction(conn, user_id=10001, meme_id=10002, reaction_id=1)
+ await conn.commit()
+
+ await resolve_duplicate(10002, 10001, reason="test")
+
+ dupe = await _row(meme, id=10002)
+ child = await _row(meme, id=10003)
+ assert dupe["duplicate_of"] == 10001
+ assert child["duplicate_of"] == 10001
+
+
+@pytest.mark.asyncio
+async def test_resolve_duplicate_moves_chat_reactions(dedup_setup):
+ async with engine.connect() as conn:
+ await create_meme(conn, id=10001, meme_source_id=10001)
+ await create_meme(conn, id=10002, meme_source_id=10001)
+ await conn.execute(
+ insert(chat_meme_reaction),
+ [
+ {"chat_id": 1, "meme_id": 10001, "user_id": 10001, "reaction": 1},
+ {"chat_id": 1, "meme_id": 10002, "user_id": 10001, "reaction": 2},
+ {"chat_id": 1, "meme_id": 10002, "user_id": 10002, "reaction": 1},
+ ],
+ )
+ await conn.commit()
+
+ result = await resolve_duplicate(10002, 10001, reason="test")
+
+ assert result.chat_reactions_moved == 1
+ assert result.chat_reactions_dropped == 2
+
+ async with engine.connect() as conn:
+ original_rows = await conn.execute(
+ select(chat_meme_reaction)
+ .where(chat_meme_reaction.c.meme_id == 10001)
+ .order_by(chat_meme_reaction.c.user_id)
+ )
+ dupe_rows = await conn.execute(
+ select(chat_meme_reaction).where(chat_meme_reaction.c.meme_id == 10002)
+ )
+
+ assert [row._asdict()["user_id"] for row in original_rows.all()] == [10001, 10002]
+ assert dupe_rows.all() == []
+
+
+@pytest.mark.asyncio
+async def test_deduplicate_pending_meme_resolves_file_id_before_ok_promotion(dedup_setup):
+ async with engine.connect() as conn:
+ await create_meme(
+ conn,
+ id=10001,
+ meme_source_id=10001,
+ status=MemeStatus.OK.value,
+ telegram_file_id="same-file-id",
+ )
+ pending = await create_meme(
+ conn,
+ id=10002,
+ meme_source_id=10001,
+ status=MemeStatus.CREATED.value,
+ telegram_file_id="same-file-id",
+ )
+ await conn.commit()
+
+ result = await deduplicate_pending_meme(pending)
+
+ assert result.duplicate_found is True
+ assert result.duplicate_of == 10001
+ assert result.reason == "telegram_file_id"
+ dupe = await _row(meme, id=10002)
+ assert dupe["status"] == MemeStatus.DUPLICATE.value
+
+
+@pytest.mark.asyncio
+async def test_deduplicate_described_meme_resolves_only_ok_memes(dedup_setup):
+ async with engine.connect() as conn:
+ await create_meme(
+ conn,
+ id=10001,
+ meme_source_id=10001,
+ ocr_result={"text": "same visible meme text", "calculated_at": "2026-05-20T00:00:00Z"},
+ )
+ await create_meme(conn, id=10002, meme_source_id=10001, status=MemeStatus.OK.value)
+ await create_meme(
+ conn,
+ id=10003,
+ meme_source_id=10001,
+ status=MemeStatus.WAITING_REVIEW.value,
+ )
+ await conn.commit()
+
+ ok_result = await deduplicate_described_meme(
+ 10002,
+ "same visible meme text",
+ status=MemeStatus.OK.value,
+ )
+ review_result = await deduplicate_described_meme(
+ 10003,
+ "same visible meme text",
+ status=MemeStatus.WAITING_REVIEW.value,
+ )
+
+ assert ok_result.duplicate_found is True
+ assert ok_result.duplicate_of == 10001
+ assert review_result.duplicate_found is False
+ review_meme = await _row(meme, id=10003)
+ assert review_meme["status"] == MemeStatus.WAITING_REVIEW.value
+
+
+@pytest.mark.asyncio
+async def test_sweep_file_id_duplicates_resolves_ok_exact_duplicates(dedup_setup):
+ async with engine.connect() as conn:
+ await create_meme(
+ conn,
+ id=10001,
+ meme_source_id=10001,
+ telegram_file_id="same-file-id",
+ )
+ await create_meme(
+ conn,
+ id=10002,
+ meme_source_id=10001,
+ telegram_file_id="same-file-id",
+ )
+ await create_reaction(conn, user_id=10001, meme_id=10001, reaction_id=1)
+ await create_reaction(conn, user_id=10002, meme_id=10002, reaction_id=2)
+ await conn.commit()
+
+ result = await sweep_file_id_duplicates()
+
+ assert result["resolved"] == 1
+ assert result["reactions_moved"] == 1
+
+ original_stats = await _row(meme_stats, meme_id=10001)
+ assert original_stats["nlikes"] == 1
+ assert original_stats["ndislikes"] == 1
+ assert original_stats["nmemes_sent"] == 2
+
+ dupe = await _row(meme, id=10002)
+ assert dupe["status"] == MemeStatus.DUPLICATE.value
+ assert dupe["duplicate_of"] == 10001
+
+
+@pytest.mark.asyncio
+async def test_sweep_file_id_duplicates_resolves_ok_meme_to_published_original(dedup_setup):
+ async with engine.connect() as conn:
+ await create_meme(
+ conn,
+ id=10001,
+ meme_source_id=10001,
+ telegram_file_id="same-file-id",
+ )
+ await create_meme(
+ conn,
+ id=10002,
+ meme_source_id=10001,
+ status=MemeStatus.PUBLISHED.value,
+ telegram_file_id="same-file-id",
+ )
+ await create_reaction(conn, user_id=10001, meme_id=10001, reaction_id=1)
+ await conn.commit()
+
+ result = await sweep_file_id_duplicates()
+
+ assert result["resolved"] == 1
+ dupe = await _row(meme, id=10001)
+ assert dupe["status"] == MemeStatus.DUPLICATE.value
+ assert dupe["duplicate_of"] == 10002
+
+ published_stats = await _row(meme_stats, meme_id=10002)
+ assert published_stats["nlikes"] == 1
+ assert published_stats["nmemes_sent"] == 1
diff --git a/tests/test_crossposting_meme.py b/tests/test_crossposting_meme.py
index 9d3258b2..fbc11415 100644
--- a/tests/test_crossposting_meme.py
+++ b/tests/test_crossposting_meme.py
@@ -442,6 +442,31 @@ async def test_ru_share_max_picker_boosts_prior_inbot_shares(clean_xpost):
assert top_candidate["share_max_score"] > top_candidate["share_max_base_score"]
+@pytest.mark.asyncio
+async def test_ru_share_max_picker_keeps_cold_sources_in_pool(clean_xpost):
+ async with engine.connect() as conn:
+ await create_meme_source(conn, id=10330, language_code="ru")
+ await create_meme_source(conn, id=10340, language_code="ru")
+ await create_meme(
+ conn, id=10331, meme_source_id=10330, language_code="ru", type="image", status="ok"
+ )
+ await create_meme(
+ conn, id=10341, meme_source_id=10340, language_code="ru", type="image", status="ok"
+ )
+ await create_meme_stats(conn, meme_id=10331, nlikes=10, ndislikes=2)
+ await create_meme_stats(conn, meme_id=10341, nlikes=10, ndislikes=2, invited_count=5)
+ await conn.commit()
+
+ picked, decision = await get_next_share_max_meme_for_tgchannelru()
+ assert picked is not None
+ assert picked["id"] == 10341
+ assert decision is not None
+ assert decision["pool_size"] == 2
+ candidate_ids = {c["meme_id"] for c in decision["candidates"]}
+ assert candidate_ids == {10331, 10341}
+ assert all(c["share_source_base"] == 1.0 for c in decision["candidates"])
+
+
@pytest.mark.asyncio
async def test_en_share_max_picker_logs_but_does_not_boost_prior_shares(clean_xpost):
async with engine.connect() as conn:
diff --git a/tests/tgbot/test_upload_moderation.py b/tests/tgbot/test_upload_moderation.py
index a972bd2f..3c6e75be 100644
--- a/tests/tgbot/test_upload_moderation.py
+++ b/tests/tgbot/test_upload_moderation.py
@@ -1,5 +1,5 @@
from types import SimpleNamespace
-from unittest.mock import AsyncMock, patch
+from unittest.mock import ANY, AsyncMock, patch
import pytest
@@ -126,3 +126,99 @@ async def test_upload_review_chat_member_can_reject_without_moderator_user_type(
notify.assert_awaited_once()
assert "выбран не тот язык" in notify.await_args.args[2]
get_user_info.assert_awaited_once_with(7)
+
+
+@pytest.mark.asyncio
+async def test_auto_review_does_not_revive_exact_file_id_duplicate(monkeypatch):
+ meme = {
+ "id": 10002,
+ "type": moderation.MemeType.IMAGE,
+ "telegram_file_id": "uploaded-file-id",
+ }
+ meme_upload = {"id": 42, "user_id": 10001, "message_id": 777}
+ stored_duplicate = {
+ **meme,
+ "status": moderation.MemeStatus.DUPLICATE.value,
+ "duplicate_of": 10000,
+ }
+ bot = AsyncMock()
+
+ with (
+ patch.object(moderation, "_get_uploader_lang", new=AsyncMock(return_value="ru")),
+ patch.object(
+ moderation,
+ "download_meme_content_from_tg",
+ new=AsyncMock(return_value=b"image"),
+ ),
+ patch.object(
+ moderation,
+ "add_watermark_to_meme_content",
+ new=AsyncMock(return_value=b"watermarked"),
+ ),
+ patch.object(
+ moderation,
+ "upload_meme_content_to_tg",
+ new=AsyncMock(return_value=stored_duplicate),
+ ),
+ patch.object(moderation, "update_meme", new=AsyncMock()) as update_meme,
+ patch.object(
+ moderation,
+ "create_user_meme_reaction",
+ new=AsyncMock(),
+ ) as create_reaction,
+ patch.object(moderation, "_notify_uploader", new=AsyncMock()) as notify,
+ patch.object(moderation, "send_uploaded_meme_to_manual_review", new=AsyncMock()) as review,
+ ):
+ await moderation._uploaded_meme_auto_review(meme, meme_upload, bot, {})
+
+ update_meme.assert_not_awaited()
+ review.assert_not_awaited()
+ create_reaction.assert_awaited_once_with(
+ 10001,
+ 10000,
+ "uploaded_meme",
+ reaction_id=1,
+ reacted_at=ANY,
+ )
+ notify.assert_awaited_once()
+ assert "повтор" in notify.await_args.args[2].lower()
+
+
+@pytest.mark.asyncio
+async def test_inline_ocr_duplicate_uses_dedup_resolver(monkeypatch):
+ meme = {
+ "id": 10002,
+ "type": moderation.MemeType.IMAGE,
+ "telegram_file_id": "uploaded-file-id",
+ }
+ refreshed = {
+ **meme,
+ "ocr_result": {"text": "same visible meme text"},
+ }
+
+ with (
+ patch.object(moderation, "describe_single_meme", new=AsyncMock(return_value="ok")),
+ patch(
+ "src.tgbot.service.get_meme_by_id",
+ new=AsyncMock(return_value=refreshed),
+ ),
+ patch.object(
+ moderation,
+ "find_duplicate_by_ocr_text",
+ new=AsyncMock(return_value=10000),
+ ) as find_duplicate,
+ patch.object(
+ moderation,
+ "resolve_duplicate",
+ new=AsyncMock(
+ return_value=SimpleNamespace(original_id=10000, reason="upload_ocr_text")
+ ),
+ ) as resolve_duplicate,
+ ):
+ refreshed_result, duplicate = await moderation._deduplicate_upload_via_ocr(meme)
+
+ assert refreshed_result == refreshed
+ assert duplicate is not None
+ assert duplicate.duplicate_of == 10000
+ find_duplicate.assert_awaited_once_with(10002, "same visible meme text")
+ resolve_duplicate.assert_awaited_once_with(10002, 10000, reason="upload_ocr_text")