Skip to content

Commit f838cbc

Browse files
authored
Merge pull request #542 from Police-Data-Accessibility-Project/mc_371_html_content_duplicates
Add logic to prevent HTML content duplicates from being sent to HuggingFace
2 parents 2d32b57 + 3ce5642 commit f838cbc

12 files changed

Lines changed: 192 additions & 16 deletions

File tree

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""Add html duplicate url materialized view
2+
3+
Revision ID: d5f0cc2be6b6
4+
Revises: 5ac9d50b91c5
5+
Create Date: 2025-11-27 09:07:28.767553
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
14+
# revision identifiers, used by Alembic.
15+
revision: str = 'd5f0cc2be6b6'
16+
down_revision: Union[str, None] = '5ac9d50b91c5'
17+
branch_labels: Union[str, Sequence[str], None] = None
18+
depends_on: Union[str, Sequence[str], None] = None
19+
20+
21+
def upgrade() -> None:
22+
op.execute("""
23+
create extension pgcrypto;
24+
""")
25+
26+
op.execute("""
27+
CREATE MATERIALIZED VIEW mat_view__html_duplicate_url AS
28+
WITH
29+
hashes AS (
30+
SELECT
31+
url_id,
32+
digest(compressed_html, 'sha256') AS hash
33+
FROM
34+
url_compressed_html
35+
)
36+
, duplicate_hashes as (
37+
SELECT
38+
hash AS content_hash,
39+
COUNT(*) AS n,
40+
ARRAY_AGG(url_id ORDER BY url_id) AS url_ids
41+
FROM
42+
hashes
43+
GROUP BY
44+
hash
45+
HAVING
46+
COUNT(*) > 1
47+
)
48+
select
49+
urls.id as url_id
50+
from urls
51+
join hashes h on h.url_id = urls.id
52+
join duplicate_hashes dh on dh.content_hash = h.hash;
53+
""")
54+
55+
56+
def downgrade() -> None:
57+
pass

src/core/tasks/scheduled/impl/huggingface/operator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from itertools import count
22

33
from src.core.tasks.mixins.prereq import HasPrerequisitesMixin
4-
from src.core.tasks.scheduled.impl.huggingface.queries.check.core import CheckValidURLsUpdatedQueryBuilder
4+
from src.core.tasks.scheduled.impl.huggingface.queries.prereq.core import CheckValidURLsUpdatedQueryBuilder
55
from src.core.tasks.scheduled.impl.huggingface.queries.get.core import GetForLoadingToHuggingFaceQueryBuilder
66
from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput
77
from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from datetime import datetime
2+
3+
from sqlalchemy import select, Column
4+
5+
from src.db.enums import TaskType
6+
from src.db.helpers.query import exists_url, no_url_task_error, not_exists_url
7+
from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated
8+
from src.db.models.impl.url.core.sqlalchemy import URL
9+
from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML
10+
from src.db.models.materialized_views.html_duplicate_url import HTMLDuplicateURLMaterializedView
11+
12+
13+
class HuggingfacePrereqCTEContainer:
14+
15+
def __init__(self):
16+
self.cte = (
17+
select(
18+
URL.id,
19+
URL.updated_at
20+
)
21+
.join(
22+
URLCompressedHTML,
23+
URL.id == URLCompressedHTML.url_id
24+
)
25+
.where(
26+
exists_url(FlagURLValidated),
27+
not_exists_url(HTMLDuplicateURLMaterializedView),
28+
no_url_task_error(TaskType.PUSH_TO_HUGGINGFACE)
29+
)
30+
)
31+
32+
@property
33+
def url_id(self) -> Column[int]:
34+
return self.cte.c.id
35+
36+
@property
37+
def updated_at(self) -> Column[datetime]:
38+
return self.cte.c.updated_at

src/core/tasks/scheduled/impl/huggingface/queries/get/core.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from sqlalchemy import select
22
from sqlalchemy.ext.asyncio import AsyncSession
33

4+
from src.core.tasks.scheduled.impl.huggingface.queries.cte import HuggingfacePrereqCTEContainer
45
from src.core.tasks.scheduled.impl.huggingface.queries.get.convert import convert_fine_to_coarse_record_type, \
56
convert_validated_type_to_relevant
67
from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput
@@ -23,21 +24,26 @@ def __init__(self, page: int):
2324

2425

2526
async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOutput]:
26-
label_url_id = 'url_id'
2727
label_url = 'url'
2828
label_record_type_fine = 'record_type_fine'
2929
label_html = 'html'
3030
label_type = 'type'
3131

3232

33+
cte = HuggingfacePrereqCTEContainer()
34+
3335
query = (
3436
select(
35-
URL.id.label(label_url_id),
37+
cte.url_id,
3638
URL.full_url.label(label_url),
3739
URLRecordType.record_type.label(label_record_type_fine),
3840
URLCompressedHTML.compressed_html.label(label_html),
3941
FlagURLValidated.type.label(label_type)
4042
)
43+
.join(
44+
URL,
45+
cte.url_id == URL.id
46+
)
4147
.join(
4248
URLRecordType,
4349
URL.id == URLRecordType.url_id
@@ -65,7 +71,7 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut
6571
final_results = []
6672
for result in db_results:
6773
output = GetForLoadingToHuggingFaceOutput(
68-
url_id=result[label_url_id],
74+
url_id=result[cte.url_id],
6975
url=result[label_url],
7076
relevant=convert_validated_type_to_relevant(
7177
URLType(result[label_type])

src/core/tasks/scheduled/impl/huggingface/queries/check/__init__.py renamed to src/core/tasks/scheduled/impl/huggingface/queries/prereq/__init__.py

File renamed without changes.

src/core/tasks/scheduled/impl/huggingface/queries/check/core.py renamed to src/core/tasks/scheduled/impl/huggingface/queries/prereq/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from sqlalchemy.ext.asyncio import AsyncSession
22

3-
from src.core.tasks.scheduled.impl.huggingface.queries.check.requester import CheckValidURLsUpdatedRequester
3+
from src.core.tasks.scheduled.impl.huggingface.queries.prereq.requester import CheckValidURLsUpdatedRequester
44
from src.db.queries.base.builder import QueryBuilderBase
55

66

src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py renamed to src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from sqlalchemy.sql.functions import count
77

88
from src.collectors.enums import URLStatus
9+
from src.core.tasks.scheduled.impl.huggingface.queries.cte import HuggingfacePrereqCTEContainer
910
from src.db.enums import TaskType
1011
from src.db.helpers.query import not_exists_url, no_url_task_error, exists_url
1112
from src.db.helpers.session import session_helper as sh
@@ -32,21 +33,17 @@ async def latest_upload(self) -> datetime:
3233
)
3334

3435
async def has_valid_urls(self, last_upload_at: datetime | None) -> bool:
36+
cte = HuggingfacePrereqCTEContainer()
3537
query = (
36-
select(count(URL.id))
37-
.join(
38-
URLCompressedHTML,
39-
URL.id == URLCompressedHTML.url_id
40-
)
41-
.where(
42-
exists_url(FlagURLValidated),
43-
no_url_task_error(TaskType.PUSH_TO_HUGGINGFACE)
38+
select(
39+
cte.url_id
4440
)
4541
)
4642
if last_upload_at is not None:
47-
query = query.where(URL.updated_at > last_upload_at)
48-
url_count = await sh.scalar(
43+
query = query.where(cte.updated_at > last_upload_at)
44+
query = query.limit(1)
45+
result = await sh.one_or_none(
4946
session=self.session,
5047
query=query
5148
)
52-
return url_count > 0
49+
return result is not None

src/db/client/async_.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -919,4 +919,7 @@ async def refresh_materialized_views(self):
919919
)
920920
await self.execute(
921921
text("REFRESH MATERIALIZED VIEW batch_url_status_mat_view")
922+
)
923+
await self.execute(
924+
text("REFRESH MATERIALIZED VIEW mat_view__html_duplicate_url")
922925
)

src/db/models/materialized_views/__init__.py

Whitespace-only changes.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from src.db.models.mixins import URLDependentViewMixin
2+
from src.db.models.templates_.base import Base
3+
4+
5+
class HTMLDuplicateURLMaterializedView(
6+
Base,
7+
URLDependentViewMixin
8+
):
9+
__tablename__ = "mat_view__html_duplicate_url"

0 commit comments

Comments
 (0)