Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "welearn-database"
version = "1.4.0"
version = "1.4.2"
description = "All stuff related to relationnal database from the WeLearn project"
authors = [
{name = "Théo",email = "theo.nardin@cri-paris.org"}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""modify corpus_name_embedding_model_lang view

Revision ID: b049924f7067
Revises: f8602200fa99
Create Date: 2026-03-31 16:09:12.085443

"""

from typing import Sequence, Union

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision: str = "b049924f7067"
down_revision: Union[str, None] = "f8602200fa99"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.execute(
"""
DROP MATERIALIZED VIEW corpus_related.corpus_name_embedding_model_lang;
"""
)
op.execute(
"""

CREATE MATERIALIZED VIEW corpus_related.corpus_name_embedding_model_lang
TABLESPACE pg_default
AS WITH ranked AS (
SELECT
c.source_name,
cem.corpus_id,
cem.embedding_model_id,
em.title,
em.lang,
cem.used_since,
c.category_id,
ROW_NUMBER() OVER (
PARTITION BY cem.corpus_id,
em.lang
ORDER BY
cem.used_since DESC
) AS rn
FROM
corpus_related.corpus_embedding_model cem
JOIN corpus_related.corpus c ON
c.id = cem.corpus_id
JOIN corpus_related.embedding_model em ON
em.id = cem.embedding_model_id
WHERE
c.is_active
)
SELECT
source_name,
corpus_id,
embedding_model_id,
title,
lang,
used_since,
category_id
FROM
ranked
WHERE
rn = 1

WITH DATA;
"""
)


def downgrade() -> None:
op.execute(
"""
DROP MATERIALIZED VIEW corpus_related.corpus_name_embedding_model_lang;
"""
)
op.execute(
"""
CREATE MATERIALIZED VIEW corpus_related.corpus_name_embedding_model_lang
TABLESPACE pg_default
AS SELECT corpus.source_name,
embedding_model.title,
embedding_model.lang
FROM corpus_related.corpus
JOIN corpus_related.corpus_embedding_model ON corpus_embedding_model.corpus_id = corpus.id
JOIN corpus_related.embedding_model ON embedding_model.id = corpus_embedding_model.embedding_model_id
WHERE corpus.is_active
WITH DATA;
"""
)
4 changes: 4 additions & 0 deletions welearn_database/data/models/corpus_related.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,12 @@ class CorpusNameEmbeddingModelLang(Base):
__table_args__ = {"schema": schema_name}
__read_only__ = True
source_name: Mapped[str] = mapped_column(primary_key=True)
corpus_id: Mapped[UUID]
embedding_model_id: Mapped[UUID]
title: Mapped[str]
lang: Mapped[str]
Comment on lines 109 to 113
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

source_name is declared as the sole primary key, but the materialized view returns one row per (corpus_id, lang) (latest used_since), which can produce multiple rows for the same source_name across different languages. With source_name alone as the ORM PK, SQLAlchemy’s identity map can collapse/overwrite rows and return incomplete/incorrect results. Consider using a composite primary key that matches the view’s uniqueness (e.g., include lang and/or corpus_id).

Suggested change
source_name: Mapped[str] = mapped_column(primary_key=True)
corpus_id: Mapped[UUID]
embedding_model_id: Mapped[UUID]
title: Mapped[str]
lang: Mapped[str]
source_name: Mapped[str] = mapped_column()
corpus_id: Mapped[UUID] = mapped_column(primary_key=True)
embedding_model_id: Mapped[UUID]
title: Mapped[str]
lang: Mapped[str] = mapped_column(primary_key=True)

Copilot uses AI. Check for mistakes.
used_since: Mapped[datetime]
category_id: Mapped[UUID]


class CorpusEmbeddingModel(Base):
Expand Down
Loading