Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
011d489
first stab at STT evals
AkhileshNegi Jan 30, 2026
7777290
Merge branch 'main' of github.com:ProjectTech4DevAI/kaapi-backend int…
AkhileshNegi Jan 30, 2026
d8df80c
Merge branch 'main' of github.com:ProjectTech4DevAI/kaapi-backend int…
AkhileshNegi Jan 31, 2026
f1df7f9
fix migration naming
AkhileshNegi Jan 31, 2026
cda0611
fixing endpoints
AkhileshNegi Jan 31, 2026
ad5779f
update dataset endpoint
AkhileshNegi Jan 31, 2026
01e2beb
update types
AkhileshNegi Jan 31, 2026
1637007
updated dataset with URL
AkhileshNegi Jan 31, 2026
36af7e9
added few more testcases
AkhileshNegi Jan 31, 2026
78fd206
added storage to core for easy reuse
AkhileshNegi Jan 31, 2026
4ac2ca6
cleanup for audio duration
AkhileshNegi Jan 31, 2026
d8b531c
first stab at fixing celery task to cron
AkhileshNegi Jan 31, 2026
2295da5
added gemini as provider
AkhileshNegi Feb 2, 2026
25e6002
moving to batch job in gemini
AkhileshNegi Feb 2, 2026
db2512e
code refactoring, using batch requests and files similar to OpenAI
AkhileshNegi Feb 2, 2026
ff29ddd
few cleanups
AkhileshNegi Feb 2, 2026
cd979fd
updated migration
AkhileshNegi Feb 3, 2026
b6c633a
cleanup config for batch
AkhileshNegi Feb 3, 2026
b6e6649
moved documentation to separate folder
AkhileshNegi Feb 3, 2026
719584d
updated score format in stt result
AkhileshNegi Feb 3, 2026
bf0b4c2
cleaner dataset sample count
AkhileshNegi Feb 3, 2026
68e6821
got rid of redundant sample count
AkhileshNegi Feb 3, 2026
2247faa
removed deadcode
AkhileshNegi Feb 3, 2026
056612c
removing more redundant code
AkhileshNegi Feb 3, 2026
13bb9cc
clean few more cruds
AkhileshNegi Feb 3, 2026
7bbf811
more free from dead code
AkhileshNegi Feb 3, 2026
04e419c
cleanup batch request code
AkhileshNegi Feb 3, 2026
09deab2
cleanup batch
AkhileshNegi Feb 3, 2026
f6bf0c2
got rid of processed_samples as well
AkhileshNegi Feb 3, 2026
d20084b
cleanup provider_metadata from results
AkhileshNegi Feb 3, 2026
4afdd2d
cleanup optimize results
AkhileshNegi Feb 4, 2026
3e62a98
cleanup queries
AkhileshNegi Feb 4, 2026
63de270
cleanup leftovers
AkhileshNegi Feb 4, 2026
c95c044
added validation for provider
AkhileshNegi Feb 4, 2026
9aa6858
updated test suite
AkhileshNegi Feb 4, 2026
4a92416
coderabbit suggestions
AkhileshNegi Feb 4, 2026
e204416
added few more testcases
AkhileshNegi Feb 4, 2026
0210dab
added more testcases for coverage
AkhileshNegi Feb 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
325 changes: 325 additions & 0 deletions backend/app/alembic/versions/043_add_stt_evaluation_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,325 @@
"""add stt evaluation tables

Revision ID: 043
Revises: 042
Create Date: 2026-01-28 12:00:00.000000

"""

import sqlalchemy as sa
import sqlmodel.sql.sqltypes
from alembic import op
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "043"
down_revision = "042"
branch_labels = None
depends_on = None


def upgrade():
# Add type and language columns to evaluation_dataset table
op.add_column(
"evaluation_dataset",
sa.Column(
"type",
sa.String(length=20),
nullable=False,
server_default="text",
comment="Evaluation type: text, stt, or tts",
),
)
op.add_column(
"evaluation_dataset",
sa.Column(
"language",
sa.String(length=10),
nullable=True,
comment="ISO 639-1 language code (e.g., en, hi)",
),
)

# Add type, language, and providers columns to evaluation_run table
op.add_column(
"evaluation_run",
sa.Column(
"type",
sa.String(length=20),
nullable=False,
server_default="text",
comment="Evaluation type: text, stt, or tts",
),
)
op.add_column(
"evaluation_run",
sa.Column(
"language",
sa.String(length=10),
nullable=True,
comment="ISO 639-1 language code",
),
)
op.add_column(
"evaluation_run",
sa.Column(
"providers",
postgresql.JSONB(astext_type=sa.Text()),
nullable=True,
comment="List of STT/TTS providers used (e.g., ['gemini-2.5-pro'])",
),
)
# Create stt_sample table
op.create_table(
"stt_sample",
sa.Column(
"id",
sa.Integer(),
nullable=False,
comment="Unique identifier for the STT sample",
),
sa.Column(
"object_store_url",
sqlmodel.sql.sqltypes.AutoString(),
nullable=False,
comment="S3 URL of the audio file",
),
sa.Column(
"language",
sa.String(length=10),
nullable=True,
comment="ISO 639-1 language code for this sample",
),
sa.Column(
"ground_truth",
sa.Text(),
nullable=True,
comment="Reference transcription for comparison (optional)",
),
sa.Column(
"sample_metadata",
postgresql.JSONB(astext_type=sa.Text()),
nullable=True,
server_default=sa.text("'{}'::jsonb"),
comment="Additional metadata (format, bitrate, original filename, etc.)",
),
sa.Column(
"dataset_id",
sa.Integer(),
nullable=False,
comment="Reference to the parent evaluation dataset",
),
sa.Column(
"organization_id",
sa.Integer(),
nullable=False,
comment="Reference to the organization",
),
sa.Column(
"project_id",
sa.Integer(),
nullable=False,
comment="Reference to the project",
),
sa.Column(
"inserted_at",
sa.DateTime(),
nullable=False,
comment="Timestamp when the sample was created",
),
sa.Column(
"updated_at",
sa.DateTime(),
nullable=False,
comment="Timestamp when the sample was last updated",
),
sa.ForeignKeyConstraint(
["dataset_id"],
["evaluation_dataset.id"],
name="fk_stt_sample_dataset_id",
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["organization_id"],
["organization.id"],
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["project_id"],
["project.id"],
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id"),
)
op.create_index(
"ix_stt_sample_dataset_id",
"stt_sample",
["dataset_id"],
unique=False,
)
op.create_index(
"idx_stt_sample_org_project",
"stt_sample",
["organization_id", "project_id"],
unique=False,
)

# Create stt_result table
op.create_table(
"stt_result",
sa.Column(
"id",
sa.Integer(),
nullable=False,
comment="Unique identifier for the STT result",
),
sa.Column(
"transcription",
sa.Text(),
nullable=True,
comment="Generated transcription from STT provider",
),
sa.Column(
"provider",
sa.String(length=50),
nullable=False,
comment="STT provider used (e.g., gemini-2.5-pro)",
),
sa.Column(
"status",
sa.String(length=20),
nullable=False,
server_default="pending",
comment="Result status: pending, completed, failed",
),
sa.Column(
"score",
postgresql.JSONB(astext_type=sa.Text()),
nullable=True,
comment="Evaluation metrics (e.g., wer, cer, mer, wil) - extensible for future metrics",
),
sa.Column(
"is_correct",
sa.Boolean(),
nullable=True,
comment="Human feedback: transcription correctness (null=not reviewed)",
),
sa.Column(
"comment",
sa.Text(),
nullable=True,
comment="Human feedback comment",
),
sa.Column(
"error_message",
sa.Text(),
nullable=True,
comment="Error message if transcription failed",
),
sa.Column(
"stt_sample_id",
sa.Integer(),
nullable=False,
comment="Reference to the STT sample",
),
sa.Column(
"evaluation_run_id",
sa.Integer(),
nullable=False,
comment="Reference to the evaluation run",
),
sa.Column(
"organization_id",
sa.Integer(),
nullable=False,
comment="Reference to the organization",
),
sa.Column(
"project_id",
sa.Integer(),
nullable=False,
comment="Reference to the project",
),
sa.Column(
"inserted_at",
sa.DateTime(),
nullable=False,
comment="Timestamp when the result was created",
),
sa.Column(
"updated_at",
sa.DateTime(),
nullable=False,
comment="Timestamp when the result was last updated",
),
sa.ForeignKeyConstraint(
["stt_sample_id"],
["stt_sample.id"],
name="fk_stt_result_sample_id",
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["evaluation_run_id"],
["evaluation_run.id"],
name="fk_stt_result_run_id",
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["organization_id"],
["organization.id"],
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["project_id"],
["project.id"],
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id"),
)
op.create_index(
"ix_stt_result_sample_id",
"stt_result",
["stt_sample_id"],
unique=False,
)
op.create_index(
"ix_stt_result_run_id",
"stt_result",
["evaluation_run_id"],
unique=False,
)
op.create_index(
"idx_stt_result_feedback",
"stt_result",
["evaluation_run_id", "is_correct"],
unique=False,
)
op.create_index(
"idx_stt_result_status",
"stt_result",
["evaluation_run_id", "status"],
unique=False,
)


def downgrade():
# Drop stt_result table
op.drop_index("idx_stt_result_status", table_name="stt_result")
op.drop_index("idx_stt_result_feedback", table_name="stt_result")
op.drop_index("ix_stt_result_run_id", table_name="stt_result")
op.drop_index("ix_stt_result_sample_id", table_name="stt_result")
op.drop_table("stt_result")

# Drop stt_sample table
op.drop_index("idx_stt_sample_org_project", table_name="stt_sample")
op.drop_index("ix_stt_sample_dataset_id", table_name="stt_sample")
op.drop_table("stt_sample")

# Remove columns from evaluation_run table
op.drop_column("evaluation_run", "providers")
op.drop_column("evaluation_run", "language")
op.drop_column("evaluation_run", "type")

# Remove columns from evaluation_dataset table
op.drop_column("evaluation_dataset", "language")
op.drop_column("evaluation_dataset", "type")
5 changes: 5 additions & 0 deletions backend/app/api/docs/stt_evaluation/create_dataset.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Create a new STT evaluation dataset with audio samples.

Each sample requires:
- **object_store_url**: S3 URL of the audio file (from /evaluations/stt/files/audio endpoint)
- **ground_truth**: Reference transcription (optional, for WER/CER metrics)
1 change: 1 addition & 0 deletions backend/app/api/docs/stt_evaluation/get_dataset.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Get an STT dataset with its samples.
1 change: 1 addition & 0 deletions backend/app/api/docs/stt_evaluation/get_result.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Get a single STT transcription result.
1 change: 1 addition & 0 deletions backend/app/api/docs/stt_evaluation/get_run.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Get an STT evaluation run with its results.
1 change: 1 addition & 0 deletions backend/app/api/docs/stt_evaluation/list_datasets.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
List all STT evaluation datasets for the current project.
1 change: 1 addition & 0 deletions backend/app/api/docs/stt_evaluation/list_runs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
List all STT evaluation runs for the current project.
8 changes: 8 additions & 0 deletions backend/app/api/docs/stt_evaluation/start_evaluation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Start an STT evaluation run on a dataset.

The evaluation will:
1. Process each audio sample through the specified providers
2. Generate transcriptions using Gemini Batch API
3. Store results for human review

**Supported providers:** gemini-2.5-pro, gemini-2.5-flash, gemini-2.0-flash
5 changes: 5 additions & 0 deletions backend/app/api/docs/stt_evaluation/update_feedback.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Update human feedback on an STT transcription result.

**Fields:**
- **is_correct**: Boolean indicating if the transcription is correct
- **comment**: Optional feedback comment explaining issues or observations
7 changes: 7 additions & 0 deletions backend/app/api/docs/stt_evaluation/upload_audio.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Upload a single audio file to S3 for STT evaluation.

**Supported formats:** mp3, wav, flac, m4a, ogg, webm

**Maximum file size:** 200 MB

Returns the S3 URL which can be used when creating an STT dataset.
2 changes: 2 additions & 0 deletions backend/app/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
collection_job,
)
from app.api.routes.evaluations import dataset as evaluation_dataset, evaluation
from app.api.routes import stt_evaluations
from app.core.config import settings

api_router = APIRouter()
Expand All @@ -39,6 +40,7 @@
api_router.include_router(doc_transformation_job.router)
api_router.include_router(evaluation_dataset.router)
api_router.include_router(evaluation.router)
api_router.include_router(stt_evaluations.router)
api_router.include_router(llm.router)
api_router.include_router(login.router)
api_router.include_router(onboarding.router)
Expand Down
5 changes: 5 additions & 0 deletions backend/app/api/routes/stt_evaluations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""STT Evaluation API routes."""

from .router import router

__all__ = ["router"]
Loading