From acb767d8403dc5447a59bee97cbee5de8d338e60 Mon Sep 17 00:00:00 2001 From: Prakhar-Sethi012 Date: Thu, 11 Jun 2026 01:41:09 +0530 Subject: [PATCH 1/3] saving my local changes --- .dockerignore | 5 ++ .env.example | 6 ++ .gitignore | 54 ++++++++++++++ Dockerfile | 13 ++++ alembic.ini | 149 +++++++++++++++++++++++++++++++++++++++ alembic/README | 1 + alembic/env.py | 79 +++++++++++++++++++++ alembic/script.py.mako | 28 ++++++++ app/database/database.py | 23 ++++++ app/database/models.py | 17 +++++ app/main.py | 7 ++ docker-compose.yml | 31 ++++++++ requirements.txt | Bin 0 -> 920 bytes 13 files changed, 413 insertions(+) create mode 100644 .dockerignore create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 alembic.ini create mode 100644 alembic/README create mode 100644 alembic/env.py create mode 100644 alembic/script.py.mako create mode 100644 app/database/database.py create mode 100644 app/database/models.py create mode 100644 app/main.py create mode 100644 docker-compose.yml create mode 100644 requirements.txt diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..dda73b3 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +.venv +__pycache__ +.git +.env +alembic/versions \ No newline at end of file diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..ccc76e2 --- /dev/null +++ b/.env.example @@ -0,0 +1,6 @@ +POSTGRES_USER=postgres +POSTGRES_PASSWORD=postgres +POSTGRES_DB=wheretf +POSTGRES_HOST=localhost +POSTGRES_PORT=5432 +DATABASE_URL=postgresql+psycopg://postgres:postgres@localhost:5432/wheretf \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ced36aa --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# ====================== +# Virtual Environment +# ====================== +.venv/ +venv/ +ENV/ +env/ + +# ====================== +# Python cache +# ====================== +__pycache__/ +*.pyc +*.pyo +*.pyd + +# ====================== +# Environment variables +# ====================== +.env + +# ====================== +# IDEs +# ====================== +.vscode/ +.idea/ + +# ====================== +# Testing / coverage +# ====================== +.pytest_cache/ +.coverage +htmlcov/ + +# ====================== +# Logs +# ====================== +*.log + +# ====================== +# Alembic +# ====================== +alembic/versions/__pycache__/ + +# ====================== +# Docker +# ====================== +docker-compose.override.yml + +# ====================== +# OS files +# ====================== +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..194e62d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.13-slim + +WORKDIR /app + +COPY requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8000 + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..d142807 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,149 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts. +# this is typically a path given in POSIX (e.g. forward slashes) +# format, relative to the token %(here)s which refers to the location of this +# ini file +script_location = %(here)s/alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s +# Or organize into date-based subdirectories (requires recursive_version_locations = true) +# file_template = %%(year)d/%%(month).2d/%%(day).2d_%%(hour).2d%%(minute).2d_%%(second).2d_%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. for multiple paths, the path separator +# is defined by "path_separator" below. +prepend_sys_path = . + + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the tzdata library which can be installed by adding +# `alembic[tz]` to the pip requirements. +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to /versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "path_separator" +# below. +# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions + +# path_separator; This indicates what character is used to split lists of file +# paths, including version_locations and prepend_sys_path within configparser +# files such as alembic.ini. +# The default rendered in new alembic.ini files is "os", which uses os.pathsep +# to provide os-dependent path splitting. +# +# Note that in order to support legacy alembic.ini files, this default does NOT +# take place if path_separator is not present in alembic.ini. If this +# option is omitted entirely, fallback logic is as follows: +# +# 1. Parsing of the version_locations option falls back to using the legacy +# "version_path_separator" key, which if absent then falls back to the legacy +# behavior of splitting on spaces and/or commas. +# 2. Parsing of the prepend_sys_path option falls back to the legacy +# behavior of splitting on spaces, commas, or colons. +# +# Valid values for path_separator are: +# +# path_separator = : +# path_separator = ; +# path_separator = space +# path_separator = newline +# +# Use os.pathsep. Default configuration used for new projects. +path_separator = os + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# database URL. This is consumed by the user-maintained env.py script only. +# other means of configuring database URLs may be customized within the env.py +# file. +sqlalchemy.url = postgresql+psycopg://postgres:postgres@localhost:5432/wheretf + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module +# hooks = ruff +# ruff.type = module +# ruff.module = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Alternatively, use the exec runner to execute a binary found on your PATH +# hooks = ruff +# ruff.type = exec +# ruff.executable = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Logging configuration. This is also consumed by the user-maintained +# env.py script only. +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/alembic/README b/alembic/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/alembic/env.py b/alembic/env.py new file mode 100644 index 0000000..804ad87 --- /dev/null +++ b/alembic/env.py @@ -0,0 +1,79 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool +from app.database.database import Base +from app.database import models +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/alembic/script.py.mako b/alembic/script.py.mako new file mode 100644 index 0000000..1101630 --- /dev/null +++ b/alembic/script.py.mako @@ -0,0 +1,28 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + """Upgrade schema.""" + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + """Downgrade schema.""" + ${downgrades if downgrades else "pass"} diff --git a/app/database/database.py b/app/database/database.py new file mode 100644 index 0000000..89f8575 --- /dev/null +++ b/app/database/database.py @@ -0,0 +1,23 @@ +from dotenv import load_dotenv +import os + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker, DeclarativeBase + +load_dotenv() + +DATABASE_URL = os.getenv( + "DATABASE_URL", + "postgresql+psycopg://postgres:postgres@db:5432/wheretf" +) +engine = create_engine(DATABASE_URL) + +SessionLocal = sessionmaker( + bind=engine, + autoflush=False, + autocommit=False, +) + + +class Base(DeclarativeBase): + pass \ No newline at end of file diff --git a/app/database/models.py b/app/database/models.py new file mode 100644 index 0000000..186210f --- /dev/null +++ b/app/database/models.py @@ -0,0 +1,17 @@ +from sqlalchemy import String, Text +from sqlalchemy.orm import Mapped, mapped_column +from pgvector.sqlalchemy import Vector + +from app.database.database import Base + + +class Document(Base): + __tablename__ = "documents" + + id: Mapped[int] = mapped_column(primary_key=True) + + title: Mapped[str] = mapped_column(String(255)) + content: Mapped[str] = mapped_column(Text) + + # pgvector embedding (we'll generate later) + embedding: Mapped[list[float]] = mapped_column(Vector(1536)) \ No newline at end of file diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..a4aee24 --- /dev/null +++ b/app/main.py @@ -0,0 +1,7 @@ +from fastapi import FastAPI +app = FastAPI( + title="WhereTF Backend" +) +@app.get("/") +def root(): + return {"message": "WhereTF Backend is running"} \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..434e457 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,31 @@ +services: + db: + image: pgvector/pgvector:pg17 + container_name: wheretf-db + + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: wheretf + + ports: + - "5432:5432" + + volumes: + - postgres_data:/var/lib/postgresql/data + + backend: + build: . + container_name: wheretf-backend + + ports: + - "8000:8000" + + depends_on: + - db + + environment: + DATABASE_URL: postgresql+psycopg://postgres:postgres@db:5432/wheretf + +volumes: + postgres_data: \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0d5e2eb153765efd1147810345a59255a75fa77 GIT binary patch literal 920 zcmZ`&T~ES55S(Wde+q>1r9SxJyFM6x0Kp2C0BzGA#`@#cnccfm9?a!{+|KOm&Mv<{ z8yvC432)dV$0JXMG4DM;=eWfUHYj;21pW%dzGK8&Mf8L?-T$!x9cuDg%P#DlaZ>J% z%9X|De9tiJqd99+RR@cDMM8*~lXK*4Sk;MFm3tjyEDf>aYim{4%#z~Dzof#a^GUify19mz|NP5W5G=$VeJh}%(Hm3hyp zs*>>&5uL@R%pqsIs~QD9$;#Q8x=(!x;Tko&?#v%yQ+6r%jIQG&o4opHqMndLbjLom z1EJp#RpfI@8Lu&GQBSRG zdl6^hb8g}km2a5KH^%pHPo;vo`fhD6<)k*SG)~$f*B^4$%=$&FYHcIKciQ(>JL8)X c@NJ&yAL9;8*pD3WtY`MBz1IAOst Date: Thu, 11 Jun 2026 16:24:42 +0530 Subject: [PATCH 2/3] feat: initialize modular database architecture with pgvector and separated alembic migrations --- alembic/env.py | 15 +++--- ...76cb765_create_file_relationships_table.py | 42 +++++++++++++++++ .../c1850b307113_create_files_table.py | 46 +++++++++++++++++++ .../cdcdffd36592_create_file_content_table.py | 46 +++++++++++++++++++ app/{database => }/database.py | 0 app/database/models.py | 17 ------- app/models/__init__.py | 3 ++ app/models/content.py | 31 +++++++++++++ app/models/file.py | 30 ++++++++++++ app/models/relationship.py | 21 +++++++++ app/schemas/__init__.py | 0 app/schemas/document.py | 11 +++++ docker-compose.yml | 2 +- 13 files changed, 237 insertions(+), 27 deletions(-) create mode 100644 alembic/versions/2f98f76cb765_create_file_relationships_table.py create mode 100644 alembic/versions/c1850b307113_create_files_table.py create mode 100644 alembic/versions/cdcdffd36592_create_file_content_table.py rename app/{database => }/database.py (100%) delete mode 100644 app/database/models.py create mode 100644 app/models/__init__.py create mode 100644 app/models/content.py create mode 100644 app/models/file.py create mode 100644 app/models/relationship.py create mode 100644 app/schemas/__init__.py create mode 100644 app/schemas/document.py diff --git a/alembic/env.py b/alembic/env.py index 804ad87..5d4475a 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,9 +1,8 @@ from logging.config import fileConfig - from sqlalchemy import engine_from_config -from sqlalchemy import pool -from app.database.database import Base -from app.database import models +from sqlalchemy import pool,create_engine +from app.database import Base +import app.models from alembic import context # this is the Alembic Config object, which provides @@ -58,12 +57,10 @@ def run_migrations_online() -> None: and associate a connection with the context. """ - connectable = engine_from_config( - config.get_section(config.config_ini_section, {}), - prefix="sqlalchemy.", - poolclass=pool.NullPool, - ) + +# We force Alembic to use localhost and the exact password 'postgres' + connectable = create_engine("postgresql+psycopg://postgres:postgres@localhost:5433/wheretf") with connectable.connect() as connection: context.configure( connection=connection, target_metadata=target_metadata diff --git a/alembic/versions/2f98f76cb765_create_file_relationships_table.py b/alembic/versions/2f98f76cb765_create_file_relationships_table.py new file mode 100644 index 0000000..3e24c02 --- /dev/null +++ b/alembic/versions/2f98f76cb765_create_file_relationships_table.py @@ -0,0 +1,42 @@ +"""create_file_relationships_table + +Revision ID: 2f98f76cb765 +Revises: cdcdffd36592 +Create Date: 2026-06-11 16:18:13.091065 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '2f98f76cb765' +down_revision: Union[str, Sequence[str], None] = 'cdcdffd36592' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('file_relationships', + sa.Column('id', sa.UUID(), server_default=sa.text('gen_random_uuid()'), nullable=False), + sa.Column('source_file_id', sa.UUID(), nullable=False), + sa.Column('target_file_id', sa.UUID(), nullable=False), + sa.Column('similarity_score', sa.Float(), nullable=False), + sa.Column('relation_type', sa.String(length=50), server_default='semantic_similarity', nullable=False), + sa.ForeignKeyConstraint(['source_file_id'], ['files.id'], ondelete='CASCADE'), + sa.ForeignKeyConstraint(['target_file_id'], ['files.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('source_file_id', 'target_file_id', name='uq_file_relationship') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('file_relationships') + # ### end Alembic commands ### diff --git a/alembic/versions/c1850b307113_create_files_table.py b/alembic/versions/c1850b307113_create_files_table.py new file mode 100644 index 0000000..5822e0d --- /dev/null +++ b/alembic/versions/c1850b307113_create_files_table.py @@ -0,0 +1,46 @@ +"""create_files_table + +Revision ID: c1850b307113 +Revises: +Create Date: 2026-06-11 16:12:34.237674 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'c1850b307113' +down_revision: Union[str, Sequence[str], None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('files', + sa.Column('id', sa.UUID(), server_default=sa.text('gen_random_uuid()'), nullable=False), + sa.Column('file_path', sa.Text(), nullable=False), + sa.Column('file_hash', sa.String(length=64), nullable=False), + sa.Column('mime_type', sa.String(length=50), nullable=False), + sa.Column('last_modified', sa.DateTime(timezone=True), nullable=False), + sa.Column('indexed_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.Column('tags', postgresql.ARRAY(sa.Text()), server_default='{}', nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('file_path') + ) + op.create_index('files_path_hash_idx', 'files', ['file_path', 'file_hash'], unique=False) + op.create_index('files_tags_idx', 'files', ['tags'], unique=False, postgresql_using='gin') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index('files_tags_idx', table_name='files', postgresql_using='gin') + op.drop_index('files_path_hash_idx', table_name='files') + op.drop_table('files') + # ### end Alembic commands ### diff --git a/alembic/versions/cdcdffd36592_create_file_content_table.py b/alembic/versions/cdcdffd36592_create_file_content_table.py new file mode 100644 index 0000000..6707dea --- /dev/null +++ b/alembic/versions/cdcdffd36592_create_file_content_table.py @@ -0,0 +1,46 @@ +"""create_file_content_table + +Revision ID: cdcdffd36592 +Revises: c1850b307113 +Create Date: 2026-06-11 16:14:39.195041 + +""" +from typing import Sequence, Union +import pgvector +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'cdcdffd36592' +down_revision: Union[str, Sequence[str], None] = 'c1850b307113' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.execute("CREATE EXTENSION IF NOT EXISTS vector;") + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('file_content', + sa.Column('id', sa.UUID(), server_default=sa.text('gen_random_uuid()'), nullable=False), + sa.Column('file_id', sa.UUID(), nullable=False), + sa.Column('chunk_index', sa.Integer(), nullable=False), + sa.Column('content_text', sa.Text(), nullable=False), + sa.Column('embedding', pgvector.sqlalchemy.vector.VECTOR(dim=384), nullable=False), + sa.Column('keyword_tokens', postgresql.TSVECTOR(), sa.Computed("to_tsvector('english', content_text)", persisted=True), nullable=True), + sa.ForeignKeyConstraint(['file_id'], ['files.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id') + ) + op.create_index('content_embedding_idx', 'file_content', ['embedding'], unique=False, postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_cosine_ops'}) + op.create_index('content_fts_idx', 'file_content', ['keyword_tokens'], unique=False, postgresql_using='gin') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index('content_fts_idx', table_name='file_content', postgresql_using='gin') + op.drop_index('content_embedding_idx', table_name='file_content', postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_cosine_ops'}) + op.drop_table('file_content') + # ### end Alembic commands ### diff --git a/app/database/database.py b/app/database.py similarity index 100% rename from app/database/database.py rename to app/database.py diff --git a/app/database/models.py b/app/database/models.py deleted file mode 100644 index 186210f..0000000 --- a/app/database/models.py +++ /dev/null @@ -1,17 +0,0 @@ -from sqlalchemy import String, Text -from sqlalchemy.orm import Mapped, mapped_column -from pgvector.sqlalchemy import Vector - -from app.database.database import Base - - -class Document(Base): - __tablename__ = "documents" - - id: Mapped[int] = mapped_column(primary_key=True) - - title: Mapped[str] = mapped_column(String(255)) - content: Mapped[str] = mapped_column(Text) - - # pgvector embedding (we'll generate later) - embedding: Mapped[list[float]] = mapped_column(Vector(1536)) \ No newline at end of file diff --git a/app/models/__init__.py b/app/models/__init__.py new file mode 100644 index 0000000..bf82e82 --- /dev/null +++ b/app/models/__init__.py @@ -0,0 +1,3 @@ +from app.models.file import File +from app.models.content import FileContent +from app.models.relationship import FileRelationship \ No newline at end of file diff --git a/app/models/content.py b/app/models/content.py new file mode 100644 index 0000000..0690482 --- /dev/null +++ b/app/models/content.py @@ -0,0 +1,31 @@ +import uuid +from sqlalchemy import Integer, Text, ForeignKey, Index, Computed +from sqlalchemy.dialects.postgresql import UUID, TSVECTOR +from sqlalchemy.orm import Mapped, mapped_column, relationship +from pgvector.sqlalchemy import Vector +from sqlalchemy.sql import func +from app.database import Base + +class FileContent(Base): + __tablename__ = "file_content" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, server_default=func.gen_random_uuid()) + file_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("files.id", ondelete="CASCADE")) + + chunk_index: Mapped[int] = mapped_column(Integer, nullable=False) + content_text: Mapped[str] = mapped_column(Text, nullable=False) + + # 384 dimensions for all-MiniLM-L6-v2 + embedding: Mapped[list[float]] = mapped_column(Vector(384)) + + # Generated column for Full Text Search + keyword_tokens = mapped_column(TSVECTOR, Computed("to_tsvector('english', content_text)", persisted=True)) + + # Link back to the parent file + file = relationship("File", back_populates="contents") + + # Indexes from the image + __table_args__ = ( + Index('content_fts_idx', 'keyword_tokens', postgresql_using='gin'), + Index('content_embedding_idx', 'embedding', postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_cosine_ops'}), + ) \ No newline at end of file diff --git a/app/models/file.py b/app/models/file.py new file mode 100644 index 0000000..cd02ede --- /dev/null +++ b/app/models/file.py @@ -0,0 +1,30 @@ +import uuid +from datetime import datetime +from sqlalchemy import String, Text, DateTime, Index +from sqlalchemy.dialects.postgresql import UUID, ARRAY +from sqlalchemy.sql import func +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.database import Base + +class File(Base): + __tablename__ = "files" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, server_default=func.gen_random_uuid()) + file_path: Mapped[str] = mapped_column(Text, unique=True, nullable=False) + file_hash: Mapped[str] = mapped_column(String(64), nullable=False) # SHA-256 + mime_type: Mapped[str] = mapped_column(String(50), nullable=False) + + last_modified: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False) + indexed_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now()) + + tags: Mapped[list[str]] = mapped_column(ARRAY(Text), server_default='{}') + + # Link to the contents table + contents = relationship("FileContent", back_populates="file", cascade="all, delete-orphan") + + # Indexes from the image + __table_args__ = ( + Index('files_tags_idx', 'tags', postgresql_using='gin'), + Index('files_path_hash_idx', 'file_path', 'file_hash'), + ) \ No newline at end of file diff --git a/app/models/relationship.py b/app/models/relationship.py new file mode 100644 index 0000000..7692777 --- /dev/null +++ b/app/models/relationship.py @@ -0,0 +1,21 @@ +import uuid +from sqlalchemy import Float, String, ForeignKey, UniqueConstraint +from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.sql import func +from sqlalchemy.orm import Mapped, mapped_column + +from app.database import Base + +class FileRelationship(Base): + __tablename__ = "file_relationships" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, server_default=func.gen_random_uuid()) + source_file_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("files.id", ondelete="CASCADE")) + target_file_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("files.id", ondelete="CASCADE")) + + similarity_score: Mapped[float] = mapped_column(Float, nullable=False) + relation_type: Mapped[str] = mapped_column(String(50), server_default='semantic_similarity') + + __table_args__ = ( + UniqueConstraint('source_file_id', 'target_file_id', name='uq_file_relationship'), + ) \ No newline at end of file diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/schemas/document.py b/app/schemas/document.py new file mode 100644 index 0000000..8312171 --- /dev/null +++ b/app/schemas/document.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel + + +class DocumentIngest(BaseModel): + file_path: str + raw_text: str +class DocumentSearchResponse(BaseModel): + filename: str + file_path: str + snippet: str + score: float \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 434e457..767fc1f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,7 +9,7 @@ services: POSTGRES_DB: wheretf ports: - - "5432:5432" + - "5433:5432" volumes: - postgres_data:/var/lib/postgresql/data From f51d5cbcbafe2b150c5e9f57043b36f38a5638a7 Mon Sep 17 00:00:00 2001 From: Maneet Gupta <131141424+RK-NerdyBirdy@users.noreply.github.com> Date: Fri, 12 Jun 2026 20:26:16 +0530 Subject: [PATCH 3/3] Enhance db service with healthcheck and update backend dependency Added healthcheck for the database service and updated backend dependency condition. --- docker-compose.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 767fc1f..5b0bee0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,30 +2,30 @@ services: db: image: pgvector/pgvector:pg17 container_name: wheretf-db - environment: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres POSTGRES_DB: wheretf - ports: - "5433:5432" - volumes: - postgres_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres -d wheretf"] + interval: 5s + timeout: 5s + retries: 5 backend: build: . container_name: wheretf-backend - ports: - "8000:8000" - depends_on: - - db - + db: + condition: service_healthy environment: DATABASE_URL: postgresql+psycopg://postgres:postgres@db:5432/wheretf volumes: - postgres_data: \ No newline at end of file + postgres_data: