diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..dda73b3 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +.venv +__pycache__ +.git +.env +alembic/versions \ No newline at end of file diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..ccc76e2 --- /dev/null +++ b/.env.example @@ -0,0 +1,6 @@ +POSTGRES_USER=postgres +POSTGRES_PASSWORD=postgres +POSTGRES_DB=wheretf +POSTGRES_HOST=localhost +POSTGRES_PORT=5432 +DATABASE_URL=postgresql+psycopg://postgres:postgres@localhost:5432/wheretf \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ced36aa --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# ====================== +# Virtual Environment +# ====================== +.venv/ +venv/ +ENV/ +env/ + +# ====================== +# Python cache +# ====================== +__pycache__/ +*.pyc +*.pyo +*.pyd + +# ====================== +# Environment variables +# ====================== +.env + +# ====================== +# IDEs +# ====================== +.vscode/ +.idea/ + +# ====================== +# Testing / coverage +# ====================== +.pytest_cache/ +.coverage +htmlcov/ + +# ====================== +# Logs +# ====================== +*.log + +# ====================== +# Alembic +# ====================== +alembic/versions/__pycache__/ + +# ====================== +# Docker +# ====================== +docker-compose.override.yml + +# ====================== +# OS files +# ====================== +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..194e62d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.13-slim + +WORKDIR /app + +COPY requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8000 + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..d142807 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,149 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts. +# this is typically a path given in POSIX (e.g. forward slashes) +# format, relative to the token %(here)s which refers to the location of this +# ini file +script_location = %(here)s/alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s +# Or organize into date-based subdirectories (requires recursive_version_locations = true) +# file_template = %%(year)d/%%(month).2d/%%(day).2d_%%(hour).2d%%(minute).2d_%%(second).2d_%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. for multiple paths, the path separator +# is defined by "path_separator" below. +prepend_sys_path = . + + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the tzdata library which can be installed by adding +# `alembic[tz]` to the pip requirements. +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to /versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "path_separator" +# below. +# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions + +# path_separator; This indicates what character is used to split lists of file +# paths, including version_locations and prepend_sys_path within configparser +# files such as alembic.ini. +# The default rendered in new alembic.ini files is "os", which uses os.pathsep +# to provide os-dependent path splitting. +# +# Note that in order to support legacy alembic.ini files, this default does NOT +# take place if path_separator is not present in alembic.ini. If this +# option is omitted entirely, fallback logic is as follows: +# +# 1. Parsing of the version_locations option falls back to using the legacy +# "version_path_separator" key, which if absent then falls back to the legacy +# behavior of splitting on spaces and/or commas. +# 2. Parsing of the prepend_sys_path option falls back to the legacy +# behavior of splitting on spaces, commas, or colons. +# +# Valid values for path_separator are: +# +# path_separator = : +# path_separator = ; +# path_separator = space +# path_separator = newline +# +# Use os.pathsep. Default configuration used for new projects. +path_separator = os + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# database URL. This is consumed by the user-maintained env.py script only. +# other means of configuring database URLs may be customized within the env.py +# file. +sqlalchemy.url = postgresql+psycopg://postgres:postgres@localhost:5432/wheretf + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module +# hooks = ruff +# ruff.type = module +# ruff.module = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Alternatively, use the exec runner to execute a binary found on your PATH +# hooks = ruff +# ruff.type = exec +# ruff.executable = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Logging configuration. This is also consumed by the user-maintained +# env.py script only. +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/alembic/README b/alembic/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/alembic/env.py b/alembic/env.py new file mode 100644 index 0000000..5d4475a --- /dev/null +++ b/alembic/env.py @@ -0,0 +1,76 @@ +from logging.config import fileConfig +from sqlalchemy import engine_from_config +from sqlalchemy import pool,create_engine +from app.database import Base +import app.models +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + + +# We force Alembic to use localhost and the exact password 'postgres' + connectable = create_engine("postgresql+psycopg://postgres:postgres@localhost:5433/wheretf") + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/alembic/script.py.mako b/alembic/script.py.mako new file mode 100644 index 0000000..1101630 --- /dev/null +++ b/alembic/script.py.mako @@ -0,0 +1,28 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + """Upgrade schema.""" + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + """Downgrade schema.""" + ${downgrades if downgrades else "pass"} diff --git a/alembic/versions/2f98f76cb765_create_file_relationships_table.py b/alembic/versions/2f98f76cb765_create_file_relationships_table.py new file mode 100644 index 0000000..3e24c02 --- /dev/null +++ b/alembic/versions/2f98f76cb765_create_file_relationships_table.py @@ -0,0 +1,42 @@ +"""create_file_relationships_table + +Revision ID: 2f98f76cb765 +Revises: cdcdffd36592 +Create Date: 2026-06-11 16:18:13.091065 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '2f98f76cb765' +down_revision: Union[str, Sequence[str], None] = 'cdcdffd36592' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('file_relationships', + sa.Column('id', sa.UUID(), server_default=sa.text('gen_random_uuid()'), nullable=False), + sa.Column('source_file_id', sa.UUID(), nullable=False), + sa.Column('target_file_id', sa.UUID(), nullable=False), + sa.Column('similarity_score', sa.Float(), nullable=False), + sa.Column('relation_type', sa.String(length=50), server_default='semantic_similarity', nullable=False), + sa.ForeignKeyConstraint(['source_file_id'], ['files.id'], ondelete='CASCADE'), + sa.ForeignKeyConstraint(['target_file_id'], ['files.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('source_file_id', 'target_file_id', name='uq_file_relationship') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('file_relationships') + # ### end Alembic commands ### diff --git a/alembic/versions/c1850b307113_create_files_table.py b/alembic/versions/c1850b307113_create_files_table.py new file mode 100644 index 0000000..5822e0d --- /dev/null +++ b/alembic/versions/c1850b307113_create_files_table.py @@ -0,0 +1,46 @@ +"""create_files_table + +Revision ID: c1850b307113 +Revises: +Create Date: 2026-06-11 16:12:34.237674 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'c1850b307113' +down_revision: Union[str, Sequence[str], None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('files', + sa.Column('id', sa.UUID(), server_default=sa.text('gen_random_uuid()'), nullable=False), + sa.Column('file_path', sa.Text(), nullable=False), + sa.Column('file_hash', sa.String(length=64), nullable=False), + sa.Column('mime_type', sa.String(length=50), nullable=False), + sa.Column('last_modified', sa.DateTime(timezone=True), nullable=False), + sa.Column('indexed_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.Column('tags', postgresql.ARRAY(sa.Text()), server_default='{}', nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('file_path') + ) + op.create_index('files_path_hash_idx', 'files', ['file_path', 'file_hash'], unique=False) + op.create_index('files_tags_idx', 'files', ['tags'], unique=False, postgresql_using='gin') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index('files_tags_idx', table_name='files', postgresql_using='gin') + op.drop_index('files_path_hash_idx', table_name='files') + op.drop_table('files') + # ### end Alembic commands ### diff --git a/alembic/versions/cdcdffd36592_create_file_content_table.py b/alembic/versions/cdcdffd36592_create_file_content_table.py new file mode 100644 index 0000000..6707dea --- /dev/null +++ b/alembic/versions/cdcdffd36592_create_file_content_table.py @@ -0,0 +1,46 @@ +"""create_file_content_table + +Revision ID: cdcdffd36592 +Revises: c1850b307113 +Create Date: 2026-06-11 16:14:39.195041 + +""" +from typing import Sequence, Union +import pgvector +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'cdcdffd36592' +down_revision: Union[str, Sequence[str], None] = 'c1850b307113' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.execute("CREATE EXTENSION IF NOT EXISTS vector;") + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('file_content', + sa.Column('id', sa.UUID(), server_default=sa.text('gen_random_uuid()'), nullable=False), + sa.Column('file_id', sa.UUID(), nullable=False), + sa.Column('chunk_index', sa.Integer(), nullable=False), + sa.Column('content_text', sa.Text(), nullable=False), + sa.Column('embedding', pgvector.sqlalchemy.vector.VECTOR(dim=384), nullable=False), + sa.Column('keyword_tokens', postgresql.TSVECTOR(), sa.Computed("to_tsvector('english', content_text)", persisted=True), nullable=True), + sa.ForeignKeyConstraint(['file_id'], ['files.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id') + ) + op.create_index('content_embedding_idx', 'file_content', ['embedding'], unique=False, postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_cosine_ops'}) + op.create_index('content_fts_idx', 'file_content', ['keyword_tokens'], unique=False, postgresql_using='gin') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index('content_fts_idx', table_name='file_content', postgresql_using='gin') + op.drop_index('content_embedding_idx', table_name='file_content', postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_cosine_ops'}) + op.drop_table('file_content') + # ### end Alembic commands ### diff --git a/app/database.py b/app/database.py new file mode 100644 index 0000000..89f8575 --- /dev/null +++ b/app/database.py @@ -0,0 +1,23 @@ +from dotenv import load_dotenv +import os + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker, DeclarativeBase + +load_dotenv() + +DATABASE_URL = os.getenv( + "DATABASE_URL", + "postgresql+psycopg://postgres:postgres@db:5432/wheretf" +) +engine = create_engine(DATABASE_URL) + +SessionLocal = sessionmaker( + bind=engine, + autoflush=False, + autocommit=False, +) + + +class Base(DeclarativeBase): + pass \ No newline at end of file diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..a4aee24 --- /dev/null +++ b/app/main.py @@ -0,0 +1,7 @@ +from fastapi import FastAPI +app = FastAPI( + title="WhereTF Backend" +) +@app.get("/") +def root(): + return {"message": "WhereTF Backend is running"} \ No newline at end of file diff --git a/app/models/__init__.py b/app/models/__init__.py new file mode 100644 index 0000000..bf82e82 --- /dev/null +++ b/app/models/__init__.py @@ -0,0 +1,3 @@ +from app.models.file import File +from app.models.content import FileContent +from app.models.relationship import FileRelationship \ No newline at end of file diff --git a/app/models/content.py b/app/models/content.py new file mode 100644 index 0000000..0690482 --- /dev/null +++ b/app/models/content.py @@ -0,0 +1,31 @@ +import uuid +from sqlalchemy import Integer, Text, ForeignKey, Index, Computed +from sqlalchemy.dialects.postgresql import UUID, TSVECTOR +from sqlalchemy.orm import Mapped, mapped_column, relationship +from pgvector.sqlalchemy import Vector +from sqlalchemy.sql import func +from app.database import Base + +class FileContent(Base): + __tablename__ = "file_content" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, server_default=func.gen_random_uuid()) + file_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("files.id", ondelete="CASCADE")) + + chunk_index: Mapped[int] = mapped_column(Integer, nullable=False) + content_text: Mapped[str] = mapped_column(Text, nullable=False) + + # 384 dimensions for all-MiniLM-L6-v2 + embedding: Mapped[list[float]] = mapped_column(Vector(384)) + + # Generated column for Full Text Search + keyword_tokens = mapped_column(TSVECTOR, Computed("to_tsvector('english', content_text)", persisted=True)) + + # Link back to the parent file + file = relationship("File", back_populates="contents") + + # Indexes from the image + __table_args__ = ( + Index('content_fts_idx', 'keyword_tokens', postgresql_using='gin'), + Index('content_embedding_idx', 'embedding', postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_cosine_ops'}), + ) \ No newline at end of file diff --git a/app/models/file.py b/app/models/file.py new file mode 100644 index 0000000..cd02ede --- /dev/null +++ b/app/models/file.py @@ -0,0 +1,30 @@ +import uuid +from datetime import datetime +from sqlalchemy import String, Text, DateTime, Index +from sqlalchemy.dialects.postgresql import UUID, ARRAY +from sqlalchemy.sql import func +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.database import Base + +class File(Base): + __tablename__ = "files" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, server_default=func.gen_random_uuid()) + file_path: Mapped[str] = mapped_column(Text, unique=True, nullable=False) + file_hash: Mapped[str] = mapped_column(String(64), nullable=False) # SHA-256 + mime_type: Mapped[str] = mapped_column(String(50), nullable=False) + + last_modified: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False) + indexed_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now()) + + tags: Mapped[list[str]] = mapped_column(ARRAY(Text), server_default='{}') + + # Link to the contents table + contents = relationship("FileContent", back_populates="file", cascade="all, delete-orphan") + + # Indexes from the image + __table_args__ = ( + Index('files_tags_idx', 'tags', postgresql_using='gin'), + Index('files_path_hash_idx', 'file_path', 'file_hash'), + ) \ No newline at end of file diff --git a/app/models/relationship.py b/app/models/relationship.py new file mode 100644 index 0000000..7692777 --- /dev/null +++ b/app/models/relationship.py @@ -0,0 +1,21 @@ +import uuid +from sqlalchemy import Float, String, ForeignKey, UniqueConstraint +from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.sql import func +from sqlalchemy.orm import Mapped, mapped_column + +from app.database import Base + +class FileRelationship(Base): + __tablename__ = "file_relationships" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, server_default=func.gen_random_uuid()) + source_file_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("files.id", ondelete="CASCADE")) + target_file_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("files.id", ondelete="CASCADE")) + + similarity_score: Mapped[float] = mapped_column(Float, nullable=False) + relation_type: Mapped[str] = mapped_column(String(50), server_default='semantic_similarity') + + __table_args__ = ( + UniqueConstraint('source_file_id', 'target_file_id', name='uq_file_relationship'), + ) \ No newline at end of file diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/schemas/document.py b/app/schemas/document.py new file mode 100644 index 0000000..8312171 --- /dev/null +++ b/app/schemas/document.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel + + +class DocumentIngest(BaseModel): + file_path: str + raw_text: str +class DocumentSearchResponse(BaseModel): + filename: str + file_path: str + snippet: str + score: float \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..5b0bee0 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,31 @@ +services: + db: + image: pgvector/pgvector:pg17 + container_name: wheretf-db + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: wheretf + ports: + - "5433:5432" + volumes: + - postgres_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres -d wheretf"] + interval: 5s + timeout: 5s + retries: 5 + + backend: + build: . + container_name: wheretf-backend + ports: + - "8000:8000" + depends_on: + db: + condition: service_healthy + environment: + DATABASE_URL: postgresql+psycopg://postgres:postgres@db:5432/wheretf + +volumes: + postgres_data: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c0d5e2e Binary files /dev/null and b/requirements.txt differ