diff --git a/.assets/advanced-dashboard-webui.png b/.assets/advanced-dashboard-webui.png new file mode 100644 index 00000000..fc700443 Binary files /dev/null and b/.assets/advanced-dashboard-webui.png differ diff --git a/.assets/memory-dashboard.png b/.assets/memory-dashboard.png new file mode 100644 index 00000000..a3c1451b Binary files /dev/null and b/.assets/memory-dashboard.png differ diff --git a/.assets/plan.png b/.assets/plan.png deleted file mode 100644 index 4a1da307..00000000 Binary files a/.assets/plan.png and /dev/null differ diff --git a/.env.template b/.env.template index 083fb254..97495493 100644 --- a/.env.template +++ b/.env.template @@ -105,7 +105,7 @@ PARAKEET_ASR_URL=http://host.docker.internal:8767 # MongoDB configuration MONGODB_URI=mongodb://mongo:${MONGODB_PORT} -MONGODB_K8S_URI=mongodb://mongodb.${INFRASTRUCTURE_NAMESPACE}.svc.cluster.local:27017/friend +MONGODB_K8S_URI=mongodb://mongodb.${INFRASTRUCTURE_NAMESPACE}.svc.cluster.local:27017/friend-lite # Qdrant configuration QDRANT_BASE_URL=qdrant diff --git a/.github/workflows/advanced-docker-compose-build.yml b/.github/workflows/advanced-docker-compose-build.yml new file mode 100644 index 00000000..5acc717e --- /dev/null +++ b/.github/workflows/advanced-docker-compose-build.yml @@ -0,0 +1,248 @@ +name: Build and Deploy Advanced (Docker Compose) + +on: + workflow_dispatch: + inputs: + version: + description: Optional version tag override (e.g. v1.2.3) + required: false + push: + branches: [ "main" ] + paths: + - "*" + - "backends/advanced/**" + - "extras/asr-services/**" + - "extras/speaker-recognition/**" + - "extras/openmemory-mcp/**" + - ".github/workflows/advanced-docker-compose-build.yml" + tags: + - "v*" + + +permissions: + contents: read + packages: write + actions: read + +env: + REGISTRY: ghcr.io + +jobs: + build-default: + runs-on: ubuntu-latest + timeout-minutes: 60 + env: + ADVANCED_ENV: ${{ secrets.ADVANCED_ENV }} + RUNNER_FLAVOUR: ubuntu-latest + defaults: + run: + shell: bash + working-directory: backends/advanced + + steps: + - name: Show selected runner + run: echo "Workflow running on ${RUNNER_FLAVOUR} runner" + working-directory: . + + - name: Checkout + uses: actions/checkout@v4 + + - name: Print commit details + run: | + echo "Event: ${{ github.event_name }}" + echo "Ref: $GITHUB_REF" + echo "Ref name: ${{ github.ref_name }}" + echo "Repository: $GITHUB_REPOSITORY" + echo "Actor: $GITHUB_ACTOR" + echo "SHA: $GITHUB_SHA" + echo "Short SHA: ${GITHUB_SHA::7}" + echo "Commit info:" + git log -1 --pretty=format:'Author: %an <%ae>%nDate: %ad%nSubject: %s' || true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Copy .env.template to .env + run: | + set -euo pipefail + copy_env() { + local dir="$1" + local template="${dir}/.env.template" + local target="${dir}/.env" + if [ -f "$template" ]; then + echo "Copying $template to $target" + cp "$template" "$target" + else + echo "$template not found; skipping" + fi + } + + copy_env . + copy_env ../../extras/asr-services + copy_env ../../extras/speaker-recognition + copy_env ../../extras/openmemory-mcp + + - name: Create .env from secret (if provided) + if: env.ADVANCED_ENV != '' + run: | + echo "Writing .env from ADVANCED_ENV secret" + printf "%s\n" "${ADVANCED_ENV}" > .env + + - name: Source .env (if present) + run: | + if [ -f .env ]; then + set -a + # shellcheck disable=SC1091 + source .env + set +a + else + echo ".env not found; continuing" + fi + + - name: Determine version + id: version + run: | + if [ -n "${{ github.event.inputs.version }}" ]; then + VERSION="${{ github.event.inputs.version }}" + elif [[ "${GITHUB_REF}" == refs/tags/* ]]; then + VERSION="${GITHUB_REF#refs/tags/}" + else + VERSION="sha-${GITHUB_SHA::7}" + fi + echo "VERSION=$VERSION" >> "$GITHUB_OUTPUT" + + - name: Build, tag, and push services sequentially with version + env: + OWNER: ${{ github.repository_owner }} + VERSION: ${{ steps.version.outputs.VERSION }} + run: | + set -euo pipefail + docker compose version + OWNER_LC=$(echo "$OWNER" | tr '[:upper:]' '[:lower:]') + + # CUDA variants from pyproject.toml + CUDA_VARIANTS=("cpu" "cu121" "cu126" "cu128") + + # Base services (no CUDA variants, no profiles) + base_service_specs=( + "friend-backend|advanced-friend-backend|docker-compose.yml|." + "workers|advanced-workers|docker-compose.yml|." + "webui|advanced-webui|docker-compose.yml|." + "openmemory-mcp|openmemory-mcp|../../extras/openmemory-mcp/docker-compose.yml|../../extras/openmemory-mcp" + ) + + # Build and push base services + for spec in "${base_service_specs[@]}"; do + IFS='|' read -r svc svc_repo compose_file project_dir <<< "$spec" + + echo "::group::Building and pushing $svc_repo" + if [ "$compose_file" = "docker-compose.yml" ] && [ "$project_dir" = "." ]; then + docker compose build --pull "$svc" + else + docker compose -f "$compose_file" --project-directory "$project_dir" build "$svc" + fi + # Resolve the built image ID via compose (avoids name mismatches) + if [ "$compose_file" = "docker-compose.yml" ] && [ "$project_dir" = "." ]; then + img_id=$(docker compose images -q "$svc" | head -n1) + else + img_id=$(docker compose -f "$compose_file" --project-directory "$project_dir" images -q "$svc" | head -n1) + fi + if [ -z "${img_id:-}" ]; then + echo "Skipping $svc_repo (no built image found after build)" + echo "::endgroup::" + continue + fi + + # Tag and push with version + target_image="$REGISTRY/$OWNER_LC/$svc_repo:$VERSION" + latest_image="$REGISTRY/$OWNER_LC/$svc_repo:latest" + echo "Tagging $img_id as $target_image" + docker tag "$img_id" "$target_image" + echo "Tagging $img_id as $latest_image" + docker tag "$img_id" "$latest_image" + + echo "Pushing $target_image" + docker push "$target_image" + echo "Pushing $latest_image" + docker push "$latest_image" + + # Clean up local tags + docker image rm -f "$target_image" || true + docker image rm -f "$latest_image" || true + echo "::endgroup::" + done + + # Build and push parakeet-asr with CUDA variants (cu121, cu126, cu128) + echo "::group::Building and pushing parakeet-asr CUDA variants" + cd ../../extras/asr-services + for cuda_variant in cu121 cu126 cu128; do + echo "Building parakeet-asr-${cuda_variant}" + export CUDA_VERSION="${cuda_variant}" + docker compose build parakeet-asr + + img_id=$(docker compose images -q parakeet-asr | head -n1) + if [ -n "${img_id:-}" ]; then + target_image="$REGISTRY/$OWNER_LC/parakeet-asr-${cuda_variant}:$VERSION" + latest_image="$REGISTRY/$OWNER_LC/parakeet-asr-${cuda_variant}:latest" + echo "Tagging $img_id as $target_image" + docker tag "$img_id" "$target_image" + echo "Tagging $img_id as $latest_image" + docker tag "$img_id" "$latest_image" + + echo "Pushing $target_image" + docker push "$target_image" + echo "Pushing $latest_image" + docker push "$latest_image" + + # Clean up local tags + docker image rm -f "$target_image" || true + docker image rm -f "$latest_image" || true + fi + done + cd - > /dev/null + echo "::endgroup::" + + # Build and push speaker-recognition with all CUDA variants (including CPU) + # Note: speaker-service has profiles, but we can build it directly by setting PYTORCH_CUDA_VERSION + echo "::group::Building and pushing speaker-recognition variants" + cd ../../extras/speaker-recognition + for cuda_variant in "${CUDA_VARIANTS[@]}"; do + echo "Building speaker-recognition-${cuda_variant}" + export PYTORCH_CUDA_VERSION="${cuda_variant}" + # Build speaker-service directly (profiles only affect 'up', not 'build') + docker compose build speaker-service + + img_id=$(docker compose images -q speaker-service | head -n1) + if [ -n "${img_id:-}" ]; then + target_image="$REGISTRY/$OWNER_LC/speaker-recognition-${cuda_variant}:$VERSION" + latest_image="$REGISTRY/$OWNER_LC/speaker-recognition-${cuda_variant}:latest" + echo "Tagging $img_id as $target_image" + docker tag "$img_id" "$target_image" + echo "Tagging $img_id as $latest_image" + docker tag "$img_id" "$latest_image" + + echo "Pushing $target_image" + docker push "$target_image" + echo "Pushing $latest_image" + docker push "$latest_image" + + # Clean up local tags + docker image rm -f "$target_image" || true + docker image rm -f "$latest_image" || true + fi + done + cd - > /dev/null + echo "::endgroup::" + + # Summary + echo "::group::Build Summary" + echo "Built and pushed images with version tag: ${VERSION}" + echo "Images pushed to: $REGISTRY/$OWNER_LC/" + echo "::endgroup::" diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 38f1eccb..ce2677b4 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -27,7 +27,25 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 - + + - name: Verify required secrets + env: + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: | + echo "Verifying required secrets..." + if [ -z "$DEEPGRAM_API_KEY" ]; then + echo "❌ ERROR: DEEPGRAM_API_KEY secret is not set" + exit 1 + fi + if [ -z "$OPENAI_API_KEY" ]; then + echo "❌ ERROR: OPENAI_API_KEY secret is not set" + exit 1 + fi + echo "✓ DEEPGRAM_API_KEY is set (length: ${#DEEPGRAM_API_KEY})" + echo "✓ OPENAI_API_KEY is set (length: ${#OPENAI_API_KEY})" + echo "✓ All required secrets verified" + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml new file mode 100644 index 00000000..e31d81c5 --- /dev/null +++ b/.github/workflows/jekyll-gh-pages.yml @@ -0,0 +1,51 @@ +# Sample workflow for building and deploying a Jekyll site to GitHub Pages +name: Deploy Jekyll with GitHub Pages dependencies preinstalled + +on: + # Runs on pushes targeting the default branch + push: + branches: ["main"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Build job + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Build with Jekyll + uses: actions/jekyll-build-pages@v1 + with: + source: ./ + destination: ./_site + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/robot-tests.yml b/.github/workflows/robot-tests.yml new file mode 100644 index 00000000..a00b7c1c --- /dev/null +++ b/.github/workflows/robot-tests.yml @@ -0,0 +1,396 @@ +name: Robot Framework Tests + +on: + pull_request: + paths: + - 'tests/**/*.robot' + - 'tests/**/*.py' + - 'backends/advanced/src/**' + - '.github/workflows/robot-tests.yml' + +permissions: + contents: read + pull-requests: write + issues: write + pages: write + id-token: write + +jobs: + robot-tests: + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Verify required secrets + env: + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: | + echo "Verifying required secrets..." + if [ -z "$DEEPGRAM_API_KEY" ]; then + echo "❌ ERROR: DEEPGRAM_API_KEY secret is not set" + exit 1 + fi + if [ -z "$OPENAI_API_KEY" ]; then + echo "❌ ERROR: OPENAI_API_KEY secret is not set" + exit 1 + fi + echo "✓ DEEPGRAM_API_KEY is set (length: ${#DEEPGRAM_API_KEY})" + echo "✓ OPENAI_API_KEY is set (length: ${#OPENAI_API_KEY})" + echo "✓ All required secrets verified" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + image=moby/buildkit:latest + network=host + + - name: Cache Docker layers + uses: actions/cache@v4 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ hashFiles('backends/advanced/Dockerfile', 'backends/advanced/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-buildx- + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: 'pip' + + - name: Install Robot Framework and dependencies + run: | + pip install --upgrade pip + pip install robotframework robotframework-requests python-dotenv websockets + + - name: Create test environment file + working-directory: tests/setup + run: | + cat > .env.test << EOF + # API URLs + API_URL=http://localhost:8001 + BACKEND_URL=http://localhost:8001 + FRONTEND_URL=http://localhost:3001 + + # Test Admin Credentials + ADMIN_EMAIL=test-admin@example.com + ADMIN_PASSWORD=test-admin-password-123 + + # API Keys (from GitHub secrets) + OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} + DEEPGRAM_API_KEY=${{ secrets.DEEPGRAM_API_KEY }} + + # Test Configuration + TEST_TIMEOUT=120 + TEST_DEVICE_NAME=robot-test + EOF + + - name: Start test environment + working-directory: backends/advanced + env: + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_PROVIDER: openai + TRANSCRIPTION_PROVIDER: deepgram + MEMORY_PROVIDER: friend_lite + run: | + # Debug: Check if secrets are available + echo "Checking environment variables..." + echo "DEEPGRAM_API_KEY is set: $([ -n "$DEEPGRAM_API_KEY" ] && echo 'YES' || echo 'NO')" + echo "OPENAI_API_KEY is set: $([ -n "$OPENAI_API_KEY" ] && echo 'YES' || echo 'NO')" + echo "LLM_PROVIDER: $LLM_PROVIDER" + echo "TRANSCRIPTION_PROVIDER: $TRANSCRIPTION_PROVIDER" + + # Create memory_config.yaml from template (file is gitignored) + echo "Creating memory_config.yaml from template..." + cp memory_config.yaml.template memory_config.yaml + + # Clean any existing test containers for fresh start + echo "Cleaning up any existing test containers..." + docker compose -f docker-compose-test.yml down -v || true + + # Start ALL services in parallel - Docker Compose handles dependencies via healthchecks + echo "Starting all services in parallel (docker-compose-test.yml)..." + echo "Note: Using test compose file with source mounts for faster startup" + + # Export API keys so docker-compose can use them + export DEEPGRAM_API_KEY + export OPENAI_API_KEY + export LLM_PROVIDER + export TRANSCRIPTION_PROVIDER + export MEMORY_PROVIDER + + DOCKER_BUILDKIT=0 docker compose -f docker-compose-test.yml up -d + + # Show container status + echo "Container status:" + docker compose -f docker-compose-test.yml ps + + # Single wait for backend readiness (backend depends_on ensures infra is ready) + echo "Waiting for backend readiness (up to 120s)..." + for i in {1..40}; do + if curl -s http://localhost:8001/readiness > /dev/null 2>&1; then + echo "✓ Backend is ready (all dependencies satisfied)" + break + fi + # Show logs every 10 attempts to help debug + if [ $((i % 10)) -eq 0 ]; then + echo "Still waiting... showing recent logs:" + docker compose -f docker-compose-test.yml logs --tail=20 friend-backend-test + fi + if [ $i -eq 40 ]; then + echo "✗ Backend failed to start - showing full logs:" + docker compose -f docker-compose-test.yml logs + exit 1 + fi + echo "Attempt $i/40..." + sleep 3 + done + + echo "✓ Backend is ready!" + + # Verify workers are registered with Redis (Robot tests need stable workers) + echo "Waiting for workers to register with Redis (up to 60s)..." + for i in {1..30}; do + WORKER_COUNT=$(docker compose -f docker-compose-test.yml exec -T workers-test uv run python -c 'from rq import Worker; from redis import Redis; import os; r = Redis.from_url(os.getenv("REDIS_URL", "redis://redis-test:6379/0")); print(len(Worker.all(connection=r)))' 2>/dev/null || echo "0") + + if [ "$WORKER_COUNT" -ge 6 ]; then + echo "✓ Found $WORKER_COUNT workers registered" + # Show worker details + docker compose -f docker-compose-test.yml exec -T workers-test uv run python -c 'from rq import Worker; from redis import Redis; import os; r = Redis.from_url(os.getenv("REDIS_URL", "redis://redis-test:6379/0")); workers = Worker.all(connection=r); print(f"Total registered workers: {len(workers)}"); [print(f" - {w.name}: queues={w.queue_names()}, state={w.get_state()}") for w in workers]' + break + fi + + if [ $i -eq 30 ]; then + echo "✗ Workers failed to register after 60s" + echo "Showing worker logs:" + docker compose -f docker-compose-test.yml logs --tail=50 workers-test + exit 1 + fi + + echo "Attempt $i/30: $WORKER_COUNT workers registered (waiting for 6+)..." + sleep 2 + done + + echo "✓ All services ready!" + + - name: Verify checked out code + working-directory: tests + run: | + echo "Current git commit:" + git log -1 --oneline + echo "" + echo "Test files in current checkout:" + find . -name "*.robot" -type f | head -10 + echo "" + echo "Sample of tags in test files:" + grep -h "\[Tags\]" endpoints/*.robot infrastructure/*.robot integration/*.robot 2>/dev/null | head -20 || echo "No tag files found" + + - name: Clean previous test results + working-directory: tests + run: | + echo "Cleaning any previous test results..." + rm -rf results + mkdir -p results + echo "✓ Fresh results directory created" + + - name: Run Robot Framework tests + working-directory: tests + env: + # Required for backend imports in test libraries + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + OPENAI_BASE_URL: https://api.openai.com/v1 + OPENAI_MODEL: gpt-4o-mini + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} + run: | + # Run all tests (don't fail workflow to allow artifact upload) + make all OUTPUTDIR=results + TEST_EXIT_CODE=$? + echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_ENV + exit 0 # Don't fail here, we'll fail at the end after uploading artifacts + + - name: Show service logs + if: always() + working-directory: backends/advanced + run: | + echo "=== Backend Logs (last 50 lines) ===" + docker compose -f docker-compose-test.yml logs --tail=50 friend-backend-test + echo "" + echo "=== Worker Logs (last 50 lines) ===" + docker compose -f docker-compose-test.yml logs --tail=50 workers-test + + - name: Check if test results exist + if: always() + id: check_results + run: | + if [ -f tests/results/output.xml ]; then + echo "results_exist=true" >> $GITHUB_OUTPUT + else + echo "results_exist=false" >> $GITHUB_OUTPUT + echo "⚠️ No test results found in tests/results/" + ls -la tests/results/ || echo "Results directory doesn't exist" + fi + + - name: Upload Robot Framework HTML reports + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/upload-artifact@v4 + with: + name: robot-test-reports-html + path: | + tests/results/report.html + tests/results/log.html + retention-days: 30 + + - name: Publish HTML Report as GitHub Pages artifact + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/upload-pages-artifact@v3 + with: + path: tests/results + + - name: Deploy to GitHub Pages + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/deploy-pages@v4 + id: deployment + + - name: Generate test summary + if: always() && steps.check_results.outputs.results_exist == 'true' + id: test_summary + run: | + # Parse test results + python3 << 'PYTHON_SCRIPT' > test_summary.txt + import xml.etree.ElementTree as ET + tree = ET.parse('tests/results/output.xml') + root = tree.getroot() + stats = root.find('.//total/stat') + if stats is not None: + passed = stats.get("pass", "0") + failed = stats.get("fail", "0") + total = int(passed) + int(failed) + print(f"PASSED={passed}") + print(f"FAILED={failed}") + print(f"TOTAL={total}") + PYTHON_SCRIPT + + # Source the variables + source test_summary.txt + + # Set outputs + echo "passed=$PASSED" >> $GITHUB_OUTPUT + echo "failed=$FAILED" >> $GITHUB_OUTPUT + echo "total=$TOTAL" >> $GITHUB_OUTPUT + + - name: Post PR comment with test results + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const passed = '${{ steps.test_summary.outputs.passed }}'; + const failed = '${{ steps.test_summary.outputs.failed }}'; + const total = '${{ steps.test_summary.outputs.total }}'; + const runUrl = `https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}`; + const pagesUrl = '${{ steps.deployment.outputs.page_url }}'; + + const status = failed === '0' ? '✅ All tests passed!' : '❌ Some tests failed'; + const emoji = failed === '0' ? '🎉' : '⚠️'; + + const comment = `## ${emoji} Robot Framework Test Results + + **Status**: ${status} + + | Metric | Count | + |--------|-------| + | ✅ Passed | ${passed} | + | ❌ Failed | ${failed} | + | 📊 Total | ${total} | + + ### 📊 View Reports + + **GitHub Pages (Live Reports):** + - [📋 Test Report](${pagesUrl}report.html) + - [📝 Detailed Log](${pagesUrl}log.html) + + **Download Artifacts:** + - [robot-test-reports-html](${runUrl}) - HTML reports + - [robot-test-results-xml](${runUrl}) - XML output + + --- + *[View full workflow run](${runUrl})*`; + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: comment + }); + + - name: Upload Robot Framework XML output + if: always() && steps.check_results.outputs.results_exist == 'true' + uses: actions/upload-artifact@v4 + with: + name: robot-test-results-xml + path: tests/results/output.xml + retention-days: 30 + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: robot-test-logs + path: | + backends/advanced/.env + tests/setup/.env.test + retention-days: 7 + + - name: Display test results summary + if: always() + run: | + if [ -f tests/results/output.xml ]; then + echo "Test results generated successfully" + echo "========================================" + python3 << 'PYTHON_SCRIPT' + import xml.etree.ElementTree as ET + tree = ET.parse('tests/results/output.xml') + root = tree.getroot() + stats = root.find('.//total/stat') + if stats is not None: + passed = stats.get("pass", "0") + failed = stats.get("fail", "0") + print(f'✅ Passed: {passed}') + print(f'❌ Failed: {failed}') + print(f'📊 Total: {int(passed) + int(failed)}') + PYTHON_SCRIPT + echo "========================================" + echo "" + echo "📊 FULL TEST REPORTS AVAILABLE:" + echo " 1. Go to the 'Summary' tab at the top of this page" + echo " 2. Scroll down to 'Artifacts' section" + echo " 3. Download 'robot-test-reports-html'" + echo " 4. Extract and open report.html or log.html in your browser" + echo "" + echo "The HTML reports provide:" + echo " - report.html: Executive summary with statistics" + echo " - log.html: Detailed step-by-step execution log" + echo "" + fi + + - name: Cleanup + if: always() + working-directory: backends/advanced + run: | + docker compose -f docker-compose-test.yml down -v + + - name: Fail workflow if tests failed + if: always() + run: | + if [ "${{ env.test_exit_code }}" != "0" ]; then + echo "❌ Tests failed with exit code ${{ env.test_exit_code }}" + exit 1 + else + echo "✅ All tests passed" + fi diff --git a/.github/workflows/speaker-recognition-tests.yml b/.github/workflows/speaker-recognition-tests.yml index f7342848..5768ada7 100644 --- a/.github/workflows/speaker-recognition-tests.yml +++ b/.github/workflows/speaker-recognition-tests.yml @@ -32,7 +32,25 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 - + + - name: Verify required secrets + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} + run: | + echo "Verifying required secrets..." + if [ -z "$HF_TOKEN" ]; then + echo "❌ ERROR: HF_TOKEN secret is not set" + exit 1 + fi + if [ -z "$DEEPGRAM_API_KEY" ]; then + echo "❌ ERROR: DEEPGRAM_API_KEY secret is not set" + exit 1 + fi + echo "✓ HF_TOKEN is set (length: ${#HF_TOKEN})" + echo "✓ DEEPGRAM_API_KEY is set (length: ${#DEEPGRAM_API_KEY})" + echo "✓ All required secrets verified" + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 diff --git a/.gitignore b/.gitignore index 41ac1fea..b2b052b3 100644 --- a/.gitignore +++ b/.gitignore @@ -73,3 +73,12 @@ backends/charts/advanced-backend/env-configmap.yaml extras/openmemory-mcp/data/* .env.backup.* +backends/advanced/nginx.conf +backends/advanced/Caddyfile + +app/ios/Pods +results +log.html +output.xml +report.html +.secrets diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6d699ff5..6ebb6573 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,26 @@ repos: + # Local hooks (project-specific checks) + - repo: local + hooks: + # Run Robot Framework endpoint tests before push + - id: robot-framework-tests + name: Robot Framework Tests (Endpoints) + entry: bash -c 'cd tests && make endpoints OUTPUTDIR=.pre-commit-results' + language: system + pass_filenames: false + stages: [push] + verbose: true + + # Clean up test results after hook runs + - id: cleanup-test-results + name: Cleanup Test Results + entry: bash -c 'cd tests && rm -rf .pre-commit-results' + language: system + pass_filenames: false + stages: [push] + always_run: true + + # Code formatting - repo: https://github.com/psf/black rev: 24.4.2 hooks: @@ -9,6 +31,8 @@ repos: hooks: - id: isort files: ^backends/advanced-backend/src/.*\.py$ + + # File hygiene - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: diff --git a/ACT_GUIDE.md b/ACT_GUIDE.md new file mode 100644 index 00000000..fcb3ae31 --- /dev/null +++ b/ACT_GUIDE.md @@ -0,0 +1,82 @@ +# Testing GitHub Actions Locally with Act + +## Setup Complete ✓ + +Act is installed and configured. Your `.secrets` file is ready (gitignored). + +## Quick Start + +### 1. Dry Run (See what would execute) +```bash +act pull_request -W .github/workflows/robot-tests.yml -n --container-architecture linux/amd64 +``` + +### 2. Run Robot Tests Locally (Full GitHub Actions simulation) +```bash +act pull_request -W .github/workflows/robot-tests.yml \ + --secret-file .secrets \ + --container-architecture linux/amd64 +``` + +### 3. Run with Verbose Output +```bash +act pull_request -W .github/workflows/robot-tests.yml \ + --secret-file .secrets \ + --container-architecture linux/amd64 \ + -v +``` + +### 4. Skip Image Pull (After first run) +```bash +act pull_request -W .github/workflows/robot-tests.yml \ + --secret-file .secrets \ + --container-architecture linux/amd64 \ + --pull=false +``` + +## Important Notes + +- **First run downloads ~20GB Docker image** - be patient +- **M-series Mac**: Always use `--container-architecture linux/amd64` +- **Secrets file**: `.secrets` contains your API keys (gitignored) +- **Resource intensive**: Docker-in-Docker uses significant CPU/RAM +- **Not 100% identical**: Some GitHub-specific features may behave differently + +## Editing Secrets + +```bash +nano .secrets +``` + +Format: +``` +DEEPGRAM_API_KEY=your-key-here +OPENAI_API_KEY=your-key-here +``` + +## Troubleshooting + +### Out of disk space +```bash +# Clean up act containers +docker system prune -a +``` + +### Workflow fails differently than GitHub +- Act uses different runner images +- Some GitHub Actions may not be fully compatible +- Check act logs vs GitHub Actions logs + +### Kill running act job +```bash +# Ctrl+C or: +docker ps | grep act | awk '{print $1}' | xargs docker kill +``` + +## Why Use Act? + +- Test workflows without pushing to GitHub +- Faster iteration during workflow development +- Debug CI-specific issues locally +- Save GitHub Actions minutes + diff --git a/CLAUDE.md b/CLAUDE.md index 8ee8193c..0f579d33 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -Friend-Lite is at the core an AI-powered personal system - various devices, incuding but not limited to wearables from OMI can be used for at the very least audio capture, speaker specific transcription, memory extraction and retriaval. +Friend-Lite is at the core an AI-powered personal system - various devices, including but not limited to wearables from OMI can be used for at the very least audio capture, speaker specific transcription, memory extraction and retrieval. On top of that - it is being designed to support other services, that can help a user with these inputs such as reminders, action items, personal diagnosis etc. This supports a comprehensive web dashboard for management. @@ -74,36 +74,40 @@ cp .env.template .env # Configure API keys # Manual test execution (for debugging) source .env && export DEEPGRAM_API_KEY && export OPENAI_API_KEY uv run pytest tests/test_integration.py::test_full_pipeline_integration -v -s -``` - -#### Speaker Recognition Tests -```bash -cd extras/speaker-recognition -# Requires .env file with HF_TOKEN and DEEPGRAM_API_KEY -cp .env.template .env # Configure tokens +# Leave test containers running for debugging (don't auto-cleanup) +CLEANUP_CONTAINERS=false source .env && export DEEPGRAM_API_KEY && export OPENAI_API_KEY +uv run pytest tests/test_integration.py::test_full_pipeline_integration -v -s -# Run speaker recognition test suite -./run-test.sh +# Manual cleanup when needed +docker compose -f docker-compose-test.yml down -v ``` +#### Test Configuration Flags +- **CLEANUP_CONTAINERS** (default: true): Automatically stop and remove test containers after test completion + - Set to `false` for debugging: `CLEANUP_CONTAINERS=false ./run-test.sh` +- **REBUILD** (default: true): Force rebuild containers with latest code changes +- **FRESH_RUN** (default: true): Start with clean database and fresh containers +- **TRANSCRIPTION_PROVIDER** (default: deepgram): Choose transcription provider (deepgram or parakeet) + +#### Test Environment Variables +Tests use isolated test environment with overridden credentials: +- **Test Database**: `test_db` (MongoDB on port 27018, separate from production) +- **Test Ports**: Backend (8001), Qdrant (6337/6338), WebUI (3001) +- **Test Credentials**: + - `AUTH_SECRET_KEY`: test-jwt-signing-key-for-integration-tests + - `ADMIN_EMAIL`: test-admin@example.com + - `ADMIN_PASSWORD`: test-admin-password-123 +- **API Keys**: Loaded from `.env` file (DEEPGRAM_API_KEY, OPENAI_API_KEY) +- **Test Settings**: `DISABLE_SPEAKER_RECOGNITION=true` to prevent segment duplication + #### Test Script Features - **Environment Compatibility**: Works with both local .env files and CI environment variables -- **Simplified Configuration**: Uses environment variables directly, no temporary .env.test files -- **Docker Cleanup**: Uses lightweight Alpine container for reliable permission-free cleanup -- **Automatic Cleanup**: Stops and removes test containers after execution +- **Isolated Test Environment**: Separate ports and database prevent conflicts with running services +- **Automatic Cleanup**: Configurable via CLEANUP_CONTAINERS flag (default: true) - **Colored Output**: Clear progress indicators and error reporting - **Timeout Protection**: 15-minute timeout for advanced backend, 30-minute for speaker recognition -- **Fresh Testing**: Uses CACHED_MODE=False for clean test environments - -#### Debugging Integration Tests -For advanced debugging, you can still use the cached mode approach: - -1. **Edit tests/test_integration.py**: Set CACHED_MODE = True -2. **Run test manually**: `uv run pytest tests/test_integration.py -v -s --tb=short` -3. **Debug containers**: `docker logs advanced-backend-friend-backend-test-1 --tail=100` -4. **Test endpoints**: `curl -X GET http://localhost:8001/health` -5. **Clean up**: `docker compose -f docker-compose-test.yml down -v` +- **Fresh Testing**: Clean database and containers for each test run ### Mobile App Development ```bash @@ -122,7 +126,7 @@ npm run web ```bash # ASR Services cd extras/asr-services -docker compose up parakeet # Offline ASR with Parakeet +docker compose up parakeet-asr # Offline ASR with Parakeet # Speaker Recognition (with tests) cd extras/speaker-recognition @@ -136,17 +140,11 @@ docker compose up --build ## Architecture Overview -### Core Structure -- **backends/advanced-backend/**: Primary FastAPI backend with real-time audio processing - - `src/main.py`: Central FastAPI application with WebSocket audio streaming - - `src/auth.py`: Email-based authentication with JWT tokens - - `src/memory/`: LLM-powered conversation memory system using mem0 - - `webui/`: React-based web dashboard for conversation and user management - ### Key Components - **Audio Pipeline**: Real-time Opus/PCM → Application-level processing → Deepgram/Mistral transcription → memory extraction - **Wyoming Protocol**: WebSocket communication uses Wyoming protocol (JSONL + binary) for structured audio sessions -- **Application-Level Processing**: Centralized processors for audio, transcription, memory, and cropping +- **Unified Pipeline**: Job-based tracking system for all audio processing (WebSocket and file uploads) +- **Job Tracker**: Tracks pipeline jobs with stage events (audio → transcription → memory) and completion status - **Task Management**: BackgroundTaskManager tracks all async tasks to prevent orphaned processes - **Unified Transcription**: Deepgram/Mistral transcription with fallback to offline ASR services - **Memory System**: Pluggable providers (Friend-Lite native or OpenMemory MCP) @@ -203,105 +201,6 @@ Optional: - Efficient storage utilization for speech-only content - Automatic quality filtering without manual intervention -### Versioned Transcript and Memory System - -**Version Architecture**: -- **`transcript_versions`**: Array of transcript processing attempts with timestamps and providers -- **`memory_versions`**: Array of memory extraction attempts with different models/prompts -- **`active_transcript_version`**: Pointer to currently displayed transcript -- **`active_memory_version`**: Pointer to currently active memory extraction - -**Reprocessing Capabilities**: -- **Transcript Reprocessing**: Re-run speech-to-text with different providers or settings -- **Memory Reprocessing**: Re-extract memories using different LLM models or prompts -- **Version Management**: Switch between different processing results -- **Backward Compatibility**: Legacy fields auto-populated from active versions - -**Data Consistency**: -- All reprocessing operations use `conversation_id` (not `audio_uuid`) -- DateTime objects stored as ISO strings for MongoDB/JSON compatibility -- Legacy field support ensures existing integrations continue working - -### Database Schema Details - -**Collections Overview**: -- **`audio_chunks`**: All audio sessions by `audio_uuid` (always created) -- **`conversations`**: Speech-detected conversations by `conversation_id` (created conditionally) -- **`users`**: User accounts and authentication data - -**Speech-Driven Schema**: -```javascript -// audio_chunks collection (always created) -{ - "_id": ObjectId, - "audio_uuid": "uuid", // Primary identifier - "user_id": ObjectId, - "client_id": "user_suffix-device_name", - "audio_file_path": "/path/to/audio.wav", - "created_at": ISODate, - "transcript": "fallback transcript", // For non-speech audio - "segments": [...], // Speaker segments - "has_speech": boolean, // Speech detection result - "speech_analysis": {...}, // Detection metadata - "conversation_id": "conv_id" | null // Link to conversations collection -} - -// conversations collection (speech-detected only) -{ - "_id": ObjectId, - "conversation_id": "conv_uuid", // Primary identifier for user-facing operations - "audio_uuid": "audio_uuid", // Link to audio_chunks - "user_id": ObjectId, - "client_id": "user_suffix-device_name", - "created_at": ISODate, - - // Versioned Transcript System - "transcript_versions": [ - { - "version_id": "uuid", - "transcript": "text content", - "segments": [...], // Speaker diarization - "provider": "deepgram|mistral|parakeet", - "model": "nova-3|voxtral-mini-2507", - "created_at": ISODate, - "processing_time_seconds": 12.5, - "metadata": {...} - } - ], - "active_transcript_version": "uuid", // Points to current version - - // Versioned Memory System - "memory_versions": [ - { - "version_id": "uuid", - "memory_count": 5, - "transcript_version_id": "uuid", // Which transcript was used - "provider": "friend_lite|openmemory_mcp", - "model": "gpt-4o-mini|ollama-llama3", - "created_at": ISODate, - "processing_time_seconds": 45.2, - "metadata": {...} - } - ], - "active_memory_version": "uuid", // Points to current version - - // Legacy Fields (auto-populated from active versions) - "transcript": "text", // From active_transcript_version - "segments": [...], // From active_transcript_version - "memories": [...], // From active_memory_version - "memory_count": 5 // From active_memory_version -} -``` - -**Key Architecture Benefits**: -- **Clean Separation**: Raw audio storage vs user-facing conversations -- **Speech Filtering**: Only meaningful conversations appear in UI -- **Version History**: Complete audit trail of processing attempts -- **Backward Compatibility**: Legacy fields ensure existing code works -- **Reprocessing Support**: Easy to re-run with different providers/models -- **Service Decoupling**: Conversation creation independent of memory processing -- **Error Isolation**: Memory service failures don't affect conversation storage - ## Authentication & Security - **User System**: Email-based authentication with MongoDB ObjectId user IDs @@ -330,7 +229,7 @@ DEEPGRAM_API_KEY=your-deepgram-key-here # Optional: PARAKEET_ASR_URL=http://host.docker.internal:8767 # Optional: TRANSCRIPTION_PROVIDER=deepgram -# Memory Provider (New) +# Memory Provider MEMORY_PROVIDER=friend_lite # or openmemory_mcp # Database @@ -347,7 +246,7 @@ CORS_ORIGINS=http://localhost:3000,http://localhost:5173 ### Memory Provider Configuration -Friend-Lite now supports two pluggable memory backends: +Friend-Lite supports two pluggable memory backends: #### Friend-Lite Memory Provider (Default) ```bash @@ -378,81 +277,6 @@ OPENMEMORY_TIMEOUT=30 OPENAI_API_KEY=your-openai-key-here ``` -#### OpenMemory MCP Interface Patterns - -**Important**: OpenMemory MCP stores memories **per-app**, not globally. Understanding this architecture is critical for proper integration. - -**App-Based Storage Architecture:** -- All memories are stored under specific "apps" (namespaces) -- Generic endpoints (`/api/v1/memories/`) return empty results -- App-specific endpoints (`/api/v1/apps/{app_id}/memories`) contain the actual memories - -**Hardcoded Values and Configuration:** -```bash -# Default app name (configurable via OPENMEMORY_CLIENT_NAME) -Default: "friend_lite" - -# Hardcoded metadata (NOT configurable) -"source": "friend_lite" # Always hardcoded in Friend-Lite - -# User ID for OpenMemory MCP server -OPENMEMORY_USER_ID=openmemory # Configurable -``` - -**API Interface Pattern:** -```python -# 1. App Discovery - Find app by client_name -GET /api/v1/apps/ -# Response: {"apps": [{"id": "uuid", "name": "friend_lite", ...}]} - -# 2. Memory Creation - Uses generic endpoint but assigns to app -POST /api/v1/memories/ -{ - "user_id": "openmemory", - "text": "memory content", - "app": "friend_lite", # Uses OPENMEMORY_CLIENT_NAME - "metadata": { - "source": "friend_lite", # Hardcoded - "client": "friend_lite" # Uses OPENMEMORY_CLIENT_NAME - } -} - -# 3. Memory Retrieval - Must use app-specific endpoint -GET /api/v1/apps/{app_id}/memories?user_id=openmemory&page=1&size=10 - -# 4. Memory Search - Must use app-specific endpoint with search_query -GET /api/v1/apps/{app_id}/memories?user_id=openmemory&search_query=keyword&page=1&size=10 -``` - -**Friend-Lite Integration Flow:** -1. **App Discovery**: Query `/api/v1/apps/` to find app matching `OPENMEMORY_CLIENT_NAME` -2. **Fallback**: If client app not found, use first available app -3. **Operations**: All memory operations use the app-specific endpoints with discovered `app_id` - -**Testing OpenMemory MCP Integration:** -```bash -# Configure .env file with OpenMemory MCP settings -cp .env.template .env -# Edit .env to set MEMORY_PROVIDER=openmemory_mcp and configure OPENMEMORY_* variables - -# Start OpenMemory MCP server -cd extras/openmemory-mcp && docker compose up -d - -# Run integration tests (reads configuration from .env file) -cd backends/advanced && ./run-test.sh - -# Manual testing - Check app structure -curl -s "http://localhost:8765/api/v1/apps/" | jq - -# Test memory creation -curl -X POST "http://localhost:8765/api/v1/memories/" \ - -H "Content-Type: application/json" \ - -d '{"user_id": "openmemory", "text": "test memory", "app": "friend_lite"}' - -# Retrieve memories (replace app_id with actual ID from apps endpoint) -curl -s "http://localhost:8765/api/v1/apps/{app_id}/memories?user_id=openmemory" | jq -``` - ### Transcription Provider Configuration Friend-Lite supports multiple transcription services: @@ -480,295 +304,40 @@ OLLAMA_BASE_URL=http://ollama:11434 SPEAKER_SERVICE_URL=http://speaker-recognition:8085 ``` -## Transcription Architecture - -### Provider System -Friend-Lite supports multiple transcription providers: - -**Online Providers (API-based):** -- **Deepgram**: High-quality transcription using Nova-3 model with real-time streaming -- **Mistral**: Voxtral models for transcription with REST API processing - -**Offline Providers (Local processing):** -- **Parakeet**: Local speech recognition service available in extras/asr-services - -**Provider Interface:** -The transcription system handles: -- Connection management and health checks -- Audio format handling (streaming vs batch) -- Error handling and reconnection -- Unified transcript format normalization - -## Wyoming Protocol Implementation - -### Overview -The system uses Wyoming protocol for WebSocket communication between mobile apps and backends. Wyoming is a peer-to-peer protocol for voice assistants that combines JSONL headers with binary audio payloads. - -### Protocol Format -``` -{JSON_HEADER}\n - -``` - -### Supported Events - -#### Audio Session Events -- **audio-start**: Signals the beginning of an audio recording session - ```json - {"type": "audio-start", "data": {"rate": 16000, "width": 2, "channels": 1}, "payload_length": null} - ``` - -- **audio-chunk**: Contains raw audio data with format metadata - ```json - {"type": "audio-chunk", "data": {"rate": 16000, "width": 2, "channels": 1}, "payload_length": 320} - <320 bytes of PCM/Opus audio data> - ``` - -- **audio-stop**: Signals the end of an audio recording session - ```json - {"type": "audio-stop", "data": {"timestamp": 1234567890}, "payload_length": null} - ``` - -### Backend Implementation - -#### Advanced Backend (`/ws_pcm`) -- **Full Wyoming Protocol Support**: Parses all Wyoming events for session management -- **Session Tracking**: Only processes audio chunks when session is active (after audio-start) -- **Conversation Boundaries**: Uses audio-start/stop events to define conversation segments -- **Backward Compatibility**: Fallback to raw binary audio for older clients - -#### Simple Backend (`/ws`) -- **Minimal Wyoming Support**: Parses audio-chunk events, ignores others -- **Opus Processing**: Handles Opus-encoded audio chunks from Wyoming protocol -- **Graceful Degradation**: Falls back to raw Opus packets for compatibility - -### Mobile App Integration - -Mobile apps should implement Wyoming protocol for proper session management: - -```javascript -// Start audio session -const audioStart = { - type: "audio-start", - data: { rate: 16000, width: 2, channels: 1 }, - payload_length: null -}; -websocket.send(JSON.stringify(audioStart) + '\n'); - -// Send audio chunks -const audioChunk = { - type: "audio-chunk", - data: { rate: 16000, width: 2, channels: 1 }, - payload_length: audioData.byteLength -}; -websocket.send(JSON.stringify(audioChunk) + '\n'); -websocket.send(audioData); - -// End audio session -const audioStop = { - type: "audio-stop", - data: { timestamp: Date.now() }, - payload_length: null -}; -websocket.send(JSON.stringify(audioStop) + '\n'); -``` +## Quick API Reference -### Benefits -- **Clear Session Boundaries**: No timeout-based conversation detection needed -- **Structured Communication**: Consistent protocol across all audio streaming -- **Future Extensibility**: Room for additional event types (pause, resume, metadata) -- **Backward Compatibility**: Works with existing raw audio streaming clients - -## Memory System Architecture - -### Overview -Friend-Lite supports two pluggable memory backends that can be selected via configuration: - -#### 1. Friend-Lite Memory Provider (`friend_lite`) -The sophisticated in-house memory implementation with full control and customization: - -**Features:** -- Custom LLM-powered memory extraction with enhanced prompts -- Individual fact storage (no JSON blobs) -- Smart deduplication algorithms -- Intelligent memory updates (ADD/UPDATE/DELETE decisions) -- **Semantic search** with relevance threshold filtering -- **Memory count API** with total count tracking from native Qdrant -- Direct Qdrant vector storage with accurate similarity scoring -- Custom memory prompts and processing -- No external dependencies - -**Architecture Flow:** -1. **Audio Input** → Transcription via Deepgram/Parakeet -2. **Memory Extraction** → LLM processes transcript using custom prompts -3. **Fact Parsing** → XML/JSON parsing into individual memory entries -4. **Deduplication** → Smart algorithms prevent duplicate memories -5. **Vector Storage** → Direct Qdrant storage with embeddings -6. **Memory Updates** → LLM-driven action proposals (ADD/UPDATE/DELETE) - -#### 2. OpenMemory MCP Provider (`openmemory_mcp`) -Thin client that delegates all memory processing to external OpenMemory MCP server: - -**Features:** -- Professional memory extraction (handled by OpenMemory) -- Battle-tested deduplication (handled by OpenMemory) -- Semantic vector search (handled by OpenMemory) -- ACL-based user isolation (handled by OpenMemory) -- Cross-client compatibility (Claude Desktop, Cursor, Windsurf) -- Web UI for memory management at http://localhost:8765 - -**Architecture Flow:** -1. **Audio Input** → Transcription via Deepgram/Parakeet -2. **MCP Delegation** → Send enriched transcript to OpenMemory MCP server -3. **External Processing** → OpenMemory handles extraction, deduplication, storage -4. **Result Mapping** → Convert MCP results to Friend-Lite MemoryEntry format -5. **Client Management** → Automatic user context switching via MCP client - -### Memory Provider Comparison - -| Feature | Friend-Lite | OpenMemory MCP | -|---------|-------------|----------------| -| **Processing** | Custom LLM extraction | Delegates to OpenMemory | -| **Deduplication** | Custom algorithms | OpenMemory handles | -| **Vector Storage** | Direct Qdrant | OpenMemory handles | -| **Search Features** | Semantic search with threshold filtering | Semantic search with relevance scoring | -| **Memory Count** | Native Qdrant count API | Varies by OpenMemory support | -| **Dependencies** | Qdrant + MongoDB | External OpenMemory server | -| **Customization** | Full control | Limited to OpenMemory features | -| **Cross-client** | Friend-Lite only | Works with Claude Desktop, Cursor, etc | -| **Web UI** | Friend-Lite WebUI with advanced search | OpenMemory UI + Friend-Lite WebUI | -| **Memory Format** | Individual facts | OpenMemory format | -| **Setup Complexity** | Medium | High (external server required) | - -### Switching Memory Providers - -You can switch providers by changing the `MEMORY_PROVIDER` environment variable: +### Common Endpoints +- **GET /health**: Basic application health check +- **GET /readiness**: Service dependency validation +- **WS /ws_pcm**: Primary audio streaming endpoint (Wyoming protocol + raw PCM fallback) +- **GET /api/conversations**: User's conversations with transcripts +- **GET /api/memories/search**: Semantic memory search with relevance scoring +- **POST /auth/jwt/login**: Email-based login (returns JWT token) +### Authentication Flow ```bash -# Switch to OpenMemory MCP -echo "MEMORY_PROVIDER=openmemory_mcp" >> .env +# 1. Get auth token +curl -s -X POST \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "username=admin@example.com&password=your-password-here" \ + http://localhost:8000/auth/jwt/login -# Switch back to Friend-Lite -echo "MEMORY_PROVIDER=friend_lite" >> .env +# 2. Use token in API calls +curl -s -H "Authorization: Bearer YOUR_TOKEN" \ + http://localhost:8000/api/conversations ``` -**Note:** Existing memories are not automatically migrated between providers. Each provider maintains its own memory storage. - -### OpenMemory MCP Setup - -To use the OpenMemory MCP provider: - +### Development Reset Commands ```bash -# 1. Start external OpenMemory MCP server -cd extras/openmemory-mcp -docker compose up -d - -# 2. Configure Friend-Lite to use OpenMemory MCP +# Reset all data (development only) cd backends/advanced -echo "MEMORY_PROVIDER=openmemory_mcp" >> .env +sudo rm -rf data/ -# 3. Start Friend-Lite backend +# Reset Docker volumes +docker compose down -v docker compose up --build -d ``` -### When to Use Each Provider - -**Use Friend-Lite when:** -- You want full control over memory processing -- You need custom memory extraction logic -- You prefer fewer external dependencies -- You want to customize memory prompts and algorithms -- You need individual fact-based memory storage - -**Use OpenMemory MCP when:** -- You want professional, battle-tested memory processing -- You need cross-client compatibility (Claude Desktop, Cursor, etc.) -- You prefer to leverage external expertise rather than maintain custom logic -- You want access to OpenMemory's web interface -- You're already using OpenMemory in other tools - -## Versioned Processing System - -### Overview - -Friend-Lite implements a comprehensive versioning system for both transcript and memory processing, allowing multiple processing attempts with different providers, models, or settings while maintaining a clean user experience. - -### Version Data Structure - -**Transcript Versions**: -```json -{ - "transcript_versions": [ - { - "version_id": "uuid", - "transcript": "processed text", - "segments": [...], - "provider": "deepgram|mistral|parakeet", - "model": "nova-3|voxtral-mini-2507", - "created_at": "2025-01-15T10:30:00Z", - "processing_time_seconds": 12.5, - "metadata": { - "confidence_scores": [...], - "speaker_diarization": true - } - } - ], - "active_transcript_version": "uuid" -} -``` - -**Memory Versions**: -```json -{ - "memory_versions": [ - { - "version_id": "uuid", - "memory_count": 5, - "transcript_version_id": "uuid", - "provider": "friend_lite|openmemory_mcp", - "model": "gpt-4o-mini|ollama-llama3", - "created_at": "2025-01-15T10:32:00Z", - "processing_time_seconds": 45.2, - "metadata": { - "prompt_version": "v2.1", - "extraction_quality": "high" - } - } - ], - "active_memory_version": "uuid" -} -``` - -### Reprocessing Workflows - -**Transcript Reprocessing**: -1. Trigger via API: `POST /api/conversations/{conversation_id}/reprocess-transcript` -2. System creates new transcript version with different provider/model -3. New version added to `transcript_versions` array -4. User can activate any version via `activate-transcript` endpoint -5. Legacy `transcript` field automatically updated from active version - -**Memory Reprocessing**: -1. Trigger via API: `POST /api/conversations/{conversation_id}/reprocess-memory` -2. Specify which transcript version to use as input -3. System creates new memory version using specified transcript -4. New version added to `memory_versions` array -5. User can activate any version via `activate-memory` endpoint -6. Legacy `memories` field automatically updated from active version - -### Legacy Field Compatibility - -**Automatic Population**: -- `transcript`: Auto-populated from active transcript version -- `segments`: Auto-populated from active transcript version -- `memories`: Auto-populated from active memory version -- `memory_count`: Auto-populated from active memory version - -**Backward Compatibility**: -- Existing API clients continue working without modification -- WebUI displays active versions by default -- Advanced users can access version history and switch between versions - ## Development Notes ### Package Management @@ -822,418 +391,40 @@ The system includes comprehensive health checks: ### Cursor Rule Integration Project includes `.cursor/rules/always-plan-first.mdc` requiring understanding before coding. Always explain the task and confirm approach before implementation. - -## API Reference - -### Health & Status Endpoints -- **GET /health**: Basic application health check -- **GET /readiness**: Service dependency validation (MongoDB, Qdrant, etc.) -- **GET /api/metrics**: System metrics and debug tracker status (Admin only) -- **GET /api/processor/status**: Processor queue status and health (Admin only) -- **GET /api/processor/tasks**: All active processing tasks (Admin only) -- **GET /api/processor/tasks/{client_id}**: Processing task status for specific client (Admin only) - -### WebSocket Endpoints -- **WS /ws_pcm**: Primary audio streaming endpoint (Wyoming protocol + raw PCM fallback) -- **WS /ws**: Simple audio streaming endpoint (Opus packets + Wyoming audio-chunk events) - -### Memory & Conversation Debugging -- **GET /api/admin/memories**: All memories across all users with debug stats (Admin only) -- **GET /api/memories/unfiltered**: User's memories without filtering -- **GET /api/memories/search**: Semantic memory search with relevance scoring -- **GET /api/conversations**: User's conversations with transcripts -- **GET /api/conversations/{conversation_id}**: Specific conversation details -- **POST /api/conversations/{conversation_id}/reprocess-transcript**: Re-run transcript processing -- **POST /api/conversations/{conversation_id}/reprocess-memory**: Re-extract memories with different parameters -- **GET /api/conversations/{conversation_id}/versions**: Get all transcript and memory versions -- **POST /api/conversations/{conversation_id}/activate-transcript**: Switch to a different transcript version -- **POST /api/conversations/{conversation_id}/activate-memory**: Switch to a different memory version - -### Client Management -- **GET /api/clients/active**: Currently active WebSocket clients -- **GET /api/users**: List all users (Admin only) - -### File Processing -- **POST /api/process-audio-files**: Upload and process audio files (Admin only) - - Note: Processes files sequentially, may timeout for large files - - Client timeout: 5 minutes, Server processing: up to 3x audio duration + 60s - - Example usage: - ```bash - # Step 1: Read .env file for ADMIN_EMAIL and ADMIN_PASSWORD - # Step 2: Get auth token - # Step 3: Use token in file upload - curl -X POST \ - -H "Authorization: Bearer YOUR_TOKEN_HERE" \ - -F "files=@/path/to/audio.wav" \ - -F "device_name=test-upload" \ - http://localhost:8000/api/process-audio-files - ``` - -### Authentication -- **POST /auth/jwt/login**: Email-based login (returns JWT token) -- **GET /users/me**: Get current authenticated user -- **GET /api/auth/config**: Authentication configuration - -### Step-by-Step API Testing Guide - -When testing API endpoints that require authentication, follow these steps: - -#### Step 1: Read credentials from .env file -```bash -# Use the Read tool to view the .env file and identify credentials -# Look for: -# ADMIN_EMAIL=admin@example.com -# ADMIN_PASSWORD=your-password-here -``` - -#### Step 2: Get authentication token -```bash -curl -s -X POST \ - -H "Content-Type: application/x-www-form-urlencoded" \ - -d "username=admin@example.com&password=your-password-here" \ - http://localhost:8000/auth/jwt/login -``` -This returns: -```json -{"access_token":"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...","token_type":"bearer"} -``` - -#### Step 3: Use the token in API calls -```bash -# Extract the token from the response above and use it: -curl -s -H "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..." \ - http://localhost:8000/api/conversations - -# For reprocessing endpoints: -curl -s -X POST \ - -H "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..." \ - -H "Content-Type: application/json" \ - http://localhost:8000/api/conversations/{conversation_id}/reprocess-transcript -``` - -**Important**: Always read the .env file first using the Read tool rather than using shell commands like `grep` or `cut`. This ensures you see the exact values and can copy them accurately. - -#### Step 4: Testing Reprocessing Endpoints -Once you have the auth token, you can test the reprocessing functionality: - -```bash -# Get list of conversations to find a conversation_id -curl -s -H "Authorization: Bearer YOUR_TOKEN" \ - http://localhost:8000/api/conversations - -# Test transcript reprocessing (uses conversation_id) -curl -s -X POST \ - -H "Authorization: Bearer YOUR_TOKEN" \ - -H "Content-Type: application/json" \ - http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/reprocess-transcript - -# Test memory reprocessing (uses conversation_id and transcript_version_id) -curl -s -X POST \ - -H "Authorization: Bearer YOUR_TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"transcript_version_id": "VERSION_ID"}' \ - http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/reprocess-memory - -# Get transcript and memory versions -curl -s -H "Authorization: Bearer YOUR_TOKEN" \ - http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/versions - -# Activate a specific transcript version -curl -s -X POST \ - -H "Authorization: Bearer YOUR_TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"transcript_version_id": "VERSION_ID"}' \ - http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/activate-transcript - -# Activate a specific memory version -curl -s -X POST \ - -H "Authorization: Bearer YOUR_TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"memory_version_id": "VERSION_ID"}' \ - http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/activate-memory -``` - -### Development Reset Endpoints -Useful endpoints for resetting state during development: - -#### Data Cleanup -- **DELETE /api/admin/memory/delete-all**: Delete all memories for the current user -- **DELETE /api/memories/{memory_id}**: Delete a specific memory -- **DELETE /api/conversations/{conversation_id}**: Delete a specific conversation (keeps original audio file in audio_chunks) -- **DELETE /api/chat/sessions/{session_id}**: Delete a chat session and all its messages -- **DELETE /api/users/{user_id}**: Delete a user (Admin only) - - Optional query params: `delete_conversations=true`, `delete_memories=true` - -#### Quick Reset Commands -```bash -# Reset all data (development only) -cd backends/advanced -sudo rm -rf data/ - -# Reset Docker volumes -docker compose down -v -docker compose up --build -d -``` - - -## Speaker Recognition Service Features - -### Speaker Analysis & Visualization -The speaker recognition service now includes advanced analysis capabilities: - -#### Embedding Analysis (/speakers/analysis endpoint) -- **2D/3D Visualization**: Interactive embedding plots using UMAP, t-SNE, or PCA -- **Clustering Analysis**: Automatic clustering using DBSCAN or K-means -- **Speaker Similarity Detection**: Identifies speakers with similar embeddings -- **Quality Metrics**: Embedding separation quality and confidence scores -- **Interactive Controls**: Adjustable analysis parameters and visualization options - -Access via: `extras/speaker-recognition/webui` → Speakers → Embedding Analysis tab - -#### Live Inference Feature (/infer-live page) -Real-time speaker identification and transcription: -- **WebRTC Audio Capture**: Live microphone access with waveform visualization -- **Deepgram Streaming**: Real-time transcription with speaker diarization -- **Live Speaker ID**: Identifies enrolled speakers in real-time using internal service -- **Session Statistics**: Live metrics for words, speakers, and confidence scores -- **Configurable Settings**: Adjustable confidence thresholds and audio parameters - -Access via: `extras/speaker-recognition/webui` → Live Inference - -### Technical Implementation - -#### Backend (Python) -- **Analysis Utils**: `src/simple_speaker_recognition/utils/analysis.py` - - UMAP/t-SNE dimensionality reduction - - DBSCAN/K-means clustering - - Cosine similarity analysis - - Quality metrics calculation -- **API Endpoint**: `/speakers/analysis` - Returns processed embedding analysis -- **Dependencies**: Added `umap-learn` for dimensionality reduction - -#### Frontend (React/TypeScript) -- **EmbeddingPlot Component**: Interactive Plotly.js visualizations -- **LiveAudioCapture Component**: WebRTC audio recording with waveform -- **DeepgramStreaming Service**: WebSocket integration for real-time transcription -- **InferLive Page**: Complete live inference interface - -### Usage Instructions - -#### Setting up Live Inference -1. Navigate to Live Inference page -2. Configure Deepgram API key in settings -3. Adjust speaker identification settings (confidence threshold) -4. Start live session to begin real-time transcription and speaker ID - -**Technical Details:** -- **Audio Processing**: Uses browser's native sample rate (typically 44.1kHz or 48kHz) -- **Buffer Retention**: 120 seconds of audio for improved utterance capture -- **Real-time Updates**: Live transcription with speaker identification results - -#### Using Speaker Analysis -1. Go to Speakers page → Embedding Analysis tab -2. Select analysis method (UMAP, t-SNE, PCA) -3. Choose clustering algorithm (DBSCAN, K-means) -4. Adjust similarity threshold for speaker detection -5. View interactive plots and quality metrics - -### Deployment Notes -- Requires Docker rebuild to pick up new Python dependencies -- Frontend dependencies (Plotly.js) already included -- Live inference requires Deepgram API key for streaming transcription -- Speaker identification uses existing enrolled speakers from database - -### Live Inference Troubleshooting -- **"NaN:NaN" timestamps**: Fixed in recent updates, ensure you're using latest version -- **Poor speaker identification**: Try adjusting confidence threshold or re-enrolling speakers -- **Audio processing delays**: Check browser console for sample rate detection logs -- **Buffer overflow issues**: Extended to 120-second retention for better performance -- **"extraction_failed" errors**: Usually indicates audio buffer timing issues - check console logs for buffer availability - -## Distributed Self-Hosting Architecture - -Friend-Lite supports distributed deployment across multiple machines, allowing you to separate GPU-intensive services from lightweight backend components. This is ideal for scenarios where you have a dedicated GPU machine and want to run the main backend on a VPS or Raspberry Pi. - -### Architecture Patterns - -#### Single Machine (Default) -All services run on one machine using Docker Compose - ideal for development and simple deployments. - -#### Distributed GPU Setup -**GPU Machine (High-performance):** -- LLM services (Ollama with GPU acceleration) -- ASR services (Parakeet with GPU) -- Speaker recognition service -- Deepgram fallback can remain on backend machine - -**Backend Machine (Lightweight - VPS/RPi):** -- Friend-Lite backend (FastAPI) -- React WebUI -- MongoDB -- Qdrant vector database - -### Networking with Tailscale - -Tailscale VPN provides secure, encrypted networking between distributed services: - -**Benefits:** -- **Zero configuration networking**: Services discover each other automatically -- **Encrypted communication**: All inter-service traffic is encrypted -- **Firewall friendly**: Works behind NATs and firewalls -- **Access control**: Granular permissions for service access -- **CORS support**: Built-in support for Tailscale IP ranges (100.x.x.x) - -**Installation:** -```bash -# On each machine -curl -fsSL https://tailscale.com/install.sh | sh -sudo tailscale up -``` - -### Distributed Service Configuration - -#### GPU Machine Services -```bash -# .env on GPU machine -OLLAMA_BASE_URL=http://0.0.0.0:11434 # Expose to Tailscale network -SPEAKER_SERVICE_URL=http://0.0.0.0:8085 - -# Enable GPU acceleration for Ollama -docker run -d --gpus=all -p 11434:11434 ollama/ollama:latest -``` - -#### Backend Machine Configuration -```bash -# .env on backend machine -OLLAMA_BASE_URL=http://100.x.x.x:11434 # GPU machine Tailscale IP -SPEAKER_SERVICE_URL=http://100.x.x.x:8085 # GPU machine Tailscale IP - -# Parakeet ASR services can also be distributed (if using offline ASR) -# PARAKEET_ASR_URL=http://100.x.x.x:8767 - -# CORS automatically supports Tailscale IPs (no configuration needed) -``` - -#### Service URL Examples - -**Common remote service configurations:** -```bash -# LLM Processing (GPU machine) -OLLAMA_BASE_URL=http://100.64.1.100:11434 -OPENAI_BASE_URL=http://100.64.1.100:8080 # For vLLM/OpenAI-compatible APIs - -# Speech Recognition (GPU machine) -# PARAKEET_ASR_URL=http://100.64.1.100:8767 # If using Parakeet ASR -SPEAKER_SERVICE_URL=http://100.64.1.100:8085 - -# Database services (can be on separate machine) -MONGODB_URI=mongodb://100.64.1.200:27017 # Database name: friend-lite -QDRANT_BASE_URL=http://100.64.1.200:6333 -``` - -### Deployment Steps - -#### 1. Set up Tailscale on all machines -```bash -# Install and connect each machine to your Tailscale network -curl -fsSL https://tailscale.com/install.sh | sh -sudo tailscale up -``` - -#### 2. Deploy GPU services -```bash -# On GPU machine - start GPU-accelerated services -cd extras/asr-services && docker compose up parakeet -d -cd extras/speaker-recognition && docker compose up --build -d - -# Start Ollama with GPU support -docker run -d --gpus=all -p 11434:11434 \ - -v ollama:/root/.ollama \ - ollama/ollama:latest -``` - -#### 3. Configure backend machine -```bash -# Update .env with Tailscale IPs of GPU machine -OLLAMA_BASE_URL=http://[gpu-machine-tailscale-ip]:11434 -SPEAKER_SERVICE_URL=http://[gpu-machine-tailscale-ip]:8085 - -# Start lightweight backend services -docker compose up --build -d -``` - -#### 4. Verify connectivity -```bash -# Test service connectivity from backend machine -curl http://[gpu-machine-ip]:11434/api/tags # Ollama -curl http://[gpu-machine-ip]:8085/health # Speaker recognition -``` - -### Performance Considerations - -**Network Latency:** -- Tailscale adds minimal latency (typically <5ms between nodes) -- LLM inference: Network time negligible compared to GPU processing -- ASR streaming: Use local fallback for latency-sensitive applications - -**Bandwidth Usage:** -- Audio streaming: ~128kbps for Opus, ~512kbps for PCM -- LLM requests: Typically <1MB per conversation -- Memory embeddings: ~3KB per memory vector - -**Processing Time Expectations:** -- Transcription (Deepgram): 2-5 seconds for 4-minute audio -- Transcription (Parakeet): 5-10 seconds for 4-minute audio -- Memory extraction (OpenAI GPT-4o-mini): 30-40 seconds for typical conversation -- Memory extraction (Ollama local): 45-90 seconds depending on model and GPU -- Full pipeline (4-min audio): 40-60 seconds with cloud services, 60-120 seconds with local models - -### Security Best Practices - -**Tailscale Access Control:** -```json -{ - "acls": [ - { - "action": "accept", - "src": ["tag:backend"], - "dst": ["tag:gpu:11434", "tag:gpu:8085", "tag:gpu:8767"] - } - ], - "tagOwners": { - "tag:backend": ["your-email@example.com"], - "tag:gpu": ["your-email@example.com"] - } -} -``` - -**Service Isolation:** -- Run GPU services in containers with limited network access -- Use Tailscale subnet routing for additional security -- Monitor service access logs for unauthorized requests - -### Troubleshooting Distributed Setup - -**Common Issues:** -- **CORS errors**: Tailscale IPs are automatically supported, but verify CORS_ORIGINS if using custom IPs -- **Service discovery**: Use `tailscale ip` to find machine IPs -- **Port conflicts**: Ensure services use different ports on shared machines -- **Authentication**: Services must be accessible without authentication for inter-service communication - -**Debugging Commands:** -```bash -# Check Tailscale connectivity -tailscale ping [machine-name] -tailscale status - -# Test service endpoints -curl http://[tailscale-ip]:11434/api/tags -curl http://[tailscale-ip]:8085/health - -# Check Docker networks -docker network ls -docker ps --format "table {{.Names}}\t{{.Ports}}" -``` +## Extended Documentation + +For detailed technical documentation, see: +- **[@docs/wyoming-protocol.md](docs/wyoming-protocol.md)**: WebSocket communication protocol details +- **[@docs/memory-providers.md](docs/memory-providers.md)**: In-depth memory provider comparison and setup +- **[@docs/versioned-processing.md](docs/versioned-processing.md)**: Transcript and memory versioning details +- **[@docs/api-reference.md](docs/api-reference.md)**: Complete endpoint documentation with examples +- **[@docs/speaker-recognition.md](docs/speaker-recognition.md)**: Advanced analysis and live inference features +- **[@docs/distributed-deployment.md](docs/distributed-deployment.md)**: Multi-machine deployment with Tailscale + +## Robot Framework Testing + +**IMPORTANT: When writing or modifying Robot Framework tests, you MUST follow the testing guidelines.** + +Before writing any Robot Framework test: +1. **Read [@tests/TESTING_GUIDELINES.md](tests/TESTING_GUIDELINES.md)** for comprehensive testing patterns and standards +2. **Check [@tests/tags.md](tests/tags.md)** for approved tags - ONLY 11 tags are permitted +3. **SCAN existing resource files** for keywords - NEVER write code that duplicates existing keywords +4. **Follow the Arrange-Act-Assert pattern** with inline verifications (not abstracted to keywords) + +Key Testing Rules: +- **Check Existing Keywords FIRST**: Before writing ANY test code, scan relevant resource files (`websocket_keywords.robot`, `queue_keywords.robot`, `conversation_keywords.robot`, etc.) for existing keywords +- **Tags**: ONLY use the 11 approved tags from tags.md, tab-separated (e.g., `[Tags] infra audio-streaming`) +- **Verifications**: Write assertions directly in tests, not in resource keywords +- **Keywords**: Only create keywords for reusable setup/action operations AFTER confirming no existing keyword exists +- **Resources**: Always check existing resource files before creating new keywords or duplicating logic +- **Naming**: Use descriptive names that explain business purpose, not technical implementation + +**DO NOT:** +- Write inline code without checking if a keyword already exists for that operation +- Create custom tags (use only the 11 approved tags) +- Abstract verifications into keywords (keep them inline in tests) +- Use space-separated tags (must be tab-separated) +- Skip reading the guidelines before writing tests ## Notes for Claude Check if the src/ is volume mounted. If not, do compose build so that code changes are reflected. Do not simply run `docker compose restart` as it will not rebuild the image. diff --git a/Docs/getting-started.md b/Docs/getting-started.md new file mode 100644 index 00000000..2f647b7b --- /dev/null +++ b/Docs/getting-started.md @@ -0,0 +1,731 @@ +# Getting Started + +# Friend-Lite Backend Quickstart Guide + +> 📖 **New to friend-lite?** This is your starting point! After reading this, continue with [architecture.md](./architecture.md) for technical details. + +## Overview + +Friend-Lite is an eco-system of services to support "AI wearable" agents/functionality. +At the moment, the basic functionalities are: +- Audio capture (via WebSocket, from OMI device, files, or a laptop) +- Audio transcription +- **Advanced memory system** with pluggable providers (Friend-Lite native or OpenMemory MCP) +- **Enhanced memory extraction** with individual fact storage and smart updates +- **Semantic memory search** with relevance threshold filtering and live results +- Action item extraction +- Modern React web dashboard with live recording and advanced search features +- Comprehensive user management with JWT authentication + +**Core Implementation**: See `src/advanced_omi_backend/main.py` for the complete FastAPI application and WebSocket handling. + +## Prerequisites + +- Docker and Docker Compose +- API keys for your chosen providers (see setup script) + +## Quick Start + +### Step 1: Interactive Setup (Recommended) + +Run the interactive setup wizard to configure all services with guided prompts: +```bash +cd backends/advanced +./init.sh +``` + +**The setup wizard will guide you through:** +- **Authentication**: Admin email/password setup +- **Transcription Provider**: Choose Deepgram, Mistral, or Offline (Parakeet) +- **LLM Provider**: Choose OpenAI or Ollama for memory extraction +- **Memory Provider**: Choose Friend-Lite Native or OpenMemory MCP +- **Optional Services**: Speaker Recognition and other extras +- **Network Configuration**: Ports and host settings + +**Example flow:** +``` +🚀 Friend-Lite Interactive Setup +=============================================== + +► Authentication Setup +---------------------- +Admin email [admin@example.com]: john@company.com +Admin password (min 8 chars): ******** + +► Speech-to-Text Configuration +------------------------------- +Choose your transcription provider: + 1) Deepgram (recommended - high quality, requires API key) + 2) Mistral (Voxtral models - requires API key) + 3) Offline (Parakeet ASR - requires GPU, runs locally) + 4) None (skip transcription setup) +Enter choice (1-4) [1]: 1 + +Get your API key from: https://console.deepgram.com/ +Deepgram API key: dg_xxxxxxxxxxxxx + +► LLM Provider Configuration +---------------------------- +Choose your LLM provider for memory extraction: + 1) OpenAI (GPT-4, GPT-3.5 - requires API key) + 2) Ollama (local models - requires Ollama server) + 3) Skip (no memory extraction) +Enter choice (1-3) [1]: 1 +``` + +### Step 2: HTTPS Setup (Optional) + +For microphone access and secure connections, set up HTTPS: +```bash +cd backends/advanced +./setup-https.sh 100.83.66.30 # Your Tailscale/network IP +``` + +This creates SSL certificates and configures nginx for secure access. + +### Step 3: Start the System + +**Start all services:** +```bash +cd backends/advanced +docker compose up --build -d +``` + +This starts: +- **Backend API**: `http://localhost:8000` +- **Web Dashboard**: `http://localhost:5173` +- **MongoDB**: `localhost:27017` +- **Qdrant**: `localhost:6333` + +### Step 4: Optional Services + +**If you configured optional services during setup, start them:** + +```bash +# OpenMemory MCP (if selected) +cd ../../extras/openmemory-mcp && docker compose up -d + +# Parakeet ASR (if selected for offline transcription) +cd ../../extras/asr-services && docker compose up parakeet -d + +# Speaker Recognition (if enabled) +cd ../../extras/speaker-recognition && docker compose up --build -d +``` + +### Manual Configuration (Alternative) + +If you prefer manual configuration, copy the `.env.template` file to `.env` and configure the required values: + +**Required Environment Variables:** +```bash +AUTH_SECRET_KEY=your-super-secret-jwt-key-here +ADMIN_PASSWORD=your-secure-admin-password +ADMIN_EMAIL=admin@example.com +``` + +**Memory Provider Configuration:** +```bash +# Memory Provider (Choose One) +# Option 1: Friend-Lite Native (Default - Recommended) +MEMORY_PROVIDER=friend_lite + +# Option 2: OpenMemory MCP (Cross-client compatibility) +# MEMORY_PROVIDER=openmemory_mcp +# OPENMEMORY_MCP_URL=http://host.docker.internal:8765 +# OPENMEMORY_CLIENT_NAME=friend_lite +# OPENMEMORY_USER_ID=openmemory +``` + +**LLM Configuration (Choose One):** +```bash +# Option 1: OpenAI (Recommended for best memory extraction) +LLM_PROVIDER=openai +OPENAI_API_KEY=your-openai-api-key-here +OPENAI_MODEL=gpt-4o-mini + +# Option 2: Local Ollama +LLM_PROVIDER=ollama +OLLAMA_BASE_URL=http://ollama:11434 +``` + +**Transcription Services (Choose One):** +```bash +# Option 1: Deepgram (Recommended for best transcription quality) +TRANSCRIPTION_PROVIDER=deepgram +DEEPGRAM_API_KEY=your-deepgram-api-key-here + +# Option 2: Mistral (Voxtral models for transcription) +TRANSCRIPTION_PROVIDER=mistral +MISTRAL_API_KEY=your-mistral-api-key-here +MISTRAL_MODEL=voxtral-mini-2507 + +# Option 3: Local ASR service +PARAKEET_ASR_URL=http://host.docker.internal:8080 +``` + +**Important Notes:** +- **OpenAI is strongly recommended** for LLM processing as it provides much better memory extraction and eliminates JSON parsing errors +- **TRANSCRIPTION_PROVIDER** determines which service to use: + - `deepgram`: Uses Deepgram's Nova-3 model for high-quality transcription + - `mistral`: Uses Mistral's Voxtral models for transcription + - If not set, system falls back to offline ASR service +- The system requires either online API keys or offline ASR service configuration + +### Testing Your Setup (Optional) + +After configuration, verify everything works with the integration test suite: +```bash +./run-test.sh + +# Alternative: Manual test with detailed logging +source .env && export DEEPGRAM_API_KEY OPENAI_API_KEY && \ + uv run pytest tests/test_integration.py -vv -s --log-cli-level=INFO +``` +This end-to-end test validates the complete audio processing pipeline. + +## Using the System + +### Web Dashboard + +1. Open `http://localhost:5173` +2. **Login** using the sidebar: + - **Admin**: `admin@example.com` / `your-admin-password` + - **Create new users** via admin interface + +### Dashboard Features + +- **Conversations**: View audio recordings, transcripts, and cropped audio +- **Memories**: Advanced memory search with semantic search, relevance threshold filtering, and memory count display +- **Live Recording**: Real-time audio recording with WebSocket streaming (HTTPS required) +- **User Management**: Create/delete users and their data +- **Client Management**: View active connections and close conversations +- **System Monitoring**: Debug tools and system health monitoring + +### Audio Client Connection + +Connect audio clients via WebSocket with authentication: + +**WebSocket URLs:** +```javascript +// Opus audio stream +ws://your-server-ip:8000/ws?token=YOUR_JWT_TOKEN&device_name=YOUR_DEVICE_NAME + +// PCM audio stream +ws://your-server-ip:8000/ws_pcm?token=YOUR_JWT_TOKEN&device_name=YOUR_DEVICE_NAME +``` + +**Authentication Methods:** +The system uses email-based authentication with JWT tokens: + +```bash +# Login with email +curl -X POST "http://localhost:8000/auth/jwt/login" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "username=admin@example.com&password=your-admin-password" + +# Response: {"access_token": "eyJhbGciOiJIUzI1NiIs...", "token_type": "bearer"} +``` + +**Authentication Flow:** +1. **User Registration**: Admin creates users via API or dashboard +2. **Login**: Users authenticate with email and password +3. **Token Usage**: Include JWT token in API calls and WebSocket connections +4. **Data Access**: Users can only access their own data (admins see all) + +For detailed authentication documentation, see [`auth.md`](./auth.md). + +**Create User Account:** +```bash +export ADMIN_TOKEN="your-admin-token" + +# Create user +curl -X POST "http://localhost:8000/api/create_user" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"email": "user@example.com", "password": "userpass", "display_name": "John Doe"}' + +# Response includes the user_id (MongoDB ObjectId) +# {"message": "User user@example.com created successfully", "user": {"id": "507f1f77bcf86cd799439011", ...}} +``` + +**Client ID Format:** +The system automatically generates client IDs using the last 6 characters of the MongoDB ObjectId plus device name (e.g., `439011-phone`, `439011-desktop`). This ensures proper user-client association and data isolation. + +## Add Existing Data + +### Audio File Upload & Processing + +The system supports processing existing audio files through the file upload API. This allows you to import and process pre-recorded conversations without requiring a live WebSocket connection. + +**Upload and Process WAV Files:** +```bash +export USER_TOKEN="your-jwt-token" + +# Upload single WAV file +curl -X POST "http://localhost:8000/api/process-audio-files" \ + -H "Authorization: Bearer $USER_TOKEN" \ + -F "files=@/path/to/audio.wav" \ + -F "device_name=file_upload" + +# Upload multiple WAV files +curl -X POST "http://localhost:8000/api/process-audio-files" \ + -H "Authorization: Bearer $USER_TOKEN" \ + -F "files=@/path/to/recording1.wav" \ + -F "files=@/path/to/recording2.wav" \ + -F "device_name=import_batch" +``` + +**Response Example:** +```json +{ + "message": "Successfully processed 2 audio files", + "processed_files": [ + { + "filename": "recording1.wav", + "sample_rate": 16000, + "channels": 1, + "duration_seconds": 120.5, + "size_bytes": 3856000 + }, + { + "filename": "recording2.wav", + "sample_rate": 44100, + "channels": 2, + "duration_seconds": 85.2, + "size_bytes": 7532800 + } + ], + "client_id": "user01-import_batch" +} +``` + +## System Features + +### Audio Processing +- **Real-time streaming**: WebSocket audio ingestion +- **Multiple formats**: Opus and PCM audio support +- **Per-client processing**: Isolated conversation management +- **Speech detection**: Automatic silence removal +- **Audio cropping**: Extract only speech segments + +**Implementation**: See `src/advanced_omi_backend/main.py` for WebSocket endpoints and `src/advanced_omi_backend/processors.py` for audio processing pipeline. + +### Transcription Options +- **Deepgram API**: Cloud-based batch processing, high accuracy (recommended) +- **Mistral API**: Voxtral models for transcription with REST API processing +- **Self-hosted ASR**: Local Wyoming protocol services with real-time processing +- **Collection timeout**: 1.5 minute collection for optimal online processing quality + +### Conversation Management +- **Automatic chunking**: 60-second audio segments +- **Conversation timeouts**: Auto-close after 1.5 minutes of silence +- **Speaker identification**: Track multiple speakers per conversation +- **Manual controls**: Close conversations via API or dashboard + +### Memory & Intelligence + +#### Pluggable Memory System +- **Two memory providers**: Choose between Friend-Lite native or OpenMemory MCP +- **Friend-Lite Provider**: Full control with custom extraction, individual fact storage, smart deduplication +- **OpenMemory MCP Provider**: Cross-client compatibility (Claude Desktop, Cursor, Windsurf), professional processing + +#### Enhanced Memory Processing +- **Individual fact storage**: No more generic transcript fallbacks +- **Smart memory updates**: LLM-driven ADD/UPDATE/DELETE actions +- **Enhanced prompts**: Improved fact extraction with granular, specific memories +- **User-centric storage**: All memories keyed by database user_id +- **Semantic search**: Vector-based memory retrieval with embeddings +- **Configurable extraction**: YAML-based configuration for memory extraction +- **Debug tracking**: SQLite-based tracking of transcript → memory conversion +- **Client metadata**: Device information preserved for debugging and reference +- **User isolation**: All data scoped to individual users with multi-device support + +**Implementation**: +- **Memory System**: `src/advanced_omi_backend/memory/memory_service.py` + `src/advanced_omi_backend/controllers/memory_controller.py` +- **Configuration**: `memory_config.yaml` + `src/advanced_omi_backend/memory_config_loader.py` + +### Authentication & Security +- **Email Authentication**: Login with email and password +- **JWT tokens**: Secure API and WebSocket authentication with 1-hour expiration +- **Role-based access**: Admin vs regular user permissions +- **Data isolation**: Users can only access their own data +- **Client ID Management**: Automatic client-user association via `objectid_suffix-device_name` format +- **Multi-device support**: Single user can connect multiple devices +- **Security headers**: Proper CORS, cookie security, and token validation + +**Implementation**: See `src/advanced_omi_backend/auth.py` for authentication logic, `src/advanced_omi_backend/users.py` for user management, and [`auth.md`](./auth.md) for comprehensive documentation. + +## Verification + +```bash +# System health check +curl http://localhost:8000/health + +# Web dashboard +open http://localhost:3000 + +# View active clients (requires auth token) +curl -H "Authorization: Bearer your-token" http://localhost:8000/api/clients/active +``` + +## HAVPE Relay Configuration + +For ESP32 audio streaming using the HAVPE relay (`extras/havpe-relay/`): + +```bash +# Environment variables for HAVPE relay +export AUTH_USERNAME="user@example.com" # Email address +export AUTH_PASSWORD="your-password" +export DEVICE_NAME="havpe" # Device identifier + +# Run the relay +cd extras/havpe-relay +python main.py --backend-url http://your-server:8000 --backend-ws-url ws://your-server:8000 +``` + +The relay will automatically: +- Authenticate using `AUTH_USERNAME` (email address) +- Generate client ID as `objectid_suffix-havpe` +- Forward ESP32 audio to the backend with proper authentication +- Handle token refresh and reconnection + +## Development tip +uv sync --group (whatever group you want to sync) +(for example, deepgram, etc.) + +## Troubleshooting + +**Service Issues:** +- Check logs: `docker compose logs friend-backend` +- Restart services: `docker compose restart` +- View all services: `docker compose ps` + +**Authentication Issues:** +- Verify `AUTH_SECRET_KEY` is set and long enough (minimum 32 characters) +- Check admin credentials match `.env` file +- Ensure user email/password combinations are correct + +**Transcription Issues:** +- **Deepgram**: Verify API key is valid and `TRANSCRIPTION_PROVIDER=deepgram` +- **Mistral**: Verify API key is valid and `TRANSCRIPTION_PROVIDER=mistral` +- **Self-hosted**: Ensure ASR service is running on port 8765 +- Check transcription service connection in health endpoint + +**Memory Issues:** +- Ensure Ollama is running and model is pulled +- Check Qdrant connection in health endpoint +- Memory processing happens at conversation end + +**Connection Issues:** +- Use server's IP address, not localhost for mobile clients +- Ensure WebSocket connections include authentication token +- Check firewall/port settings for remote connections + +## Distributed Deployment + +### Single Machine vs Distributed Setup + +**Single Machine (Default):** +```bash +# Everything on one machine +docker compose up --build -d +``` + +**Distributed Setup (GPU + Backend separation):** + +#### GPU Machine Setup +```bash +# Start GPU-accelerated services +cd extras/asr-services +docker compose up moonshine -d + +cd extras/speaker-recognition +docker compose up --build -d + +# Ollama with GPU support +docker run -d --gpus=all -p 11434:11434 \ + -v ollama:/root/.ollama \ + ollama/ollama:latest +``` + +#### Backend Machine Configuration +```bash +# .env configuration for distributed services +OLLAMA_BASE_URL=http://[gpu-machine-tailscale-ip]:11434 +SPEAKER_SERVICE_URL=http://[gpu-machine-tailscale-ip]:8085 +PARAKEET_ASR_URL=http://[gpu-machine-tailscale-ip]:8080 + +# Start lightweight backend services +docker compose up --build -d +``` + +#### Tailscale Networking +```bash +# Install on each machine +curl -fsSL https://tailscale.com/install.sh | sh +sudo tailscale up + +# Find machine IPs +tailscale ip -4 +``` + +**Benefits of Distributed Setup:** +- GPU services on dedicated hardware +- Lightweight backend on VPS/Raspberry Pi +- Automatic Tailscale IP support (100.x.x.x) - no CORS configuration needed +- Encrypted inter-service communication + +**Service Examples:** +- GPU machine: LLM inference, ASR, speaker recognition +- Backend machine: FastAPI, WebUI, databases +- Database machine: MongoDB, Qdrant (optional separation) + +## Data Architecture + +The friend-lite backend uses a **user-centric data architecture**: + +- **All memories are keyed by database user_id** (not client_id) +- **Client information is stored in metadata** for reference and debugging +- **User email is included** for easy identification in admin interfaces +- **Multi-device support**: Users can access their data from any registered device + +For detailed information, see [User Data Architecture](user-data-architecture.md). + +## Memory Provider Selection + +### Choosing a Memory Provider + +Friend-Lite offers two memory backends: + +#### 1. Friend-Lite Native +```bash +# In your .env file +MEMORY_PROVIDER=friend_lite +LLM_PROVIDER=openai +OPENAI_API_KEY=your-openai-key-here +``` + +**Benefits:** +- Full control over memory processing +- Individual fact storage with no fallbacks +- Custom prompts and extraction logic +- Smart deduplication algorithms +- LLM-driven memory updates (ADD/UPDATE/DELETE) +- No external dependencies + +#### 2. OpenMemory MCP +```bash +# First, start the external server +cd extras/openmemory-mcp +docker compose up -d + +# Then configure Friend-Lite +MEMORY_PROVIDER=openmemory_mcp +OPENMEMORY_MCP_URL=http://host.docker.internal:8765 +``` + +**Benefits:** +- Cross-client compatibility (works with Claude Desktop, Cursor, etc.) +- Professional memory processing +- Web UI at http://localhost:8765 +- Battle-tested deduplication + +**Use OpenMemory MCP when:** +- You want cross-client memory sharing +- You're already using OpenMemory in other tools +- You prefer external expertise over custom logic + +**See [MEMORY_PROVIDERS.md](../MEMORY_PROVIDERS.md) for detailed comparison** + +## Memory & Action Item Configuration + +> 🎯 **New to memory configuration?** Read our [Memory Configuration Guide](./memory-configuration-guide.md) for a step-by-step setup guide with examples. + +The system uses **centralized configuration** via `memory_config.yaml` for all memory extraction settings. All hardcoded values have been removed from the code to ensure consistent, configurable behavior. + +### Configuration File Location +- **Path**: `backends/advanced-backend/memory_config.yaml` +- **Hot-reload**: Changes are applied on next processing cycle (no restart required) +- **Fallback**: If file is missing, system uses safe defaults with environment variables + +### LLM Provider & Model Configuration + +⭐ **OpenAI is STRONGLY RECOMMENDED** for optimal memory extraction performance. + +The system supports **multiple LLM providers** - configure via environment variables: + +```bash +# In your .env file +LLM_PROVIDER=openai # RECOMMENDED: Use "openai" for best results +OPENAI_API_KEY=your-openai-api-key +OPENAI_MODEL=gpt-4o-mini # RECOMMENDED: "gpt-5-mini" for better memory extraction + +# Alternative: Local Ollama (may have reduced memory quality) +LLM_PROVIDER=ollama +OLLAMA_BASE_URL=http://ollama:11434 +OLLAMA_MODEL=gemma3n:e4b # Fallback if YAML config fails to load +``` + +**Why OpenAI is recommended:** +- **Enhanced memory extraction**: Creates multiple granular memories instead of fallback transcripts +- **Better fact extraction**: More reliable JSON parsing and structured output +- **No more "fallback memories"**: Eliminates generic transcript-based memory entries +- **Improved conversation understanding**: Better context awareness and detail extraction + +**YAML Configuration** (provider-specific models): +```yaml +memory_extraction: + enabled: true + prompt: | + Extract anything relevant about this conversation that would be valuable to remember. + Focus on key topics, people, decisions, dates, and emotional context. + llm_settings: + # Model selection based on LLM_PROVIDER: + # - Ollama: "gemma3n:e4b", "llama3.1:latest", "llama3.2:latest", etc. + # - OpenAI: "gpt-5-mini" (recommended for JSON reliability), "gpt-5-mini", "gpt-3.5-turbo", etc. + model: "gemma3n:e4b" + temperature: 0.1 + +fact_extraction: + enabled: false # Disabled to avoid JSON parsing issues + # RECOMMENDATION: Enable with OpenAI GPT-4o for better JSON reliability + llm_settings: + model: "gemma3n:e4b" # Auto-switches based on LLM_PROVIDER + temperature: 0.0 # Lower for factual accuracy +``` + +**Provider-Specific Behavior:** +- **Ollama**: Uses local models with Ollama embeddings (nomic-embed-text) +- **OpenAI**: Uses OpenAI models with OpenAI embeddings (text-embedding-3-small) +- **Embeddings**: Automatically selected based on provider (768 dims for Ollama, 1536 for OpenAI) + +#### Fixing JSON Parsing Errors + +If you experience JSON parsing errors in fact extraction: + +1. **Switch to OpenAI GPT-4o** (recommended solution): + ```bash + # In your .env file + LLM_PROVIDER=openai + OPENAI_API_KEY=your-openai-api-key + OPENAI_MODEL=gpt-4o-mini + ``` + +2. **Enable fact extraction** with reliable JSON output: + ```yaml + # In memory_config.yaml + fact_extraction: + enabled: true # Safe to enable with GPT-4o + ``` + +3. **Monitor logs** for JSON parsing success: + ```bash + # Check for JSON parsing errors + docker logs advanced-backend | grep "JSONDecodeError" + + # Verify OpenAI usage + docker logs advanced-backend | grep "OpenAI response" + ``` + +**Why GPT-4o helps with JSON errors:** +- More consistent JSON formatting +- Better instruction following for structured output +- Reduced malformed JSON responses +- Built-in JSON mode for reliable parsing + +#### Testing OpenAI Configuration + +To verify your OpenAI setup is working: + +1. **Check logs for OpenAI usage**: + ```bash + # Start the backend and check logs + docker logs advanced-backend | grep -i "openai" + + # You should see: + # "Using OpenAI provider with model: gpt-5-mini" + ``` + +2. **Test memory extraction** with a conversation: + ```bash + # The health endpoint includes LLM provider info + curl http://localhost:8000/health + + # Response should include: "llm_provider": "openai" + ``` + +3. **Monitor memory processing**: + ```bash + # After a conversation ends, check for successful processing + docker logs advanced-backend | grep "memory processing" + ``` + +If you see errors about missing API keys or models, verify your `.env` file has: +```bash +LLM_PROVIDER=openai +OPENAI_API_KEY=sk-your-actual-api-key-here +OPENAI_MODEL=gpt-4o-mini +``` + +### Quality Control Settings +```yaml +quality_control: + min_conversation_length: 50 # Skip very short conversations + max_conversation_length: 50000 # Skip extremely long conversations + skip_low_content: true # Skip conversations with mostly filler words + min_content_ratio: 0.3 # Minimum meaningful content ratio + skip_patterns: # Regex patterns to skip + - "^(um|uh|hmm|yeah|ok|okay)\\s*$" + - "^test\\s*$" + - "^testing\\s*$" +``` + +### Processing & Performance +```yaml +processing: + parallel_processing: true # Enable concurrent processing + max_concurrent_tasks: 3 # Limit concurrent LLM requests + processing_timeout: 300 # Timeout for memory extraction (seconds) + retry_failed: true # Retry failed extractions + max_retries: 2 # Maximum retry attempts + retry_delay: 5 # Delay between retries (seconds) +``` + +### Debug & Monitoring +```yaml +debug: + enabled: true + db_path: "/app/debug/memory_debug.db" + log_level: "INFO" # DEBUG, INFO, WARNING, ERROR + log_full_conversations: false # Privacy consideration + log_extracted_memories: true # Log successful extractions +``` + +### Configuration Validation +The system validates configuration on startup and provides detailed error messages for invalid settings. Use the debug API to verify your configuration: + +```bash +# Check current configuration +curl -H "Authorization: Bearer $ADMIN_TOKEN" \ + http://localhost:8000/api/debug/memory/config +``` + +### API Endpoints for Debugging +- `GET /api/debug/memory/stats` - Processing statistics +- `GET /api/debug/memory/sessions` - Recent memory sessions +- `GET /api/debug/memory/session/{audio_uuid}` - Detailed session info +- `GET /api/debug/memory/config` - Current configuration +- `GET /api/debug/memory/pipeline/{audio_uuid}` - Pipeline trace + +**Implementation**: See `src/advanced_omi_backend/routers/modules/system_routes.py` for debug endpoints and system utilities. + +## Next Steps + +- **Configure Google OAuth** for easy user login +- **Set up Ollama** for local memory processing +- **Deploy ASR service** for self-hosted transcription +- **Connect audio clients** using the WebSocket API +- **Explore the dashboard** to manage conversations and users +- **Review the user data architecture** for understanding data organization +- **Customize memory extraction** by editing `memory_config.yaml` +- **Monitor processing performance** using debug API endpoints \ No newline at end of file diff --git a/Docs/init-system.md b/Docs/init-system.md index fbcbcbe9..fb9c1763 100644 --- a/Docs/init-system.md +++ b/Docs/init-system.md @@ -12,7 +12,7 @@ Friend-Lite uses a unified initialization system with clean separation of concerns: -- **Configuration** (`init.py`) - Set up service configurations, API keys, and .env files +- **Configuration** (`wizard.py`) - Set up service configurations, API keys, and .env files - **Service Management** (`services.py`) - Start, stop, and manage running services The root orchestrator handles service selection and delegates configuration to individual service scripts. In general, setup scripts only configure and do not start services automatically. Exceptions: `extras/asr-services` and `extras/openmemory-mcp` are startup scripts. This prevents unnecessary resource usage and gives you control over when services actually run. @@ -22,14 +22,14 @@ The root orchestrator handles service selection and delegates configuration to i ## Architecture ### Root Orchestrator -- **Location**: `/init.py` +- **Location**: `/wizard.py` - **Purpose**: Service selection and delegation only - **Does NOT**: Handle service-specific configuration or duplicate setup logic ### Service Scripts - **Backend**: `backends/advanced/init.py` - Complete Python-based interactive setup -- **Speaker Recognition**: `extras/speaker-recognition/setup.sh` - Simple bash setup -- **ASR Services**: `extras/asr-services/setup.sh` - Service startup script +- **Speaker Recognition**: `extras/speaker-recognition/init.sh` - Python-based interactive setup +- **ASR Services**: `extras/asr-services/setup.sh` - Service startup script - **OpenMemory MCP**: `extras/openmemory-mcp/setup.sh` - External server startup ## Usage @@ -39,7 +39,7 @@ Set up multiple services together with automatic URL coordination: ```bash # From project root -uv run --with-requirements setup-requirements.txt python init.py +uv run --with-requirements setup-requirements.txt python wizard.py ``` The orchestrator will: @@ -127,7 +127,7 @@ Note (Linux): If `host.docker.internal` is unavailable, add `extra_hosts: - "hos ### Container-to-Container Communication Services use `host.docker.internal` for inter-container communication: -- `http://host.docker.internal:8085` - Speaker Recognition +- `http://127.0.0.1:8085` - Speaker Recognition - `http://host.docker.internal:8767` - Parakeet ASR - `http://host.docker.internal:8765` - OpenMemory MCP diff --git a/Makefile b/Makefile index 29a73f75..1a5a3829 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ export $(shell sed 's/=.*//' config.env | grep -v '^\s*$$' | grep -v '^\s*\#') SCRIPTS_DIR := scripts K8S_SCRIPTS_DIR := $(SCRIPTS_DIR)/k8s -.PHONY: help menu setup-k8s setup-infrastructure setup-rbac setup-storage-pvc config config-docker config-k8s config-all clean deploy deploy-docker deploy-k8s deploy-k8s-full deploy-infrastructure deploy-apps check-infrastructure check-apps build-backend up-backend down-backend k8s-status k8s-cleanup k8s-purge audio-manage +.PHONY: help menu setup-k8s setup-infrastructure setup-rbac setup-storage-pvc config config-docker config-k8s config-all clean deploy deploy-docker deploy-k8s deploy-k8s-full deploy-infrastructure deploy-apps check-infrastructure check-apps build-backend up-backend down-backend k8s-status k8s-cleanup k8s-purge audio-manage test-robot test-robot-integration test-robot-unit test-robot-endpoints test-robot-specific test-robot-clean # Default target .DEFAULT_GOAL := menu @@ -29,6 +29,7 @@ menu: ## Show interactive menu (default) @echo "================================" @echo @echo "📋 Quick Actions:" + @echo " setup-dev 🛠️ Setup development environment (git hooks, pre-commit)" @echo " setup-k8s 🏗️ Complete Kubernetes setup (registry + infrastructure + RBAC)" @echo " config 📝 Generate all configuration files" @echo " deploy 🚀 Deploy using configured mode ($(DEPLOYMENT_MODE))" @@ -36,6 +37,11 @@ menu: ## Show interactive menu (default) @echo " k8s-cleanup 🧹 Clean up Kubernetes resources" @echo " audio-manage 🎵 Manage audio files" @echo + @echo "🧪 Testing:" + @echo " test-robot 🧪 Run all Robot Framework tests" + @echo " test-robot-integration 🔬 Run integration tests only" + @echo " test-robot-endpoints 🌐 Run endpoint tests only" + @echo @echo "📝 Configuration:" @echo " config-docker 🐳 Generate Docker Compose .env files" @echo " config-k8s ☸️ Generate Kubernetes files (Skaffold env + ConfigMap/Secret)" @@ -95,6 +101,13 @@ help: ## Show detailed help for all targets @echo "🎵 AUDIO MANAGEMENT:" @echo " audio-manage Interactive audio file management" @echo + @echo "🧪 ROBOT FRAMEWORK TESTING:" + @echo " test-robot Run all Robot Framework tests" + @echo " test-robot-integration Run integration tests only" + @echo " test-robot-endpoints Run endpoint tests only" + @echo " test-robot-specific FILE=path Run specific test file" + @echo " test-robot-clean Clean up test results" + @echo @echo "🔍 MONITORING:" @echo " check-infrastructure Check if infrastructure services are running" @echo " check-apps Check if application services are running" @@ -102,6 +115,29 @@ help: ## Show detailed help for all targets @echo "🧹 CLEANUP:" @echo " clean Clean up generated configuration files" +# ======================================== +# DEVELOPMENT SETUP +# ======================================== + +setup-dev: ## Setup development environment (git hooks, pre-commit) + @echo "🛠️ Setting up development environment..." + @echo "" + @echo "📦 Installing pre-commit..." + @pip install pre-commit 2>/dev/null || pip3 install pre-commit + @echo "" + @echo "🔧 Installing git hooks..." + @pre-commit install --hook-type pre-push + @pre-commit install --hook-type pre-commit + @echo "" + @echo "✅ Development environment setup complete!" + @echo "" + @echo "💡 Hooks installed:" + @echo " • Robot Framework tests run before push" + @echo " • Black/isort format Python code on commit" + @echo " • Code quality checks on commit" + @echo "" + @echo "⚙️ To skip hooks: git push --no-verify / git commit --no-verify" + # ======================================== # KUBERNETES SETUP # ======================================== @@ -170,12 +206,11 @@ config: config-all ## Generate all configuration files config-docker: ## Generate Docker Compose configuration files @echo "🐳 Generating Docker Compose configuration files..." - @python3 scripts/generate-docker-configs.py + @CONFIG_FILE=config.env.dev python3 scripts/generate-docker-configs.py @echo "✅ Docker Compose configuration files generated" -config-k8s: ## Generate Kubernetes configuration files (Skaffold env + ConfigMap/Secret) +config-k8s: ## Generate Kubernetes configuration files (ConfigMap/Secret only - no .env files) @echo "☸️ Generating Kubernetes configuration files..." - @python3 scripts/generate-docker-configs.py @python3 scripts/generate-k8s-configs.py @echo "📦 Applying ConfigMap and Secret to Kubernetes..." @kubectl apply -f k8s-manifests/configmap.yaml -n $(APPLICATION_NAMESPACE) 2>/dev/null || echo "⚠️ ConfigMap not applied (cluster not available?)" @@ -297,3 +332,49 @@ k8s-purge: ## Purge unused images (registry + container) audio-manage: ## Interactive audio file management @echo "🎵 Starting audio file management..." @$(SCRIPTS_DIR)/manage-audio-files.sh + +# ======================================== +# TESTING TARGETS +# ======================================== + +# Define test environment variables +TEST_ENV := BACKEND_URL=http://localhost:8001 ADMIN_EMAIL=test-admin@example.com ADMIN_PASSWORD=test-admin-password-123 + +test-robot: ## Run all Robot Framework tests + @echo "🧪 Running all Robot Framework tests..." + @cd tests && $(TEST_ENV) robot --outputdir ../results . + @echo "✅ All Robot Framework tests completed" + @echo "📊 Results available in: results/" + +test-robot-integration: ## Run integration tests only + @echo "🧪 Running Robot Framework integration tests..." + @cd tests && $(TEST_ENV) robot --outputdir ../results integration/ + @echo "✅ Robot Framework integration tests completed" + @echo "📊 Results available in: results/" + +test-robot-unit: ## Run unit tests only + @echo "🧪 Running Robot Framework unit tests..." + @cd tests && $(TEST_ENV) robot --outputdir ../results unit/ || echo "⚠️ No unit tests directory found" + @echo "✅ Robot Framework unit tests completed" + @echo "📊 Results available in: results/" + +test-robot-endpoints: ## Run endpoint tests only + @echo "🧪 Running Robot Framework endpoint tests..." + @cd tests && $(TEST_ENV) robot --outputdir ../results endpoints/ + @echo "✅ Robot Framework endpoint tests completed" + @echo "📊 Results available in: results/" + +test-robot-specific: ## Run specific Robot Framework test file (usage: make test-robot-specific FILE=path/to/test.robot) + @echo "🧪 Running specific Robot Framework test: $(FILE)" + @if [ -z "$(FILE)" ]; then \ + echo "❌ FILE parameter is required. Usage: make test-robot-specific FILE=path/to/test.robot"; \ + exit 1; \ + fi + @cd tests && $(TEST_ENV) robot --outputdir ../results $(FILE) + @echo "✅ Robot Framework test completed: $(FILE)" + @echo "📊 Results available in: results/" + +test-robot-clean: ## Clean up Robot Framework test results + @echo "🧹 Cleaning up Robot Framework test results..." + @rm -rf results/ + @echo "✅ Test results cleaned" diff --git a/README.md b/README.md index 8bb8c0e8..0a43076b 100644 --- a/README.md +++ b/README.md @@ -8,17 +8,17 @@ Clone, run setup wizard, start services, access at http://localhost:5173 ## Screenshots -*[WebUI Dashboard - Screenshot coming soon]* +### WebUI Dashboard -![WebUI Dashboard](screenshots/dashboard.png) +![WebUI Dashboard](.assets/advanced-dashboard-webui.png) -*[Mobile App - Screenshot coming soon]* +### Memory Search -![Mobile App](screenshots/mobile-app.png) +![Memory Search](.assets/memory-dashboard.png) -*[Memory Search - Screenshot coming soon]* +*[Mobile App - Screenshot coming soon]* -![Memory Search](screenshots/memory-search.png) +![Mobile App](screenshots/mobile-app.png) ## What's Included diff --git a/backends/advanced/.dockerignore b/backends/advanced/.dockerignore index 0ea8a946..2dd9b44f 100644 --- a/backends/advanced/.dockerignore +++ b/backends/advanced/.dockerignore @@ -14,4 +14,8 @@ !webui !ssl !nginx.conf -!nginx.conf.template \ No newline at end of file +!nginx.conf.template +!start.sh +!start-k8s.sh +!start-workers.sh +!Caddyfile \ No newline at end of file diff --git a/backends/advanced/.env.template b/backends/advanced/.env.template index b00a30c8..01724f19 100644 --- a/backends/advanced/.env.template +++ b/backends/advanced/.env.template @@ -29,9 +29,10 @@ OPENAI_MODEL=gpt-4o-mini # For Ollama (OpenAI-compatible mode): # LLM_PROVIDER=ollama -# OPENAI_API_KEY=dummy -# OPENAI_BASE_URL=http://ollama:11434/v1 -# OPENAI_MODEL=llama3.1:latest +# OLLAMA_BASE_URL=dummy +# OLLAMA_BASE_URL=http://ollama:11434/v1 +# OLLAMA_MODEL=llama3.1:latest +# OLLAMA_EMBEDDER_MODEL=nomic-embed-text:latest # ======================================== # CHAT INTERFACE CONFIGURATION (Optional) @@ -77,6 +78,12 @@ TRANSCRIPTION_BUFFER_SECONDS=120 # Trigger transcription every N seconds # Auto-stop thresholds SPEECH_INACTIVITY_THRESHOLD_SECONDS=60 # Close conversation after N seconds of no speech +# Speaker enrollment filter (default: false) +# When enabled, only creates conversations when enrolled speakers are detected +# Requires speaker recognition service to be running and speakers to be enrolled +# Set to "true" to enable, "false" or omit to disable +RECORD_ONLY_ENROLLED_SPEAKERS=true + # ======================================== # DATABASE CONFIGURATION # ======================================== diff --git a/backends/advanced/Caddyfile.template b/backends/advanced/Caddyfile.template new file mode 100644 index 00000000..21caf0ee --- /dev/null +++ b/backends/advanced/Caddyfile.template @@ -0,0 +1,107 @@ +# Caddy reverse proxy configuration for Friend-Lite +# Provides automatic HTTPS for microphone access + +# USAGE: +# 1. Start services: docker compose up -d +# 2. Access at: https://localhost (Caddy will use self-signed cert) +# 3. Browser will warn about self-signed cert - accept it +# 4. Microphone access will now work via HTTPS +# +# NOTE: If using Caddy, update docker-compose.yml webui build args: +# VITE_BACKEND_URL: "" (empty for same-origin through Caddy) +# +# For production, replace 'localhost' with your domain name and Caddy +# will automatically obtain Let's Encrypt certificates. + +localhost TAILSCALE_IP { + # Enable automatic HTTPS + tls internal + + # WebSocket endpoints - proxy to backend with upgrade support + handle /ws* { + reverse_proxy friend-backend:8000 { + # Caddy automatically handles WebSocket upgrades + header_up X-Real-IP {remote_host} + header_up X-Forwarded-For {remote_host} + header_up X-Forwarded-Proto {scheme} + } + } + + # API endpoints - proxy to backend + handle /api/* { + reverse_proxy friend-backend:8000 + } + + # Auth endpoints - proxy to backend + handle /auth/* { + reverse_proxy friend-backend:8000 + } + + # Health checks - proxy to backend + handle /health { + reverse_proxy friend-backend:8000 + } + + handle /readiness { + reverse_proxy friend-backend:8000 + } + + # Users endpoints - proxy to backend + handle /users/* { + reverse_proxy friend-backend:8000 + } + + # Audio files - proxy to backend + handle /audio/* { + reverse_proxy friend-backend:8000 + } + + # Everything else - proxy to webui + handle { + reverse_proxy webui:80 + } +} + +# Production configuration (uncomment and modify for your domain) +# yourdomain.com { +# # Caddy automatically obtains Let's Encrypt certificates +# +# # WebSocket endpoints +# handle /ws* { +# reverse_proxy friend-backend:8000 +# } +# +# # API endpoints +# handle /api/* { +# reverse_proxy friend-backend:8000 +# } +# +# # Auth endpoints +# handle /auth/* { +# reverse_proxy friend-backend:8000 +# } +# +# # Health checks +# handle /health { +# reverse_proxy friend-backend:8000 +# } +# +# handle /readiness { +# reverse_proxy friend-backend:8000 +# } +# +# # Users endpoints +# handle /users/* { +# reverse_proxy friend-backend:8000 +# } +# +# # Audio files +# handle /audio/* { +# reverse_proxy friend-backend:8000 +# } +# +# # Everything else - webui +# handle { +# reverse_proxy webui:80 +# } +# } diff --git a/backends/advanced/Dockerfile b/backends/advanced/Dockerfile index be3e1019..c8f54ac7 100644 --- a/backends/advanced/Dockerfile +++ b/backends/advanced/Dockerfile @@ -39,5 +39,10 @@ COPY memory_config.yaml* ./ COPY diarization_config.json* ./ -# Run the application -CMD ["uv", "run", "--extra", "deepgram", "python3", "src/advanced_omi_backend/main.py"] +# Copy and make startup scripts executable +COPY start.sh ./ +COPY start-workers.sh ./ +RUN chmod +x start.sh start-workers.sh + +# Run the application with workers +CMD ["./start.sh"] diff --git a/backends/advanced/Dockerfile.blackwell b/backends/advanced/Dockerfile.blackwell deleted file mode 100644 index 892541a9..00000000 --- a/backends/advanced/Dockerfile.blackwell +++ /dev/null @@ -1,33 +0,0 @@ -FROM python:3.12-slim-bookworm AS builder - -# Install system dependencies for building -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential \ - libsndfile1 \ - git \ - curl \ - ffmpeg \ - && rm -rf /var/lib/apt/lists/* - -# Install uv -COPY --from=ghcr.io/astral-sh/uv:0.6.10 /uv /uvx /bin/ - -# Set up the working directory -WORKDIR /app - -# Copy dependency files -COPY pyproject.blackwell.toml pyproject.toml -COPY README.md . - -# Install dependencies using uv -RUN --mount=type=cache,target=/root/.cache/uv \ - uv sync - - -# Copy application code -COPY . . - - -# Run the application -CMD ["uv", "run", "python3", "src/advanced_omi_backend/main.py"] diff --git a/backends/advanced/Dockerfile.k8s b/backends/advanced/Dockerfile.k8s index edfe62db..097f5d7f 100644 --- a/backends/advanced/Dockerfile.k8s +++ b/backends/advanced/Dockerfile.k8s @@ -36,9 +36,13 @@ COPY . . # Copy memory config (created by init.sh from template) COPY memory_config.yaml ./ +# Copy and make K8s startup scripts executable +COPY start-k8s.sh start-workers.sh ./ +RUN chmod +x start-k8s.sh start-workers.sh -# Run the application -# CMD ["uv", "run", "python3", "src/advanced_omi_backend/main.py"] +# Activate virtual environment in PATH +ENV PATH="/app/.venv/bin:$PATH" -# don't sync if deploying prebuilt image to k8s -CMD ["uv", "run", "--no-sync", "python3", "src/advanced_omi_backend/main.py"] +# Run the application with workers +# K8s startup script starts both FastAPI backend and RQ workers +CMD ["./start-k8s.sh"] diff --git a/backends/advanced/Docs/architecture.md b/backends/advanced/Docs/architecture.md index 67919ae9..8211cb32 100644 --- a/backends/advanced/Docs/architecture.md +++ b/backends/advanced/Docs/architecture.md @@ -1005,7 +1005,7 @@ src/advanced_omi_backend/ - `GET /api/conversations/{conversation_id}/versions` - Get version history - `POST /api/conversations/{conversation_id}/activate-transcript` - Switch transcript version - `POST /api/conversations/{conversation_id}/activate-memory` - Switch memory version -- `POST /api/process-audio-files` - Batch audio file processing +- `POST /api/audio/upload` - Batch audio file upload and processing - WebSocket `/ws_omi` - Real-time Opus audio streaming with Wyoming protocol (OMI devices) - WebSocket `/ws_pcm` - Real-time PCM audio streaming with Wyoming protocol (all apps) diff --git a/backends/advanced/Docs/quickstart.md b/backends/advanced/Docs/quickstart.md index 272388f5..523218bc 100644 --- a/backends/advanced/Docs/quickstart.md +++ b/backends/advanced/Docs/quickstart.md @@ -260,13 +260,13 @@ The system supports processing existing audio files through the file upload API. export USER_TOKEN="your-jwt-token" # Upload single WAV file -curl -X POST "http://localhost:8000/api/process-audio-files" \ +curl -X POST "http://localhost:8000/api/audio/upload" \ -H "Authorization: Bearer $USER_TOKEN" \ -F "files=@/path/to/audio.wav" \ -F "device_name=file_upload" # Upload multiple WAV files -curl -X POST "http://localhost:8000/api/process-audio-files" \ +curl -X POST "http://localhost:8000/api/audio/upload" \ -H "Authorization: Bearer $USER_TOKEN" \ -F "files=@/path/to/recording1.wav" \ -F "files=@/path/to/recording2.wav" \ diff --git a/backends/advanced/docker-compose-ci.yml b/backends/advanced/docker-compose-ci.yml new file mode 100644 index 00000000..79d7a2b0 --- /dev/null +++ b/backends/advanced/docker-compose-ci.yml @@ -0,0 +1,192 @@ +# docker-compose-ci.yml +# CI/CD environment for GitHub Actions +# Uses built image without source code mounts to ensure memory_config.yaml is included + +services: + friend-backend-test: + build: + context: . + dockerfile: Dockerfile + ports: + - "8001:8000" # Avoid conflict with dev on 8000 + volumes: + # No src mount for CI - use built image with all files included + - ./data/test_audio_chunks:/app/audio_chunks + - ./data/test_debug_dir:/app/debug_dir + - ./data/test_data:/app/data + environment: + # Override with test-specific settings + - MONGODB_URI=mongodb://mongo-test:27017/test_db + - QDRANT_BASE_URL=qdrant-test + - QDRANT_PORT=6333 + - REDIS_URL=redis://redis-test:6379/0 + - DEBUG_DIR=/app/debug_dir + # Import API keys from environment + - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} + - OPENAI_API_KEY=${OPENAI_API_KEY} + # LLM provider configuration (required for memory service) + - LLM_PROVIDER=${LLM_PROVIDER:-openai} + - OPENAI_BASE_URL=${OPENAI_BASE_URL:-https://api.openai.com/v1} + - OPENAI_MODEL=${OPENAI_MODEL:-gpt-4o-mini} + # Authentication (test-specific) + - AUTH_SECRET_KEY=test-jwt-signing-key-for-integration-tests + - ADMIN_PASSWORD=test-admin-password-123 + - ADMIN_EMAIL=test-admin@example.com + # Transcription provider configuration + - TRANSCRIPTION_PROVIDER=${TRANSCRIPTION_PROVIDER:-deepgram} + # - PARAKEET_ASR_URL=${PARAKEET_ASR_URL} + # Memory provider configuration + - MEMORY_PROVIDER=${MEMORY_PROVIDER:-friend_lite} + - OPENMEMORY_MCP_URL=${OPENMEMORY_MCP_URL:-http://host.docker.internal:8765} + - OPENMEMORY_USER_ID=${OPENMEMORY_USER_ID:-openmemory} + # Disable speaker recognition in test environment to prevent segment duplication + - DISABLE_SPEAKER_RECOGNITION=false + - SPEAKER_SERVICE_URL=https://localhost:8085 + - CORS_ORIGINS=http://localhost:3001,http://localhost:8001,https://localhost:3001,https://localhost:8001 + # Set low inactivity timeout for tests (2 seconds instead of 60) + - SPEECH_INACTIVITY_THRESHOLD_SECONDS=2 + # Wait for audio queue to drain before timing out (test mode) + - WAIT_FOR_AUDIO_QUEUE_DRAIN=true + depends_on: + qdrant-test: + condition: service_started + mongo-test: + condition: service_healthy + redis-test: + condition: service_started + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/readiness"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + restart: unless-stopped + + webui-test: + build: + context: ./webui + dockerfile: Dockerfile + args: + - VITE_BACKEND_URL=http://localhost:8001 + - BACKEND_URL=http://localhost:8001 + volumes: + - ./webui/src:/app/src # Mount source code for easier development + ports: + - "3001:80" # Avoid conflict with dev on 3000 + depends_on: + friend-backend-test: + condition: service_healthy + mongo-test: + condition: service_healthy + qdrant-test: + condition: service_started + redis-test: + condition: service_started + + qdrant-test: + image: qdrant/qdrant:latest + ports: + - "6337:6333" # gRPC - avoid conflict with dev 6333 + - "6338:6334" # HTTP - avoid conflict with dev 6334 + volumes: + - ./data/test_qdrant_data:/qdrant/storage + + mongo-test: + image: mongo:8.0.14 + ports: + - "27018:27017" # Avoid conflict with dev on 27017 + volumes: + - ./data/test_mongo_data:/data/db + # Use test database name to ensure isolation + command: mongod --dbpath /data/db --bind_ip_all + healthcheck: + test: ["CMD", "mongosh", "--eval", "db.runCommand('ping').ok", "--quiet"] + interval: 5s + timeout: 5s + retries: 10 + start_period: 10s + + redis-test: + image: redis:7-alpine + ports: + - "6380:6379" # Avoid conflict with dev on 6379 + volumes: + - ./data/test_redis_data:/data + command: redis-server --appendonly yes + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + workers-test: + build: + context: . + dockerfile: Dockerfile + command: ./start-workers.sh + volumes: + # No src mount for CI - use built image + - ./data/test_audio_chunks:/app/audio_chunks + - ./data/test_debug_dir:/app/debug_dir + - ./data/test_data:/app/data + environment: + # Same environment as backend + - MONGODB_URI=mongodb://mongo-test:27017/test_db + - QDRANT_BASE_URL=qdrant-test + - QDRANT_PORT=6333 + - REDIS_URL=redis://redis-test:6379/0 + - DEBUG_DIR=/app/debug_dir + - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - LLM_PROVIDER=${LLM_PROVIDER:-openai} + - OPENAI_BASE_URL=${OPENAI_BASE_URL:-https://api.openai.com/v1} + - OPENAI_MODEL=${OPENAI_MODEL:-gpt-4o-mini} + - AUTH_SECRET_KEY=test-jwt-signing-key-for-integration-tests + - ADMIN_PASSWORD=test-admin-password-123 + - ADMIN_EMAIL=test-admin@example.com + - TRANSCRIPTION_PROVIDER=${TRANSCRIPTION_PROVIDER:-deepgram} + - MEMORY_PROVIDER=${MEMORY_PROVIDER:-friend_lite} + - OPENMEMORY_MCP_URL=${OPENMEMORY_MCP_URL:-http://host.docker.internal:8765} + - OPENMEMORY_USER_ID=${OPENMEMORY_USER_ID:-openmemory} + - DISABLE_SPEAKER_RECOGNITION=false + - SPEAKER_SERVICE_URL=https://localhost:8085 + # Set low inactivity timeout for tests (2 seconds instead of 60) + - SPEECH_INACTIVITY_THRESHOLD_SECONDS=2 + # Wait for audio queue to drain before timing out (test mode) + - WAIT_FOR_AUDIO_QUEUE_DRAIN=true + depends_on: + friend-backend-test: + condition: service_healthy + mongo-test: + condition: service_healthy + redis-test: + condition: service_started + qdrant-test: + condition: service_started + restart: unless-stopped + + # caddy: + # image: caddy:2-alpine + # ports: + # - "443:443" + # - "80:80" # HTTP redirect to HTTPS + # volumes: + # - ./Caddyfile-test:/etc/caddy/Caddyfile:ro + # - ./data/caddy_data:/data + # - ./data/caddy_config:/config + # depends_on: + # webui-test: + # condition: service_started + # friend-backend-test: + # condition: service_healthy + # restart: unless-stopped + +# CI Considerations (for future implementation): +# - GitHub Actions can run these services in isolated containers +# - Port conflicts won't exist in CI since each job runs in isolation +# - For CI, we could add: +# - --build flag for fresh builds +# - --force-recreate for clean state +# - Volume cleanup between test runs +# - Environment variables can be injected via GitHub secrets +# - Health checks ensure services are ready before tests run \ No newline at end of file diff --git a/backends/advanced/docker-compose-test.yml b/backends/advanced/docker-compose-test.yml index c491d89b..029d0238 100644 --- a/backends/advanced/docker-compose-test.yml +++ b/backends/advanced/docker-compose-test.yml @@ -10,6 +10,7 @@ services: ports: - "8001:8000" # Avoid conflict with dev on 8000 volumes: + - ./src:/app/src # Mount source code for easier development - ./data/test_audio_chunks:/app/audio_chunks - ./data/test_debug_dir:/app/debug_dir - ./data/test_data:/app/data @@ -18,6 +19,7 @@ services: - MONGODB_URI=mongodb://mongo-test:27017/test_db - QDRANT_BASE_URL=qdrant-test - QDRANT_PORT=6333 + - REDIS_URL=redis://redis-test:6379/0 - DEBUG_DIR=/app/debug_dir # Import API keys from environment - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} @@ -37,18 +39,26 @@ services: - OPENMEMORY_MCP_URL=${OPENMEMORY_MCP_URL:-http://host.docker.internal:8765} - OPENMEMORY_USER_ID=${OPENMEMORY_USER_ID:-openmemory} # Disable speaker recognition in test environment to prevent segment duplication - - DISABLE_SPEAKER_RECOGNITION=true + - DISABLE_SPEAKER_RECOGNITION=false + - SPEAKER_SERVICE_URL=https://localhost:8085 + - CORS_ORIGINS=http://localhost:3001,http://localhost:8001,https://localhost:3001,https://localhost:8001 + # Set low inactivity timeout for tests (2 seconds instead of 60) + - SPEECH_INACTIVITY_THRESHOLD_SECONDS=2 + # Wait for audio queue to drain before timing out (test mode) + - WAIT_FOR_AUDIO_QUEUE_DRAIN=true depends_on: qdrant-test: condition: service_started mongo-test: condition: service_healthy + redis-test: + condition: service_started healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/readiness"] interval: 10s timeout: 5s retries: 5 - start_period: 5s + start_period: 30s restart: unless-stopped webui-test: @@ -57,6 +67,9 @@ services: dockerfile: Dockerfile args: - VITE_BACKEND_URL=http://localhost:8001 + - BACKEND_URL=http://localhost:8001 + volumes: + - ./webui/src:/app/src # Mount source code for easier development ports: - "3001:80" # Avoid conflict with dev on 3000 depends_on: @@ -66,6 +79,8 @@ services: condition: service_healthy qdrant-test: condition: service_started + redis-test: + condition: service_started qdrant-test: image: qdrant/qdrant:latest @@ -76,7 +91,7 @@ services: - ./data/test_qdrant_data:/qdrant/storage mongo-test: - image: mongo:4.4.18 + image: mongo:8.0.14 ports: - "27018:27017" # Avoid conflict with dev on 27017 volumes: @@ -84,12 +99,86 @@ services: # Use test database name to ensure isolation command: mongod --dbpath /data/db --bind_ip_all healthcheck: - test: ["CMD", "mongo", "--eval", "db.runCommand('ping').ok", "--quiet"] + test: ["CMD", "mongosh", "--eval", "db.runCommand('ping').ok", "--quiet"] interval: 5s timeout: 5s retries: 10 start_period: 10s + redis-test: + image: redis:7-alpine + ports: + - "6380:6379" # Avoid conflict with dev on 6379 + volumes: + - ./data/test_redis_data:/data + command: redis-server --appendonly yes + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + workers-test: + build: + context: . + dockerfile: Dockerfile + command: ./start-workers.sh + volumes: + - ./src:/app/src + - ./data/test_audio_chunks:/app/audio_chunks + - ./data/test_debug_dir:/app/debug_dir + - ./data/test_data:/app/data + environment: + # Same environment as backend + - MONGODB_URI=mongodb://mongo-test:27017/test_db + - QDRANT_BASE_URL=qdrant-test + - QDRANT_PORT=6333 + - REDIS_URL=redis://redis-test:6379/0 + - DEBUG_DIR=/app/debug_dir + - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - LLM_PROVIDER=${LLM_PROVIDER:-openai} + - OPENAI_MODEL=${OPENAI_MODEL:-gpt-4o-mini} + - AUTH_SECRET_KEY=test-jwt-signing-key-for-integration-tests + - ADMIN_PASSWORD=test-admin-password-123 + - ADMIN_EMAIL=test-admin@example.com + - TRANSCRIPTION_PROVIDER=${TRANSCRIPTION_PROVIDER:-deepgram} + - MEMORY_PROVIDER=${MEMORY_PROVIDER:-friend_lite} + - OPENMEMORY_MCP_URL=${OPENMEMORY_MCP_URL:-http://host.docker.internal:8765} + - OPENMEMORY_USER_ID=${OPENMEMORY_USER_ID:-openmemory} + - DISABLE_SPEAKER_RECOGNITION=false + - SPEAKER_SERVICE_URL=https://localhost:8085 + # Set low inactivity timeout for tests (2 seconds instead of 60) + - SPEECH_INACTIVITY_THRESHOLD_SECONDS=2 + # Wait for audio queue to drain before timing out (test mode) + - WAIT_FOR_AUDIO_QUEUE_DRAIN=true + depends_on: + friend-backend-test: + condition: service_healthy + mongo-test: + condition: service_healthy + redis-test: + condition: service_started + qdrant-test: + condition: service_started + restart: unless-stopped + + # caddy: + # image: caddy:2-alpine + # ports: + # - "443:443" + # - "80:80" # HTTP redirect to HTTPS + # volumes: + # - ./Caddyfile-test:/etc/caddy/Caddyfile:ro + # - ./data/caddy_data:/data + # - ./data/caddy_config:/config + # depends_on: + # webui-test: + # condition: service_started + # friend-backend-test: + # condition: service_healthy + # restart: unless-stopped + # CI Considerations (for future implementation): # - GitHub Actions can run these services in isolated containers # - Port conflicts won't exist in CI since each job runs in isolation diff --git a/backends/advanced/docker-compose.yml b/backends/advanced/docker-compose.yml index 00718452..d9d58dca 100644 --- a/backends/advanced/docker-compose.yml +++ b/backends/advanced/docker-compose.yml @@ -15,7 +15,12 @@ services: environment: - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} - MISTRAL_API_KEY=${MISTRAL_API_KEY} + - MISTRAL_MODEL=${MISTRAL_MODEL} - TRANSCRIPTION_PROVIDER=${TRANSCRIPTION_PROVIDER} + - PARAKEET_ASR_URL=${PARAKEET_ASR_URL} + - OFFLINE_ASR_TCP_URI=${OFFLINE_ASR_TCP_URI} + - OLLAMA_BASE_URL=${OLLAMA_BASE_URL} + - HF_TOKEN=${HF_TOKEN} - SPEAKER_SERVICE_URL=${SPEAKER_SERVICE_URL} - ADMIN_PASSWORD=${ADMIN_PASSWORD} - ADMIN_EMAIL=${ADMIN_EMAIL} @@ -24,30 +29,18 @@ services: - OPENAI_API_KEY=${OPENAI_API_KEY} - OPENAI_BASE_URL=${OPENAI_BASE_URL} - OPENAI_MODEL=${OPENAI_MODEL} - - CORS_ORIGINS=${CORS_ORIGINS} - # OpenMemory MCP configuration - - MEMORY_PROVIDER=${MEMORY_PROVIDER} - - OPENMEMORY_MCP_URL=${OPENMEMORY_MCP_URL:-http://host.docker.internal:8765} - - OPENMEMORY_CLIENT_NAME=${OPENMEMORY_CLIENT_NAME:-friend_lite} - - OPENMEMORY_USER_ID=${OPENMEMORY_USER_ID:-openmemory} - - OPENMEMORY_TIMEOUT=${OPENMEMORY_TIMEOUT:-30} - # Speech Detection (Speech-Driven Conversations Architecture) - - SPEECH_DETECTION_MIN_WORDS=${SPEECH_DETECTION_MIN_WORDS:-5} - - SPEECH_DETECTION_MIN_CONFIDENCE=${SPEECH_DETECTION_MIN_CONFIDENCE:-0.5} - # Conversation Stop (Automatic Conversation Closure) - - TRANSCRIPTION_BUFFER_SECONDS=${TRANSCRIPTION_BUFFER_SECONDS:-120} - - SPEECH_INACTIVITY_THRESHOLD_SECONDS=${SPEECH_INACTIVITY_THRESHOLD_SECONDS:-60} - - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY} - - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY} - - LANGFUSE_HOST=${LANGFUSE_HOST} - - LANGFUSE_ENABLE_TELEMETRY=${LANGFUSE_ENABLE_TELEMETRY} - + - NEO4J_HOST=${NEO4J_HOST} + - NEO4J_USER=${NEO4J_USER} + - NEO4J_PASSWORD=${NEO4J_PASSWORD} + - CORS_ORIGINS=http://localhost:3010,http://localhost:8000,http://192.168.1.153:3010,http://192.168.1.153:8000,https://localhost:3010,https://localhost:8000,https://100.105.225.45,https://localhost + - REDIS_URL=redis://redis:6379/0 depends_on: - # You may not want qdrant if you are using openmemory_mcp - # qdrant: - # condition: service_started - mongo: + qdrant: condition: service_started + mongo: + condition: service_healthy + redis: + condition: service_healthy # neo4j-mem0: # condition: service_started healthcheck: @@ -58,68 +51,132 @@ services: start_period: 5s restart: unless-stopped - # Development webui service (default) + # Unified Worker Container + # No CUDA needed for friend-backend and workers, workers only orchestrate jobs and call external services + # Runs all workers in a single container for efficiency: + # - 3 RQ workers (transcription, memory, default queues) + # - 1 Audio stream worker (Redis Streams consumer - must be single to maintain sequential chunks) + workers: + build: + context: . + dockerfile: Dockerfile + command: ["./start-workers.sh"] + env_file: + - .env + volumes: + - ./src:/app/src + - ./start-workers.sh:/app/start-workers.sh + - ./data/audio_chunks:/app/audio_chunks + - ./data:/app/data + environment: + - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY} + - MISTRAL_API_KEY=${MISTRAL_API_KEY} + - MISTRAL_MODEL=${MISTRAL_MODEL} + - TRANSCRIPTION_PROVIDER=${TRANSCRIPTION_PROVIDER} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - OPENAI_BASE_URL=${OPENAI_BASE_URL} + - OPENAI_MODEL=${OPENAI_MODEL} + - LLM_PROVIDER=${LLM_PROVIDER} + - REDIS_URL=redis://redis:6379/0 + depends_on: + redis: + condition: service_healthy + mongo: + condition: service_healthy + qdrant: + condition: service_started + restart: unless-stopped + webui: + build: + context: ./webui + dockerfile: Dockerfile + args: + # Direct access (http://localhost:3010): + # - VITE_BACKEND_URL=http://localhost:8000 + # - BACKEND_URL=http://localhost:8000 + # For Caddy HTTPS (https://localhost), use: + - VITE_BACKEND_URL= + - BACKEND_URL= + ports: + # - "${WEBUI_PORT:-3010}:80" + - 3010:80 + depends_on: + friend-backend: + condition: service_healthy + restart: unless-stopped + + # Caddy reverse proxy - provides HTTPS for microphone access + # Access at: https://localhost (accepts self-signed cert warning) + # Only starts when HTTPS is configured (Caddyfile exists) + caddy: + image: caddy:2-alpine + ports: + - "443:443" + - "80:80" # HTTP redirect to HTTPS + volumes: + - ./Caddyfile:/etc/caddy/Caddyfile:ro + - caddy_data:/data + - caddy_config:/config + depends_on: + friend-backend: + condition: service_healthy + restart: unless-stopped + profiles: + - https + + # Development webui service (use with docker-compose --profile dev up) + webui-dev: build: context: ./webui dockerfile: Dockerfile.dev ports: - - "${WEBUI_PORT:-5173}:5173" + - "${WEBUI_DEV_PORT:-5173}:5173" environment: - # Don't set VITE_BACKEND_URL - let frontend auto-detect based on access method - # - VITE_BACKEND_URL=http://${HOST_IP}:${BACKEND_PUBLIC_PORT:-8000} - - VITE_HMR_PORT=443 + - VITE_BACKEND_URL=http://${HOST_IP}:${BACKEND_PUBLIC_PORT:-8000} volumes: - ./webui/src:/app/src - ./webui/public:/app/public depends_on: friend-backend: condition: service_healthy - restart: unless-stopped + profiles: + - dev qdrant: image: qdrant/qdrant:latest ports: - - "6333:6333" # gRPC - - "6334:6334" # HTTP + - "6033:6033" # gRPC + - "6034:6034" # HTTP volumes: - ./data/qdrant_data:/qdrant/storage mongo: - image: mongo:4.4.18 + image: mongo:8.0.14 ports: - "27017:27017" volumes: - - ./data/mongo_data:/data/db - - # OpenMemory MCP Server - Use external server from extras/openmemory-mcp - # The Friend-Lite backend connects to the external OpenMemory MCP server - # running from extras/openmemory-mcp via host.docker.internal:8765 - # - # To start the external server: - # cd extras/openmemory-mcp && docker compose up -d + - mongo_data:/data/db + healthcheck: + test: ["CMD", "mongosh", "--quiet", "--eval", "db.adminCommand({ ping: 1 })"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s - # Nginx reverse proxy for HTTPS access - nginx: - image: nginx:alpine + redis: + image: redis:7-alpine ports: - - "443:443" - - "80:80" + - "6379:6379" # Avoid conflict with dev on 6379 volumes: - - ./nginx.conf:/etc/nginx/nginx.conf:ro - - ./ssl:/etc/nginx/ssl:ro - depends_on: - friend-backend: - condition: service_healthy - webui: - condition: service_started - restart: unless-stopped + - ./data/redis_data:/data + command: redis-server --appendonly yes healthcheck: - test: ["CMD", "curl", "-f", "-k", "https://localhost/health"] - interval: 30s - timeout: 10s - retries: 3 + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 ## Additional @@ -155,15 +212,32 @@ services: + # Use tailscale instead + # UNCOMMENT OUT FOR LOCAL DEMO - EXPOSES to internet + # ngrok: + # image: ngrok/ngrok:latest + # depends_on: [friend-backend, proxy] + # ports: + # - "4040:4040" # Ngrok web interface + # environment: + # - NGROK_AUTHTOKEN=${NGROK_AUTHTOKEN} + # command: "http proxy:80 --url=${NGROK_URL} --basic-auth=${NGROK_BASIC_AUTH}" +# Shared network for cross-project communication +networks: + default: + name: friend-network -# Question: These are named volumes, but they are not being used, right? Can we remove them? -# volumes: -# ollama_data: -# driver: local -# mongo_data: -# driver: local -# neo4j_data: -# driver: local -# neo4j_logs: -# driver: local +volumes: + ollama_data: + driver: local + mongo_data: + driver: local + caddy_data: + driver: local + caddy_config: + driver: local + neo4j_data: + driver: local + neo4j_logs: + driver: local diff --git a/backends/advanced/init.py b/backends/advanced/init.py index 35776430..667f5209 100644 --- a/backends/advanced/init.py +++ b/backends/advanced/init.py @@ -13,9 +13,9 @@ import sys from datetime import datetime from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any, Dict -from rich import print as rprint +from dotenv import get_key, set_key from rich.console import Console from rich.panel import Panel from rich.prompt import Confirm, Prompt @@ -100,6 +100,26 @@ def backup_existing_env(self): shutil.copy2(env_path, backup_path) self.console.print(f"[blue][INFO][/blue] Backed up existing .env file to {backup_path}") + def read_existing_env_value(self, key: str) -> str: + """Read a value from existing .env file""" + env_path = Path(".env") + if not env_path.exists(): + return None + + value = get_key(str(env_path), key) + # get_key returns None if key doesn't exist or value is empty + return value if value else None + + def mask_api_key(self, key: str, show_chars: int = 5) -> str: + """Mask API key showing only first and last few characters""" + if not key or len(key) <= show_chars * 2: + return key + + # Remove quotes if present + key_clean = key.strip("'\"") + + return f"{key_clean[:show_chars]}{'*' * min(15, len(key_clean) - show_chars * 2)}{key_clean[-show_chars:]}" + def setup_authentication(self): """Configure authentication settings""" self.print_section("Authentication Setup") @@ -128,8 +148,17 @@ def setup_transcription(self): if choice == "1": self.console.print("[blue][INFO][/blue] Deepgram selected") self.console.print("Get your API key from: https://console.deepgram.com/") - - api_key = self.prompt_value("Deepgram API key (leave empty to skip)", "") + + # Check for existing API key + existing_key = self.read_existing_env_value("DEEPGRAM_API_KEY") + if existing_key and existing_key not in ['your_deepgram_api_key_here', 'your-deepgram-key-here']: + masked_key = self.mask_api_key(existing_key) + prompt_text = f"Deepgram API key ({masked_key}) [press Enter to reuse, or enter new]" + api_key_input = self.prompt_value(prompt_text, "") + api_key = api_key_input if api_key_input else existing_key + else: + api_key = self.prompt_value("Deepgram API key (leave empty to skip)", "") + if api_key: self.config["TRANSCRIPTION_PROVIDER"] = "deepgram" self.config["DEEPGRAM_API_KEY"] = api_key @@ -141,10 +170,19 @@ def setup_transcription(self): self.config["TRANSCRIPTION_PROVIDER"] = "mistral" self.console.print("[blue][INFO][/blue] Mistral selected") self.console.print("Get your API key from: https://console.mistral.ai/") - - api_key = self.prompt_value("Mistral API key (leave empty to skip)", "") + + # Check for existing API key + existing_key = self.read_existing_env_value("MISTRAL_API_KEY") + if existing_key and existing_key not in ['your_mistral_api_key_here', 'your-mistral-key-here']: + masked_key = self.mask_api_key(existing_key) + prompt_text = f"Mistral API key ({masked_key}) [press Enter to reuse, or enter new]" + api_key_input = self.prompt_value(prompt_text, "") + api_key = api_key_input if api_key_input else existing_key + else: + api_key = self.prompt_value("Mistral API key (leave empty to skip)", "") + model = self.prompt_value("Mistral model", "voxtral-mini-2507") - + if api_key: self.config["MISTRAL_API_KEY"] = api_key self.config["MISTRAL_MODEL"] = model @@ -178,11 +216,20 @@ def setup_llm(self): self.config["LLM_PROVIDER"] = "openai" self.console.print("[blue][INFO][/blue] OpenAI selected") self.console.print("Get your API key from: https://platform.openai.com/api-keys") - - api_key = self.prompt_value("OpenAI API key (leave empty to skip)", "") + + # Check for existing API key + existing_key = self.read_existing_env_value("OPENAI_API_KEY") + if existing_key and existing_key not in ['your_openai_api_key_here', 'your-openai-key-here']: + masked_key = self.mask_api_key(existing_key) + prompt_text = f"OpenAI API key ({masked_key}) [press Enter to reuse, or enter new]" + api_key_input = self.prompt_value(prompt_text, "") + api_key = api_key_input if api_key_input else existing_key + else: + api_key = self.prompt_value("OpenAI API key (leave empty to skip)", "") + model = self.prompt_value("OpenAI model", "gpt-4o-mini") base_url = self.prompt_value("OpenAI base URL (for proxies/compatible APIs)", "https://api.openai.com/v1") - + if api_key: self.config["OPENAI_API_KEY"] = api_key self.config["OPENAI_MODEL"] = model @@ -196,12 +243,19 @@ def setup_llm(self): self.console.print("[blue][INFO][/blue] Ollama selected") base_url = self.prompt_value("Ollama server URL", "http://host.docker.internal:11434") + if not base_url.endswith("/v1"): + base_url = base_url.rstrip("/") + "/v1" + self.console.print(f"[blue][INFO][/blue] Automatically appending /v1 to Ollama URL: {base_url}") + model = self.prompt_value("Ollama model", "llama3.2") + embedder_model = self.prompt_value("Ollama embedder model", "nomic-embed-text:latest") + self.config["OLLAMA_BASE_URL"] = base_url self.config["OLLAMA_MODEL"] = model + self.config["OLLAMA_EMBEDDER_MODEL"] = embedder_model self.console.print("[green][SUCCESS][/green] Ollama configured") - self.console.print("[yellow][WARNING][/yellow] Make sure Ollama is running and the model is pulled") + self.console.print("[yellow][WARNING][/yellow] Make sure Ollama is running and all required models (LLM and embedder) are pulled") elif choice == "3": self.console.print("[blue][INFO][/blue] Skipping LLM setup - memory extraction disabled") @@ -336,67 +390,70 @@ def setup_https(self): self.console.print(f"[yellow][WARNING][/yellow] nginx.conf generation failed: {e}") else: self.console.print("[yellow][WARNING][/yellow] nginx.conf.template not found") + + # Generate Caddyfile from template + self.console.print("[blue][INFO][/blue] Creating Caddyfile configuration...") + caddyfile_template = script_dir / "Caddyfile.template" + caddyfile_path = script_dir / "Caddyfile" + + if caddyfile_template.exists(): + try: + # Check if Caddyfile exists as a directory (common issue) + if caddyfile_path.exists() and caddyfile_path.is_dir(): + self.console.print("[red]❌ ERROR: 'Caddyfile' exists as a directory![/red]") + self.console.print("[yellow] Please remove it manually:[/yellow]") + self.console.print(f"[yellow] rm -rf {caddyfile_path}[/yellow]") + self.console.print("[red] HTTPS will NOT work without a proper Caddyfile![/red]") + self.config["HTTPS_ENABLED"] = "false" + else: + with open(caddyfile_template, 'r') as f: + caddyfile_content = f.read() + + # Replace TAILSCALE_IP with server_ip + caddyfile_content = caddyfile_content.replace('TAILSCALE_IP', server_ip) + + with open(caddyfile_path, 'w') as f: + f.write(caddyfile_content) + + self.console.print(f"[green][SUCCESS][/green] Caddyfile created for: {server_ip}") + + except Exception as e: + self.console.print(f"[red]❌ ERROR: Caddyfile generation failed: {e}[/red]") + self.console.print("[red] HTTPS will NOT work without a proper Caddyfile![/red]") + self.config["HTTPS_ENABLED"] = "false" + else: + self.console.print("[red]❌ ERROR: Caddyfile.template not found[/red]") + self.console.print("[red] HTTPS will NOT work without a proper Caddyfile![/red]") + self.config["HTTPS_ENABLED"] = "false" else: self.config["HTTPS_ENABLED"] = "false" def generate_env_file(self): - """Generate the .env file from configuration""" - env_content = f"""# ============================================================================= -# Friend-Lite Advanced Backend Configuration -# Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} -# ============================================================================= - -# Authentication -AUTH_SECRET_KEY={self.config.get('AUTH_SECRET_KEY', '')} -ADMIN_EMAIL={self.config.get('ADMIN_EMAIL', '')} -ADMIN_PASSWORD={self.config.get('ADMIN_PASSWORD', '')} - -# Transcription Provider -TRANSCRIPTION_PROVIDER={self.config.get('TRANSCRIPTION_PROVIDER', '')} -DEEPGRAM_API_KEY={self.config.get('DEEPGRAM_API_KEY', '')} -MISTRAL_API_KEY={self.config.get('MISTRAL_API_KEY', '')} -MISTRAL_MODEL={self.config.get('MISTRAL_MODEL', '')} - -# LLM Provider -LLM_PROVIDER={self.config.get('LLM_PROVIDER', '')} -OPENAI_API_KEY={self.config.get('OPENAI_API_KEY', '')} -OPENAI_MODEL={self.config.get('OPENAI_MODEL', '')} -OPENAI_BASE_URL={self.config.get('OPENAI_BASE_URL', '')} -OLLAMA_BASE_URL={self.config.get('OLLAMA_BASE_URL', '')} -OLLAMA_MODEL={self.config.get('OLLAMA_MODEL', '')} -# Memory Provider -MEMORY_PROVIDER={self.config.get('MEMORY_PROVIDER', 'friend_lite')} -QDRANT_BASE_URL={self.config.get('QDRANT_BASE_URL', 'qdrant')} -OPENMEMORY_MCP_URL={self.config.get('OPENMEMORY_MCP_URL', '')} -OPENMEMORY_CLIENT_NAME={self.config.get('OPENMEMORY_CLIENT_NAME', '')} -OPENMEMORY_USER_ID={self.config.get('OPENMEMORY_USER_ID', '')} - -# Optional Services -SPEAKER_SERVICE_URL={self.config.get('SPEAKER_SERVICE_URL', '')} -PARAKEET_ASR_URL={self.config.get('PARAKEET_ASR_URL', '')} - -# Network Configuration -BACKEND_PUBLIC_PORT={self.config.get('BACKEND_PUBLIC_PORT', '8000')} -WEBUI_PORT={self.config.get('WEBUI_PORT', '5173')} - -# Database -MONGODB_URI=mongodb://mongo:27017 -DATABASE_NAME=friend_db - -# CORS (supports Tailscale IPs automatically) -CORS_ORIGINS=http://localhost:3000,http://localhost:5173 - -# Logging -LOG_LEVEL=INFO -""" - - # Create .env file with secure permissions (owner read/write only) - env_path = ".env" - fd = os.open(env_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, mode=0o600) - with os.fdopen(fd, 'w') as f: - f.write(env_content) - - self.console.print("[green][SUCCESS][/green] .env file created successfully with secure permissions") + """Generate .env file from template and update with configuration""" + env_path = Path(".env") + env_template = Path(".env.template") + + # Backup existing .env if it exists + self.backup_existing_env() + + # Copy template to .env + if env_template.exists(): + shutil.copy2(env_template, env_path) + self.console.print("[blue][INFO][/blue] Copied .env.template to .env") + else: + self.console.print("[yellow][WARNING][/yellow] .env.template not found, creating new .env") + env_path.touch(mode=0o600) + + # Update configured values using set_key + env_path_str = str(env_path) + for key, value in self.config.items(): + if value: # Only set non-empty values + set_key(env_path_str, key, value) + + # Ensure secure permissions + os.chmod(env_path, 0o600) + + self.console.print("[green][SUCCESS][/green] .env file configured successfully with secure permissions") def copy_config_templates(self): """Copy other configuration files""" diff --git a/backends/advanced/nginx.conf b/backends/advanced/nginx.conf deleted file mode 100644 index 12c811e7..00000000 --- a/backends/advanced/nginx.conf +++ /dev/null @@ -1,221 +0,0 @@ -worker_processes 1; - -events { - worker_connections 1024; -} - -http { - # Basic settings - sendfile on; - tcp_nopush on; - tcp_nodelay on; - keepalive_timeout 65; - types_hash_max_size 2048; - client_max_body_size 100M; - - # MIME types - include /etc/nginx/mime.types; - default_type application/octet-stream; - - # Logging - access_log /var/log/nginx/access.log; - error_log /var/log/nginx/error.log; - - # Gzip compression - gzip on; - gzip_vary on; - gzip_min_length 10240; - gzip_proxied expired no-cache no-store private auth; - gzip_types - text/plain - text/css - text/xml - text/javascript - application/x-javascript - application/xml+rss - application/javascript - application/json; - - # WebSocket proxy settings - map $http_upgrade $connection_upgrade { - default upgrade; - '' close; - } - - # Upstream services - upstream friend_backend { - server friend-backend:8000; - } - - upstream friend_webui { - server webui:5173; - } - - # HTTPS Server - server { - listen 443 ssl http2; - server_name localhost 100.83.66.30; - - # SSL Configuration - ssl_certificate /etc/nginx/ssl/server.crt; - ssl_certificate_key /etc/nginx/ssl/server.key; - ssl_protocols TLSv1.2 TLSv1.3; - ssl_ciphers ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384; - ssl_prefer_server_ciphers off; - - # Security headers - add_header X-Frame-Options DENY; - add_header X-Content-Type-Options nosniff; - add_header X-XSS-Protection "1; mode=block"; - - # Backend API endpoints - location /api/ { - proxy_pass http://friend_backend/api/; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_redirect off; - } - - # Authentication endpoints - location /auth/ { - proxy_pass http://friend_backend/auth/; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_redirect off; - } - - # Users endpoints - location /users/ { - proxy_pass http://friend_backend/users/; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_redirect off; - } - - # WebSocket endpoints for audio streaming - location /ws_pcm { - proxy_pass http://friend_backend/ws_pcm; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_cache_bypass $http_upgrade; - proxy_read_timeout 86400; - proxy_send_timeout 86400; - proxy_connect_timeout 60s; - proxy_buffering off; - proxy_request_buffering off; - } - - location /ws_omi { - proxy_pass http://friend_backend/ws_omi; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_cache_bypass $http_upgrade; - proxy_read_timeout 86400; - proxy_send_timeout 86400; - proxy_connect_timeout 60s; - proxy_buffering off; - proxy_request_buffering off; - } - - # Legacy WebSocket endpoint - location /ws { - proxy_pass http://friend_backend/ws; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_cache_bypass $http_upgrade; - proxy_read_timeout 86400; - proxy_send_timeout 86400; - proxy_connect_timeout 60s; - proxy_buffering off; - proxy_request_buffering off; - } - - # Health check endpoints - location /health { - proxy_pass http://friend_backend/health; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - } - - # Readiness check endpoint - location /readiness { - proxy_pass http://friend_backend/readiness; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - } - - # Audio file serving - location /audio/ { - proxy_pass http://friend_backend/audio/; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_redirect off; - - # Add headers for audio file serving - proxy_set_header Accept-Ranges bytes; - proxy_cache_bypass $http_range; - } - - # Vite HMR WebSocket (specific path) - location /@vite/client { - proxy_pass http://friend_webui/@vite/client; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - proxy_cache_bypass $http_upgrade; - } - - # Frontend Vite dev server (with HMR support) - location / { - proxy_pass http://friend_webui/; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_redirect off; - - # Handle WebSocket upgrade for Vite HMR - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - } - } - - # HTTP redirect to HTTPS - server { - listen 80; - server_name localhost 100.83.66.30; - return 301 https://$host$request_uri; - } -} \ No newline at end of file diff --git a/backends/advanced/pyproject.blackwell.toml b/backends/advanced/pyproject.blackwell.toml deleted file mode 100644 index 2f661dea..00000000 --- a/backends/advanced/pyproject.blackwell.toml +++ /dev/null @@ -1,53 +0,0 @@ -[project] -name = "advanced-omi-backend" -version = "0.1.0" -description = "Add your description here" -readme = "README.md" -requires-python = ">=3.12" -dependencies = [ - "easy-audio-interfaces>=0.5.1", - "fastapi>=0.115.12", - "mem0ai>=0.1.111", - "motor>=3.7.1", - "ollama>=0.4.8", - "python-dotenv>=1.1.0", - "uvicorn>=0.34.2", - "wyoming>=1.6.1", - "aiohttp>=3.8.0", - "langfuse==3.3.0", - "spacy>=3.8.2", - "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl", -] - -[dependency-groups] -deepgram = [ - "deepgram-sdk>=4.0.0", -] -dev = [ - "black>=25.1.0", - "isort>=6.0.1", -] -tests = [ - "pytest>=8.4.1", - "pytest-asyncio>=1.0.0", -] - - -[tool.isort] -profile = "black" - -[tool.uv.sources] -useful-moonshine-onnx = { git = "https://github.com/usefulsensors/moonshine.git", subdirectory = "moonshine-onnx" } -torchaudio = { index = "pytorch-cu128", marker = "sys_platform != 'darwin'" } -torchvision = { index = "pytorch-cu128", marker = "sys_platform != 'darwin'" } -torch = [ - { index = "pytorch-cu128", marker = "sys_platform != 'darwin'" }, -] - -[[tool.uv.index]] -name = "pypi" -url = "https://pypi.org/simple" - -[[tool.uv.index]] -name = "pytorch-cu128" -url = "https://download.pytorch.org/whl/cu128" diff --git a/backends/advanced/pyproject.toml b/backends/advanced/pyproject.toml index c355509f..5af2ec2e 100644 --- a/backends/advanced/pyproject.toml +++ b/backends/advanced/pyproject.toml @@ -7,6 +7,7 @@ requires-python = ">=3.12" dependencies = [ "easy-audio-interfaces>=0.7.1", # we need to add local-audio for scripts/local-audio.py | If we don't need that, we can remove this, and then remove portaudio19-dev from Dockerfile "fastapi>=0.115.12", + "fastmcp>=0.5.0", # MCP server for conversation access "mem0ai", # Using main branch with PR #3250 AsyncMemory fix "langchain_neo4j", "motor>=3.7.1", @@ -21,6 +22,10 @@ dependencies = [ "langfuse>=3.3.0", "spacy>=3.8.2", "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl", + "redis>=5.0.0", + "rq>=1.16.0", + "soundfile>=0.12.1", + "websockets>=12.0", ] [project.optional-dependencies] @@ -48,9 +53,43 @@ line-length = 100 [tool.uv.sources] mem0ai = { git = "https://github.com/AnkushMalaker/mem0.git", rev = "async-client-unbound-var-fix" } +[tool.poetry.dependencies] +robotframework = "^6.1.1" + [tool.pytest.ini_options] +minversion = "8.0" +testpaths = ["tests"] +python_files = ["test_*.py", "*_test.py"] +python_classes = ["Test*", "*Tests"] +python_functions = ["test_*"] +addopts = [ + "-ra", + "--strict-markers", + "--strict-config", + "--spec", + "-v", + "--tb=long", + "--color=yes", + "--durations=10", + "--showlocals", + "--capture=no", +] markers = [ - "integration: marks tests as integration tests", + "integration: marks tests as integration tests (may be slow)", + "unit: marks tests as unit tests (fast, isolated)", + "smoke: marks tests as smoke tests (quick validation)", + "slow: marks tests as slow running tests", + "api: marks tests that test API endpoints", + "memory: marks tests that test memory functionality", + "transcription: marks tests that test transcription functionality", + "auth: marks tests that test authentication", + "database: marks tests that require database access", +] +filterwarnings = [ + "error", + "ignore::UserWarning", + "ignore::DeprecationWarning", + "ignore::PendingDeprecationWarning", ] [dependency-groups] @@ -63,4 +102,11 @@ dev = [ test = [ "pytest>=8.4.1", "pytest-asyncio>=1.0.0", + "pytest-spec>=3.2.0", + "pytest-cov>=6.0.0", + "pytest-xdist>=3.6.0", + "pytest-mock>=3.14.0", + "requests-mock>=1.12.1", + "pytest-json-report>=1.5.0", + "pytest-html>=4.0.0", ] diff --git a/backends/advanced/setup-requirements.txt b/backends/advanced/setup-requirements.txt index 89d23f8d..fe569286 100644 --- a/backends/advanced/setup-requirements.txt +++ b/backends/advanced/setup-requirements.txt @@ -1,3 +1,4 @@ # Dependencies for interactive setup script rich>=13.0.0 -pyyaml>=6.0.0 \ No newline at end of file +pyyaml>=6.0.0 +python-dotenv \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/__init__.py b/backends/advanced/src/advanced_omi_backend/__init__.py index 30235e86..8eb09ac9 100644 --- a/backends/advanced/src/advanced_omi_backend/__init__.py +++ b/backends/advanced/src/advanced_omi_backend/__init__.py @@ -2,6 +2,4 @@ __version__ = "0.1.0" -from .database import AudioChunksRepository - -__all__ = ["AudioChunksRepository"] +__all__ = [] diff --git a/backends/advanced/src/advanced_omi_backend/app_config.py b/backends/advanced/src/advanced_omi_backend/app_config.py new file mode 100644 index 00000000..4caa70c5 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/app_config.py @@ -0,0 +1,117 @@ +""" +Application configuration for Friend-Lite backend. + +Centralizes all application-level configuration including database connections, +service configurations, and environment variables that were previously in main.py. +""" + +import logging +import os +from pathlib import Path +from typing import Optional + +from dotenv import load_dotenv +from motor.motor_asyncio import AsyncIOMotorClient + +from advanced_omi_backend.constants import OMI_CHANNELS, OMI_SAMPLE_RATE, OMI_SAMPLE_WIDTH +from advanced_omi_backend.services.transcription import get_transcription_provider + +# Load environment variables +load_dotenv() + +logger = logging.getLogger(__name__) + + +class AppConfig: + """Centralized application configuration.""" + + def __init__(self): + # MongoDB Configuration + self.mongodb_uri = os.getenv("MONGODB_URI", "mongodb://mongo:27017") + self.mongo_client = AsyncIOMotorClient(self.mongodb_uri) + self.db = self.mongo_client.get_default_database("friend-lite") + self.users_col = self.db["users"] + self.speakers_col = self.db["speakers"] + + # Audio Configuration + self.segment_seconds = 60 # length of each stored chunk + self.target_samples = OMI_SAMPLE_RATE * self.segment_seconds + self.audio_chunk_dir = Path("./audio_chunks") + self.audio_chunk_dir.mkdir(parents=True, exist_ok=True) + + # Conversation timeout configuration + self.new_conversation_timeout_minutes = float( + os.getenv("NEW_CONVERSATION_TIMEOUT_MINUTES", "1.5") + ) + + # Audio cropping configuration + self.audio_cropping_enabled = os.getenv("AUDIO_CROPPING_ENABLED", "true").lower() == "true" + self.min_speech_segment_duration = float(os.getenv("MIN_SPEECH_SEGMENT_DURATION", "1.0")) + self.cropping_context_padding = float(os.getenv("CROPPING_CONTEXT_PADDING", "0.1")) + + # Transcription Configuration + self.transcription_provider_name = os.getenv("TRANSCRIPTION_PROVIDER") + self.deepgram_api_key = os.getenv("DEEPGRAM_API_KEY") + self.mistral_api_key = os.getenv("MISTRAL_API_KEY") + + # Get configured transcription provider + self.transcription_provider = get_transcription_provider(self.transcription_provider_name) + if self.transcription_provider: + logger.info( + f"✅ Using {self.transcription_provider.name} transcription provider ({self.transcription_provider.mode})" + ) + else: + logger.warning("⚠️ No transcription provider configured - speech-to-text will not be available") + + # External Services Configuration + self.qdrant_base_url = os.getenv("QDRANT_BASE_URL", "qdrant") + self.qdrant_port = os.getenv("QDRANT_PORT", "6333") + self.memory_provider = os.getenv("MEMORY_PROVIDER", "friend_lite").lower() + + # Redis Configuration + self.redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0") + + # CORS Configuration + default_origins = "http://localhost:3000,http://localhost:3001,http://127.0.0.1:3000,http://127.0.0.1:3002" + self.cors_origins = os.getenv("CORS_ORIGINS", default_origins) + self.allowed_origins = [origin.strip() for origin in self.cors_origins.split(",") if origin.strip()] + + # Tailscale support + self.tailscale_regex = r"http://100\.\d{1,3}\.\d{1,3}\.\d{1,3}:3000" + + # Thread pool configuration + self.max_workers = os.cpu_count() or 4 + + # Memory service configuration + self.memory_service_supports_threshold = self.memory_provider == "friend_lite" + + +# Global configuration instance +app_config = AppConfig() + + +def get_app_config() -> AppConfig: + """Get the global application configuration instance.""" + return app_config + + +def get_audio_chunk_dir() -> Path: + """Get the audio chunk directory.""" + return app_config.audio_chunk_dir + + +def get_mongo_collections(): + """Get MongoDB collections.""" + return { + 'users': app_config.users_col, + 'speakers': app_config.speakers_col, + } + + +def get_redis_config(): + """Get Redis configuration.""" + return { + 'url': app_config.redis_url, + 'encoding': "utf-8", + 'decode_responses': False + } \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/app_factory.py b/backends/advanced/src/advanced_omi_backend/app_factory.py new file mode 100644 index 00000000..52a48093 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/app_factory.py @@ -0,0 +1,207 @@ +""" +Application factory for Friend-Lite backend. + +Creates and configures the FastAPI application with all routers, middleware, +and service initializations. +""" + +import asyncio +import logging +from contextlib import asynccontextmanager +from pathlib import Path + +import redis.asyncio as redis +from fastapi import FastAPI +from fastapi.staticfiles import StaticFiles + +from advanced_omi_backend.app_config import get_app_config +from advanced_omi_backend.auth import ( + bearer_backend, + cookie_backend, + create_admin_user_if_needed, + current_superuser, + fastapi_users, + websocket_auth, +) +from advanced_omi_backend.users import ( + User, + UserRead, + UserUpdate, + register_client_to_user, +) +from advanced_omi_backend.client_manager import get_client_manager +from advanced_omi_backend.memory import get_memory_service, shutdown_memory_service +from advanced_omi_backend.middleware.app_middleware import setup_middleware +from advanced_omi_backend.routers.api_router import router as api_router +from advanced_omi_backend.routers.modules.health_routes import router as health_router +from advanced_omi_backend.routers.modules.websocket_routes import router as websocket_router +from advanced_omi_backend.services.audio_service import get_audio_stream_service +from advanced_omi_backend.task_manager import init_task_manager, get_task_manager + +logger = logging.getLogger(__name__) +application_logger = logging.getLogger("audio_processing") + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Manage application lifespan events.""" + config = get_app_config() + + # Startup + application_logger.info("Starting application...") + + # Initialize Beanie for all document models + try: + from beanie import init_beanie + from advanced_omi_backend.models.conversation import Conversation + from advanced_omi_backend.models.audio_file import AudioFile + from advanced_omi_backend.models.user import User + + await init_beanie( + database=config.db, + document_models=[User, Conversation, AudioFile], + ) + application_logger.info("Beanie initialized for all document models") + except Exception as e: + application_logger.error(f"Failed to initialize Beanie: {e}") + raise + + # Create admin user if needed + try: + await create_admin_user_if_needed() + except Exception as e: + application_logger.error(f"Failed to create admin user: {e}") + # Don't raise here as this is not critical for startup + + + # Initialize Redis connection for RQ + try: + from advanced_omi_backend.controllers.queue_controller import redis_conn + redis_conn.ping() + application_logger.info("Redis connection established for RQ") + application_logger.info("RQ workers can be started with: rq worker transcription memory default") + except Exception as e: + application_logger.error(f"Failed to connect to Redis for RQ: {e}") + application_logger.warning("RQ queue system will not be available - check Redis connection") + + # Initialize audio stream service for Redis Streams + try: + audio_service = get_audio_stream_service() + await audio_service.connect() + application_logger.info("Audio stream service connected to Redis Streams") + application_logger.info("Audio stream workers can be started with: python -m advanced_omi_backend.workers.audio_stream_worker") + except Exception as e: + application_logger.error(f"Failed to connect audio stream service: {e}") + application_logger.warning("Redis Streams audio processing will not be available") + + # Initialize Redis client for audio streaming producer (used by WebSocket handlers) + try: + app.state.redis_audio_stream = await redis.from_url( + config.redis_url, + encoding="utf-8", + decode_responses=False + ) + from advanced_omi_backend.services.audio_stream import AudioStreamProducer + app.state.audio_stream_producer = AudioStreamProducer(app.state.redis_audio_stream) + application_logger.info("✅ Redis client for audio streaming producer initialized") + except Exception as e: + application_logger.error(f"Failed to initialize Redis client for audio streaming: {e}", exc_info=True) + application_logger.warning("Audio streaming producer will not be available") + + # Skip memory service pre-initialization to avoid blocking FastAPI startup + # Memory service will be lazily initialized when first used + application_logger.info("Memory service will be initialized on first use (lazy loading)") + + # SystemTracker is used for monitoring and debugging + application_logger.info("Using SystemTracker for monitoring and debugging") + + application_logger.info("Application ready - using application-level processing architecture.") + + logger.info("App ready") + try: + yield + finally: + # Shutdown + application_logger.info("Shutting down application...") + + # Clean up all active clients + client_manager = get_client_manager() + for client_id in client_manager.get_all_client_ids(): + try: + from advanced_omi_backend.controllers.websocket_controller import cleanup_client_state + await cleanup_client_state(client_id) + except Exception as e: + application_logger.error(f"Error cleaning up client {client_id}: {e}") + + # RQ workers shut down automatically when process ends + # No special cleanup needed for Redis connections + + # Shutdown audio stream service + try: + audio_service = get_audio_stream_service() + await audio_service.disconnect() + application_logger.info("Audio stream service disconnected") + except Exception as e: + application_logger.error(f"Error disconnecting audio stream service: {e}") + + # Close Redis client for audio streaming producer + try: + if hasattr(app.state, 'redis_audio_stream') and app.state.redis_audio_stream: + await app.state.redis_audio_stream.close() + application_logger.info("Redis client for audio streaming producer closed") + except Exception as e: + application_logger.error(f"Error closing Redis audio streaming client: {e}") + + # Stop metrics collection and save final report + application_logger.info("Metrics collection stopped") + + # Shutdown memory service and speaker service + shutdown_memory_service() + application_logger.info("Memory and speaker services shut down.") + + application_logger.info("Shutdown complete.") + + +def create_app() -> FastAPI: + """Create and configure the FastAPI application.""" + # Create FastAPI application with lifespan management + app = FastAPI(lifespan=lifespan) + + # Set up middleware (CORS, exception handlers) + setup_middleware(app) + + # Include all routers + app.include_router(api_router) + + # Add health check router at root level (not under /api prefix) + app.include_router(health_router) + + # Add WebSocket router at root level (not under /api prefix) + app.include_router(websocket_router) + + # Add authentication routers + app.include_router( + fastapi_users.get_auth_router(cookie_backend), + prefix="/auth/cookie", + tags=["auth"], + ) + app.include_router( + fastapi_users.get_auth_router(bearer_backend), + prefix="/auth/jwt", + tags=["auth"], + ) + + # Add users router for /users/me and other user endpoints + app.include_router( + fastapi_users.get_users_router(UserRead, UserUpdate), + prefix="/users", + tags=["users"], + ) + + # Mount static files LAST (mounts are catch-all patterns) + CHUNK_DIR = Path("/app/audio_chunks") + app.mount("/audio", StaticFiles(directory=CHUNK_DIR), name="audio") + + logger.info("FastAPI application created with all routers and middleware configured") + + return app \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/audio_utils.py b/backends/advanced/src/advanced_omi_backend/audio_utils.py deleted file mode 100644 index 2821d126..00000000 --- a/backends/advanced/src/advanced_omi_backend/audio_utils.py +++ /dev/null @@ -1,267 +0,0 @@ -############################################################################### -# AUDIO PROCESSING FUNCTIONS -############################################################################### - -import asyncio -import logging -import os -import time - -# Type import to avoid circular imports -from typing import TYPE_CHECKING, Optional - -from wyoming.audio import AudioChunk - -if TYPE_CHECKING: - from advanced_omi_backend.client import ClientState - from advanced_omi_backend.database import AudioChunksRepository - -logger = logging.getLogger(__name__) - -# Import constants from main.py (these are defined there) -MIN_SPEECH_SEGMENT_DURATION = float(os.getenv("MIN_SPEECH_SEGMENT_DURATION", "1.0")) # seconds -CROPPING_CONTEXT_PADDING = float(os.getenv("CROPPING_CONTEXT_PADDING", "0.1")) # seconds - - -async def process_audio_chunk( - audio_data: bytes, - client_id: str, - user_id: str, - user_email: str, - audio_format: dict, - client_state: Optional["ClientState"] = None -) -> None: - """Process a single audio chunk through the standard pipeline. - - This function encapsulates the common pattern used across all audio input sources: - 1. Create AudioChunk with format details - 2. Queue AudioProcessingItem to processor - 3. Update client state if provided - - Args: - audio_data: Raw audio bytes - client_id: Client identifier - user_id: User identifier - user_email: User email - audio_format: Dict containing {rate, width, channels, timestamp} - client_state: Optional ClientState for state updates - """ - - from advanced_omi_backend.processors import ( - AudioProcessingItem, - get_processor_manager, - ) - - # Extract format details - rate = audio_format.get("rate", 16000) - width = audio_format.get("width", 2) - channels = audio_format.get("channels", 1) - timestamp = audio_format.get("timestamp") - - # Use current time if no timestamp provided - if timestamp is None: - timestamp = int(time.time() * 1000) - - # Create AudioChunk with format details - chunk = AudioChunk( - audio=audio_data, - rate=rate, - width=width, - channels=channels, - timestamp=timestamp - ) - - # Create AudioProcessingItem and queue for processing - processor_manager = get_processor_manager() - processing_item = AudioProcessingItem( - client_id=client_id, - user_id=user_id, - user_email=user_email, - audio_chunk=chunk, - timestamp=timestamp - ) - - await processor_manager.queue_audio(processing_item) - - # Update client state if provided - if client_state is not None: - client_state.update_audio_received(chunk) - - -async def _process_audio_cropping_with_relative_timestamps( - original_path: str, - speech_segments: list[tuple[float, float]], - output_path: str, - audio_uuid: str, - chunk_repo: Optional['AudioChunksRepository'] = None, -) -> bool: - """ - Process audio cropping with automatic relative timestamp conversion. - This function handles both live processing and reprocessing scenarios. - """ - try: - # Convert absolute timestamps to relative timestamps - # Extract file start time from filename: timestamp_client_uuid.wav - filename = original_path.split("/")[-1] - logger.info(f"🕐 Parsing filename: {filename}") - filename_parts = filename.split("_") - if len(filename_parts) < 3: - logger.error( - f"Invalid filename format: {filename}. Expected format: timestamp_client_id_audio_uuid.wav" - ) - return False - - try: - file_start_timestamp = float(filename_parts[0]) - except ValueError as e: - logger.error(f"Cannot parse timestamp from filename {filename}: {e}") - return False - - # Convert speech segments to relative timestamps - relative_segments = [] - for start_abs, end_abs in speech_segments: - # Validate input timestamps - if start_abs >= end_abs: - logger.warning( - f"⚠️ Invalid speech segment: start={start_abs} >= end={end_abs}, skipping" - ) - continue - - start_rel = start_abs - file_start_timestamp - end_rel = end_abs - file_start_timestamp - - # Ensure relative timestamps are positive (sanity check) - if start_rel < 0: - logger.warning( - f"⚠️ Negative start timestamp: {start_rel} (absolute: {start_abs}, file_start: {file_start_timestamp}), clamping to 0.0" - ) - start_rel = 0.0 - if end_rel < 0: - logger.warning( - f"⚠️ Negative end timestamp: {end_rel} (absolute: {end_abs}, file_start: {file_start_timestamp}), skipping segment" - ) - continue - - relative_segments.append((start_rel, end_rel)) - - logger.info(f"🕐 Converting timestamps for {audio_uuid}: file_start={file_start_timestamp}") - logger.info(f"🕐 Absolute segments: {speech_segments}") - logger.info(f"🕐 Relative segments: {relative_segments}") - - # Validate that we have valid relative segments after conversion - if not relative_segments: - logger.warning( - f"No valid relative segments after timestamp conversion for {audio_uuid}" - ) - return False - - success = await _crop_audio_with_ffmpeg(original_path, relative_segments, output_path) - if success: - # Update database with cropped file info (keep original absolute timestamps for reference) - cropped_filename = output_path.split("/")[-1] - if chunk_repo is not None: - await chunk_repo.update_cropped_audio(audio_uuid, cropped_filename, speech_segments) - logger.info(f"Successfully processed cropped audio: {cropped_filename}") - return True - else: - logger.error(f"Failed to crop audio for {audio_uuid}") - return False - except Exception as e: - logger.error(f"Error in audio cropping task for {audio_uuid}: {e}", exc_info=True) - return False - - -async def _crop_audio_with_ffmpeg( - original_path: str, speech_segments: list[tuple[float, float]], output_path: str -) -> bool: - """Use ffmpeg to crop audio - runs as async subprocess, no GIL issues""" - logger.info(f"Cropping audio {original_path} with {len(speech_segments)} speech segments") - - if not speech_segments: - logger.warning(f"No speech segments to crop for {original_path}") - return False - - # Check if the original file exists - if not os.path.exists(original_path): - logger.error(f"Original audio file does not exist: {original_path}") - return False - - # Filter out segments that are too short - filtered_segments = [] - for start, end in speech_segments: - duration = end - start - if duration >= MIN_SPEECH_SEGMENT_DURATION: - # Add padding around speech segments - padded_start = max(0, start - CROPPING_CONTEXT_PADDING) - padded_end = end + CROPPING_CONTEXT_PADDING - filtered_segments.append((padded_start, padded_end)) - else: - logger.debug( - f"Skipping short segment: {start}-{end} ({duration:.2f}s < {MIN_SPEECH_SEGMENT_DURATION}s)" - ) - - if not filtered_segments: - logger.warning( - f"No segments meet minimum duration ({MIN_SPEECH_SEGMENT_DURATION}s) for {original_path}" - ) - return False - - logger.info( - f"Cropping audio {original_path} with {len(filtered_segments)} speech segments (filtered from {len(speech_segments)})" - ) - - try: - # Build ffmpeg filter for concatenating speech segments - filter_parts = [] - for i, (start, end) in enumerate(filtered_segments): - duration = end - start - filter_parts.append( - f"[0:a]atrim=start={start}:duration={duration},asetpts=PTS-STARTPTS[seg{i}]" - ) - - # Concatenate all segments - inputs = "".join(f"[seg{i}]" for i in range(len(filtered_segments))) - concat_filter = f"{inputs}concat=n={len(filtered_segments)}:v=0:a=1[out]" - - full_filter = ";".join(filter_parts + [concat_filter]) - - # Run ffmpeg as async subprocess - cmd = [ - "ffmpeg", - "-y", # -y = overwrite output - "-i", - original_path, - "-filter_complex", - full_filter, - "-map", - "[out]", - "-c:a", - "pcm_s16le", # Keep same format as original - output_path, - ] - - logger.info(f"Running ffmpeg command: {' '.join(cmd)}") - - process = await asyncio.create_subprocess_exec( - *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE - ) - - stdout, stderr = await process.communicate() - if stdout: - logger.debug(f"FFMPEG stdout: {stdout.decode()}") - - if process.returncode == 0: - # Calculate cropped duration - cropped_duration = sum(end - start for start, end in filtered_segments) - logger.info( - f"Successfully cropped {original_path} -> {output_path} ({cropped_duration:.1f}s from {len(filtered_segments)} segments)" - ) - return True - else: - error_msg = stderr.decode() if stderr else "Unknown ffmpeg error" - logger.error(f"ffmpeg failed for {original_path}: {error_msg}") - return False - - except Exception as e: - logger.error(f"Error running ffmpeg on {original_path}: {e}", exc_info=True) - return False diff --git a/backends/advanced/src/advanced_omi_backend/auth.py b/backends/advanced/src/advanced_omi_backend/auth.py index 8eefe9c9..a39637f1 100644 --- a/backends/advanced/src/advanced_omi_backend/auth.py +++ b/backends/advanced/src/advanced_omi_backend/auth.py @@ -82,7 +82,7 @@ async def get_user_manager(user_db=Depends(get_user_db)): # Transport configurations cookie_transport = CookieTransport( - cookie_max_age=3600, # 1 hour + cookie_max_age=86400, # 24 hours (matches JWT lifetime) cookie_secure=COOKIE_SECURE, # Set to False in development if not using HTTPS cookie_httponly=True, cookie_samesite="lax", @@ -119,9 +119,38 @@ def get_jwt_strategy() -> JWTStrategy: # User dependencies for protecting endpoints current_active_user = fastapi_users.current_user(active=True) +current_active_user_optional = fastapi_users.current_user(active=True, optional=True) current_superuser = fastapi_users.current_user(active=True, superuser=True) +async def get_user_from_token_param(token: str) -> Optional[User]: + """ + Get user from JWT token string (for query parameter authentication). + + This is useful for endpoints that need to support token-based auth via query params, + such as HTML audio elements that can't set custom headers. + + Args: + token: JWT token string + + Returns: + User object if token is valid and user is active, None otherwise + """ + if not token: + return None + try: + strategy = get_jwt_strategy() + user_db_gen = get_user_db() + user_db = await user_db_gen.__anext__() + user_manager = UserManager(user_db) + user = await strategy.read_token(token, user_manager) + if user and user.is_active: + return user + except Exception: + pass + return None + + def get_accessible_user_ids(user: User) -> list[str] | None: """ Get list of user IDs that the current user can access data for. @@ -171,7 +200,7 @@ async def create_admin_user_if_needed(): ) except Exception as e: - logger.error(f"Failed to create admin user: {e}") + logger.error(f"Failed to create admin user: {e}", exc_info=True) async def websocket_auth(websocket, token: Optional[str] = None) -> Optional[User]: @@ -183,7 +212,7 @@ async def websocket_auth(websocket, token: Optional[str] = None) -> Optional[Use # Try JWT token from query parameter first if token: - logger.debug("Attempting WebSocket auth with query token.") + logger.info(f"Attempting WebSocket auth with query token (first 20 chars): {token[:20]}...") try: user_db_gen = get_user_db() user_db = await user_db_gen.__anext__() @@ -192,8 +221,10 @@ async def websocket_auth(websocket, token: Optional[str] = None) -> Optional[Use if user and user.is_active: logger.info(f"WebSocket auth successful for user {user.user_id} using query token.") return user + else: + logger.warning(f"Token validated but user inactive or not found: user={user}") except Exception as e: - logger.warning(f"WebSocket auth with query token failed: {e}") + logger.error(f"WebSocket auth with query token failed: {type(e).__name__}: {e}", exc_info=True) # Try cookie authentication logger.debug("Attempting WebSocket auth with cookie.") diff --git a/backends/advanced/src/advanced_omi_backend/chat_service.py b/backends/advanced/src/advanced_omi_backend/chat_service.py index 9b158679..812f8af0 100644 --- a/backends/advanced/src/advanced_omi_backend/chat_service.py +++ b/backends/advanced/src/advanced_omi_backend/chat_service.py @@ -28,7 +28,6 @@ logger = logging.getLogger(__name__) # Configuration from environment variables -CHAT_LLM_MODEL = os.getenv("CHAT_LLM_MODEL") or os.getenv("OPENAI_MODEL", "gpt-4o-mini") CHAT_TEMPERATURE = float(os.getenv("CHAT_TEMPERATURE", "0.7")) MAX_MEMORY_CONTEXT = 5 # Maximum number of memories to include in context MAX_CONVERSATION_HISTORY = 10 # Maximum conversation turns to keep in context @@ -383,7 +382,6 @@ async def generate_response_stream( # In the future, this should be replaced with actual streaming response_content = self.llm_client.generate( prompt=full_prompt, - model=CHAT_LLM_MODEL, temperature=CHAT_TEMPERATURE ) diff --git a/backends/advanced/src/advanced_omi_backend/client.py b/backends/advanced/src/advanced_omi_backend/client.py index 3c43a43a..be92716e 100644 --- a/backends/advanced/src/advanced_omi_backend/client.py +++ b/backends/advanced/src/advanced_omi_backend/client.py @@ -12,8 +12,6 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple -from advanced_omi_backend.conversation_manager import get_conversation_manager -from advanced_omi_backend.database import AudioChunksRepository from advanced_omi_backend.task_manager import get_task_manager from wyoming.audio import AudioChunk @@ -30,14 +28,12 @@ class ClientState: def __init__( self, client_id: str, - ac_db_collection_helper: AudioChunksRepository, chunk_dir: Path, user_id: str, user_email: Optional[str] = None, ): self.client_id = client_id self.connected = True - self.db_helper = ac_db_collection_helper self.chunk_dir = chunk_dir # Store user data for memory processing @@ -133,33 +129,19 @@ async def close_current_conversation(self): audio_logger.info(f"🔒 No active conversation to close for client {self.client_id}") return - # Debug logging for memory processing investigation - audio_logger.info(f"🔍 ClientState close_current_conversation debug for {self.client_id}:") - audio_logger.info(f" - current_audio_uuid: {self.current_audio_uuid}") - audio_logger.info(f" - user_id: {self.user_id}") - audio_logger.info(f" - user_email: {self.user_email}") - audio_logger.info(f" - client_id: {self.client_id}") - - # Use ConversationManager for clean separation of concerns - conversation_manager = get_conversation_manager() - success = await conversation_manager.close_conversation( - client_id=self.client_id, - audio_uuid=self.current_audio_uuid, - user_id=self.user_id, - user_email=self.user_email, - conversation_start_time=self.conversation_start_time, - speech_segments=self.speech_segments, - chunk_dir=self.chunk_dir, - ) + # NOTE: ClientState is legacy V1 code. In V2 architecture, conversation closure + # is handled by the websocket controllers using RQ jobs directly. + # This method is kept minimal for backward compatibility. - if success: - # Clean up speech segments for this conversation - if self.current_audio_uuid in self.speech_segments: - del self.speech_segments[self.current_audio_uuid] - if self.current_audio_uuid in self.current_speech_start: - del self.current_speech_start[self.current_audio_uuid] - else: - audio_logger.warning(f"⚠️ Conversation closure had issues for {self.current_audio_uuid}") + audio_logger.info(f"🔒 Closing conversation for client {self.client_id}, audio_uuid: {self.current_audio_uuid}") + + # Clean up speech segments for this conversation + if self.current_audio_uuid in self.speech_segments: + del self.speech_segments[self.current_audio_uuid] + if self.current_audio_uuid in self.current_speech_start: + del self.current_speech_start[self.current_audio_uuid] + + audio_logger.info(f"✅ Cleaned up state for {self.current_audio_uuid}") async def start_new_conversation(self): """Start a new conversation by closing current and resetting state.""" @@ -187,10 +169,6 @@ async def disconnect(self): # Close current conversation await self.close_current_conversation() - # Cancel any tasks for this client - task_manager = get_task_manager() - await task_manager.cancel_tasks_for_client(self.client_id) - # Clean up state self.speech_segments.clear() self.current_speech_start.clear() diff --git a/backends/advanced/src/advanced_omi_backend/client_manager.py b/backends/advanced/src/advanced_omi_backend/client_manager.py index b48cd51c..5a3131b5 100644 --- a/backends/advanced/src/advanced_omi_backend/client_manager.py +++ b/backends/advanced/src/advanced_omi_backend/client_manager.py @@ -104,7 +104,7 @@ def get_client_count(self) -> int: """ return len(self._active_clients) - def create_client(self, client_id: str, ac_repository, chunk_dir, user_id: str, user_email: Optional[str] = None) -> "ClientState": + def create_client(self, client_id: str, chunk_dir, user_id: str, user_email: Optional[str] = None) -> "ClientState": """ Atomically create and register a new client. @@ -113,7 +113,6 @@ def create_client(self, client_id: str, ac_repository, chunk_dir, user_id: str, Args: client_id: Unique client identifier - ac_repository: Audio chunks repository chunk_dir: Directory for audio chunks user_id: User ID who owns this client user_email: Optional user email @@ -131,7 +130,7 @@ def create_client(self, client_id: str, ac_repository, chunk_dir, user_id: str, from advanced_omi_backend.client import ClientState # Create client state - client_state = ClientState(client_id, ac_repository, chunk_dir, user_id, user_email) + client_state = ClientState(client_id, chunk_dir, user_id, user_email) # Atomically add to internal storage and register mapping self._active_clients[client_id] = client_state diff --git a/backends/advanced/src/advanced_omi_backend/clients/__init__.py b/backends/advanced/src/advanced_omi_backend/clients/__init__.py new file mode 100644 index 00000000..099f3c45 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/clients/__init__.py @@ -0,0 +1,11 @@ +"""Client implementations for Friend-Lite backend. + +This module provides reusable client implementations that can be used for: +- Integration testing +- CLI tools +- External integrations +""" + +from advanced_omi_backend.clients.audio_stream_client import AudioStreamClient + +__all__ = ["AudioStreamClient"] diff --git a/backends/advanced/src/advanced_omi_backend/clients/audio_stream_client.py b/backends/advanced/src/advanced_omi_backend/clients/audio_stream_client.py new file mode 100644 index 00000000..af89fd51 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/clients/audio_stream_client.py @@ -0,0 +1,556 @@ +"""WebSocket client for audio streaming using Wyoming protocol. + +This client mirrors the protocol implementation in websocket_controller.py +and can be used for integration testing and external integrations. + +Protocol flow: +1. Connect to WebSocket with token and device_name +2. Receive "ready" message from server (PCM endpoint only) +3. Send "audio-start" with format and mode +4. Send audio chunks (Wyoming protocol or raw binary) +5. Send "audio-stop" to finalize session + +Example usage (blocking): + ```python + import asyncio + from advanced_omi_backend.clients import AudioStreamClient + + async def main(): + client = AudioStreamClient("http://localhost:8000", "your-jwt-token") + await client.connect() + await client.stream_wav_file("/path/to/audio.wav") + await client.close() + + asyncio.run(main()) + ``` + +Example usage (non-blocking for testing): + ```python + from advanced_omi_backend.clients.audio_stream_client import StreamManager + + manager = StreamManager() + stream_id = manager.start_stream("http://localhost:8000", "token", "device") + manager.send_chunks_from_file(stream_id, "/path/to/audio.wav", num_chunks=10) + # ... do other things while stream is open ... + manager.stop_stream(stream_id) + ``` +""" + +import asyncio +import json +import logging +import threading +import uuid +import wave +from pathlib import Path +from typing import Dict, Optional, Union + +import websockets +from websockets.client import WebSocketClientProtocol + +from advanced_omi_backend.constants import OMI_CHANNELS, OMI_SAMPLE_RATE, OMI_SAMPLE_WIDTH + +logger = logging.getLogger(__name__) + + +class AudioStreamClient: + """WebSocket client for streaming audio using Wyoming protocol. + + This client implements the same protocol as the server expects in + websocket_controller.py, ensuring consistency between client and server. + """ + + def __init__( + self, + base_url: str, + token: str, + device_name: str = "python-client", + endpoint: str = "ws_pcm", + ): + """Initialize the audio stream client. + + Args: + base_url: Base URL of the backend (e.g., "http://localhost:8000") + token: JWT authentication token + device_name: Device name for client identification + endpoint: WebSocket endpoint ("ws_pcm" or "ws_omi") + """ + self.base_url = base_url + self.token = token + self.device_name = device_name + self.endpoint = endpoint + self.ws: Optional[WebSocketClientProtocol] = None + self.chunk_count = 0 + self.total_bytes = 0 + + @property + def ws_url(self) -> str: + """Build WebSocket URL from base URL.""" + url = self.base_url.replace("http://", "ws://").replace("https://", "wss://") + return f"{url}/{self.endpoint}?token={self.token}&device_name={self.device_name}" + + async def connect(self, wait_for_ready: bool = True) -> WebSocketClientProtocol: + """Connect to the WebSocket endpoint. + + Args: + wait_for_ready: If True, wait for "ready" message from server (PCM endpoint) + + Returns: + The WebSocket connection + + Raises: + RuntimeError: If connection fails or ready message not received + """ + logger.info(f"Connecting to {self.ws_url}") + self.ws = await websockets.connect(self.ws_url) + logger.info("WebSocket connected") + + if wait_for_ready and self.endpoint == "ws_pcm": + # PCM endpoint sends "ready" message after auth (line 261-268 in websocket_controller.py) + ready_msg = await self.ws.recv() + ready = json.loads(ready_msg.strip() if isinstance(ready_msg, str) else ready_msg.decode().strip()) + if ready.get("type") != "ready": + raise RuntimeError(f"Expected 'ready' message, got: {ready}") + logger.info("Received ready message from server") + + return self.ws + + async def send_audio_start( + self, + recording_mode: str = "streaming", + sample_rate: int = OMI_SAMPLE_RATE, + sample_width: int = OMI_SAMPLE_WIDTH, + channels: int = OMI_CHANNELS, + ) -> None: + """Send Wyoming audio-start event. + + Args: + recording_mode: "streaming" or "batch" + sample_rate: Audio sample rate in Hz (default: 16000) + sample_width: Bytes per sample (default: 2 for 16-bit) + channels: Number of audio channels (default: 1) + + Note: + The mode is inside the "data" dict, matching _handle_audio_session_start + in websocket_controller.py (line 618). + """ + if not self.ws: + raise RuntimeError("Not connected. Call connect() first.") + + header = { + "type": "audio-start", + "data": { + "rate": sample_rate, + "width": sample_width, + "channels": channels, + "mode": recording_mode, + }, + "payload_length": None, + } + await self.ws.send(json.dumps(header) + "\n") + logger.info(f"Sent audio-start with mode={recording_mode}") + + async def send_audio_chunk_wyoming( + self, + audio_data: bytes, + sample_rate: int = OMI_SAMPLE_RATE, + sample_width: int = OMI_SAMPLE_WIDTH, + channels: int = OMI_CHANNELS, + ) -> None: + """Send audio chunk using Wyoming protocol (JSON header + binary payload). + + This matches the handler at lines 979-1007 in websocket_controller.py. + + Args: + audio_data: Raw PCM audio bytes + sample_rate: Audio sample rate in Hz + sample_width: Bytes per sample + channels: Number of audio channels + """ + if not self.ws: + raise RuntimeError("Not connected. Call connect() first.") + + header = { + "type": "audio-chunk", + "payload_length": len(audio_data), + "data": { + "rate": sample_rate, + "width": sample_width, + "channels": channels, + }, + } + # Send JSON header followed by binary payload + await self.ws.send(json.dumps(header) + "\n") + await self.ws.send(audio_data) + + self.chunk_count += 1 + self.total_bytes += len(audio_data) + + if self.chunk_count <= 3 or self.chunk_count % 100 == 0: + logger.debug(f"Sent audio chunk #{self.chunk_count}: {len(audio_data)} bytes") + + async def send_audio_chunk_raw(self, audio_data: bytes) -> None: + """Send raw binary audio without Wyoming header (legacy mode). + + This matches the handler at lines 1016-1035 in websocket_controller.py. + + Args: + audio_data: Raw PCM audio bytes + """ + if not self.ws: + raise RuntimeError("Not connected. Call connect() first.") + + await self.ws.send(audio_data) + + self.chunk_count += 1 + self.total_bytes += len(audio_data) + + async def send_audio_stop(self) -> None: + """Send Wyoming audio-stop event to finalize the session.""" + if not self.ws: + raise RuntimeError("Not connected. Call connect() first.") + + header = {"type": "audio-stop"} + await self.ws.send(json.dumps(header) + "\n") + logger.info(f"Sent audio-stop (total: {self.chunk_count} chunks, {self.total_bytes} bytes)") + + async def send_ping(self) -> None: + """Send keepalive ping.""" + if not self.ws: + raise RuntimeError("Not connected. Call connect() first.") + + header = {"type": "ping", "payload_length": None} + await self.ws.send(json.dumps(header) + "\n") + logger.debug("Sent ping") + + async def stream_wav_file( + self, + wav_path: Union[str, Path], + chunk_duration_ms: int = 100, + use_wyoming: bool = True, + recording_mode: str = "streaming", + realtime_factor: float = 0.1, + ) -> int: + """Stream a WAV file in chunks, simulating real-time audio. + + Args: + wav_path: Path to the WAV file + chunk_duration_ms: Duration of each chunk in milliseconds + use_wyoming: If True, use Wyoming protocol; if False, send raw binary + recording_mode: "streaming" or "batch" + realtime_factor: Fraction of real-time to simulate (0.1 = 10x speed) + + Returns: + Number of chunks sent + """ + wav_path = Path(wav_path) + if not wav_path.exists(): + raise FileNotFoundError(f"WAV file not found: {wav_path}") + + with wave.open(str(wav_path), "rb") as wav: + sample_rate = wav.getframerate() + channels = wav.getnchannels() + sample_width = wav.getsampwidth() + + logger.info( + f"Streaming {wav_path.name}: {sample_rate}Hz, {channels}ch, {sample_width * 8}-bit" + ) + + # Calculate chunk size + bytes_per_sample = sample_width * channels + samples_per_chunk = int(sample_rate * chunk_duration_ms / 1000) + + # Send audio-start + await self.send_audio_start( + recording_mode=recording_mode, + sample_rate=sample_rate, + sample_width=sample_width, + channels=channels, + ) + + # Reset counters + self.chunk_count = 0 + self.total_bytes = 0 + + # Stream chunks + while True: + chunk = wav.readframes(samples_per_chunk) + if not chunk: + break + + if use_wyoming: + await self.send_audio_chunk_wyoming( + chunk, + sample_rate=sample_rate, + sample_width=sample_width, + channels=channels, + ) + else: + await self.send_audio_chunk_raw(chunk) + + # Simulate real-time delay + if realtime_factor > 0: + await asyncio.sleep(chunk_duration_ms / 1000 * realtime_factor) + + # Send audio-stop + await self.send_audio_stop() + + logger.info(f"Finished streaming: {self.chunk_count} chunks, {self.total_bytes} bytes") + return self.chunk_count + + async def close(self) -> None: + """Close the WebSocket connection.""" + if self.ws: + await self.ws.close() + self.ws = None + logger.info("WebSocket connection closed") + + async def __aenter__(self) -> "AudioStreamClient": + """Async context manager entry.""" + await self.connect() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Async context manager exit.""" + await self.close() + + +# Synchronous wrapper for Robot Framework and other sync contexts +def stream_audio_file( + base_url: str, + token: str, + wav_path: str, + device_name: str = "robot-test", + recording_mode: str = "streaming", + use_wyoming: bool = True, +) -> int: + """Synchronous wrapper for streaming audio file. + + This function is designed for use with Robot Framework or other + synchronous test frameworks. + + Args: + base_url: Base URL of the backend + token: JWT authentication token + wav_path: Path to WAV file + device_name: Device name for client identification + recording_mode: "streaming" or "batch" + use_wyoming: If True, use Wyoming protocol + + Returns: + Number of chunks sent + """ + + async def _run() -> int: + async with AudioStreamClient(base_url, token, device_name) as client: + return await client.stream_wav_file( + wav_path, + use_wyoming=use_wyoming, + recording_mode=recording_mode, + ) + + return asyncio.run(_run()) + + +class StreamSession: + """Holds state for an active streaming session.""" + + def __init__( + self, + stream_id: str, + client: AudioStreamClient, + loop: asyncio.AbstractEventLoop, + thread: threading.Thread, + ): + self.stream_id = stream_id + self.client = client + self.loop = loop + self.thread = thread + self.connected = False + self.audio_started = False + self.chunk_count = 0 + self.error: Optional[str] = None + + +class StreamManager: + """Manages multiple non-blocking audio streams for testing. + + This allows tests to start a stream, perform checks while streaming, + and then stop the stream - mimicking real client behavior. + + Example: + manager = StreamManager() + stream_id = manager.start_stream(base_url, token, "test-device") + manager.send_chunks_from_file(stream_id, "audio.wav", num_chunks=10) + # ... check jobs, verify state ... + manager.stop_stream(stream_id) + """ + + def __init__(self): + self._sessions: Dict[str, StreamSession] = {} + + def start_stream( + self, + base_url: str, + token: str, + device_name: str = "robot-test", + recording_mode: str = "streaming", + ) -> str: + """Start a new audio stream (non-blocking). + + Args: + base_url: Backend URL + token: JWT token + device_name: Device name for client ID + recording_mode: "streaming" or "batch" + + Returns: + stream_id: Unique ID for this stream session + """ + stream_id = str(uuid.uuid4())[:8] + + # Create event loop for this stream's thread + loop = asyncio.new_event_loop() + + def run_loop(): + asyncio.set_event_loop(loop) + loop.run_forever() + + thread = threading.Thread(target=run_loop, daemon=True) + thread.start() + + # Create client + client = AudioStreamClient(base_url, token, device_name) + + session = StreamSession(stream_id, client, loop, thread) + self._sessions[stream_id] = session + + # Connect and send audio-start + async def _connect_and_start(): + try: + await client.connect() + session.connected = True + await client.send_audio_start(recording_mode=recording_mode) + session.audio_started = True + logger.info(f"Stream {stream_id} started for {device_name}") + except Exception as e: + session.error = str(e) + logger.error(f"Stream {stream_id} failed to start: {e}") + + future = asyncio.run_coroutine_threadsafe(_connect_and_start(), loop) + future.result(timeout=10) # Wait for connection + + if session.error: + raise RuntimeError(f"Failed to start stream: {session.error}") + + return stream_id + + def send_chunks_from_file( + self, + stream_id: str, + wav_path: str, + num_chunks: Optional[int] = None, + chunk_duration_ms: int = 100, + realtime_pacing: bool = False, + ) -> int: + """Send audio chunks from a WAV file. + + Args: + stream_id: Stream session ID + wav_path: Path to WAV file + num_chunks: Number of chunks to send (None = all) + chunk_duration_ms: Duration per chunk in ms + realtime_pacing: If True, sleep between chunks to simulate real-time streaming + + Returns: + Number of chunks sent + """ + session = self._sessions.get(stream_id) + if not session: + raise ValueError(f"Unknown stream_id: {stream_id}") + + if not session.audio_started: + raise RuntimeError("Stream not started") + + wav_path = Path(wav_path) + if not wav_path.exists(): + raise FileNotFoundError(f"WAV file not found: {wav_path}") + + async def _send_chunks() -> int: + with wave.open(str(wav_path), "rb") as wav: + sample_rate = wav.getframerate() + channels = wav.getnchannels() + sample_width = wav.getsampwidth() + + samples_per_chunk = int(sample_rate * chunk_duration_ms / 1000) + chunks_sent = 0 + chunk_duration_seconds = chunk_duration_ms / 1000.0 + + while True: + if num_chunks is not None and chunks_sent >= num_chunks: + break + + chunk = wav.readframes(samples_per_chunk) + if not chunk: + break + + await session.client.send_audio_chunk_wyoming( + chunk, + sample_rate=sample_rate, + sample_width=sample_width, + channels=channels, + ) + chunks_sent += 1 + session.chunk_count += 1 + + # Optional: Sleep to maintain real-time pacing + if realtime_pacing: + await asyncio.sleep(chunk_duration_seconds) + + return chunks_sent + + future = asyncio.run_coroutine_threadsafe(_send_chunks(), session.loop) + return future.result(timeout=60) + + def stop_stream(self, stream_id: str) -> int: + """Stop a stream and close the connection. + + Args: + stream_id: Stream session ID + + Returns: + Total chunks sent during this session + """ + session = self._sessions.get(stream_id) + if not session: + raise ValueError(f"Unknown stream_id: {stream_id}") + + async def _stop(): + if session.audio_started: + await session.client.send_audio_stop() + await session.client.close() + + future = asyncio.run_coroutine_threadsafe(_stop(), session.loop) + future.result(timeout=10) + + # Stop the event loop + session.loop.call_soon_threadsafe(session.loop.stop) + session.thread.join(timeout=5) + + total_chunks = session.chunk_count + del self._sessions[stream_id] + + logger.info(f"Stream {stream_id} stopped, sent {total_chunks} chunks") + return total_chunks + + def get_session(self, stream_id: str) -> Optional[StreamSession]: + """Get session info for a stream.""" + return self._sessions.get(stream_id) + + def cleanup_all(self): + """Stop all active streams.""" + for stream_id in list(self._sessions.keys()): + try: + self.stop_stream(stream_id) + except Exception as e: + logger.warning(f"Error stopping stream {stream_id}: {e}") diff --git a/backends/advanced/src/advanced_omi_backend/config.py b/backends/advanced/src/advanced_omi_backend/config.py index 3d738d7a..ceebcad0 100644 --- a/backends/advanced/src/advanced_omi_backend/config.py +++ b/backends/advanced/src/advanced_omi_backend/config.py @@ -13,6 +13,10 @@ logger = logging.getLogger(__name__) +# Data directory paths +DATA_DIR = Path(os.getenv("DATA_DIR", "/app/data")) +CHUNK_DIR = Path("./audio_chunks") # Mounted to ./data/audio_chunks by Docker + # Default diarization settings DEFAULT_DIARIZATION_SETTINGS = { "diarization_source": "pyannote", @@ -37,6 +41,12 @@ "speech_inactivity_threshold": 60, # Speech gap threshold for closure (1 minute) } +# Default audio storage settings +DEFAULT_AUDIO_STORAGE_SETTINGS = { + "audio_base_path": "/app/data", # Main audio directory (where volume is mounted) + "audio_chunks_path": "/app/audio_chunks", # Full path to audio chunks subfolder +} + # Global cache for diarization settings _diarization_settings = None @@ -140,5 +150,18 @@ def get_conversation_stop_settings(): } +def get_audio_storage_settings(): + """Get audio storage settings from environment or defaults.""" + + # Get base path and derive chunks path + audio_base_path = os.getenv("AUDIO_BASE_PATH", DEFAULT_AUDIO_STORAGE_SETTINGS["audio_base_path"]) + audio_chunks_path = os.getenv("AUDIO_CHUNKS_PATH", f"{audio_base_path}/audio_chunks") + + return { + "audio_base_path": audio_base_path, + "audio_chunks_path": audio_chunks_path, + } + + # Initialize settings on module load _diarization_settings = load_diarization_settings_from_file() \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/controllers/audio_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/audio_controller.py new file mode 100644 index 00000000..da884eb6 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/controllers/audio_controller.py @@ -0,0 +1,290 @@ +""" +Audio file upload and processing controller. + +Handles audio file uploads and processes them directly. +Simplified to write files immediately and enqueue transcription. + +Also includes audio cropping operations that work with the Conversation model. +""" + +import logging +import time +import uuid +from pathlib import Path + +from fastapi import UploadFile +from fastapi.responses import JSONResponse + +from advanced_omi_backend.utils.audio_utils import ( + AudioValidationError, + write_audio_file, +) +from advanced_omi_backend.models.job import JobPriority +from advanced_omi_backend.models.user import User +from advanced_omi_backend.models.conversation import create_conversation +from advanced_omi_backend.models.conversation import Conversation + +logger = logging.getLogger(__name__) +audio_logger = logging.getLogger("audio_processing") + + +def generate_client_id(user: User, device_name: str) -> str: + """Generate client ID for uploaded files.""" + user_id_suffix = str(user.id)[-6:] + return f"{user_id_suffix}-{device_name}" + + +async def upload_and_process_audio_files( + user: User, + files: list[UploadFile], + device_name: str = "upload", + auto_generate_client: bool = True, + folder: str = None, +) -> dict: + """ + Upload audio files and process them directly. + + Simplified flow: + 1. Validate and read WAV file + 2. Write audio file and create AudioSession immediately + 3. Enqueue transcription job (same as WebSocket path) + + Args: + user: Authenticated user + files: List of uploaded audio files + device_name: Device identifier + auto_generate_client: Whether to auto-generate client ID + folder: Optional subfolder for audio storage (e.g., 'fixtures') + """ + try: + if not files: + return JSONResponse(status_code=400, content={"error": "No files provided"}) + + processed_files = [] + client_id = generate_client_id(user, device_name) + + for file_index, file in enumerate(files): + try: + # Validate file type (only WAV for now) + if not file.filename or not file.filename.lower().endswith(".wav"): + processed_files.append({ + "filename": file.filename or "unknown", + "status": "error", + "error": "Only WAV files are currently supported", + }) + continue + + audio_logger.info( + f"📁 Uploading file {file_index + 1}/{len(files)}: {file.filename}" + ) + + # Read file content + content = await file.read() + + # Generate audio UUID and timestamp + audio_uuid = str(uuid.uuid4()) + timestamp = int(time.time() * 1000) + + # Determine output directory (with optional subfolder) + from advanced_omi_backend.config import CHUNK_DIR + if folder: + chunk_dir = CHUNK_DIR / folder + chunk_dir.mkdir(parents=True, exist_ok=True) + else: + chunk_dir = CHUNK_DIR + + # Validate, write audio file and create AudioSession (all in one) + try: + relative_audio_path, file_path, duration = await write_audio_file( + raw_audio_data=content, + audio_uuid=audio_uuid, + client_id=client_id, + user_id=user.user_id, + user_email=user.email, + timestamp=timestamp, + chunk_dir=chunk_dir, + validate=True # Validate WAV format, convert stereo→mono + ) + except AudioValidationError as e: + processed_files.append({ + "filename": file.filename, + "status": "error", + "error": str(e), + }) + continue + + audio_logger.info( + f"📊 {file.filename}: {duration:.1f}s → {relative_audio_path}" + ) + + # Create conversation immediately for uploaded files (conversation_id auto-generated) + version_id = str(uuid.uuid4()) + + # Generate title from filename + title = file.filename.rsplit('.', 1)[0][:50] if file.filename else "Uploaded Audio" + + conversation = create_conversation( + audio_uuid=audio_uuid, + user_id=user.user_id, + client_id=client_id, + title=title, + summary="Processing uploaded audio file..." + ) + # Use the relative path returned by write_audio_file (already includes folder prefix if applicable) + conversation.audio_path = relative_audio_path + await conversation.insert() + conversation_id = conversation.conversation_id # Get the auto-generated ID + + audio_logger.info(f"📝 Created conversation {conversation_id} for uploaded file") + + # Enqueue post-conversation processing job chain + from advanced_omi_backend.controllers.queue_controller import start_post_conversation_jobs + + job_ids = start_post_conversation_jobs( + conversation_id=conversation_id, + audio_uuid=audio_uuid, + audio_file_path=file_path, + user_id=user.user_id, + post_transcription=True, # Run batch transcription for uploads + client_id=client_id # Pass client_id for UI tracking + ) + + processed_files.append({ + "filename": file.filename, + "status": "processing", + "audio_uuid": audio_uuid, + "conversation_id": conversation_id, + "transcript_job_id": job_ids['transcription'], + "speaker_job_id": job_ids['speaker_recognition'], + "memory_job_id": job_ids['memory'], + "duration_seconds": round(duration, 2), + }) + + audio_logger.info( + f"✅ Processed {file.filename} → conversation {conversation_id}, " + f"jobs: {job_ids['transcription']} → {job_ids['speaker_recognition']} → {job_ids['memory']}" + ) + + except (OSError, IOError) as e: + # File I/O errors during audio processing + audio_logger.exception(f"File I/O error processing {file.filename}") + processed_files.append({ + "filename": file.filename or "unknown", + "status": "error", + "error": str(e), + }) + except Exception as e: + # Unexpected errors during file processing + audio_logger.exception(f"Unexpected error processing file {file.filename}") + processed_files.append({ + "filename": file.filename or "unknown", + "status": "error", + "error": str(e), + }) + + successful_files = [f for f in processed_files if f.get("status") == "processing"] + failed_files = [f for f in processed_files if f.get("status") == "error"] + + return { + "message": f"Uploaded and processing {len(successful_files)} file(s)", + "client_id": client_id, + "files": processed_files, + "summary": { + "total": len(files), + "processing": len(successful_files), + "failed": len(failed_files), + }, + } + + except (OSError, IOError) as e: + # File system errors during upload handling + audio_logger.exception("File I/O error in upload_and_process_audio_files") + return JSONResponse( + status_code=500, content={"error": f"File upload failed: {str(e)}"} + ) + except Exception as e: + # Unexpected errors in upload handler + audio_logger.exception("Unexpected error in upload_and_process_audio_files") + return JSONResponse( + status_code=500, content={"error": f"File upload failed: {str(e)}"} + ) + + +async def get_conversation_audio_path(conversation_id: str, user: User, cropped: bool = False) -> Path: + """ + Get the file path for a conversation's audio file. + + Args: + conversation_id: The conversation ID + user: The authenticated user + cropped: If True, return cropped audio path; if False, return original audio path + + Returns: + Path object for the audio file + + Raises: + ValueError: If conversation not found, access denied, or audio file not available + """ + # Get conversation by conversation_id (UUID field, not _id) + conversation = await Conversation.find_one(Conversation.conversation_id == conversation_id) + + if not conversation: + raise ValueError("Conversation not found") + + # Check ownership (admins can access all files) + if not user.is_superuser and conversation.user_id != str(user.user_id): + raise ValueError("Access denied") + + # Get the appropriate audio path + audio_path = conversation.cropped_audio_path if cropped else conversation.audio_path + + if not audio_path: + audio_type = "cropped" if cropped else "original" + raise ValueError(f"No {audio_type} audio file available for this conversation") + + # Build full file path + from advanced_omi_backend.app_config import get_audio_chunk_dir + audio_dir = get_audio_chunk_dir() + file_path = audio_dir / audio_path + + # Check if file exists + if not file_path.exists() or not file_path.is_file(): + raise ValueError("Audio file not found on disk") + + return file_path + + +async def get_cropped_audio_info(audio_uuid: str, user: User): + """ + Get audio cropping metadata from the conversation. + + This is an audio service operation that retrieves cropping-related metadata + such as speech segments, cropped audio path, and cropping timestamps. + + Used for: Checking cropping status and retrieving audio processing details. + Works with: Conversation model. + """ + try: + # Find the conversation + conversation = await Conversation.find_one(Conversation.audio_uuid == audio_uuid) + if not conversation: + return JSONResponse(status_code=404, content={"error": "Conversation not found"}) + + # Check ownership for non-admin users + if not user.is_superuser: + if conversation.user_id != str(user.user_id): + return JSONResponse(status_code=404, content={"error": "Conversation not found"}) + + return { + "audio_uuid": audio_uuid, + "cropped_audio_path": conversation.cropped_audio_path, + "speech_segments": conversation.speech_segments if hasattr(conversation, 'speech_segments') else [], + "cropped_duration": conversation.cropped_duration if hasattr(conversation, 'cropped_duration') else None, + "cropped_at": conversation.cropped_at if hasattr(conversation, 'cropped_at') else None, + "original_audio_path": conversation.audio_path, + } + + except Exception as e: + # Database or unexpected errors when fetching audio metadata + audio_logger.exception("Error fetching cropped audio info") + return JSONResponse(status_code=500, content={"error": "Error fetching cropped audio info"}) diff --git a/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py index e53eef88..b9533391 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py @@ -2,32 +2,26 @@ Conversation controller for handling conversation-related business logic. """ -import asyncio -import hashlib import logging import time from pathlib import Path from typing import Optional -from advanced_omi_backend.audio_utils import ( - _process_audio_cropping_with_relative_timestamps, -) from advanced_omi_backend.client_manager import ( ClientManager, client_belongs_to_user, - get_user_clients_all, ) -from advanced_omi_backend.database import AudioChunksRepository, ProcessingRunsRepository, chunks_col, processing_runs_col, conversations_col, ConversationsRepository +from advanced_omi_backend.models.audio_file import AudioFile +from advanced_omi_backend.models.conversation import Conversation from advanced_omi_backend.users import User from fastapi.responses import JSONResponse logger = logging.getLogger(__name__) audio_logger = logging.getLogger("audio_processing") -# Initialize repositories -chunk_repo = AudioChunksRepository(chunks_col) -processing_runs_repo = ProcessingRunsRepository(processing_runs_col) -conversations_repo = ConversationsRepository(conversations_col) +# Legacy audio_chunks collection is still used by some endpoints (speaker assignment, segment updates) +# But conversation queries now use the Conversation model directly +# Audio cropping operations are handled in audio_controller.py async def close_current_conversation(client_id: str, user: User, client_manager: ClientManager): @@ -90,398 +84,147 @@ async def close_current_conversation(client_id: str, user: User, client_manager: ) -async def get_conversations(user: User): - """Get conversations with speech only (speech-driven architecture).""" - try: - # Import conversations collection and repository - conversations_repo = ConversationsRepository(conversations_col) - - # Build query based on user permissions - if not user.is_superuser: - # Regular users can only see their own conversations - user_conversations = await conversations_repo.get_user_conversations(str(user.user_id)) - else: - # Admins see all conversations - cursor = conversations_col.find({}).sort("created_at", -1) - user_conversations = await cursor.to_list(length=None) - - # Group conversations by client_id for backwards compatibility - conversations = {} - for conversation in user_conversations: - client_id = conversation["client_id"] - if client_id not in conversations: - conversations[client_id] = [] - - # Get audio file paths from audio_chunks collection - audio_chunk = await chunk_repo.get_chunk_by_audio_uuid(conversation["audio_uuid"]) - audio_path = audio_chunk.get("audio_path") if audio_chunk else None - cropped_audio_path = audio_chunk.get("cropped_audio_path") if audio_chunk else None - - # Convert conversation to API format - conversations[client_id].append( - { - "conversation_id": conversation["conversation_id"], - "audio_uuid": conversation["audio_uuid"], - "title": conversation.get("title", "Conversation"), - "summary": conversation.get("summary", ""), - "timestamp": conversation.get("session_start").timestamp() if conversation.get("session_start") else 0, - "created_at": conversation.get("created_at").isoformat() if conversation.get("created_at") else None, - "transcript": conversation.get("transcript", []), - "speakers_identified": conversation.get("speakers_identified", []), - "speaker_names": conversation.get("speaker_names", {}), - "duration_seconds": conversation.get("duration_seconds", 0), - "memories": conversation.get("memories", []), - "has_memory": bool(conversation.get("memories", [])), - "memory_processing_status": conversation.get("memory_processing_status", "pending"), - "action_items": conversation.get("action_items", []), - # Audio file paths for playback - "audio_path": audio_path, - "cropped_audio_path": cropped_audio_path, - } - ) - - return {"conversations": conversations} - - except Exception as e: - logger.error(f"Error fetching conversations: {e}") - return JSONResponse(status_code=500, content={"error": "Error fetching conversations"}) - - -async def get_conversation_by_id(conversation_id: str, user: User): - """Get a specific conversation by conversation_id (speech-driven architecture).""" +async def get_conversation(conversation_id: str, user: User): + """Get a single conversation with full transcript details.""" try: - # Import conversations collection and repository - conversations_repo = ConversationsRepository(conversations_col) - - # Get the conversation - conversation = await conversations_repo.get_conversation(conversation_id) + # Find the conversation using Beanie + conversation = await Conversation.find_one(Conversation.conversation_id == conversation_id) if not conversation: - return JSONResponse( - status_code=404, - content={"error": "Conversation not found"} - ) - - # Check if user owns this conversation - if not user.is_superuser and conversation["user_id"] != str(user.user_id): - return JSONResponse( - status_code=403, - content={"error": "Access forbidden. You can only access your own conversations."} - ) - - # Get audio file paths from audio_chunks collection - audio_chunk = await chunk_repo.get_chunk_by_audio_uuid(conversation["audio_uuid"]) - audio_path = audio_chunk.get("audio_path") if audio_chunk else None - cropped_audio_path = audio_chunk.get("cropped_audio_path") if audio_chunk else None - - # Format conversation for API response - formatted_conversation = { - "conversation_id": conversation["conversation_id"], - "audio_uuid": conversation["audio_uuid"], - "title": conversation.get("title", "Conversation"), - "summary": conversation.get("summary", ""), - "timestamp": conversation.get("session_start").timestamp() if conversation.get("session_start") else 0, - "created_at": conversation.get("created_at").isoformat() if conversation.get("created_at") else None, - "transcript": conversation.get("transcript", []), - "speakers_identified": conversation.get("speakers_identified", []), - "speaker_names": conversation.get("speaker_names", {}), - "duration_seconds": conversation.get("duration_seconds", 0), - "memories": conversation.get("memories", []), - "has_memory": bool(conversation.get("memories", [])), - "memory_processing_status": conversation.get("memory_processing_status", "pending"), - "action_items": conversation.get("action_items", []), - # Audio file paths for playback - "audio_path": audio_path, - "cropped_audio_path": cropped_audio_path, - } - - return {"conversation": formatted_conversation} - - except Exception as e: - logger.error(f"Error fetching conversation {conversation_id}: {e}") - return JSONResponse(status_code=500, content={"error": "Error fetching conversation"}) - - -async def get_cropped_audio_info(audio_uuid: str, user: User): - """Get cropped audio information for a conversation. Users can only access their own conversations.""" - try: - # Find the conversation - chunk = await chunks_col.find_one({"audio_uuid": audio_uuid}) - if not chunk: return JSONResponse(status_code=404, content={"error": "Conversation not found"}) # Check ownership for non-admin users - if not user.is_superuser: - if not client_belongs_to_user(chunk["client_id"], user.user_id): - return JSONResponse(status_code=404, content={"error": "Conversation not found"}) - - return { - "audio_uuid": audio_uuid, - "cropped_audio_path": chunk.get("cropped_audio_path"), - "speech_segments": chunk.get("speech_segments", []), - "cropped_duration": chunk.get("cropped_duration"), - "cropped_at": chunk.get("cropped_at"), - "original_audio_path": chunk.get("audio_path"), + if not user.is_superuser and conversation.user_id != str(user.user_id): + return JSONResponse(status_code=403, content={"error": "Access forbidden"}) + + # Build response with explicit curated fields + response = { + "conversation_id": conversation.conversation_id, + "audio_uuid": conversation.audio_uuid, + "user_id": conversation.user_id, + "client_id": conversation.client_id, + "audio_path": conversation.audio_path, + "cropped_audio_path": conversation.cropped_audio_path, + "created_at": conversation.created_at.isoformat() if conversation.created_at else None, + "deleted": conversation.deleted, + "deletion_reason": conversation.deletion_reason, + "deleted_at": conversation.deleted_at.isoformat() if conversation.deleted_at else None, + "end_reason": conversation.end_reason.value if conversation.end_reason else None, + "completed_at": conversation.completed_at.isoformat() if conversation.completed_at else None, + "title": conversation.title, + "summary": conversation.summary, + "detailed_summary": conversation.detailed_summary, + # Computed fields + "transcript": conversation.transcript, + "segments": [s.model_dump() for s in conversation.segments], + "segment_count": conversation.segment_count, + "memory_count": conversation.memory_count, + "has_memory": conversation.has_memory, + "active_transcript_version": conversation.active_transcript_version, + "active_memory_version": conversation.active_memory_version, + "transcript_version_count": conversation.transcript_version_count, + "memory_version_count": conversation.memory_version_count, } - except Exception as e: - logger.error(f"Error fetching cropped audio info: {e}") - return JSONResponse(status_code=500, content={"error": "Error fetching cropped audio info"}) - - -async def reprocess_audio_cropping(audio_uuid: str, user: User): - """Reprocess audio cropping for a conversation. Users can only reprocess their own conversations.""" - try: - # Find the conversation - chunk = await chunks_col.find_one({"audio_uuid": audio_uuid}) - if not chunk: - return JSONResponse(status_code=404, content={"error": "Conversation not found"}) - - # Check ownership for non-admin users - if not user.is_superuser: - if not client_belongs_to_user(chunk["client_id"], user.user_id): - return JSONResponse(status_code=404, content={"error": "Conversation not found"}) - - audio_path = chunk.get("audio_path") - if not audio_path: - return JSONResponse( - status_code=400, content={"error": "No audio file found for this conversation"} - ) - - # Check if file exists - try multiple possible locations - possible_paths = [ - Path("/app/data/audio_chunks") / audio_path, - Path("/app/audio_chunks") / audio_path, - Path(audio_path), # fallback to relative path - ] - - full_audio_path = None - for path in possible_paths: - if path.exists(): - full_audio_path = path - break - - if not full_audio_path: - return JSONResponse( - status_code=422, - content={ - "error": "Audio file not found on disk", - "details": f"Conversation exists but audio file '{audio_path}' is missing from expected locations", - "searched_paths": [str(p) for p in possible_paths] - } - ) - - # Get speech segments from the chunk - speech_segments = chunk.get("speech_segments", []) - if not speech_segments: - return JSONResponse( - status_code=400, - content={"error": "No speech segments found for this conversation"} - ) - - # Generate output path for cropped audio - cropped_filename = f"cropped_{audio_uuid}.wav" - output_path = Path("/app/data/audio_chunks") / cropped_filename - - # Get repository for database updates - chunk_repo = AudioChunksRepository(chunks_col) - - # Reprocess the audio cropping - try: - result = await _process_audio_cropping_with_relative_timestamps( - str(full_audio_path), - speech_segments, - str(output_path), - audio_uuid, - chunk_repo - ) - - if result: - audio_logger.info(f"Successfully reprocessed audio cropping for {audio_uuid}") - return JSONResponse( - content={"message": f"Audio cropping reprocessed for {audio_uuid}"} - ) - else: - audio_logger.error(f"Failed to reprocess audio cropping for {audio_uuid}") - return JSONResponse( - status_code=500, content={"error": "Failed to reprocess audio cropping"} - ) - - except Exception as processing_error: - audio_logger.error(f"Error during audio cropping reprocessing: {processing_error}") - return JSONResponse( - status_code=500, - content={"error": f"Audio processing failed: {str(processing_error)}"}, - ) + return {"conversation": response} except Exception as e: - logger.error(f"Error reprocessing audio cropping: {e}") - return JSONResponse(status_code=500, content={"error": "Error reprocessing audio cropping"}) + logger.error(f"Error fetching conversation {conversation_id}: {e}") + return JSONResponse(status_code=500, content={"error": "Error fetching conversation"}) -async def add_speaker_to_conversation(audio_uuid: str, speaker_id: str, user: User): - """Add a speaker to the speakers_identified list for a conversation. Users can only modify their own conversations.""" +async def get_conversations(user: User): + """Get conversations with speech only (speech-driven architecture).""" try: - # Find the conversation first - chunk = await chunks_col.find_one({"audio_uuid": audio_uuid}) - if not chunk: - return JSONResponse(status_code=404, content={"error": "Conversation not found"}) - - # Check ownership for non-admin users + # Build query based on user permissions using Beanie if not user.is_superuser: - if not client_belongs_to_user(chunk["client_id"], user.user_id): - return JSONResponse(status_code=404, content={"error": "Conversation not found"}) - - # Update the speakers_identified list - speakers = chunk.get("speakers_identified", []) - if speaker_id not in speakers: - speakers.append(speaker_id) - await chunks_col.update_one( - {"audio_uuid": audio_uuid}, {"$set": {"speakers_identified": speakers}} - ) + # Regular users can only see their own conversations + user_conversations = await Conversation.find( + Conversation.user_id == str(user.user_id) + ).sort(-Conversation.created_at).to_list() + else: + # Admins see all conversations + user_conversations = await Conversation.find_all().sort(-Conversation.created_at).to_list() + + # Build response with explicit curated fields - minimal for list view + conversations = [] + for conv in user_conversations: + conversations.append({ + "conversation_id": conv.conversation_id, + "audio_uuid": conv.audio_uuid, + "user_id": conv.user_id, + "client_id": conv.client_id, + "audio_path": conv.audio_path, + "cropped_audio_path": conv.cropped_audio_path, + "created_at": conv.created_at.isoformat() if conv.created_at else None, + "deleted": conv.deleted, + "deletion_reason": conv.deletion_reason, + "deleted_at": conv.deleted_at.isoformat() if conv.deleted_at else None, + "title": conv.title, + "summary": conv.summary, + "detailed_summary": conv.detailed_summary, + "active_transcript_version": conv.active_transcript_version, + "active_memory_version": conv.active_memory_version, + # Computed fields (counts only, no heavy data) + "segment_count": conv.segment_count, + "has_memory": conv.has_memory, + "memory_count": conv.memory_count, + "transcript_version_count": conv.transcript_version_count, + "memory_version_count": conv.memory_version_count, + }) - return { - "message": f"Speaker {speaker_id} added to conversation", - "speakers_identified": speakers, - } + return {"conversations": conversations} except Exception as e: - logger.error(f"Error adding speaker to conversation: {e}") - return JSONResponse( - status_code=500, content={"error": "Error adding speaker to conversation"} - ) + logger.exception(f"Error fetching conversations: {e}") + return JSONResponse(status_code=500, content={"error": "Error fetching conversations"}) -async def update_transcript_segment( - audio_uuid: str, - segment_index: int, - user: User, - speaker_id: Optional[str] = None, - start_time: Optional[float] = None, - end_time: Optional[float] = None, -): - """Update a specific transcript segment with speaker or timing information. Users can only modify their own conversations.""" +async def delete_conversation(conversation_id: str, user: User): + """Delete a conversation and its associated audio files. Users can only delete their own conversations.""" try: - # Find the conversation first - chunk = await chunks_col.find_one({"audio_uuid": audio_uuid}) - if not chunk: - return JSONResponse(status_code=404, content={"error": "Conversation not found"}) - - # Check ownership for non-admin users - if not user.is_superuser: - if not client_belongs_to_user(chunk["client_id"], user.user_id): - return JSONResponse(status_code=404, content={"error": "Conversation not found"}) - - update_doc = {} - - if speaker_id is not None: - update_doc[f"transcript.{segment_index}.speaker"] = speaker_id - # Add to speakers_identified if not already present - speakers = chunk.get("speakers_identified", []) - if speaker_id not in speakers: - speakers.append(speaker_id) - await chunks_col.update_one( - {"audio_uuid": audio_uuid}, {"$set": {"speakers_identified": speakers}} - ) - - if start_time is not None: - update_doc[f"transcript.{segment_index}.start"] = start_time - - if end_time is not None: - update_doc[f"transcript.{segment_index}.end"] = end_time - - if not update_doc: - return JSONResponse(status_code=400, content={"error": "No update parameters provided"}) - - result = await chunks_col.update_one({"audio_uuid": audio_uuid}, {"$set": update_doc}) - - if result.modified_count == 0: - return JSONResponse(status_code=400, content={"error": "No changes were made"}) + # Create masked identifier for logging + masked_id = f"{conversation_id[:8]}...{conversation_id[-4:]}" if len(conversation_id) > 12 else "***" + logger.info(f"Attempting to delete conversation: {masked_id}") - return JSONResponse(content={"message": "Transcript segment updated successfully"}) + # Find the conversation using Beanie + conversation = await Conversation.find_one(Conversation.conversation_id == conversation_id) - except Exception as e: - audio_logger.error(f"Error updating transcript segment: {e}") - return JSONResponse(status_code=500, content={"error": "Internal server error"}) - -async def delete_conversation(audio_uuid: str, user: User): - """Delete a conversation and its associated audio file. Users can only delete their own conversations.""" - try: - # Create masked identifier for logging - masked_uuid = f"{audio_uuid[:8]}...{audio_uuid[-4:]}" if len(audio_uuid) > 12 else "***" - logger.info(f"Attempting to delete conversation: {masked_uuid}") - - # Detailed debugging only when debug level is enabled - if logger.isEnabledFor(logging.DEBUG): - total_count = await chunks_col.count_documents({}) - logger.debug(f"Total conversations in collection: {total_count}") - logger.debug(f"UUID length: {len(audio_uuid)}, type: {type(audio_uuid)}") - - # First, get the audio chunk record to check ownership and get conversation_id - audio_chunk = await chunks_col.find_one({"audio_uuid": audio_uuid}) - - if logger.isEnabledFor(logging.DEBUG): - logger.debug(f"Audio chunk lookup result: {'found' if audio_chunk else 'not found'}") - if audio_chunk: - logger.debug(f"Found audio chunk with client_id: {audio_chunk.get('client_id')}") - logger.debug(f"Audio chunk has conversation_id: {audio_chunk.get('conversation_id')}") - else: - # Try alternative queries for debugging - regex_result = await chunks_col.find_one({"audio_uuid": {"$regex": f"^{audio_uuid}$", "$options": "i"}}) - contains_result = await chunks_col.find_one({"audio_uuid": {"$regex": audio_uuid}}) - logger.debug(f"Alternative query attempts - case insensitive: {'found' if regex_result else 'not found'}, substring: {'found' if contains_result else 'not found'}") - - if not audio_chunk: + if not conversation: return JSONResponse( status_code=404, - content={"error": f"Audio chunk with audio_uuid '{audio_uuid}' not found"} + content={"error": f"Conversation '{conversation_id}' not found"} ) - # Check if user has permission to delete this conversation - client_id = audio_chunk.get("client_id") - if not user.is_superuser and not client_belongs_to_user(client_id, user.user_id): + # Check ownership for non-admin users + if not user.is_superuser and conversation.user_id != str(user.user_id): logger.warning( - f"User {user.user_id} attempted to delete conversation {audio_uuid} without permission" + f"User {user.user_id} attempted to delete conversation {conversation_id} without permission" ) return JSONResponse( status_code=403, content={ "error": "Access forbidden. You can only delete your own conversations.", - "details": f"Conversation '{audio_uuid}' does not belong to your account." + "details": f"Conversation '{conversation_id}' does not belong to your account." } ) - # Get audio file paths for deletion - audio_path = audio_chunk.get("audio_path") - cropped_audio_path = audio_chunk.get("cropped_audio_path") - - # Get conversation_id if this audio chunk has an associated conversation - conversation_id = audio_chunk.get("conversation_id") - conversation_deleted = False - - # Delete from audio_chunks collection first - audio_result = await chunks_col.delete_one({"audio_uuid": audio_uuid}) - - if audio_result.deleted_count == 0: - return JSONResponse( - status_code=404, - content={"error": f"Failed to delete audio chunk with audio_uuid '{audio_uuid}'"} - ) + # Get file paths before deletion + audio_path = conversation.audio_path + cropped_audio_path = conversation.cropped_audio_path + audio_uuid = conversation.audio_uuid + client_id = conversation.client_id - logger.info(f"Deleted audio chunk {audio_uuid}") + # Delete the conversation from database + await conversation.delete() + logger.info(f"Deleted conversation {conversation_id}") - # If this audio chunk has an associated conversation, delete it from conversations collection too - if conversation_id: - try: - conversation_result = await conversations_col.delete_one({"conversation_id": conversation_id}) - if conversation_result.deleted_count > 0: - conversation_deleted = True - logger.info(f"Deleted conversation {conversation_id} associated with audio chunk {audio_uuid}") - else: - logger.warning(f"Conversation {conversation_id} not found in conversations collection, but audio chunk was deleted") - except Exception as e: - logger.warning(f"Failed to delete conversation {conversation_id}: {e}") + # Also delete from legacy AudioFile collection if it exists (backward compatibility) + audio_file = await AudioFile.find_one(AudioFile.audio_uuid == audio_uuid) + if audio_file: + await audio_file.delete() + logger.info(f"Deleted legacy audio file record for {audio_uuid}") - # Delete associated audio files + # Delete associated audio files from disk deleted_files = [] if audio_path: try: @@ -505,29 +248,26 @@ async def delete_conversation(audio_uuid: str, user: User): except Exception as e: logger.warning(f"Failed to delete cropped audio file {cropped_audio_path}: {e}") - logger.info(f"Successfully deleted conversation {audio_uuid} for user {user.user_id}") + logger.info(f"Successfully deleted conversation {conversation_id} for user {user.user_id}") # Prepare response message - delete_summary = [] - delete_summary.append("audio chunk") - if conversation_deleted: - delete_summary.append("conversation record") + delete_summary = ["conversation"] if deleted_files: delete_summary.append(f"{len(deleted_files)} audio file(s)") return JSONResponse( status_code=200, content={ - "message": f"Successfully deleted {', '.join(delete_summary)} for '{audio_uuid}'", + "message": f"Successfully deleted {', '.join(delete_summary)} '{conversation_id}'", "deleted_files": deleted_files, "client_id": client_id, "conversation_id": conversation_id, - "conversation_deleted": conversation_deleted + "audio_uuid": audio_uuid } ) except Exception as e: - logger.error(f"Error deleting conversation {audio_uuid}: {e}") + logger.error(f"Error deleting conversation {conversation_id}: {e}") return JSONResponse( status_code=500, content={"error": f"Failed to delete conversation: {str(e)}"} @@ -537,25 +277,19 @@ async def delete_conversation(audio_uuid: str, user: User): async def reprocess_transcript(conversation_id: str, user: User): """Reprocess transcript for a conversation. Users can only reprocess their own conversations.""" try: - # Find the conversation in conversations collection - conversations_repo = ConversationsRepository(conversations_col) - conversation = await conversations_repo.get_conversation(conversation_id) - if not conversation: + # Find the conversation using Beanie + conversation_model = await Conversation.find_one(Conversation.conversation_id == conversation_id) + if not conversation_model: return JSONResponse(status_code=404, content={"error": "Conversation not found"}) # Check ownership for non-admin users - if not user.is_superuser and conversation["user_id"] != str(user.user_id): + if not user.is_superuser and conversation_model.user_id != str(user.user_id): return JSONResponse(status_code=403, content={"error": "Access forbidden. You can only reprocess your own conversations."}) - # Get audio_uuid for file access - audio_uuid = conversation["audio_uuid"] + # Get audio_uuid and file path from conversation + audio_uuid = conversation_model.audio_uuid + audio_path = conversation_model.audio_path - # Get audio file path from audio_chunks collection - chunk = await chunks_col.find_one({"audio_uuid": audio_uuid}) - if not chunk: - return JSONResponse(status_code=404, content={"error": "Audio session not found"}) - - audio_path = chunk.get("audio_path") if not audio_path: return JSONResponse( status_code=400, content={"error": "No audio file found for this conversation"} @@ -563,7 +297,6 @@ async def reprocess_transcript(conversation_id: str, user: User): # Check if file exists - try multiple possible locations possible_paths = [ - Path("/app/data/audio_chunks") / audio_path, Path("/app/audio_chunks") / audio_path, Path(audio_path), # fallback to relative path ] @@ -584,47 +317,86 @@ async def reprocess_transcript(conversation_id: str, user: User): } ) - # Generate configuration hash for duplicate detection - config_data = { - "audio_path": str(full_audio_path), - "transcription_provider": "deepgram", # This would come from settings - "trigger": "manual_reprocess" - } - config_hash = hashlib.sha256(str(config_data).encode()).hexdigest()[:16] - - # Create processing run - run_id = await processing_runs_repo.create_run( - conversation_id=conversation_id, - audio_uuid=audio_uuid, - run_type="transcript", - user_id=user.user_id, - trigger="manual_reprocess", - config_hash=config_hash + # Create new transcript version ID + import uuid + version_id = str(uuid.uuid4()) + + # Enqueue job chain with RQ (transcription -> speaker recognition -> cropping -> memory) + from advanced_omi_backend.workers.transcription_jobs import transcribe_full_audio_job + from advanced_omi_backend.workers.speaker_jobs import recognise_speakers_job + from advanced_omi_backend.workers.audio_jobs import process_cropping_job + from advanced_omi_backend.workers.memory_jobs import process_memory_job + from advanced_omi_backend.controllers.queue_controller import transcription_queue, memory_queue, default_queue, JOB_RESULT_TTL + + # Job 1: Transcribe audio to text + transcript_job = transcription_queue.enqueue( + transcribe_full_audio_job, + conversation_id, + audio_uuid, + str(full_audio_path), + version_id, + "reprocess", + job_timeout=600, + result_ttl=JOB_RESULT_TTL, + job_id=f"reprocess_{conversation_id[:8]}", + description=f"Transcribe audio for {conversation_id[:8]}", + meta={'audio_uuid': audio_uuid, 'conversation_id': conversation_id} ) - - # Create new transcript version in conversations collection - version_id = await conversations_repo.create_transcript_version( - conversation_id=conversation_id, - processing_run_id=run_id + logger.info(f"📥 RQ: Enqueued transcription job {transcript_job.id}") + + # Job 2: Recognize speakers (depends on transcription) + speaker_job = transcription_queue.enqueue( + recognise_speakers_job, + conversation_id, + version_id, + str(full_audio_path), + "", # transcript_text - will be read from DB + [], # words - will be read from DB + depends_on=transcript_job, + job_timeout=600, + result_ttl=JOB_RESULT_TTL, + job_id=f"speaker_{conversation_id[:8]}", + description=f"Recognize speakers for {conversation_id[:8]}", + meta={'audio_uuid': audio_uuid, 'conversation_id': conversation_id} ) + logger.info(f"📥 RQ: Enqueued speaker recognition job {speaker_job.id} (depends on {transcript_job.id})") + + # Job 3: Audio cropping (depends on speaker recognition) + cropping_job = default_queue.enqueue( + process_cropping_job, + conversation_id, + str(full_audio_path), + depends_on=speaker_job, + job_timeout=300, + result_ttl=JOB_RESULT_TTL, + job_id=f"crop_{conversation_id[:8]}", + description=f"Crop audio for {conversation_id[:8]}", + meta={'audio_uuid': audio_uuid, 'conversation_id': conversation_id} + ) + logger.info(f"📥 RQ: Enqueued audio cropping job {cropping_job.id} (depends on {speaker_job.id})") + + # Job 4: Extract memories (depends on cropping) + # Note: redis_client is injected by @async_job decorator, don't pass it directly + memory_job = memory_queue.enqueue( + process_memory_job, + conversation_id, + depends_on=cropping_job, + job_timeout=1800, + result_ttl=JOB_RESULT_TTL, + job_id=f"memory_{conversation_id[:8]}", + description=f"Extract memories for {conversation_id[:8]}", + meta={'audio_uuid': audio_uuid, 'conversation_id': conversation_id} + ) + logger.info(f"📥 RQ: Enqueued memory job {memory_job.id} (depends on {cropping_job.id})") - if not version_id: - return JSONResponse( - status_code=500, content={"error": "Failed to create transcript version"} - ) - - # TODO: Queue audio for reprocessing with ProcessorManager - # This is where we would integrate with the existing processor - # For now, we'll return the version ID for the caller to handle - - logger.info(f"Created transcript reprocessing job {run_id} (version {version_id}) for conversation {conversation_id}") + job = transcript_job # For backward compatibility with return value + logger.info(f"Created transcript reprocessing job {job.id} (version: {version_id}) for conversation {conversation_id}") return JSONResponse(content={ "message": f"Transcript reprocessing started for conversation {conversation_id}", - "run_id": run_id, + "job_id": job.id, "version_id": version_id, - "config_hash": config_hash, - "status": "PENDING" + "status": "queued" }) except Exception as e: @@ -635,25 +407,19 @@ async def reprocess_transcript(conversation_id: str, user: User): async def reprocess_memory(conversation_id: str, transcript_version_id: str, user: User): """Reprocess memory extraction for a specific transcript version. Users can only reprocess their own conversations.""" try: - # Find the conversation in conversations collection - conversations_repo = ConversationsRepository(conversations_col) - conversation = await conversations_repo.get_conversation(conversation_id) - if not conversation: + # Find the conversation using Beanie + conversation_model = await Conversation.find_one(Conversation.conversation_id == conversation_id) + if not conversation_model: return JSONResponse(status_code=404, content={"error": "Conversation not found"}) # Check ownership for non-admin users - if not user.is_superuser and conversation["user_id"] != str(user.user_id): + if not user.is_superuser and conversation_model.user_id != str(user.user_id): return JSONResponse(status_code=403, content={"error": "Access forbidden. You can only reprocess your own conversations."}) - # Get audio_uuid for processing run tracking - audio_uuid = conversation["audio_uuid"] - # Resolve transcript version ID - transcript_versions = conversation.get("transcript_versions", []) - # Handle special "active" version ID if transcript_version_id == "active": - active_version_id = conversation.get("active_transcript_version") + active_version_id = conversation_model.active_transcript_version if not active_version_id: return JSONResponse( status_code=404, content={"error": "No active transcript version found"} @@ -662,8 +428,8 @@ async def reprocess_memory(conversation_id: str, transcript_version_id: str, use # Find the specific transcript version transcript_version = None - for version in transcript_versions: - if version["version_id"] == transcript_version_id: + for version in conversation_model.transcript_versions: + if version.version_id == transcript_version_id: transcript_version = version break @@ -672,48 +438,30 @@ async def reprocess_memory(conversation_id: str, transcript_version_id: str, use status_code=404, content={"error": f"Transcript version '{transcript_version_id}' not found"} ) - # Generate configuration hash for duplicate detection - config_data = { - "transcript_version_id": transcript_version_id, - "memory_provider": "friend_lite", # This would come from settings - "trigger": "manual_reprocess" - } - config_hash = hashlib.sha256(str(config_data).encode()).hexdigest()[:16] + # Create new memory version ID + import uuid + version_id = str(uuid.uuid4()) - # Create processing run - run_id = await processing_runs_repo.create_run( - conversation_id=conversation_id, - audio_uuid=audio_uuid, - run_type="memory", - user_id=user.user_id, - trigger="manual_reprocess", - config_hash=config_hash - ) + # Enqueue memory processing job with RQ (RQ handles job tracking) + from advanced_omi_backend.workers.memory_jobs import enqueue_memory_processing + from advanced_omi_backend.models.job import JobPriority - # Create new memory version in conversations collection - version_id = await conversations_repo.create_memory_version( + job = enqueue_memory_processing( + client_id=conversation_model.client_id, + user_id=str(user.user_id), + user_email=user.email, conversation_id=conversation_id, - transcript_version_id=transcript_version_id, - processing_run_id=run_id + priority=JobPriority.NORMAL ) - if not version_id: - return JSONResponse( - status_code=500, content={"error": "Failed to create memory version"} - ) - - # TODO: Queue memory extraction for processing - # This is where we would integrate with the existing memory processor - - logger.info(f"Created memory reprocessing job {run_id} (version {version_id}) for conversation {conversation_id}") + logger.info(f"Created memory reprocessing job {job.id} (version {version_id}) for conversation {conversation_id}") return JSONResponse(content={ "message": f"Memory reprocessing started for conversation {conversation_id}", - "run_id": run_id, + "job_id": job.id, "version_id": version_id, "transcript_version_id": transcript_version_id, - "config_hash": config_hash, - "status": "PENDING" + "status": "queued" }) except Exception as e: @@ -724,23 +472,24 @@ async def reprocess_memory(conversation_id: str, transcript_version_id: str, use async def activate_transcript_version(conversation_id: str, version_id: str, user: User): """Activate a specific transcript version. Users can only modify their own conversations.""" try: - # Find the conversation in conversations collection - conversations_repo = ConversationsRepository(conversations_col) - conversation = await conversations_repo.get_conversation(conversation_id) - if not conversation: + # Find the conversation using Beanie + conversation_model = await Conversation.find_one(Conversation.conversation_id == conversation_id) + if not conversation_model: return JSONResponse(status_code=404, content={"error": "Conversation not found"}) # Check ownership for non-admin users - if not user.is_superuser and conversation["user_id"] != str(user.user_id): + if not user.is_superuser and conversation_model.user_id != str(user.user_id): return JSONResponse(status_code=403, content={"error": "Access forbidden. You can only modify your own conversations."}) - # Activate the transcript version - success = await conversations_repo.activate_transcript_version(conversation_id, version_id) + # Activate the transcript version using Beanie model method + success = conversation_model.set_active_transcript_version(version_id) if not success: return JSONResponse( status_code=400, content={"error": "Failed to activate transcript version"} ) + await conversation_model.save() + # TODO: Trigger speaker recognition if configured # This would integrate with existing speaker recognition logic @@ -759,23 +508,24 @@ async def activate_transcript_version(conversation_id: str, version_id: str, use async def activate_memory_version(conversation_id: str, version_id: str, user: User): """Activate a specific memory version. Users can only modify their own conversations.""" try: - # Find the conversation in conversations collection - conversations_repo = ConversationsRepository(conversations_col) - conversation = await conversations_repo.get_conversation(conversation_id) - if not conversation: + # Find the conversation using Beanie + conversation_model = await Conversation.find_one(Conversation.conversation_id == conversation_id) + if not conversation_model: return JSONResponse(status_code=404, content={"error": "Conversation not found"}) # Check ownership for non-admin users - if not user.is_superuser and conversation["user_id"] != str(user.user_id): + if not user.is_superuser and conversation_model.user_id != str(user.user_id): return JSONResponse(status_code=403, content={"error": "Access forbidden. You can only modify your own conversations."}) - # Activate the memory version - success = await conversations_repo.activate_memory_version(conversation_id, version_id) + # Activate the memory version using Beanie model method + success = conversation_model.set_active_memory_version(version_id) if not success: return JSONResponse( status_code=400, content={"error": "Failed to activate memory version"} ) + await conversation_model.save() + logger.info(f"Activated memory version {version_id} for conversation {conversation_id} by user {user.user_id}") return JSONResponse(content={ @@ -791,18 +541,38 @@ async def activate_memory_version(conversation_id: str, version_id: str, user: U async def get_conversation_version_history(conversation_id: str, user: User): """Get version history for a conversation. Users can only access their own conversations.""" try: - # Find the conversation in conversations collection to check ownership - conversations_repo = ConversationsRepository(conversations_col) - conversation = await conversations_repo.get_conversation(conversation_id) - if not conversation: + # Find the conversation using Beanie to check ownership + conversation_model = await Conversation.find_one(Conversation.conversation_id == conversation_id) + if not conversation_model: return JSONResponse(status_code=404, content={"error": "Conversation not found"}) # Check ownership for non-admin users - if not user.is_superuser and conversation["user_id"] != str(user.user_id): + if not user.is_superuser and conversation_model.user_id != str(user.user_id): return JSONResponse(status_code=403, content={"error": "Access forbidden. You can only access your own conversations."}) - # Get version history - history = await conversations_repo.get_version_history(conversation_id) + # Get version history from model + # Convert datetime objects to ISO strings for JSON serialization + transcript_versions = [] + for v in conversation_model.transcript_versions: + version_dict = v.model_dump() + if version_dict.get('created_at'): + version_dict['created_at'] = version_dict['created_at'].isoformat() + transcript_versions.append(version_dict) + + memory_versions = [] + for v in conversation_model.memory_versions: + version_dict = v.model_dump() + if version_dict.get('created_at'): + version_dict['created_at'] = version_dict['created_at'].isoformat() + memory_versions.append(version_dict) + + history = { + "conversation_id": conversation_id, + "active_transcript_version": conversation_model.active_transcript_version, + "active_memory_version": conversation_model.active_memory_version, + "transcript_versions": transcript_versions, + "memory_versions": memory_versions + } return JSONResponse(content=history) diff --git a/backends/advanced/src/advanced_omi_backend/controllers/queue_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/queue_controller.py new file mode 100644 index 00000000..91773756 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/controllers/queue_controller.py @@ -0,0 +1,742 @@ +""" +Queue Controller - RQ queue configuration, management and monitoring. + +This module provides: +- Queue setup and configuration +- Job statistics and monitoring +- Queue health checks +- Beanie initialization for workers +""" + +import asyncio +import os +import logging +import uuid +from datetime import datetime +from typing import Dict, Any, Optional + +import redis +from rq import Queue, Worker +from rq.job import Job +from rq.registry import ScheduledJobRegistry, DeferredJobRegistry + +from advanced_omi_backend.models.job import JobPriority +from advanced_omi_backend.models.conversation import Conversation + +logger = logging.getLogger(__name__) + +# Redis connection configuration +REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0") +redis_conn = redis.from_url(REDIS_URL) + +# Queue name constants +TRANSCRIPTION_QUEUE = "transcription" +MEMORY_QUEUE = "memory" +AUDIO_QUEUE = "audio" +DEFAULT_QUEUE = "default" + +# Centralized list of all queue names +QUEUE_NAMES = [DEFAULT_QUEUE, TRANSCRIPTION_QUEUE, MEMORY_QUEUE, AUDIO_QUEUE] + +# Job retention configuration +JOB_RESULT_TTL = int(os.getenv("RQ_RESULT_TTL", 86400)) # 24 hour default + +# Create queues with custom result TTL +transcription_queue = Queue(TRANSCRIPTION_QUEUE, connection=redis_conn, default_timeout=86400) # 24 hours for streaming jobs +memory_queue = Queue(MEMORY_QUEUE, connection=redis_conn, default_timeout=300) +audio_queue = Queue(AUDIO_QUEUE, connection=redis_conn, default_timeout=86400) # 24 hours for all-day sessions +default_queue = Queue(DEFAULT_QUEUE, connection=redis_conn, default_timeout=300) + + +def get_queue(queue_name: str = DEFAULT_QUEUE) -> Queue: + """Get an RQ queue by name.""" + queues = { + TRANSCRIPTION_QUEUE: transcription_queue, + MEMORY_QUEUE: memory_queue, + AUDIO_QUEUE: audio_queue, + DEFAULT_QUEUE: default_queue, + } + return queues.get(queue_name, default_queue) + + +def get_job_stats() -> Dict[str, Any]: + """Get statistics about jobs in all queues matching frontend expectations.""" + total_jobs = 0 + queued_jobs = 0 + processing_jobs = 0 + completed_jobs = 0 + failed_jobs = 0 + cancelled_jobs = 0 + deferred_jobs = 0 # Jobs waiting for dependencies (depends_on) + + for queue_name in QUEUE_NAMES: + queue = get_queue(queue_name) + + queued_jobs += len(queue) + processing_jobs += len(queue.started_job_registry) + completed_jobs += len(queue.finished_job_registry) + failed_jobs += len(queue.failed_job_registry) + cancelled_jobs += len(queue.canceled_job_registry) + deferred_jobs += len(queue.deferred_job_registry) + + total_jobs = queued_jobs + processing_jobs + completed_jobs + failed_jobs + cancelled_jobs + deferred_jobs + + return { + "total_jobs": total_jobs, + "queued_jobs": queued_jobs, + "processing_jobs": processing_jobs, + "completed_jobs": completed_jobs, + "failed_jobs": failed_jobs, + "cancelled_jobs": cancelled_jobs, + "deferred_jobs": deferred_jobs, + "timestamp": datetime.utcnow().isoformat() + } + + +def get_jobs( + limit: int = 20, + offset: int = 0, + queue_name: str = None, + job_type: str = None, + client_id: str = None +) -> Dict[str, Any]: + """ + Get jobs from a specific queue or all queues with optional filtering. + + Args: + limit: Maximum number of jobs to return + offset: Number of jobs to skip + queue_name: Specific queue name or None for all queues + job_type: Filter by job type (matches func_name, e.g., "speech_detection") + client_id: Filter by client_id in job meta (partial match) + + Returns: + Dict with jobs list and pagination metadata matching frontend expectations + """ + all_jobs = [] + + queues_to_check = [queue_name] if queue_name else QUEUE_NAMES + + for qname in queues_to_check: + queue = get_queue(qname) + + # Collect jobs from all registries + registries = [ + (queue.job_ids, "queued"), + (queue.started_job_registry.get_job_ids(), "processing"), + (queue.finished_job_registry.get_job_ids(), "completed"), + (queue.failed_job_registry.get_job_ids(), "failed"), + (queue.deferred_job_registry.get_job_ids(), "deferred"), # Jobs waiting for dependencies + ] + + for job_ids, status in registries: + for job_id in job_ids: + try: + job = Job.fetch(job_id, connection=redis_conn) + + # Extract user_id from kwargs if present + user_id = job.kwargs.get("user_id", "") if job.kwargs else "" + + # Extract just the function name (e.g., "listen_for_speech_job" from "module.listen_for_speech_job") + func_name = job.func_name.split('.')[-1] if job.func_name else "unknown" + + # Apply job_type filter + if job_type and job_type not in func_name: + continue + + # Apply client_id filter (partial match in meta) + if client_id: + job_client_id = job.meta.get("client_id", "") if job.meta else "" + if client_id not in job_client_id: + continue + + all_jobs.append({ + "job_id": job.id, + "job_type": func_name, + "user_id": user_id, + "status": status, + "priority": "normal", # RQ doesn't track priority in metadata + "data": { + "description": job.description or "", + "queue": qname, + }, + "result": job.result if hasattr(job, 'result') else None, + "meta": job.meta if job.meta else {}, # Include job metadata + "error_message": str(job.exc_info) if job.exc_info else None, + "created_at": job.created_at.isoformat() if job.created_at else None, + "started_at": job.started_at.isoformat() if job.started_at else None, + "completed_at": job.ended_at.isoformat() if job.ended_at else None, + "retry_count": job.retries_left if hasattr(job, 'retries_left') else 0, + "max_retries": 3, # Default max retries + "progress_percent": 0, # RQ doesn't track progress by default + "progress_message": "", + }) + except Exception as e: + logger.error(f"Error fetching job {job_id}: {e}") + + # Sort by created_at (most recent first) + all_jobs.sort(key=lambda x: x.get("created_at") or "", reverse=True) + + # Paginate + total_jobs = len(all_jobs) + paginated_jobs = all_jobs[offset:offset + limit] + has_more = (offset + limit) < total_jobs + + return { + "jobs": paginated_jobs, + "pagination": { + "total": total_jobs, + "limit": limit, + "offset": offset, + "has_more": has_more, + } + } + + +def all_jobs_complete_for_session(session_id: str) -> bool: + """ + Check if all jobs associated with a session are in terminal states. + + Only checks jobs with audio_uuid in job.meta (no backward compatibility). + Traverses dependency chains to include dependent jobs. + + Args: + session_id: The audio_uuid (session ID) to check jobs for + + Returns: + True if all jobs are complete (or no jobs found), False if any job is still processing + """ + processed_job_ids = set() + + def is_job_complete(job): + """Recursively check if job and all its dependents are terminal.""" + if job.id in processed_job_ids: + return True + processed_job_ids.add(job.id) + + # Check if this job is terminal + if not (job.is_finished or job.is_failed or job.is_canceled): + logger.debug(f"Job {job.id} ({job.func_name}) is not terminal") + return False + + # Check dependent jobs + for dep_id in (job.dependent_ids or []): + try: + dep_job = Job.fetch(dep_id, connection=redis_conn) + if not is_job_complete(dep_job): + return False + except Exception as e: + logger.debug(f"Error fetching dependent job {dep_id}: {e}") + + return True + + # Find all jobs for this session + all_queues = [transcription_queue, memory_queue, audio_queue, default_queue] + for queue in all_queues: + registries = [ + queue.job_ids, + queue.started_job_registry.get_job_ids(), + queue.finished_job_registry.get_job_ids(), + queue.failed_job_registry.get_job_ids(), + queue.canceled_job_registry.get_job_ids(), + ScheduledJobRegistry(queue=queue).get_job_ids(), + DeferredJobRegistry(queue=queue).get_job_ids(), + ] + + for job_ids in registries: + for job_id in job_ids: + try: + job = Job.fetch(job_id, connection=redis_conn) + + # Only check jobs with audio_uuid in meta + if job.meta and job.meta.get('audio_uuid') == session_id: + if not is_job_complete(job): + return False + except Exception as e: + logger.debug(f"Error checking job {job_id}: {e}") + + return True + + +def start_streaming_jobs( + session_id: str, + user_id: str, + client_id: str +) -> Dict[str, str]: + """ + Enqueue jobs for streaming audio session (initial session setup). + + This starts the parallel job processing for a NEW streaming session: + 1. Speech detection job - monitors transcription results for speech + 2. Audio persistence job - writes audio chunks to WAV file (file rotation per conversation) + + Args: + session_id: Stream session ID (audio_uuid) + user_id: User identifier + client_id: Client identifier + + Returns: + Dict with job IDs: {'speech_detection': job_id, 'audio_persistence': job_id} + + Note: user_email is fetched from the database when needed. + """ + from advanced_omi_backend.workers.transcription_jobs import stream_speech_detection_job + from advanced_omi_backend.workers.audio_jobs import audio_streaming_persistence_job + + # Enqueue speech detection job + speech_job = transcription_queue.enqueue( + stream_speech_detection_job, + session_id, + user_id, + client_id, + job_timeout=86400, # 24 hours for all-day sessions + result_ttl=JOB_RESULT_TTL, + job_id=f"speech-detect_{session_id[:12]}", + description=f"Listening for speech...", + meta={'audio_uuid': session_id, 'client_id': client_id, 'session_level': True} + ) + logger.info(f"📥 RQ: Enqueued speech detection job {speech_job.id}") + + # Store job ID for cleanup (keyed by client_id for easy WebSocket cleanup) + try: + redis_conn.set(f"speech_detection_job:{client_id}", speech_job.id, ex=86400) # 24 hour TTL + logger.info(f"📌 Stored speech detection job ID for client {client_id}") + except Exception as e: + logger.warning(f"⚠️ Failed to store job ID for {client_id}: {e}") + + # Enqueue audio persistence job on dedicated audio queue + # NOTE: This job handles file rotation for multiple conversations automatically + # Runs for entire session, not tied to individual conversations + audio_job = audio_queue.enqueue( + audio_streaming_persistence_job, + session_id, + user_id, + client_id, + job_timeout=86400, # 24 hours for all-day sessions + result_ttl=JOB_RESULT_TTL, + job_id=f"audio-persist_{session_id[:12]}", + description=f"Audio persistence for session {session_id[:12]}", + meta={'audio_uuid': session_id, 'session_level': True} # Mark as session-level job + ) + logger.info(f"📥 RQ: Enqueued audio persistence job {audio_job.id} on audio queue") + + return { + 'speech_detection': speech_job.id, + 'audio_persistence': audio_job.id + } + + +def start_post_conversation_jobs( + conversation_id: str, + audio_uuid: str, + audio_file_path: str, + user_id: str, + post_transcription: bool = True, + transcript_version_id: Optional[str] = None, + depends_on_job = None, + client_id: Optional[str] = None +) -> Dict[str, str]: + """ + Start post-conversation processing jobs after conversation is created. + + This creates the standard processing chain after a conversation is created: + 1. [Optional] Transcription job - Batch transcription (if post_transcription=True) + 2. Audio cropping job - Removes silence from audio + 3. Speaker recognition job - Identifies speakers in audio + 4. Memory extraction job - Extracts memories from conversation (parallel) + 5. Title/summary generation job - Generates title and summary (parallel) + + Args: + conversation_id: Conversation identifier + audio_uuid: Audio UUID for job tracking + audio_file_path: Path to audio file + user_id: User identifier + post_transcription: If True, run batch transcription step (for uploads) + If False, skip transcription (streaming already has it) + transcript_version_id: Transcript version ID (auto-generated if None) + depends_on_job: Optional job dependency for cropping job + + Returns: + Dict with job IDs (transcription will be None if post_transcription=False) + """ + from advanced_omi_backend.workers.transcription_jobs import transcribe_full_audio_job + from advanced_omi_backend.workers.speaker_jobs import recognise_speakers_job + from advanced_omi_backend.workers.audio_jobs import process_cropping_job + from advanced_omi_backend.workers.memory_jobs import process_memory_job + from advanced_omi_backend.workers.conversation_jobs import generate_title_summary_job + + version_id = transcript_version_id or str(uuid.uuid4()) + + # Build job metadata (include client_id if provided for UI tracking) + job_meta = {'audio_uuid': audio_uuid, 'conversation_id': conversation_id} + if client_id: + job_meta['client_id'] = client_id + + # Step 1: Batch transcription job (ALWAYS run to get correct conversation-relative timestamps) + # Even for streaming, we need batch transcription before cropping to fix cumulative timestamps + transcribe_job_id = f"transcribe_{conversation_id[:12]}" + logger.info(f"🔍 DEBUG: Creating transcribe job with job_id={transcribe_job_id}, conversation_id={conversation_id[:12]}, audio_uuid={audio_uuid[:12]}") + + transcription_job = transcription_queue.enqueue( + transcribe_full_audio_job, + conversation_id, + audio_uuid, + audio_file_path, + version_id, + "batch", # trigger + job_timeout=1800, # 30 minutes + result_ttl=JOB_RESULT_TTL, + depends_on=depends_on_job, + job_id=transcribe_job_id, + description=f"Transcribe conversation {conversation_id[:8]}", + meta=job_meta + ) + logger.info(f"📥 RQ: Enqueued transcription job {transcription_job.id}, meta={transcription_job.meta}") + crop_depends_on = transcription_job + + # Step 2: Audio cropping job (depends on transcription if it ran, otherwise depends_on_job) + crop_job_id = f"crop_{conversation_id[:12]}" + logger.info(f"🔍 DEBUG: Creating crop job with job_id={crop_job_id}, conversation_id={conversation_id[:12]}, audio_uuid={audio_uuid[:12]}") + + cropping_job = default_queue.enqueue( + process_cropping_job, + conversation_id, + audio_file_path, + job_timeout=300, # 5 minutes + result_ttl=JOB_RESULT_TTL, + depends_on=crop_depends_on, + job_id=crop_job_id, + description=f"Crop audio for conversation {conversation_id[:8]}", + meta=job_meta + ) + logger.info(f"📥 RQ: Enqueued cropping job {cropping_job.id}, meta={cropping_job.meta}") + + # Speaker recognition depends on cropping + speaker_depends_on = cropping_job + + # Step 3: Speaker recognition job + speaker_job_id = f"speaker_{conversation_id[:12]}" + logger.info(f"🔍 DEBUG: Creating speaker job with job_id={speaker_job_id}, conversation_id={conversation_id[:12]}, audio_uuid={audio_uuid[:12]}") + + speaker_job = transcription_queue.enqueue( + recognise_speakers_job, + conversation_id, + version_id, + audio_file_path, + "", # transcript_text - will be read from DB + [], # words - will be read from DB + job_timeout=1200, # 20 minutes + result_ttl=JOB_RESULT_TTL, + depends_on=speaker_depends_on, + job_id=speaker_job_id, + description=f"Speaker recognition for conversation {conversation_id[:8]}", + meta=job_meta + ) + logger.info(f"📥 RQ: Enqueued speaker recognition job {speaker_job.id}, meta={speaker_job.meta} (depends on {speaker_depends_on.id})") + + # Step 4: Memory extraction job (parallel with title/summary) + memory_job_id = f"memory_{conversation_id[:12]}" + logger.info(f"🔍 DEBUG: Creating memory job with job_id={memory_job_id}, conversation_id={conversation_id[:12]}, audio_uuid={audio_uuid[:12]}") + + memory_job = memory_queue.enqueue( + process_memory_job, + conversation_id, + job_timeout=900, # 15 minutes + result_ttl=JOB_RESULT_TTL, + depends_on=speaker_job, + job_id=memory_job_id, + description=f"Memory extraction for conversation {conversation_id[:8]}", + meta=job_meta + ) + logger.info(f"📥 RQ: Enqueued memory extraction job {memory_job.id}, meta={memory_job.meta} (depends on {speaker_job.id})") + + # Step 5: Title/summary generation job (parallel with memory, independent) + # This ensures conversations always get titles/summaries even if memory job fails + title_job_id = f"title_summary_{conversation_id[:12]}" + logger.info(f"🔍 DEBUG: Creating title/summary job with job_id={title_job_id}, conversation_id={conversation_id[:12]}, audio_uuid={audio_uuid[:12]}") + + title_summary_job = default_queue.enqueue( + generate_title_summary_job, + conversation_id, + job_timeout=300, # 5 minutes + result_ttl=JOB_RESULT_TTL, + depends_on=speaker_job, # Depends on speaker job, NOT memory job + job_id=title_job_id, + description=f"Generate title and summary for conversation {conversation_id[:8]}", + meta=job_meta + ) + logger.info(f"📥 RQ: Enqueued title/summary job {title_summary_job.id}, meta={title_summary_job.meta} (depends on {speaker_job.id})") + + return { + 'cropping': cropping_job.id, + 'transcription': transcription_job.id if transcription_job else None, + 'speaker_recognition': speaker_job.id, + 'memory': memory_job.id, + 'title_summary': title_summary_job.id + } + + + + +def get_queue_health() -> Dict[str, Any]: + """Get health status of all queues and workers.""" + health = { + "queues": {}, + "workers": [], + "redis_connection": "unknown", + "total_workers": 0, + "active_workers": 0, + "idle_workers": 0, + } + + # Check Redis connection + try: + redis_conn.ping() + health["redis_connection"] = "healthy" + except Exception as e: + health["redis_connection"] = f"unhealthy: {e}" + return health + + # Check each queue + for queue_name in QUEUE_NAMES: + queue = get_queue(queue_name) + health["queues"][queue_name] = { + "count": len(queue), + "failed_count": len(queue.failed_job_registry), + "finished_count": len(queue.finished_job_registry), + "started_count": len(queue.started_job_registry), + } + + # Check workers + workers = Worker.all(connection=redis_conn) + health["total_workers"] = len(workers) + + for worker in workers: + state = worker.get_state() + current_job = worker.get_current_job_id() + + # Count active vs idle workers + if current_job or state == "busy": + health["active_workers"] += 1 + else: + health["idle_workers"] += 1 + + health["workers"].append({ + "name": worker.name, + "state": state, + "queues": [q.name for q in worker.queues], + "current_job": current_job, + }) + + return health + +# needs tidying but works for now +async def cleanup_stuck_stream_workers(request): + """Clean up stuck Redis Stream consumers and pending messages from all active streams.""" + import time + from fastapi.responses import JSONResponse + + try: + # Get Redis client from request.app.state (initialized during startup) + redis_client = request.app.state.redis_audio_stream + + if not redis_client: + return JSONResponse( + status_code=503, + content={"error": "Redis client for audio streaming not initialized"} + ) + + cleanup_results = {} + total_cleaned = 0 + total_deleted_consumers = 0 + total_deleted_streams = 0 + current_time = time.time() + + # Discover all audio streams (per-client streams) + stream_keys = await redis_client.keys("audio:stream:*") + + for stream_key in stream_keys: + stream_name = stream_key.decode() if isinstance(stream_key, bytes) else stream_key + + try: + # First check stream age - delete old streams (>1 hour) immediately + stream_info = await redis_client.execute_command('XINFO', 'STREAM', stream_name) + + # Parse stream info + info_dict = {} + for i in range(0, len(stream_info), 2): + key_name = stream_info[i].decode() if isinstance(stream_info[i], bytes) else str(stream_info[i]) + info_dict[key_name] = stream_info[i+1] + + stream_length = int(info_dict.get("length", 0)) + last_entry = info_dict.get("last-entry") + + # Check if stream is old + should_delete_stream = False + stream_age = 0 + + if stream_length == 0: + should_delete_stream = True + stream_age = 0 + elif last_entry and isinstance(last_entry, list) and len(last_entry) > 0: + try: + last_id = last_entry[0] + if isinstance(last_id, bytes): + last_id = last_id.decode() + last_timestamp_ms = int(last_id.split('-')[0]) + last_timestamp_s = last_timestamp_ms / 1000 + stream_age = current_time - last_timestamp_s + + # Delete streams older than 1 hour (3600 seconds) + if stream_age > 3600: + should_delete_stream = True + except (ValueError, IndexError): + pass + + if should_delete_stream: + await redis_client.delete(stream_name) + total_deleted_streams += 1 + cleanup_results[stream_name] = { + "message": f"Deleted old stream (age: {stream_age:.0f}s, length: {stream_length})", + "cleaned": 0, + "deleted_consumers": 0, + "deleted_stream": True, + "stream_age": stream_age + } + continue + + # Get consumer groups + groups = await redis_client.execute_command('XINFO', 'GROUPS', stream_name) + + if not groups: + cleanup_results[stream_name] = {"message": "No consumer groups found", "cleaned": 0, "deleted_stream": False} + continue + + # Parse first group + group_dict = {} + group = groups[0] + for i in range(0, len(group), 2): + key = group[i].decode() if isinstance(group[i], bytes) else str(group[i]) + value = group[i+1] + if isinstance(value, bytes): + try: + value = value.decode() + except UnicodeDecodeError: + value = str(value) + group_dict[key] = value + + group_name = group_dict.get("name", "unknown") + if isinstance(group_name, bytes): + group_name = group_name.decode() + + pending_count = int(group_dict.get("pending", 0)) + + # Get consumers for this group to check per-consumer pending + consumers = await redis_client.execute_command('XINFO', 'CONSUMERS', stream_name, group_name) + + cleaned_count = 0 + total_consumer_pending = 0 + + # Clean up pending messages for each consumer AND delete dead consumers + deleted_consumers = 0 + for consumer in consumers: + consumer_dict = {} + for i in range(0, len(consumer), 2): + key = consumer[i].decode() if isinstance(consumer[i], bytes) else str(consumer[i]) + value = consumer[i+1] + if isinstance(value, bytes): + try: + value = value.decode() + except UnicodeDecodeError: + value = str(value) + consumer_dict[key] = value + + consumer_name = consumer_dict.get("name", "unknown") + if isinstance(consumer_name, bytes): + consumer_name = consumer_name.decode() + + consumer_pending = int(consumer_dict.get("pending", 0)) + consumer_idle_ms = int(consumer_dict.get("idle", 0)) + total_consumer_pending += consumer_pending + + # Check if consumer is dead (idle > 5 minutes = 300000ms) + is_dead = consumer_idle_ms > 300000 + + if consumer_pending > 0: + logger.info(f"Found {consumer_pending} pending messages for consumer {consumer_name} (idle: {consumer_idle_ms}ms)") + + # Get pending messages for this specific consumer + try: + pending_messages = await redis_client.execute_command( + 'XPENDING', stream_name, group_name, '-', '+', str(consumer_pending), consumer_name + ) + + # XPENDING returns flat list: [msg_id, consumer, idle_ms, delivery_count, msg_id, ...] + # Parse in groups of 4 + for i in range(0, len(pending_messages), 4): + if i < len(pending_messages): + msg_id = pending_messages[i] + if isinstance(msg_id, bytes): + msg_id = msg_id.decode() + + # Claim the message to a cleanup worker + try: + await redis_client.execute_command( + 'XCLAIM', stream_name, group_name, 'cleanup-worker', '0', msg_id + ) + + # Acknowledge it immediately + await redis_client.xack(stream_name, group_name, msg_id) + cleaned_count += 1 + except Exception as claim_error: + logger.warning(f"Failed to claim/ack message {msg_id}: {claim_error}") + + except Exception as consumer_error: + logger.error(f"Error processing consumer {consumer_name}: {consumer_error}") + + # Delete dead consumers (idle > 5 minutes with no pending messages) + if is_dead and consumer_pending == 0: + try: + await redis_client.execute_command( + 'XGROUP', 'DELCONSUMER', stream_name, group_name, consumer_name + ) + deleted_consumers += 1 + logger.info(f"🧹 Deleted dead consumer {consumer_name} (idle: {consumer_idle_ms}ms)") + except Exception as delete_error: + logger.warning(f"Failed to delete consumer {consumer_name}: {delete_error}") + + if total_consumer_pending == 0 and deleted_consumers == 0: + cleanup_results[stream_name] = {"message": "No pending messages or dead consumers", "cleaned": 0, "deleted_consumers": 0, "deleted_stream": False} + continue + + total_cleaned += cleaned_count + total_deleted_consumers += deleted_consumers + cleanup_results[stream_name] = { + "message": f"Cleaned {cleaned_count} pending messages, deleted {deleted_consumers} dead consumers", + "cleaned": cleaned_count, + "deleted_consumers": deleted_consumers, + "deleted_stream": False, + "original_pending": pending_count + } + + except Exception as e: + cleanup_results[stream_name] = { + "error": str(e), + "cleaned": 0 + } + + return { + "success": True, + "total_cleaned": total_cleaned, + "total_deleted_consumers": total_deleted_consumers, + "total_deleted_streams": total_deleted_streams, + "streams": cleanup_results, # New key for per-stream results + "providers": cleanup_results, # Keep for backward compatibility with frontend + "timestamp": time.time() + } + + except Exception as e: + logger.error(f"Error cleaning up stuck workers: {e}", exc_info=True) + return JSONResponse( + status_code=500, content={"error": f"Failed to cleanup stuck workers: {str(e)}"} + ) diff --git a/backends/advanced/src/advanced_omi_backend/controllers/session_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/session_controller.py new file mode 100644 index 00000000..a3836898 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/controllers/session_controller.py @@ -0,0 +1,586 @@ +""" +Session controller for handling audio session-related business logic. + +This module manages Redis-based audio streaming sessions, including: +- Session metadata and status +- Conversation counts per session +- Session lifecycle tracking +""" + +import logging +import time +from typing import Dict, List, Optional + +from fastapi.responses import JSONResponse + +logger = logging.getLogger(__name__) + + +async def get_session_info(redis_client, session_id: str) -> Optional[Dict]: + """ + Get detailed information about a specific session. + + Args: + redis_client: Redis async client + session_id: Session UUID + + Returns: + Dict with session information or None if not found + """ + try: + session_key = f"audio:session:{session_id}" + session_data = await redis_client.hgetall(session_key) + + if not session_data: + return None + + # Get conversation count for this session + conversation_count_key = f"session:conversation_count:{session_id}" + conversation_count_bytes = await redis_client.get(conversation_count_key) + conversation_count = int(conversation_count_bytes.decode()) if conversation_count_bytes else 0 + + started_at = float(session_data.get(b"started_at", b"0")) + last_chunk_at = float(session_data.get(b"last_chunk_at", b"0")) + + return { + "session_id": session_id, + "user_id": session_data.get(b"user_id", b"").decode(), + "client_id": session_data.get(b"client_id", b"").decode(), + "provider": session_data.get(b"provider", b"").decode(), + "mode": session_data.get(b"mode", b"").decode(), + "status": session_data.get(b"status", b"").decode(), + "chunks_published": int(session_data.get(b"chunks_published", b"0")), + "started_at": started_at, + "last_chunk_at": last_chunk_at, + "age_seconds": time.time() - started_at, + "idle_seconds": time.time() - last_chunk_at, + "conversation_count": conversation_count, + # Speech detection events + "last_event": session_data.get(b"last_event", b"").decode(), + "speech_detected_at": session_data.get(b"speech_detected_at", b"").decode(), + "speaker_check_status": session_data.get(b"speaker_check_status", b"").decode(), + "identified_speakers": session_data.get(b"identified_speakers", b"").decode() + } + + except Exception as e: + logger.error(f"Error getting session info for {session_id}: {e}") + return None + + +async def get_all_sessions(redis_client, limit: int = 100) -> List[Dict]: + """ + Get information about all active sessions. + + Args: + redis_client: Redis async client + limit: Maximum number of sessions to return + + Returns: + List of session info dictionaries + """ + try: + # Get all session keys + session_keys = [] + cursor = b"0" + while cursor and len(session_keys) < limit: + cursor, keys = await redis_client.scan( + cursor, match="audio:session:*", count=limit + ) + session_keys.extend(keys[:limit - len(session_keys)]) + + # Get info for each session + sessions = [] + for key in session_keys: + session_id = key.decode().replace("audio:session:", "") + session_info = await get_session_info(redis_client, session_id) + if session_info: + sessions.append(session_info) + + return sessions + + except Exception as e: + logger.error(f"Error getting all sessions: {e}") + return [] + + +async def get_session_conversation_count(redis_client, session_id: str) -> int: + """ + Get the conversation count for a specific session. + + Args: + redis_client: Redis async client + session_id: Session UUID + + Returns: + Number of conversations created in this session + """ + try: + conversation_count_key = f"session:conversation_count:{session_id}" + conversation_count_bytes = await redis_client.get(conversation_count_key) + return int(conversation_count_bytes.decode()) if conversation_count_bytes else 0 + except Exception as e: + logger.error(f"Error getting conversation count for session {session_id}: {e}") + return 0 + + +async def increment_session_conversation_count(redis_client, session_id: str) -> int: + """ + Increment and return the conversation count for a session. + + Args: + redis_client: Redis async client + session_id: Session UUID + + Returns: + New conversation count + """ + try: + conversation_count_key = f"session:conversation_count:{session_id}" + count = await redis_client.incr(conversation_count_key) + await redis_client.expire(conversation_count_key, 3600) # 1 hour TTL + logger.info(f"📊 Conversation count for session {session_id}: {count}") + return count + except Exception as e: + logger.error(f"Error incrementing conversation count for session {session_id}: {e}") + return 0 + + +async def get_streaming_status(request): + """Get status of active streaming sessions and Redis Streams health.""" + from advanced_omi_backend.controllers.queue_controller import ( + transcription_queue, + memory_queue, + default_queue, + all_jobs_complete_for_session + ) + + try: + # Get Redis client from request.app.state (initialized during startup) + redis_client = request.app.state.redis_audio_stream + + if not redis_client: + return JSONResponse( + status_code=503, + content={"error": "Redis client for audio streaming not initialized"} + ) + + # Get all sessions (both active and completed) + session_keys = await redis_client.keys("audio:session:*") + active_sessions = [] + completed_sessions_from_redis = [] + + for key in session_keys: + session_id = key.decode().split(":")[-1] + + # Use session_controller to get complete session info including conversation_count + session_obj = await get_session_info(redis_client, session_id) + if not session_obj: + continue + + status = session_obj.get("status", "") + + # Separate active and completed sessions + # Check if all jobs are complete (including failed jobs) + all_jobs_done = all_jobs_complete_for_session(session_id) + + # Session is completed if: + # 1. Redis status says complete/finalized AND all jobs done, OR + # 2. All jobs are done (even if status isn't complete yet) + # This ensures sessions with failed jobs move to completed + if status in ["complete", "completed", "finalized"] or all_jobs_done: + if all_jobs_done: + # All jobs complete - this is truly a completed session + # Update Redis status if it wasn't already marked complete + if status not in ["complete", "completed", "finalized"]: + await redis_client.hset(key, "status", "complete") + logger.info(f"✅ Marked session {session_id} as complete (all jobs terminal)") + + # Get additional session data for completed sessions + session_key = f"audio:session:{session_id}" + session_data = await redis_client.hgetall(session_key) + + completed_sessions_from_redis.append({ + "session_id": session_id, + "client_id": session_obj.get("client_id", ""), + "conversation_id": session_data.get(b"conversation_id", b"").decode() if session_data and b"conversation_id" in session_data else None, + "has_conversation": bool(session_data and session_data.get(b"conversation_id", b"")), + "action": session_data.get(b"action", b"complete").decode() if session_data and b"action" in session_data else "complete", + "reason": session_data.get(b"reason", b"").decode() if session_data and b"reason" in session_data else "", + "completed_at": session_obj.get("last_chunk_at", 0), + "audio_file": session_data.get(b"audio_file", b"").decode() if session_data and b"audio_file" in session_data else "", + "conversation_count": session_obj.get("conversation_count", 0) + }) + else: + # Status says complete but jobs still processing - keep in active + active_sessions.append(session_obj) + else: + # This is an active session + active_sessions.append(session_obj) + + # Get stream health for all streams (per-client streams) + # Categorize as active or completed based on consumer activity + active_streams = {} + completed_streams = {} + + # Create a map of client_id to session for quick lookup + client_to_session = {} + for session in active_sessions + completed_sessions_from_redis: + client_id = session.get("client_id") + if client_id: + client_to_session[client_id] = session + + # Discover all audio streams + stream_keys = await redis_client.keys("audio:stream:*") + current_time = time.time() + + for stream_key in stream_keys: + stream_name = stream_key.decode() if isinstance(stream_key, bytes) else stream_key + try: + # Check if stream exists + stream_info = await redis_client.execute_command('XINFO', 'STREAM', stream_name) + + # Parse stream info (returns flat list of key-value pairs) + info_dict = {} + for i in range(0, len(stream_info), 2): + key = stream_info[i].decode() if isinstance(stream_info[i], bytes) else str(stream_info[i]) + value = stream_info[i+1] + + # Skip complex binary structures like first-entry and last-entry + # which contain message data that can't be JSON serialized + if key in ["first-entry", "last-entry"]: + # Just extract the message ID (first element) + if isinstance(value, list) and len(value) > 0: + msg_id = value[0] + if isinstance(msg_id, bytes): + msg_id = msg_id.decode() + value = msg_id + else: + value = None + elif isinstance(value, bytes): + try: + value = value.decode() + except UnicodeDecodeError: + # Binary data that can't be decoded, skip it + value = "" + + info_dict[key] = value + + # Calculate stream age from last entry (for determining if stream is stale) + stream_age_seconds = 0 + last_entry_id = info_dict.get("last-entry") + if last_entry_id: + try: + # Redis Stream IDs format: "milliseconds-sequence" + last_timestamp_ms = int(last_entry_id.split('-')[0]) + last_timestamp_s = last_timestamp_ms / 1000 + stream_age_seconds = current_time - last_timestamp_s + except (ValueError, IndexError, AttributeError): + stream_age_seconds = 0 + + # Extract client_id from stream name (audio:stream:{client_id}) + client_id = stream_name.split(":")[-1] if ":" in stream_name else "" + + # Get session age from associated session (more meaningful than stream age) + session_age_seconds = 0 + session_idle_seconds = 0 + if client_id and client_id in client_to_session: + session_data = client_to_session[client_id] + session_age_seconds = session_data.get("age_seconds", 0) + session_idle_seconds = session_data.get("idle_seconds", 0) + + # Get consumer groups + groups = await redis_client.execute_command('XINFO', 'GROUPS', stream_name) + + stream_data = { + "stream_length": info_dict.get("length", 0), + "first_entry_id": info_dict.get("first-entry"), + "last_entry_id": last_entry_id, + "session_age_seconds": session_age_seconds, # Age since session started + "session_idle_seconds": session_idle_seconds, # Time since last audio chunk + "client_id": client_id, # Include client_id for reference + "consumer_groups": [] + } + + # Track if stream has any active consumers + has_active_consumer = False + min_consumer_idle_ms = float('inf') + + # Parse consumer groups + for group in groups: + group_dict = {} + for i in range(0, len(group), 2): + key = group[i].decode() if isinstance(group[i], bytes) else str(group[i]) + value = group[i+1] + if isinstance(value, bytes): + try: + value = value.decode() + except UnicodeDecodeError: + value = "" + group_dict[key] = value + + group_name = group_dict.get("name", "unknown") + if isinstance(group_name, bytes): + group_name = group_name.decode() + + # Get consumers for this group + consumers = await redis_client.execute_command('XINFO', 'CONSUMERS', stream_name, group_name) + consumer_list = [] + consumer_pending_total = 0 + + for consumer in consumers: + consumer_dict = {} + for i in range(0, len(consumer), 2): + key = consumer[i].decode() if isinstance(consumer[i], bytes) else str(consumer[i]) + value = consumer[i+1] + if isinstance(value, bytes): + try: + value = value.decode() + except UnicodeDecodeError: + value = "" + consumer_dict[key] = value + + consumer_name = consumer_dict.get("name", "unknown") + if isinstance(consumer_name, bytes): + consumer_name = consumer_name.decode() + + consumer_pending = int(consumer_dict.get("pending", 0)) + consumer_idle_ms = int(consumer_dict.get("idle", 0)) + consumer_pending_total += consumer_pending + + # Track minimum idle time + min_consumer_idle_ms = min(min_consumer_idle_ms, consumer_idle_ms) + + # Consumer is active if idle < 5 minutes (300000ms) + if consumer_idle_ms < 300000: + has_active_consumer = True + + consumer_list.append({ + "name": consumer_name, + "pending": consumer_pending, + "idle_ms": consumer_idle_ms + }) + + # Get group-level pending count (may be 0 even if consumers have pending) + try: + pending = await redis_client.xpending(stream_name, group_name) + group_pending_count = int(pending[0]) if pending else 0 + except Exception: + group_pending_count = 0 + + # Use the maximum of group-level pending or sum of consumer pending + # (Sometimes group pending is 0 but consumers still have pending messages) + effective_pending = max(group_pending_count, consumer_pending_total) + + stream_data["consumer_groups"].append({ + "name": str(group_name), + "consumers": consumer_list, + "pending": int(effective_pending) + }) + + # Determine if stream is active or completed + # Active: has active consumers OR pending messages OR recent activity (< 5 min) + # Completed: no active consumers and idle > 5 minutes but < 1 hour + total_pending = sum(group["pending"] for group in stream_data["consumer_groups"]) + is_active = ( + has_active_consumer or + total_pending > 0 or + stream_age_seconds < 300 # Less than 5 minutes old + ) + + if is_active: + active_streams[stream_name] = stream_data + else: + # Mark as completed (will be cleaned up when > 1 hour old) + stream_data["idle_seconds"] = stream_age_seconds + completed_streams[stream_name] = stream_data + + except Exception as e: + # Stream doesn't exist or error getting info + logger.debug(f"Error processing stream {stream_name}: {e}") + continue + + # Get RQ queue stats - include all registries + rq_stats = { + "transcription_queue": { + "queued": transcription_queue.count, + "processing": len(transcription_queue.started_job_registry), + "completed": len(transcription_queue.finished_job_registry), + "failed": len(transcription_queue.failed_job_registry), + "cancelled": len(transcription_queue.canceled_job_registry), + "deferred": len(transcription_queue.deferred_job_registry) + }, + "memory_queue": { + "queued": memory_queue.count, + "processing": len(memory_queue.started_job_registry), + "completed": len(memory_queue.finished_job_registry), + "failed": len(memory_queue.failed_job_registry), + "cancelled": len(memory_queue.canceled_job_registry), + "deferred": len(memory_queue.deferred_job_registry) + }, + "default_queue": { + "queued": default_queue.count, + "processing": len(default_queue.started_job_registry), + "completed": len(default_queue.finished_job_registry), + "failed": len(default_queue.failed_job_registry), + "cancelled": len(default_queue.canceled_job_registry), + "deferred": len(default_queue.deferred_job_registry) + } + } + + return { + "active_sessions": active_sessions, + "completed_sessions": completed_sessions_from_redis, + "active_streams": active_streams, + "completed_streams": completed_streams, + "stream_health": active_streams, # Backward compatibility - use active_streams + "rq_queues": rq_stats, + "timestamp": time.time() + } + + except Exception as e: + logger.error(f"Error getting streaming status: {e}", exc_info=True) + return JSONResponse( + status_code=500, + content={"error": f"Failed to get streaming status: {str(e)}"} + ) + + +async def cleanup_old_sessions(request, max_age_seconds: int = 3600): + """Clean up old session tracking metadata and old audio streams from Redis.""" + import time + from fastapi.responses import JSONResponse + + try: + # Get Redis client from request.app.state (initialized during startup) + redis_client = request.app.state.redis_audio_stream + + if not redis_client: + return JSONResponse( + status_code=503, + content={"error": "Redis client for audio streaming not initialized"} + ) + + # Get all session keys + session_keys = await redis_client.keys("audio:session:*") + cleaned_sessions = 0 + old_sessions = [] + + current_time = time.time() + + for key in session_keys: + session_data = await redis_client.hgetall(key) + if not session_data: + continue + + session_id = key.decode().split(":")[-1] + started_at = float(session_data.get(b"started_at", b"0")) + status = session_data.get(b"status", b"").decode() + + age_seconds = current_time - started_at + + # Clean up sessions older than max_age or stuck in "finalizing" + should_clean = ( + age_seconds > max_age_seconds or + (status == "finalizing" and age_seconds > 300) # Finalizing for more than 5 minutes + ) + + if should_clean: + old_sessions.append({ + "session_id": session_id, + "age_seconds": age_seconds, + "status": status + }) + await redis_client.delete(key) + cleaned_sessions += 1 + + # Also clean up old audio streams (per-client streams that are inactive) + stream_keys = await redis_client.keys("audio:stream:*") + cleaned_streams = 0 + old_streams = [] + + for stream_key in stream_keys: + stream_name = stream_key.decode() if isinstance(stream_key, bytes) else stream_key + + try: + # Check stream info to get last activity + stream_info = await redis_client.execute_command('XINFO', 'STREAM', stream_name) + + # Parse stream info + info_dict = {} + for i in range(0, len(stream_info), 2): + key_name = stream_info[i].decode() if isinstance(stream_info[i], bytes) else str(stream_info[i]) + info_dict[key_name] = stream_info[i+1] + + stream_length = int(info_dict.get("length", 0)) + last_entry = info_dict.get("last-entry") + + # Check stream age via last entry ID (Redis Stream IDs are timestamps) + should_delete = False + age_seconds = 0 + + if stream_length == 0: + # Empty stream - safe to delete + should_delete = True + reason = "empty" + elif last_entry and isinstance(last_entry, list) and len(last_entry) > 0: + # Extract timestamp from last entry ID + last_id = last_entry[0] + if isinstance(last_id, bytes): + last_id = last_id.decode() + + # Redis Stream IDs format: "milliseconds-sequence" + try: + last_timestamp_ms = int(last_id.split('-')[0]) + last_timestamp_s = last_timestamp_ms / 1000 + age_seconds = current_time - last_timestamp_s + + # Delete streams older than max_age regardless of size + if age_seconds > max_age_seconds: + should_delete = True + reason = "old" + except (ValueError, IndexError): + # If we can't parse timestamp, check if first entry is old + first_entry = info_dict.get("first-entry") + if first_entry and isinstance(first_entry, list) and len(first_entry) > 0: + try: + first_id = first_entry[0] + if isinstance(first_id, bytes): + first_id = first_id.decode() + first_timestamp_ms = int(first_id.split('-')[0]) + first_timestamp_s = first_timestamp_ms / 1000 + age_seconds = current_time - first_timestamp_s + + if age_seconds > max_age_seconds: + should_delete = True + reason = "old_unparseable" + except (ValueError, IndexError): + pass + + if should_delete: + await redis_client.delete(stream_name) + cleaned_streams += 1 + old_streams.append({ + "stream_name": stream_name, + "reason": reason, + "age_seconds": age_seconds, + "length": stream_length + }) + + except Exception as e: + logger.debug(f"Error checking stream {stream_name}: {e}") + continue + + return { + "success": True, + "cleaned_sessions": cleaned_sessions, + "cleaned_streams": cleaned_streams, + "cleaned_session_details": old_sessions, + "cleaned_stream_details": old_streams, + "timestamp": time.time() + } + + except Exception as e: + logger.error(f"Error cleaning up old sessions: {e}", exc_info=True) + return JSONResponse( + status_code=500, content={"error": f"Failed to cleanup old sessions: {str(e)}"} + ) diff --git a/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py index 9fc7efe6..5bc0b35d 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py @@ -2,32 +2,19 @@ System controller for handling system-related business logic. """ -import asyncio -import io -import json import logging import os import shutil import time -import wave from datetime import UTC, datetime -from pathlib import Path -import numpy as np -from advanced_omi_backend.client_manager import generate_client_id from advanced_omi_backend.config import ( load_diarization_settings_from_file, save_diarization_settings_to_file, ) -from advanced_omi_backend.database import chunks_col -from advanced_omi_backend.job_tracker import FileStatus, JobStatus, get_job_tracker -from advanced_omi_backend.processors import AudioProcessingItem, get_processor_manager -from advanced_omi_backend.audio_utils import process_audio_chunk +from advanced_omi_backend.models.user import User from advanced_omi_backend.task_manager import get_task_manager -from advanced_omi_backend.users import User -from fastapi import BackgroundTasks, File, Query, UploadFile from fastapi.responses import JSONResponse -from wyoming.audio import AudioChunk logger = logging.getLogger(__name__) audio_logger = logging.getLogger("audio_processing") @@ -68,690 +55,7 @@ async def get_auth_config(): } -async def get_all_processing_tasks(): - """Get all active processing tasks.""" - try: - processor_manager = get_processor_manager() - return processor_manager.get_all_processing_status() - except Exception as e: - logger.error(f"Error getting processing tasks: {e}") - return JSONResponse( - status_code=500, content={"error": f"Failed to get processing tasks: {str(e)}"} - ) - - -async def get_processing_task_status(client_id: str): - """Get processing task status for a specific client.""" - try: - processor_manager = get_processor_manager() - processing_status = processor_manager.get_processing_status(client_id) - - # Check if transcription is marked as started but not completed, and verify with database - stages = processing_status.get("stages", {}) - transcription_stage = stages.get("transcription", {}) - - """This is a hack to update it the DB INCASE a process failed - if transcription_stage.get("status") == "started" and not transcription_stage.get("completed", False): - # Check if transcription is actually complete by checking the database - try: - chunk = await chunks_col.find_one({"client_id": client_id}) - if chunk and chunk.get("transcript") and len(chunk.get("transcript", [])) > 0: - # Transcription is complete! Update the processor state - processor_manager.track_processing_stage( - client_id, - "transcription", - "completed", - {"audio_uuid": chunk.get("audio_uuid"), "segments": len(chunk.get("transcript", []))} - ) - logger.info(f"Detected transcription completion for client {client_id} ({len(chunk.get('transcript', []))} segments)") - # Get updated status - processing_status = processor_manager.get_processing_status(client_id) - except Exception as e: - logger.debug(f"Error checking transcription completion: {e}") - """ - return processing_status - except Exception as e: - logger.error(f"Error getting processing task status for {client_id}: {e}") - return JSONResponse( - status_code=500, content={"error": f"Failed to get processing task status: {str(e)}"} - ) - - -async def get_processor_status(): - """Get processor queue status and health.""" - try: - processor_manager = get_processor_manager() - - # Get queue sizes - status = { - "queues": { - "audio_queue": processor_manager.audio_queue.qsize(), - "transcription_queue": processor_manager.transcription_queue.qsize(), - "memory_queue": processor_manager.memory_queue.qsize(), - "cropping_queue": processor_manager.cropping_queue.qsize(), - }, - "processors": { - "audio_processor": "running", - "transcription_processor": "running", - "memory_processor": "running", - "cropping_processor": "running", - }, - "active_clients": len(processor_manager.active_file_sinks), - "active_audio_uuids": len(processor_manager.active_audio_uuids), - "processing_tasks": len(processor_manager.processing_tasks), - "timestamp": int(time.time()), - } - - # Get task manager status if available - try: - task_manager = get_task_manager() - if task_manager: - task_status = task_manager.get_health_status() - status["task_manager"] = task_status - except Exception as e: - status["task_manager"] = {"error": str(e)} - - return status - - except Exception as e: - logger.error(f"Error getting processor status: {e}") - return JSONResponse( - status_code=500, content={"error": f"Failed to get processor status: {str(e)}"} - ) - - -async def process_audio_files( - user: User, files: list[UploadFile], device_name: str, auto_generate_client: bool -): - """Process uploaded audio files through the transcription pipeline.""" - # Need to import here because we import the routes into main, causing circular imports - from advanced_omi_backend.main import cleanup_client_state, create_client_state - - # Process files through complete transcription pipeline like WebSocket clients - try: - if not files: - return JSONResponse(status_code=400, content={"error": "No files provided"}) - - processed_files = [] - processed_conversations = [] - - for file_index, file in enumerate(files): - client_id = None - client_state = None - - try: - # Validate file type (only WAV for now) - if not file.filename or not file.filename.lower().endswith(".wav"): - processed_files.append( - { - "filename": file.filename or "unknown", - "status": "error", - "error": "Only WAV files are currently supported", - } - ) - continue - - # Generate unique client ID for each file to create separate conversations - file_device_name = f"{device_name}-{file_index + 1:03d}" - client_id = generate_client_id(user, file_device_name) - - # Create separate client state for this file - client_state = await create_client_state(client_id, user, file_device_name) - - audio_logger.info( - f"📁 Processing file {file_index + 1}/{len(files)}: {file.filename} with client_id: {client_id}" - ) - - processor_manager = get_processor_manager() - - # Read file content - content = await file.read() - - # Process WAV file - with wave.open(io.BytesIO(content), "rb") as wav_file: - # Get audio parameters - sample_rate = wav_file.getframerate() - sample_width = wav_file.getsampwidth() - channels = wav_file.getnchannels() - - # Read all audio data - audio_data = wav_file.readframes(wav_file.getnframes()) - - # Convert to mono if stereo - if channels == 2: - # Convert stereo to mono by averaging channels - if sample_width == 2: - audio_array = np.frombuffer(audio_data, dtype=np.int16) - else: - audio_array = np.frombuffer(audio_data, dtype=np.int32) - - # Reshape to separate channels and average - audio_array = audio_array.reshape(-1, 2) - audio_data = ( - np.mean(audio_array, axis=1).astype(audio_array.dtype).tobytes() - ) - channels = 1 - - # Ensure sample rate is 16kHz (resample if needed) - if sample_rate != 16000: - audio_logger.warning( - f"File {file.filename} has sample rate {sample_rate}Hz, expected 16kHz." - ) - raise JSONResponse(status_code=400, content={"error": f"File {file.filename} has sample rate {sample_rate}Hz, expected 16kHz. I'll implement this at some point sorry"}) - - # Process audio in larger chunks for faster file processing - # Use larger chunks (32KB) for optimal performance - chunk_size = 32 * 1024 # 32KB chunks - base_timestamp = int(time.time()) - - for i in range(0, len(audio_data), chunk_size): - chunk_data = audio_data[i : i + chunk_size] - - # Calculate relative timestamp for this chunk - chunk_offset_bytes = i - chunk_offset_seconds = chunk_offset_bytes / ( - sample_rate * sample_width * channels - ) - chunk_timestamp = base_timestamp + int(chunk_offset_seconds) - - # Process audio chunk through unified pipeline - await process_audio_chunk( - audio_data=chunk_data, - client_id=client_id, - user_id=user.user_id, - user_email=user.email, - audio_format={ - "rate": sample_rate, - "width": sample_width, - "channels": channels, - "timestamp": chunk_timestamp, - }, - client_state=None, # No client state needed for file upload - ) - - # Yield control occasionally to prevent blocking the event loop - if i % (chunk_size * 10) == 0: # Every 10 chunks (~320KB) - await asyncio.sleep(0) - - processed_files.append( - { - "filename": file.filename, - "sample_rate": sample_rate, - "channels": channels, - "duration_seconds": len(audio_data) - / (sample_rate * sample_width * channels), - "size_bytes": len(audio_data), - "client_id": client_id, - "status": "processed", - } - ) - - audio_logger.info( - f"✅ Processed audio file: {file.filename} ({len(audio_data)} bytes)" - ) - - # Wait briefly for transcription manager to be created by background processor - audio_logger.info( - f"⏳ Waiting for transcription manager to be created for client {client_id}" - ) - await asyncio.sleep(2.0) # Give transcription processor time to create manager - - # Close client audio to trigger transcription completion (flush_final_transcript) - audio_logger.info( - f"📞 About to call close_client_audio for upload client {client_id}" - ) - processor_manager = get_processor_manager() - audio_logger.info(f"📞 Got processor manager, calling close_client_audio now...") - await processor_manager.close_client_audio(client_id) - audio_logger.info( - f"🔚 Successfully called close_client_audio for upload client {client_id}" - ) - - # Wait for this file's transcription processing to complete - audio_logger.info(f"📁 Waiting for transcription to process file: {file.filename}") - - # Wait for chunks to be processed by the audio saver - await asyncio.sleep(1.0) - - # Wait for file processing to complete using task tracking - # Increase timeout based on file duration (3x duration + 60s buffer) - audio_duration = len(audio_data) / (sample_rate * sample_width * channels) - max_wait_time = max( - 120, int(audio_duration * 3) + 60 - ) # At least 2 minutes, or 3x duration + 60s - wait_interval = 2.0 # Reduced from 0.5s to 2s to reduce polling spam - elapsed_time = 0 - - audio_logger.info( - f"📁 Audio duration: {audio_duration:.1f}s, max wait time: {max_wait_time}s" - ) - - # Use concrete task tracking instead of database polling - while elapsed_time < max_wait_time: - try: - # Check processing status using task tracking - processing_status = processor_manager.get_processing_status(client_id) - - # Check if transcription stage is complete - stages = processing_status.get("stages", {}) - transcription_stage = stages.get("transcription", {}) - - # If transcription is marked as started but not completed, check database - if transcription_stage.get( - "status" - ) == "started" and not transcription_stage.get("completed", False): - # Check if transcription is actually complete by checking the database - try: - chunk = await chunks_col.find_one({"client_id": client_id}) - if ( - chunk - and chunk.get("transcript") - and len(chunk.get("transcript", [])) > 0 - ): - # Transcription is complete! Update the processor state - processor_manager.track_processing_stage( - client_id, - "transcription", - "completed", - { - "audio_uuid": chunk.get("audio_uuid"), - "segments": len(chunk.get("transcript", [])), - }, - ) - audio_logger.info( - f"📁 Transcription completed for file: {file.filename} ({len(chunk.get('transcript', []))} segments)" - ) - break - except Exception as e: - audio_logger.debug(f"Error checking transcription completion: {e}") - - if transcription_stage.get("completed", False): - audio_logger.info( - f"📁 Transcription completed for file: {file.filename}" - ) - break - - # Check for errors - if transcription_stage.get("error"): - audio_logger.warning( - f"📁 Transcription error for file: {file.filename}: {transcription_stage.get('error')}" - ) - break - - except Exception as e: - audio_logger.debug(f"Error checking processing status: {e}") - - await asyncio.sleep(wait_interval) - elapsed_time += wait_interval - - if elapsed_time >= max_wait_time: - audio_logger.warning(f"📁 Transcription timed out for file: {file.filename}") - - # Signal end of conversation - trigger memory processing - await client_state.close_current_conversation() - - # Give cleanup time to complete - await asyncio.sleep(0.5) - - # Track conversation created - conversation_info = { - "client_id": client_id, - "filename": file.filename, - "status": "completed" if elapsed_time < max_wait_time else "timed_out", - } - processed_conversations.append(conversation_info) - - except Exception as e: - audio_logger.error(f"Error processing file {file.filename}: {e}") - processed_files.append( - {"filename": file.filename or "unknown", "status": "error", "error": str(e)} - ) - finally: - # Always clean up client state to prevent accumulation - if client_id and client_state: - try: - await cleanup_client_state(client_id) - audio_logger.info(f"🧹 Cleaned up client state for {client_id}") - except Exception as cleanup_error: - audio_logger.error( - f"❌ Error cleaning up client state for {client_id}: {cleanup_error}" - ) - - return { - "message": f"Processed {len(files)} files", - "files": processed_files, - "conversations": processed_conversations, - "successful": len([f for f in processed_files if f.get("status") != "error"]), - "failed": len([f for f in processed_files if f.get("status") == "error"]), - } - - except Exception as e: - audio_logger.error(f"Error in process_audio_files: {e}") - return JSONResponse(status_code=500, content={"error": f"File processing failed: {str(e)}"}) - - -def get_audio_duration(file_content: bytes) -> float: - """Get duration of WAV file in seconds using wave library.""" - try: - with wave.open(io.BytesIO(file_content), "rb") as wav_file: - frames = wav_file.getnframes() - sample_rate = wav_file.getframerate() - duration = frames / sample_rate - return duration - except Exception as e: - logger.warning(f"Could not determine audio duration: {e}") - return 0.0 - - -async def process_audio_files_async( - background_tasks: BackgroundTasks, user: User, files: list[UploadFile], device_name: str -): - """Start async processing of uploaded audio files. Returns job ID immediately.""" - try: - if not files: - return JSONResponse(status_code=400, content={"error": "No files provided"}) - - # Read all file contents immediately to avoid file handle issues - file_data = [] - for file in files: - try: - content = await file.read() - file_data.append((file.filename, content)) - audio_logger.info(f"📥 Read file: {file.filename} ({len(content)} bytes)") - except Exception as e: - audio_logger.error(f"❌ Failed to read file {file.filename}: {e}") - return JSONResponse( - status_code=500, - content={"error": f"Failed to read file {file.filename}: {str(e)}"}, - ) - - # Create job - job_tracker = get_job_tracker() - filenames = [filename for filename, _ in file_data] - job_id = await job_tracker.create_job(user.user_id, device_name, filenames) - - # Start background processing with file contents - background_tasks.add_task(process_files_with_content, job_id, file_data, user, device_name) - - audio_logger.info(f"🚀 Started async processing job {job_id} with {len(files)} files") - - return { - "job_id": job_id, - "message": f"Started processing {len(files)} files", - "status_url": f"/api/process-audio-files/jobs/{job_id}", - "total_files": len(files), - } - - except Exception as e: - audio_logger.error(f"Error starting async file processing: {e}") - return JSONResponse( - status_code=500, content={"error": f"Failed to start processing: {str(e)}"} - ) - - -async def get_processing_job_status(job_id: str): - """Get status of an async file processing job.""" - try: - job_tracker = get_job_tracker() - job = await job_tracker.get_job(job_id) - - if not job: - return JSONResponse(status_code=404, content={"error": "Job not found"}) - - return job.to_dict() - - except Exception as e: - logger.error(f"Error getting job status for {job_id}: {e}") - return JSONResponse( - status_code=500, content={"error": f"Failed to get job status: {str(e)}"} - ) - - -async def list_processing_jobs(): - """List all active processing jobs.""" - try: - job_tracker = get_job_tracker() - active_jobs = await job_tracker.get_active_jobs() - - return {"active_jobs": len(active_jobs), "jobs": [job.to_dict() for job in active_jobs]} - - except Exception as e: - logger.error(f"Error listing jobs: {e}") - return JSONResponse(status_code=500, content={"error": f"Failed to list jobs: {str(e)}"}) - - -async def process_files_with_content( - job_id: str, file_data: list[tuple[str, bytes]], user: User, device_name: str -): - """Background task to process uploaded files using pre-read content.""" - # Import here to avoid circular imports - from advanced_omi_backend.main import cleanup_client_state, create_client_state - - audio_logger.info( - f"🚀 process_files_with_content called for job {job_id} with {len(file_data)} files" - ) - job_tracker = get_job_tracker() - - try: - # Update job status to processing - await job_tracker.update_job_status(job_id, JobStatus.PROCESSING) - - for file_index, (filename, content) in enumerate(file_data): - client_id = None - client_state = None - - try: - audio_logger.info( - f"🔧 [Job {job_id}] Processing file {file_index + 1}/{len(file_data)}: {filename}, content type: {type(content)}, size: {len(content)}" - ) - # Set current file - await job_tracker.set_current_file(job_id, filename) - await job_tracker.update_file_status(job_id, filename, FileStatus.PROCESSING) - - audio_logger.info( - f"🚀 [Job {job_id}] Processing file {file_index + 1}/{len(file_data)}: {filename}" - ) - - # Check duration and skip if too long - audio_logger.info( - f"🔍 [Job {job_id}] About to check duration for {filename}, content size: {len(content)} bytes" - ) - try: - duration = get_audio_duration(content) - audio_logger.info( - f"🔍 [Job {job_id}] Duration check successful: {duration:.2f}s for {filename}" - ) - except Exception as duration_error: - audio_logger.error( - f"❌ [Job {job_id}] Duration check failed for {filename}: {duration_error}" - ) - raise - # Duration limit removed - process files of any reasonable length - audio_logger.info(f"📊 File duration: {duration/60:.1f} minutes") - - # Validate file type - if not filename or not filename.lower().endswith(".wav"): - error_msg = "Only WAV files are currently supported" - await job_tracker.update_file_status( - job_id, filename, FileStatus.FAILED, error_message=error_msg - ) - continue - - # Generate unique client ID for each file - file_device_name = f"{device_name}-{file_index + 1:03d}" - client_id = generate_client_id(user, file_device_name) - - # Update job tracker with client ID - await job_tracker.update_file_status( - job_id, filename, FileStatus.PROCESSING, client_id=client_id - ) - - # Create client state - client_state = await create_client_state(client_id, user, file_device_name) - - # Process WAV file - with wave.open(io.BytesIO(content), "rb") as wav_file: - sample_rate = wav_file.getframerate() - sample_width = wav_file.getsampwidth() - channels = wav_file.getnchannels() - audio_data = wav_file.readframes(wav_file.getnframes()) - - # Convert to mono if stereo - if channels == 2: - if sample_width == 2: - audio_array = np.frombuffer(audio_data, dtype=np.int16) - else: - audio_array = np.frombuffer(audio_data, dtype=np.int32) - audio_array = audio_array.reshape(-1, 2) - audio_data = ( - np.mean(audio_array, axis=1).astype(audio_array.dtype).tobytes() - ) - channels = 1 - - # Process audio in chunks - processor_manager = get_processor_manager() - chunk_size = 32 * 1024 - base_timestamp = int(time.time()) - - for i in range(0, len(audio_data), chunk_size): - chunk_data = audio_data[i : i + chunk_size] - chunk_offset_bytes = i - chunk_offset_seconds = chunk_offset_bytes / ( - sample_rate * sample_width * channels - ) - chunk_timestamp = base_timestamp + int(chunk_offset_seconds) - - # Process audio chunk through unified pipeline - await process_audio_chunk( - audio_data=chunk_data, - client_id=client_id, - user_id=user.user_id, - user_email=user.email, - audio_format={ - "rate": sample_rate, - "width": sample_width, - "channels": channels, - "timestamp": chunk_timestamp, - }, - client_state=None, # No client state needed for file upload - ) - - if i % (chunk_size * 10) == 0: # Yield control occasionally - await asyncio.sleep(0) - - # Wait briefly for transcription manager to be created - await asyncio.sleep(2.0) - - # Close client audio to trigger transcription completion - await processor_manager.close_client_audio(client_id) - - # Wait for processing to complete with dynamic timeout - max_wait_time = max(120, int(duration * 2) + 60) # 2x duration + 60s buffer - wait_interval = 2.0 - elapsed_time = 0 - - audio_logger.info( - f"⏳ [Job {job_id}] Waiting for transcription (max {max_wait_time}s)" - ) - - # Track whether memory processing has been triggered to avoid duplicate calls - memory_triggered = False - - while elapsed_time < max_wait_time: - try: - # Check database for completion status - chunk = await chunks_col.find_one({"client_id": client_id}) - if chunk: - transcription_status = chunk.get("transcription_status", "PENDING") - memory_status = chunk.get("memory_processing_status", "PENDING") - - # Update job tracker with current status - await job_tracker.update_file_status( - job_id, - filename, - FileStatus.PROCESSING, - audio_uuid=chunk.get("audio_uuid"), - transcription_status=transcription_status, - memory_status=memory_status, - ) - - # Check if transcription failed - immediately fail the job - if transcription_status == "FAILED": - audio_logger.error( - f"❌ [Job {job_id}] Transcription failed, marking file as failed: {filename}" - ) - await job_tracker.update_file_status( - job_id, filename, FileStatus.FAILED, - error_message="Transcription failed" - ) - break # Exit monitoring loop for this file - - # Check if transcription is complete to trigger memory processing - elif transcription_status in ["COMPLETED", "EMPTY"]: - # Trigger memory processing if not already done - if memory_status == "PENDING" and not memory_triggered: - audio_logger.info( - f"🚀 [Job {job_id}] Transcription complete, triggering memory processing: {filename}" - ) - await client_state.close_current_conversation() - memory_triggered = True - # Continue to next iteration to check memory status - continue - - # Check if memory processing is also complete - if memory_status in ["COMPLETED", "FAILED", "SKIPPED"]: - audio_logger.info( - f"✅ [Job {job_id}] File processing completed: {filename}" - ) - await job_tracker.update_file_status( - job_id, filename, FileStatus.COMPLETED - ) - break - - except Exception as e: - audio_logger.debug(f"Error checking processing status: {e}") - - await asyncio.sleep(wait_interval) - elapsed_time += wait_interval - - if elapsed_time >= max_wait_time: - error_msg = f"Processing timed out after {max_wait_time}s" - audio_logger.warning(f"⏰ [Job {job_id}] {error_msg}: {filename}") - await job_tracker.update_file_status( - job_id, filename, FileStatus.FAILED, error_message=error_msg - ) - - # Signal end of conversation - trigger memory processing - await client_state.close_current_conversation() - await asyncio.sleep(0.5) - - except Exception as e: - error_msg = f"Error processing file: {str(e)}" - audio_logger.error(f"❌ [Job {job_id}] {error_msg}") - await job_tracker.update_file_status( - job_id, filename, FileStatus.FAILED, error_message=error_msg - ) - finally: - # Always clean up client state to prevent accumulation - if client_id and client_state: - try: - await cleanup_client_state(client_id) - audio_logger.info( - f"🧹 [Job {job_id}] Cleaned up client state for {client_id}" - ) - except Exception as cleanup_error: - audio_logger.error( - f"❌ [Job {job_id}] Error cleaning up client state for {client_id}: {cleanup_error}" - ) - - # Mark job as completed - await job_tracker.update_job_status(job_id, JobStatus.COMPLETED) - audio_logger.info(f"🎉 [Job {job_id}] All files processed") - - except Exception as e: - error_msg = f"Job processing failed: {str(e)}" - audio_logger.error(f"💥 [Job {job_id}] {error_msg}") - await job_tracker.update_job_status(job_id, JobStatus.FAILED, error_msg) +# Audio file processing functions moved to audio_controller.py # Configuration functions moved to config.py to avoid circular imports @@ -1159,3 +463,5 @@ async def delete_all_user_memories(user: User): return JSONResponse( status_code=500, content={"error": f"Failed to delete memories: {str(e)}"} ) + + diff --git a/backends/advanced/src/advanced_omi_backend/controllers/user_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/user_controller.py index dd00f8a9..ba7dd753 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/user_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/user_controller.py @@ -15,9 +15,10 @@ UserManager, ) from advanced_omi_backend.client_manager import get_user_clients_all -from advanced_omi_backend.database import chunks_col, db, users_col +from advanced_omi_backend.database import db, users_col from advanced_omi_backend.memory import get_memory_service -from advanced_omi_backend.users import User, UserCreate +from advanced_omi_backend.models.conversation import Conversation +from advanced_omi_backend.users import User, UserCreate, UserUpdate logger = logging.getLogger(__name__) @@ -58,13 +59,13 @@ async def create_user(user_data: UserCreate): # Create the user through the user manager user = await user_manager.create(user_data) + # Return the full user object (serialized via UserRead schema) + from advanced_omi_backend.models.user import UserRead + user_read = UserRead.model_validate(user) + return JSONResponse( status_code=201, - content={ - "message": f"User {user.email} created successfully", - "user_id": str(user.id), - "user_email": user.email, - }, + content=user_read.model_dump(mode='json'), ) except Exception as e: @@ -78,7 +79,7 @@ async def create_user(user_data: UserCreate): ) -async def update_user(user_id: str, user_data: UserCreate): +async def update_user(user_id: str, user_data: UserUpdate): """Update an existing user.""" try: # Validate ObjectId format @@ -106,31 +107,18 @@ async def update_user(user_id: str, user_data: UserCreate): # Convert to User object for the manager user_obj = User(**existing_user) - - # Prepare update data - only include non-None fields - update_data = {} - if user_data.email: - update_data["email"] = user_data.email - if user_data.display_name is not None: - update_data["display_name"] = user_data.display_name - if hasattr(user_data, 'is_superuser'): - update_data["is_superuser"] = user_data.is_superuser - if hasattr(user_data, 'is_active'): - update_data["is_active"] = user_data.is_active - if user_data.password: - # Hash the password if provided - update_data["hashed_password"] = user_manager.password_helper.hash(user_data.password) - - # Update the user - updated_user = await user_manager.update(user_obj, update_data) + + # Update the user using the fastapi-users manager + # Note: signature is update(user_update, user) - update data first, then user object + updated_user = await user_manager.update(user_data, user_obj) + + # Return the full user object (serialized via UserRead schema) + from advanced_omi_backend.models.user import UserRead + user_read = UserRead.model_validate(updated_user) return JSONResponse( status_code=200, - content={ - "message": f"User {updated_user.email} updated successfully", - "user_id": str(updated_user.id), - "user_email": updated_user.email, - }, + content=user_read.model_dump(mode='json'), ) except Exception as e: @@ -187,8 +175,8 @@ async def delete_user( deleted_data["user_deleted"] = user_result.deleted_count > 0 if delete_conversations: - # Delete all conversations (audio chunks) for this user - conversations_result = await chunks_col.delete_many({"client_id": user_id}) + # Delete all conversations for this user + conversations_result = await Conversation.find(Conversation.user_id == user_id).delete() deleted_data["conversations_deleted"] = conversations_result.deleted_count if delete_memories: diff --git a/backends/advanced/src/advanced_omi_backend/controllers/websocket_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/websocket_controller.py new file mode 100644 index 00000000..a4338f2b --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/controllers/websocket_controller.py @@ -0,0 +1,1128 @@ + +""" +WebSocket controller for Friend-Lite backend. + +This module handles WebSocket connections for audio streaming. +""" + +import asyncio +import concurrent.futures +import json +import logging +import os +import time +import uuid +from functools import partial +from typing import Optional + +from fastapi import WebSocket, WebSocketDisconnect, Query +from friend_lite.decoder import OmiOpusDecoder + +from advanced_omi_backend.auth import websocket_auth +from advanced_omi_backend.client_manager import generate_client_id, get_client_manager +from advanced_omi_backend.constants import OMI_CHANNELS, OMI_SAMPLE_RATE, OMI_SAMPLE_WIDTH +from advanced_omi_backend.utils.audio_utils import process_audio_chunk +from advanced_omi_backend.services.audio_stream import AudioStreamProducer +from advanced_omi_backend.services.audio_stream.producer import get_audio_stream_producer + +# Thread pool executors for audio decoding +_DEC_IO_EXECUTOR = concurrent.futures.ThreadPoolExecutor( + max_workers=os.cpu_count() or 4, + thread_name_prefix="opus_io", +) + +# Logging setup +logger = logging.getLogger(__name__) +application_logger = logging.getLogger("audio_processing") + +# Track pending WebSocket connections to prevent race conditions +pending_connections: set[str] = set() + + +async def parse_wyoming_protocol(ws: WebSocket) -> tuple[dict, Optional[bytes]]: + """Parse Wyoming protocol: JSON header line followed by optional binary payload. + + Returns: + Tuple of (header_dict, payload_bytes or None) + """ + # Read data from WebSocket + logger.debug(f"parse_wyoming_protocol: About to call ws.receive()") + message = await ws.receive() + logger.debug(f"parse_wyoming_protocol: Received message with keys: {message.keys() if message else 'None'}") + + # Handle WebSocket close frame + if "type" in message and message["type"] == "websocket.disconnect": + # This is a normal WebSocket close event + code = message.get("code", 1000) + reason = message.get("reason", "") + logger.info(f"📴 WebSocket disconnect received in parse_wyoming_protocol. Code: {code}, Reason: {reason}") + raise WebSocketDisconnect(code=code, reason=reason) + + # Handle text message (JSON header) + if "text" in message: + header_text = message["text"] + # Wyoming protocol uses newline-terminated JSON + if not header_text.endswith("\n"): + header_text += "\n" + + # Parse JSON header + json_line = header_text.strip() + header = json.loads(json_line) + + # If payload is expected, read binary data + payload = None + payload_length = header.get("payload_length") + if payload_length is not None and payload_length > 0: + payload_msg = await ws.receive() + if "bytes" in payload_msg: + payload = payload_msg["bytes"] + else: + logger.warning(f"Expected binary payload but got: {payload_msg.keys()}") + + return header, payload + + # Handle binary message (invalid - Wyoming protocol requires JSONL headers) + elif "bytes" in message: + raise ValueError( + "Raw binary messages not supported - Wyoming protocol requires JSONL headers" + ) + + else: + raise ValueError(f"Unexpected WebSocket message type: {message.keys()}") + + +async def create_client_state(client_id: str, user, device_name: Optional[str] = None): + """Create and register a new client state.""" + # Get client manager + client_manager = get_client_manager() + + # Directory where WAV chunks are written + from pathlib import Path + CHUNK_DIR = Path("./audio_chunks") # This will be mounted to ./data/audio_chunks by Docker + + # Use ClientManager for atomic client creation and registration + client_state = client_manager.create_client( + client_id, CHUNK_DIR, user.user_id, user.email + ) + + # Also track in persistent mapping (for database queries) + from advanced_omi_backend.client_manager import track_client_user_relationship + track_client_user_relationship(client_id, user.user_id) + + # Register client in user model (persistent) + from advanced_omi_backend.users import register_client_to_user + await register_client_to_user(user, client_id, device_name) + + return client_state + + +async def cleanup_client_state(client_id: str): + """Clean up and remove client state, including cancelling speech detection job and marking session complete.""" + # Cancel the speech detection job for this client + from advanced_omi_backend.controllers.queue_controller import redis_conn + from rq.job import Job + import redis.asyncio as redis + + try: + job_id_key = f"speech_detection_job:{client_id}" + job_id_bytes = redis_conn.get(job_id_key) + + if job_id_bytes: + job_id = job_id_bytes.decode() + logger.info(f"🛑 Cancelling speech detection job {job_id} for client {client_id}") + + try: + # Fetch and cancel the job + job = Job.fetch(job_id, connection=redis_conn) + job.cancel() + logger.info(f"✅ Successfully cancelled speech detection job {job_id}") + except Exception as job_error: + logger.warning(f"⚠️ Failed to cancel job {job_id}: {job_error}") + + # Clean up the tracking key + redis_conn.delete(job_id_key) + logger.info(f"🧹 Cleaned up job tracking key for client {client_id}") + else: + logger.debug(f"No speech detection job found for client {client_id}") + except Exception as e: + logger.warning(f"⚠️ Error during job cancellation for client {client_id}: {e}") + + # Mark all active sessions for this client as complete AND delete Redis streams + try: + # Get async Redis client + redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0") + async_redis = redis.from_url(redis_url, decode_responses=False) + + # Find all session keys for this client and mark them complete + pattern = f"audio:session:*" + cursor = 0 + sessions_closed = 0 + + while True: + cursor, keys = await async_redis.scan(cursor, match=pattern, count=100) + + for key in keys: + # Check if this session belongs to this client + client_id_bytes = await async_redis.hget(key, "client_id") + if client_id_bytes and client_id_bytes.decode() == client_id: + # Mark session as complete (WebSocket disconnected) + await async_redis.hset(key, mapping={ + "status": "complete", + "completed_at": str(time.time()), + "completion_reason": "websocket_disconnect" + }) + session_id = key.decode().replace("audio:session:", "") + logger.info(f"📊 Marked session {session_id[:12]} as complete (WebSocket disconnect)") + sessions_closed += 1 + + if cursor == 0: + break + + if sessions_closed > 0: + logger.info(f"✅ Closed {sessions_closed} active session(s) for client {client_id}") + + # Delete Redis Streams for this client + stream_pattern = f"audio:stream:{client_id}" + stream_key = await async_redis.exists(stream_pattern) + if stream_key: + await async_redis.delete(stream_pattern) + logger.info(f"🧹 Deleted Redis stream: {stream_pattern}") + else: + logger.debug(f"No Redis stream found for client {client_id}") + + await async_redis.close() + + except Exception as session_error: + logger.warning(f"⚠️ Error marking sessions complete for client {client_id}: {session_error}") + + # Use ClientManager for atomic client removal with cleanup + client_manager = get_client_manager() + removed = await client_manager.remove_client_with_cleanup(client_id) + + if removed: + logger.info(f"Client {client_id} cleaned up successfully") + else: + logger.warning(f"Client {client_id} was not found for cleanup") + + +# Shared helper functions for WebSocket handlers +async def _setup_websocket_connection( + ws: WebSocket, + token: Optional[str], + device_name: Optional[str], + pending_client_id: str, + connection_type: str +) -> tuple[Optional[str], Optional[object], Optional[object]]: + """ + Setup WebSocket connection: accept, authenticate, create client state. + + Args: + ws: WebSocket connection + token: JWT authentication token + device_name: Optional device name for client ID + pending_client_id: Temporary tracking ID + connection_type: "OMI" or "PCM" for logging + + Returns: + tuple: (client_id, client_state, user) or (None, None, None) on failure + """ + # Accept WebSocket first (required before any send/close operations) + await ws.accept() + + # Authenticate user after accepting connection + user = await websocket_auth(ws, token) + if not user: + # Send error message to client before closing + try: + error_msg = json.dumps({ + "type": "error", + "error": "authentication_failed", + "message": "Authentication failed. Please log in again and ensure your token is valid.", + "code": 1008 + }) + "\n" + await ws.send_text(error_msg) + application_logger.info("Sent authentication error message to client") + except Exception as send_error: + application_logger.warning(f"Failed to send error message: {send_error}") + + # Close connection with appropriate code + await ws.close(code=1008, reason="Authentication failed") + return None, None, None + + # Generate proper client_id using user and device_name + client_id = generate_client_id(user, device_name) + + # Remove from pending now that we have real client_id + pending_connections.discard(pending_client_id) + application_logger.info( + f"🔌 {connection_type} WebSocket connection accepted - User: {user.user_id} ({user.email}), Client: {client_id}" + ) + + # Send ready message for PCM clients + if connection_type == "PCM": + try: + ready_msg = json.dumps({"type": "ready", "message": "WebSocket connection established"}) + "\n" + await ws.send_text(ready_msg) + application_logger.debug(f"✅ Sent ready message to {client_id}") + except Exception as e: + application_logger.error(f"Failed to send ready message to {client_id}: {e}") + + # Create client state + client_state = await create_client_state(client_id, user, device_name) + + return client_id, client_state, user + + +async def _initialize_streaming_session( + client_state, + audio_stream_producer, + user_id: str, + user_email: str, + client_id: str, + audio_format: dict +) -> None: + """ + Initialize streaming session with Redis and enqueue processing jobs. + + Args: + client_state: Client state object + audio_stream_producer: Audio stream producer instance + user_id: User ID + user_email: User email + client_id: Client ID + audio_format: Audio format dict from audio-start event + """ + if hasattr(client_state, 'stream_session_id'): + application_logger.debug(f"Session already initialized for {client_id}") + return + + # Initialize stream session + client_state.stream_session_id = str(uuid.uuid4()) + client_state.stream_chunk_count = 0 + client_state.stream_audio_format = audio_format + application_logger.info(f"🆔 Created stream session: {client_state.stream_session_id}") + + # Determine transcription provider from environment + transcription_provider = os.getenv("TRANSCRIPTION_PROVIDER", "").lower() + if transcription_provider in ["offline", "parakeet"]: + provider = "parakeet" + elif transcription_provider == "deepgram": + provider = "deepgram" + else: + # Auto-detect: prefer Parakeet if URL is set, otherwise Deepgram + parakeet_url = os.getenv("PARAKEET_ASR_URL") or os.getenv("OFFLINE_ASR_TCP_URI") + deepgram_key = os.getenv("DEEPGRAM_API_KEY") + if parakeet_url: + provider = "parakeet" + elif deepgram_key: + provider = "deepgram" + else: + raise ValueError("No transcription provider configured (DEEPGRAM_API_KEY or PARAKEET_ASR_URL required)") + + # Initialize session tracking in Redis + await audio_stream_producer.init_session( + session_id=client_state.stream_session_id, + user_id=user_id, + client_id=client_id, + mode="streaming", + provider=provider + ) + + # Enqueue streaming jobs (speech detection + audio persistence) + from advanced_omi_backend.controllers.queue_controller import start_streaming_jobs + + job_ids = start_streaming_jobs( + session_id=client_state.stream_session_id, + user_id=user_id, + client_id=client_id + ) + + client_state.speech_detection_job_id = job_ids['speech_detection'] + client_state.audio_persistence_job_id = job_ids['audio_persistence'] + + +async def _finalize_streaming_session( + client_state, + audio_stream_producer, + user_id: str, + user_email: str, + client_id: str +) -> None: + """ + Finalize streaming session: flush buffer, signal workers, enqueue finalize job, cleanup. + + Args: + client_state: Client state object + audio_stream_producer: Audio stream producer instance + user_id: User ID + user_email: User email + client_id: Client ID + """ + if not hasattr(client_state, 'stream_session_id'): + application_logger.debug(f"No active session to finalize for {client_id}") + return + + session_id = client_state.stream_session_id + + try: + # Flush any remaining buffered audio + audio_format = getattr(client_state, 'stream_audio_format', {}) + await audio_stream_producer.flush_session_buffer( + session_id=session_id, + sample_rate=audio_format.get("rate", 16000), + channels=audio_format.get("channels", 1), + sample_width=audio_format.get("width", 2) + ) + + # Send end-of-session signal to workers + await audio_stream_producer.send_session_end_signal(session_id) + + # Mark session as finalizing + await audio_stream_producer.finalize_session(session_id) + + # NOTE: Finalize job disabled - open_conversation_job now handles everything + # The open_conversation_job will: + # 1. Detect the "finalizing" status + # 2. Enter 5-second grace period + # 3. Get audio file path + # 4. Mark session complete + # 5. Clean up Redis streams + # 6. Enqueue batch transcription and memory processing + # + # If no speech was detected (open_conversation_job never started): + # - Audio is discarded (intentional - we only create conversations with speech) + # - Redis streams are cleaned up by TTL + # + # TODO: Consider adding cleanup for no-speech scenarios if needed + + application_logger.info( + f"✅ Session {session_id[:12]} marked as finalizing - open_conversation_job will handle cleanup" + ) + + # Clear session state + for attr in ['stream_session_id', 'stream_chunk_count', 'stream_audio_format', + 'speech_detection_job_id', 'audio_persistence_job_id']: + if hasattr(client_state, attr): + delattr(client_state, attr) + + except Exception as finalize_error: + application_logger.error( + f"❌ Failed to finalize streaming session: {finalize_error}", + exc_info=True + ) + + +async def _publish_audio_to_stream( + client_state, + audio_stream_producer, + audio_data: bytes, + user_id: str, + client_id: str, + sample_rate: int, + channels: int, + sample_width: int +) -> None: + """ + Publish audio chunk to Redis Stream with chunk tracking. + + Args: + client_state: Client state object + audio_stream_producer: Audio stream producer instance + audio_data: Raw PCM audio bytes + user_id: User ID + client_id: Client ID + sample_rate: Sample rate (Hz) + channels: Number of channels + sample_width: Bytes per sample + """ + if not hasattr(client_state, 'stream_session_id'): + application_logger.warning(f"⚠️ Received audio chunk before session initialized for {client_id}") + return + + # Increment chunk count and format chunk ID + client_state.stream_chunk_count += 1 + chunk_id = f"{client_state.stream_chunk_count:05d}" + + # Publish to Redis Stream using producer + await audio_stream_producer.add_audio_chunk( + audio_data=audio_data, + session_id=client_state.stream_session_id, + chunk_id=chunk_id, + user_id=user_id, + client_id=client_id, + sample_rate=sample_rate, + channels=channels, + sample_width=sample_width + ) + + +async def _handle_omi_audio_chunk( + client_state, + audio_stream_producer, + opus_payload: bytes, + decode_packet_fn, + user_id: str, + client_id: str, + packet_count: int +) -> None: + """ + Handle OMI audio chunk: decode Opus to PCM, then publish to stream. + + Args: + client_state: Client state object + audio_stream_producer: Audio stream producer instance + opus_payload: Opus-encoded audio bytes + decode_packet_fn: Opus decoder function + user_id: User ID + client_id: Client ID + packet_count: Current packet number for logging + """ + # Decode Opus to PCM + start_time = time.time() + loop = asyncio.get_running_loop() + pcm_data = await loop.run_in_executor(_DEC_IO_EXECUTOR, decode_packet_fn, opus_payload) + decode_time = time.time() - start_time + + if pcm_data: + if packet_count <= 5 or packet_count % 1000 == 0: + application_logger.debug( + f"🎵 Decoded OMI packet #{packet_count}: {len(opus_payload)} bytes -> " + f"{len(pcm_data)} PCM bytes (took {decode_time:.3f}s)" + ) + + # Publish decoded PCM to Redis Stream + await _publish_audio_to_stream( + client_state, + audio_stream_producer, + pcm_data, + user_id, + client_id, + OMI_SAMPLE_RATE, + OMI_CHANNELS, + OMI_SAMPLE_WIDTH + ) + else: + # Log decode failures for first 5 packets + if packet_count <= 5: + application_logger.warning( + f"❌ Failed to decode OMI packet #{packet_count}: {len(opus_payload)} bytes" + ) + + +async def _handle_streaming_mode_audio( + client_state, + audio_stream_producer, + audio_data: bytes, + audio_format: dict, + user_id: str, + user_email: str, + client_id: str +) -> None: + """ + Handle audio chunk in streaming mode. + + Args: + client_state: Client state object + audio_stream_producer: Audio stream producer instance + audio_data: Raw PCM audio bytes + audio_format: Audio format dict (rate, width, channels) + user_id: User ID + user_email: User email + client_id: Client ID + """ + # Initialize session if needed + if not hasattr(client_state, 'stream_session_id'): + await _initialize_streaming_session( + client_state, + audio_stream_producer, + user_id, + user_email, + client_id, + audio_format + ) + + # Publish to Redis Stream + await _publish_audio_to_stream( + client_state, + audio_stream_producer, + audio_data, + user_id, + client_id, + audio_format.get("rate", 16000), + audio_format.get("channels", 1), + audio_format.get("width", 2) + ) + + +async def _handle_batch_mode_audio( + client_state, + audio_data: bytes, + audio_format: dict, + client_id: str +) -> None: + """ + Handle audio chunk in batch mode - accumulate in memory. + + Args: + client_state: Client state object + audio_data: Raw PCM audio bytes + audio_format: Audio format dict + client_id: Client ID + """ + # Initialize batch accumulator if needed + if not hasattr(client_state, 'batch_audio_chunks'): + client_state.batch_audio_chunks = [] + client_state.batch_audio_format = audio_format + application_logger.info(f"📦 Started batch audio accumulation for {client_id}") + + # Accumulate audio + client_state.batch_audio_chunks.append(audio_data) + application_logger.debug( + f"📦 Accumulated chunk #{len(client_state.batch_audio_chunks)} ({len(audio_data)} bytes) for {client_id}" + ) + + +async def _handle_audio_chunk( + client_state, + audio_stream_producer, + audio_data: bytes, + audio_format: dict, + user_id: str, + user_email: str, + client_id: str +) -> None: + """ + Route audio chunk to appropriate mode handler (streaming or batch). + + Args: + client_state: Client state object + audio_stream_producer: Audio stream producer instance + audio_data: Raw PCM audio bytes + audio_format: Audio format dict + user_id: User ID + user_email: User email + client_id: Client ID + """ + recording_mode = getattr(client_state, 'recording_mode', 'batch') + + if recording_mode == "streaming": + await _handle_streaming_mode_audio( + client_state, audio_stream_producer, audio_data, + audio_format, user_id, user_email, client_id + ) + else: + await _handle_batch_mode_audio( + client_state, audio_data, audio_format, client_id + ) + + +async def _handle_audio_session_start( + client_state, + audio_format: dict, + client_id: str +) -> tuple[bool, str]: + """ + Handle audio-start event - set mode and switch to audio streaming. + + Args: + client_state: Client state object + audio_format: Audio format dict with mode + client_id: Client ID + + Returns: + (audio_streaming_flag, recording_mode) + """ + recording_mode = audio_format.get("mode", "batch") + client_state.recording_mode = recording_mode + + application_logger.info( + f"🎙️ Audio session started for {client_id} - " + f"Format: {audio_format.get('rate')}Hz, " + f"{audio_format.get('width')}bytes, " + f"{audio_format.get('channels')}ch, " + f"Mode: {recording_mode}" + ) + + return True, recording_mode # Switch to audio streaming mode + + +async def _handle_audio_session_stop( + client_state, + audio_stream_producer, + user_id: str, + user_email: str, + client_id: str +) -> bool: + """ + Handle audio-stop event - finalize session based on mode. + + Args: + client_state: Client state object + audio_stream_producer: Audio stream producer instance + user_id: User ID + user_email: User email + client_id: Client ID + + Returns: + False to switch back to control mode + """ + recording_mode = getattr(client_state, 'recording_mode', 'batch') + application_logger.info(f"🛑 Audio session stopped for {client_id} (mode: {recording_mode})") + + if recording_mode == "streaming": + await _finalize_streaming_session( + client_state, audio_stream_producer, + user_id, user_email, client_id + ) + else: + await _process_batch_audio_complete( + client_state, user_id, user_email, client_id + ) + + return False # Switch back to control mode + + +async def _process_batch_audio_complete( + client_state, + user_id: str, + user_email: str, + client_id: str +) -> None: + """ + Process completed batch audio: write file, create conversation, enqueue jobs. + + Args: + client_state: Client state with batch_audio_chunks + user_id: User ID + user_email: User email + client_id: Client ID + """ + if not hasattr(client_state, 'batch_audio_chunks') or not client_state.batch_audio_chunks: + application_logger.warning(f"⚠️ Batch mode: No audio chunks accumulated for {client_id}") + return + + try: + from advanced_omi_backend.utils.audio_utils import write_audio_file + from advanced_omi_backend.models.conversation import create_conversation + + # Combine all chunks + complete_audio = b''.join(client_state.batch_audio_chunks) + application_logger.info( + f"📦 Batch mode: Combined {len(client_state.batch_audio_chunks)} chunks into {len(complete_audio)} bytes" + ) + + # Generate audio UUID and timestamp + audio_uuid = str(uuid.uuid4()) + timestamp = int(time.time() * 1000) + + # Write audio file and create AudioFile entry + relative_audio_path, file_path, duration = await write_audio_file( + raw_audio_data=complete_audio, + audio_uuid=audio_uuid, + client_id=client_id, + user_id=user_id, + user_email=user_email, + timestamp=timestamp, + validate=False # PCM data, not WAV + ) + + application_logger.info( + f"✅ Batch mode: Wrote audio file {relative_audio_path} ({duration:.1f}s)" + ) + + # Create conversation immediately for batch audio (conversation_id auto-generated) + version_id = str(uuid.uuid4()) + + conversation = create_conversation( + audio_uuid=audio_uuid, + user_id=user_id, + client_id=client_id, + title="Batch Recording", + summary="Processing batch audio..." + ) + conversation.audio_path = relative_audio_path + await conversation.insert() + conversation_id = conversation.conversation_id # Get the auto-generated ID + + application_logger.info(f"📝 Batch mode: Created conversation {conversation_id}") + + # Enqueue post-conversation processing job chain + from advanced_omi_backend.controllers.queue_controller import start_post_conversation_jobs + + job_ids = start_post_conversation_jobs( + conversation_id=conversation_id, + audio_uuid=audio_uuid, + audio_file_path=file_path, + user_id=None, # Will be read from conversation in DB by jobs + post_transcription=True, # Run batch transcription for uploads + client_id=client_id # Pass client_id for UI tracking + ) + + application_logger.info( + f"✅ Batch mode: Enqueued job chain for {conversation_id} - " + f"transcription ({job_ids['transcription']}) → " + f"speaker ({job_ids['speaker_recognition']}) → " + f"memory ({job_ids['memory']})" + ) + + # Clear accumulated chunks + client_state.batch_audio_chunks = [] + + except Exception as batch_error: + application_logger.error( + f"❌ Batch mode processing failed: {batch_error}", + exc_info=True + ) + + +async def handle_omi_websocket( + ws: WebSocket, + token: Optional[str] = None, + device_name: Optional[str] = None, +): + """Handle OMI WebSocket connections with Opus decoding.""" + # Generate pending client_id to track connection even if auth fails + pending_client_id = f"pending_{uuid.uuid4()}" + pending_connections.add(pending_client_id) + + client_id = None + client_state = None + + try: + # Setup connection (accept, auth, create client state) + client_id, client_state, user = await _setup_websocket_connection( + ws, token, device_name, pending_client_id, "OMI" + ) + if not user: + return + + # OMI-specific: Setup Opus decoder + decoder = OmiOpusDecoder() + _decode_packet = partial(decoder.decode_packet, strip_header=False) + + # Get singleton audio stream producer + audio_stream_producer = get_audio_stream_producer() + + packet_count = 0 + total_bytes = 0 + + while True: + # Parse Wyoming protocol + header, payload = await parse_wyoming_protocol(ws) + + if header["type"] == "audio-start": + # Handle audio session start + application_logger.info(f"🎙️ OMI audio session started for {client_id}") + await _initialize_streaming_session( + client_state, + audio_stream_producer, + user.user_id, + user.email, + client_id, + header.get("data", {"rate": OMI_SAMPLE_RATE, "width": OMI_SAMPLE_WIDTH, "channels": OMI_CHANNELS}) + ) + + elif header["type"] == "audio-chunk" and payload: + packet_count += 1 + total_bytes += len(payload) + + # Log progress + if packet_count <= 5 or packet_count % 1000 == 0: + application_logger.info( + f"🎵 Received OMI audio chunk #{packet_count}: {len(payload)} bytes" + ) + + # Handle OMI audio chunk (Opus decode + publish to stream) + await _handle_omi_audio_chunk( + client_state, + audio_stream_producer, + payload, + _decode_packet, + user.user_id, + client_id, + packet_count + ) + + # Log progress every 1000th packet + if packet_count % 1000 == 0: + application_logger.info( + f"📊 Processed {packet_count} OMI packets ({total_bytes} bytes total)" + ) + + elif header["type"] == "audio-stop": + # Handle audio session stop + application_logger.info( + f"🛑 OMI audio session stopped for {client_id} - " + f"Total chunks: {packet_count}, Total bytes: {total_bytes}" + ) + + # Finalize session using helper function + await _finalize_streaming_session( + client_state, + audio_stream_producer, + user.user_id, + user.email, + client_id + ) + + # Reset counters for next session + packet_count = 0 + total_bytes = 0 + + else: + # Unknown event type + application_logger.debug( + f"Ignoring Wyoming event type '{header['type']}' for OMI client {client_id}" + ) + + except WebSocketDisconnect: + application_logger.info( + f"🔌 WebSocket disconnected - Client: {client_id}, Packets: {packet_count}, Total bytes: {total_bytes}" + ) + except Exception as e: + application_logger.error(f"❌ WebSocket error for client {client_id}: {e}", exc_info=True) + finally: + # Clean up pending connection tracking + pending_connections.discard(pending_client_id) + + # Ensure cleanup happens even if client_id is None + if client_id: + try: + # Clean up client state + await cleanup_client_state(client_id) + except Exception as cleanup_error: + application_logger.error( + f"Error during cleanup for client {client_id}: {cleanup_error}", exc_info=True + ) + + +async def handle_pcm_websocket( + ws: WebSocket, + token: Optional[str] = None, + device_name: Optional[str] = None +): + """Handle PCM WebSocket connections with batch and streaming mode support.""" + # Generate pending client_id to track connection even if auth fails + pending_client_id = f"pending_{uuid.uuid4()}" + pending_connections.add(pending_client_id) + + client_id = None + client_state = None + + try: + # Setup connection (accept, auth, create client state) + client_id, client_state, user = await _setup_websocket_connection( + ws, token, device_name, pending_client_id, "PCM" + ) + if not user: + return + + # Get singleton audio stream producer + audio_stream_producer = get_audio_stream_producer() + + packet_count = 0 + total_bytes = 0 + audio_streaming = False # Track if audio session is active + + while True: + try: + if not audio_streaming: + # Control message mode - parse Wyoming protocol + application_logger.debug(f"🔄 Control mode for {client_id}, WebSocket state: {ws.client_state if hasattr(ws, 'client_state') else 'unknown'}") + application_logger.debug(f"📨 About to receive control message for {client_id}") + header, payload = await parse_wyoming_protocol(ws) + application_logger.debug(f"✅ Received message type: {header.get('type')} for {client_id}") + + if header["type"] == "audio-start": + application_logger.debug(f"🎙️ Processing audio-start for {client_id}") + # Handle audio session start using helper function + audio_streaming, recording_mode = await _handle_audio_session_start( + client_state, + header.get("data", {}), + client_id + ) + continue # Continue to audio streaming mode + + elif header["type"] == "ping": + # Handle keepalive ping from frontend + application_logger.debug(f"🏓 Received ping from {client_id}") + continue + + else: + # Unknown control message type + application_logger.debug( + f"Ignoring Wyoming control event type '{header['type']}' for {client_id}" + ) + continue + + else: + # Audio streaming mode - receive raw bytes (like speaker recognition) + application_logger.debug(f"🎵 Audio streaming mode for {client_id} - waiting for audio data") + + try: + # Receive raw audio bytes or check for control messages + message = await ws.receive() + + + # Check if it's a disconnect + if "type" in message and message["type"] == "websocket.disconnect": + code = message.get("code", 1000) + reason = message.get("reason", "") + application_logger.info(f"🔌 WebSocket disconnect during audio streaming for {client_id}. Code: {code}, Reason: {reason}") + break + + # Check if it's a text message (control message like audio-stop) + if "text" in message: + try: + control_header = json.loads(message["text"].strip()) + if control_header.get("type") == "audio-stop": + # Handle audio session stop using helper function + audio_streaming = await _handle_audio_session_stop( + client_state, + audio_stream_producer, + user.user_id, + user.email, + client_id + ) + # Reset counters for next session + packet_count = 0 + total_bytes = 0 + continue + elif control_header.get("type") == "ping": + application_logger.debug(f"🏓 Received ping during streaming from {client_id}") + continue + elif control_header.get("type") == "audio-start": + # Handle duplicate audio-start messages gracefully (idempotent behavior) + application_logger.info(f"🔄 Ignoring duplicate audio-start message during streaming for {client_id}") + continue + elif control_header.get("type") == "audio-chunk": + # Handle Wyoming protocol audio-chunk with binary payload + payload_length = control_header.get("payload_length") + if payload_length and payload_length > 0: + # Receive the binary audio data + payload_msg = await ws.receive() + if "bytes" in payload_msg: + audio_data = payload_msg["bytes"] + packet_count += 1 + total_bytes += len(audio_data) + + application_logger.debug(f"🎵 Received audio chunk #{packet_count}: {len(audio_data)} bytes") + + # Route to appropriate mode handler + audio_format = control_header.get("data", {}) + await _handle_audio_chunk( + client_state, + audio_stream_producer, + audio_data, + audio_format, + user.user_id, + user.email, + client_id + ) + else: + application_logger.warning(f"Expected binary payload for audio-chunk, got: {payload_msg.keys()}") + else: + application_logger.warning(f"audio-chunk missing payload_length: {payload_length}") + continue + else: + application_logger.warning(f"Unknown control message during streaming: {control_header.get('type')}") + continue + + except json.JSONDecodeError: + application_logger.warning(f"Invalid control message during streaming for {client_id}") + continue + + # Check if it's binary data (raw audio without Wyoming protocol) + elif "bytes" in message: + # Raw binary audio data (legacy support) + audio_data = message["bytes"] + packet_count += 1 + total_bytes += len(audio_data) + + application_logger.debug(f"🎵 Received raw audio chunk #{packet_count}: {len(audio_data)} bytes") + + # Route to appropriate mode handler with default format + default_format = {"rate": 16000, "width": 2, "channels": 1} + await _handle_audio_chunk( + client_state, + audio_stream_producer, + audio_data, + default_format, + user.user_id, + user.email, + client_id + ) + + else: + application_logger.warning(f"Unexpected message format in streaming mode: {message.keys()}") + continue + + except Exception as streaming_error: + application_logger.error(f"Error in audio streaming mode: {streaming_error}") + if "disconnect" in str(streaming_error).lower(): + break + continue + + except WebSocketDisconnect as e: + application_logger.info( + f"🔌 WebSocket disconnected during message processing for {client_id}. " + f"Code: {e.code}, Reason: {e.reason}" + ) + break # Exit the loop on disconnect + except json.JSONDecodeError as e: + application_logger.error( + f"❌ JSON decode error in Wyoming protocol for {client_id}: {e}" + ) + continue # Skip this message but don't disconnect + except ValueError as e: + application_logger.error( + f"❌ Protocol error for {client_id}: {e}" + ) + continue # Skip this message but don't disconnect + except RuntimeError as e: + # Handle "Cannot call receive once a disconnect message has been received" + if "disconnect" in str(e).lower(): + application_logger.info( + f"🔌 WebSocket already disconnected for {client_id}: {e}" + ) + break # Exit the loop on disconnect + else: + application_logger.error( + f"❌ Runtime error for {client_id}: {e}", exc_info=True + ) + continue + except Exception as e: + application_logger.error( + f"❌ Unexpected error processing message for {client_id}: {e}", exc_info=True + ) + # Check if it's a connection-related error + error_msg = str(e).lower() + if "disconnect" in error_msg or "closed" in error_msg or "receive" in error_msg: + application_logger.info( + f"🔌 Connection issue detected for {client_id}, exiting loop" + ) + break + else: + continue # Skip this message for other errors + + except WebSocketDisconnect: + application_logger.info( + f"🔌 PCM WebSocket disconnected - Client: {client_id}, Packets: {packet_count}, Total bytes: {total_bytes}" + ) + except Exception as e: + application_logger.error( + f"❌ PCM WebSocket error for client {client_id}: {e}", exc_info=True + ) + finally: + # Clean up pending connection tracking + pending_connections.discard(pending_client_id) + + # Ensure cleanup happens even if client_id is None + if client_id: + try: + # Clean up client state + await cleanup_client_state(client_id) + except Exception as cleanup_error: + application_logger.error( + f"Error during cleanup for client {client_id}: {cleanup_error}", exc_info=True + ) diff --git a/backends/advanced/src/advanced_omi_backend/conversation_manager.py b/backends/advanced/src/advanced_omi_backend/conversation_manager.py deleted file mode 100644 index 92b1ee0b..00000000 --- a/backends/advanced/src/advanced_omi_backend/conversation_manager.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Conversation Manager for handling conversation lifecycle and processing coordination. - -This module separates conversation management concerns from ClientState to follow -the Single Responsibility Principle. It handles conversation closure, memory processing -queuing, and audio cropping coordination. -""" - -import logging -from typing import Optional - -from advanced_omi_backend.processors import ( - get_processor_manager, -) -from advanced_omi_backend.transcript_coordinator import get_transcript_coordinator - -audio_logger = logging.getLogger("audio") - - -class ConversationManager: - """Manages conversation lifecycle and processing coordination. - - This class handles the responsibilities previously mixed into ClientState, - providing a clean separation of concerns for conversation management. - """ - - def __init__(self): - self.coordinator = get_transcript_coordinator() - audio_logger.info("ConversationManager initialized") - - async def close_conversation( - self, - client_id: str, - audio_uuid: str, - user_id: str, - user_email: Optional[str], - conversation_start_time: float, - speech_segments: dict, - chunk_dir, # Can be Path or str - ) -> bool: - """Close a conversation and coordinate all necessary processing. - - Args: - client_id: Client identifier - audio_uuid: Unique audio conversation identifier - user_id: User identifier - user_email: User email - db_helper: Database helper instance - conversation_start_time: When conversation started - speech_segments: Speech segments for cropping - chunk_dir: Directory for audio chunks - - Returns: - True if conversation was closed successfully - """ - audio_logger.info(f"🔒 Closing conversation {audio_uuid} for client {client_id}") - - try: - # Get processor manager - processor_manager = get_processor_manager() - - # Step 1: Close audio file in processor (only if transcription not already completed) - # Check if transcription is already completed to avoid double-flushing - processing_status = processor_manager.get_processing_status(client_id) - transcription_completed = processing_status.get("stages", {}).get("transcription", {}).get("completed", False) - - if not transcription_completed: - audio_logger.info(f"🔄 Transcription not completed, calling close_client_audio for {client_id}") - await processor_manager.close_client_audio(client_id) - else: - audio_logger.info(f"✅ Transcription already completed, skipping close_client_audio for {client_id}") - - # Step 2: Memory processing is now handled by transcription completion - # This eliminates race conditions and event coordination issues - audio_logger.info(f"💭 Memory processing will be triggered by transcription completion for {audio_uuid}") - - # Step 3: Audio cropping is now handled at processor level after transcription - # This ensures cropping happens with diarization segments when available - # See transcription.py _queue_diarization_based_cropping() method - - audio_logger.info(f"✅ Successfully closed conversation {audio_uuid}") - return True - - except Exception as e: - audio_logger.error(f"❌ Error closing conversation {audio_uuid}: {e}", exc_info=True) - return False - - - -# Global singleton instance -_conversation_manager: Optional[ConversationManager] = None - - -def get_conversation_manager() -> ConversationManager: - """Get the global ConversationManager instance.""" - global _conversation_manager - if _conversation_manager is None: - _conversation_manager = ConversationManager() - return _conversation_manager diff --git a/backends/advanced/src/advanced_omi_backend/database.py b/backends/advanced/src/advanced_omi_backend/database.py index e93c1d5c..cca103ea 100644 --- a/backends/advanced/src/advanced_omi_backend/database.py +++ b/backends/advanced/src/advanced_omi_backend/database.py @@ -7,10 +7,6 @@ import logging import os -import time -from datetime import UTC, datetime -from typing import Optional -import uuid from motor.motor_asyncio import AsyncIOMotorClient @@ -18,15 +14,22 @@ # MongoDB Configuration MONGODB_URI = os.getenv("MONGODB_URI", "mongodb://mongo:27017") -mongo_client = AsyncIOMotorClient(MONGODB_URI) +mongo_client = AsyncIOMotorClient( + MONGODB_URI, + maxPoolSize=50, # Increased pool size for concurrent operations + minPoolSize=10, # Keep minimum connections ready + maxIdleTimeMS=45000, # Keep idle connections for 45 seconds + serverSelectionTimeoutMS=5000, # Fail fast if server unavailable + socketTimeoutMS=20000, # 20 second timeout for operations +) db = mongo_client.get_default_database("friend-lite") -# Collection references -chunks_col = db["audio_chunks"] -processing_runs_col = db["processing_runs"] +# Collection references (for non-Beanie collections) users_col = db["users"] -speakers_col = db["speakers"] -conversations_col = db["conversations"] + +# Note: conversations collection managed by Beanie (Document model) +# Note: processing_runs replaced by RQ job tracking +# Beanie initialization happens in main.py during application startup def get_database(): @@ -37,818 +40,7 @@ def get_database(): def get_collections(): """Get commonly used collection references.""" return { - "chunks_col": chunks_col, - "processing_runs_col": processing_runs_col, "users_col": users_col, - "speakers_col": speakers_col, - "conversations_col": conversations_col, } -class AudioChunksRepository: - """Async helpers for the audio_chunks collection.""" - - def __init__(self, collection): - self.col = collection - - async def create_chunk( - self, - *, - audio_uuid, - audio_path, - client_id, - timestamp, - user_id=None, - user_email=None, - transcript=None, - speakers_identified=None, - memories=None, - transcription_status="PENDING", - memory_processing_status="PENDING", - ): - # Create initial transcript version if provided - transcript_versions = [] - active_transcript_version = None - - if transcript: - version_id = str(uuid.uuid4()) - transcript_versions.append({ - "version_id": version_id, - "segments": transcript, - "status": transcription_status, - "provider": None, - "created_at": datetime.now(UTC).isoformat(), - "processing_run_id": None, - "raw_data": {}, - "speakers_identified": speakers_identified or [] - }) - active_transcript_version = version_id - - # Create initial memory version if provided - memory_versions = [] - active_memory_version = None - - if memories: - version_id = str(uuid.uuid4()) - memory_versions.append({ - "version_id": version_id, - "memories": memories, - "status": memory_processing_status, - "created_at": datetime.now(UTC).isoformat(), - "processing_run_id": None, - "transcript_version_id": active_transcript_version - }) - active_memory_version = version_id - - doc = { - "audio_uuid": audio_uuid, - "audio_path": audio_path, - "client_id": client_id, - "timestamp": timestamp, - "user_id": user_id, - "user_email": user_email, - - # Versioned transcript data - "transcript_versions": transcript_versions, - "active_transcript_version": active_transcript_version, - - # Versioned memory data - "memory_versions": memory_versions, - "active_memory_version": active_memory_version, - - # Compatibility fields (computed from active versions) - "transcript": transcript or [], - "speakers_identified": speakers_identified or [], - "memories": memories or [], - "transcription_status": transcription_status, - "memory_processing_status": memory_processing_status, - "raw_transcript_data": {} - } - await self.col.insert_one(doc) - - async def add_transcript_segment(self, audio_uuid, transcript_segment): - """Add a single transcript segment to the conversation. - - Interface compatibility method - adds to active transcript version. - Creates first transcript version if none exists. - """ - chunk = await self.get_chunk(audio_uuid) - if not chunk: - return False - - active_version = chunk.get("active_transcript_version") - if not active_version: - # Create initial version if none exists - version_id = str(uuid.uuid4()) - version_data = { - "version_id": version_id, - "segments": [transcript_segment], - "status": "PENDING", - "provider": None, - "created_at": datetime.now(UTC).isoformat(), - "processing_run_id": None, - "raw_data": {}, - "speakers_identified": [] - } - - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - { - "$push": {"transcript_versions": version_data}, - "$set": { - "active_transcript_version": version_id, - # Update compatibility field too - "transcript": [transcript_segment] - } - } - ) - else: - # Add to existing active version - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - { - "$push": { - f"transcript_versions.$[version].segments": transcript_segment, - # Update compatibility field too - "transcript": transcript_segment - } - }, - array_filters=[{"version.version_id": active_version}] - ) - - return result.modified_count > 0 - - async def add_speaker(self, audio_uuid, speaker_id): - """Add a speaker to the speakers_identified list if not already present.""" - await self.col.update_one( - {"audio_uuid": audio_uuid}, - {"$addToSet": {"speakers_identified": speaker_id}}, - ) - - async def store_raw_transcript_data(self, audio_uuid, raw_data, provider): - """Store raw transcript data from transcription provider.""" - await self.col.update_one( - {"audio_uuid": audio_uuid}, - { - "$set": { - "raw_transcript_data": { - "provider": provider, - "data": raw_data, - "stored_at": datetime.now(UTC).isoformat(), - } - } - }, - ) - - async def get_chunk(self, audio_uuid): - """Get a chunk by audio_uuid.""" - return await self.col.find_one({"audio_uuid": audio_uuid}) - - async def add_memory_reference(self, audio_uuid: str, memory_id: str, status: str = "created"): - """Add memory reference to audio chunk.""" - memory_ref = { - "memory_id": memory_id, - "created_at": datetime.now(UTC).isoformat(), - "status": status, - "updated_at": datetime.now(UTC).isoformat(), - } - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, {"$push": {"memories": memory_ref}} - ) - if result.modified_count > 0: - logger.info(f"Added memory reference {memory_id} to audio {audio_uuid}") - return result.modified_count > 0 - - async def update_memory_status(self, audio_uuid: str, memory_id: str, status: str): - """Update memory status in audio chunk.""" - result = await self.col.update_one( - {"audio_uuid": audio_uuid, "memories.memory_id": memory_id}, - { - "$set": { - "memories.$.status": status, - "memories.$.updated_at": datetime.now(UTC).isoformat(), - } - }, - ) - if result.modified_count > 0: - logger.info(f"Updated memory {memory_id} status to {status} for audio {audio_uuid}") - return result.modified_count > 0 - - async def remove_memory_reference(self, audio_uuid: str, memory_id: str): - """Remove memory reference from audio chunk.""" - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, {"$pull": {"memories": {"memory_id": memory_id}}} - ) - if result.modified_count > 0: - logger.info(f"Removed memory reference {memory_id} from audio {audio_uuid}") - return result.modified_count > 0 - - async def get_chunk_by_audio_uuid(self, audio_uuid: str): - """Get a chunk document by audio_uuid.""" - return await self.col.find_one({"audio_uuid": audio_uuid}) - - async def get_transcript_segments(self, audio_uuid: str): - """Get transcript segments for a specific audio UUID from active version.""" - document = await self.col.find_one( - {"audio_uuid": audio_uuid}, - {"transcript_versions": 1, "active_transcript_version": 1, "transcript": 1} - ) - - if not document: - return [] - - # Try to get from active version first (new versioned approach) - active_version_id = document.get("active_transcript_version") - if active_version_id and "transcript_versions" in document: - for version in document["transcript_versions"]: - if version.get("version_id") == active_version_id: - return version.get("segments", []) - - # Fallback to legacy transcript field for backward compatibility - if "transcript" in document: - return document["transcript"] - - return [] - - async def update_transcript(self, audio_uuid, full_transcript): - """Update the entire transcript list (for compatibility).""" - await self.col.update_one( - {"audio_uuid": audio_uuid}, {"$set": {"transcript": full_transcript}} - ) - - async def update_segment_timing(self, audio_uuid, segment_index, start_time, end_time): - """Update timing information for a specific transcript segment.""" - await self.col.update_one( - {"audio_uuid": audio_uuid}, - { - "$set": { - f"transcript.{segment_index}.start": start_time, - f"transcript.{segment_index}.end": end_time, - } - }, - ) - - async def update_segment_speaker(self, audio_uuid, segment_index, speaker_id): - """Update the speaker for a specific transcript segment.""" - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - {"$set": {f"transcript.{segment_index}.speaker": speaker_id}}, - ) - if result.modified_count > 0: - logger.info(f"Updated segment {segment_index} speaker to {speaker_id} for {audio_uuid}") - return result.modified_count > 0 - - async def update_cropped_audio( - self, - audio_uuid: str, - cropped_path: str, - speech_segments: list[tuple[float, float]], - ): - """Update the chunk with cropped audio information.""" - cropped_duration = sum(end - start for start, end in speech_segments) - - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - { - "$set": { - "cropped_audio_path": cropped_path, - "speech_segments": [ - {"start": start, "end": end} for start, end in speech_segments - ], - "cropped_duration": cropped_duration, - "cropped_at": datetime.now(UTC), - } - }, - ) - if result.modified_count > 0: - logger.info(f"Updated cropped audio info for {audio_uuid}: {cropped_path}") - return result.modified_count > 0 - - - async def update_memory_processing_status( - self, audio_uuid: str, status: str, error_message: str = None - ): - """Update memory processing status and completion timestamp. - - Interface compatibility method - updates active memory version. - """ - chunk = await self.get_chunk(audio_uuid) - if not chunk: - return False - - active_version = chunk.get("active_memory_version") - if not active_version: - # Create initial memory version if none exists - version_id = str(uuid.uuid4()) - version_data = { - "version_id": version_id, - "memories": [], - "status": status, - "created_at": datetime.now(UTC).isoformat(), - "processing_run_id": None, - "transcript_version_id": chunk.get("active_transcript_version") - } - if error_message: - version_data["error_message"] = error_message - - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - { - "$push": {"memory_versions": version_data}, - "$set": { - "active_memory_version": version_id, - "memory_processing_status": status, - "memory_processing_updated_at": datetime.now(UTC).isoformat(), - } - } - ) - else: - # Update existing active version - update_doc = { - f"memory_versions.$[version].status": status, - f"memory_versions.$[version].updated_at": datetime.now(UTC), - "memory_processing_status": status, - "memory_processing_updated_at": datetime.now(UTC).isoformat(), - } - if status == "COMPLETED": - update_doc["memory_processing_completed_at"] = datetime.now(UTC).isoformat() - if error_message: - update_doc[f"memory_versions.$[version].error_message"] = error_message - update_doc["memory_processing_error"] = error_message - - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - {"$set": update_doc}, - array_filters=[{"version.version_id": active_version}] - ) - - if result.modified_count > 0: - logger.info(f"Updated memory processing status to {status} for {audio_uuid}") - return result.modified_count > 0 - - async def update_transcription_status( - self, audio_uuid: str, status: str, error_message: Optional[str] = None, provider: Optional[str] = None - ): - """Update transcription processing status and completion timestamp. - - Interface compatibility method - updates active transcript version. - """ - chunk = await self.get_chunk(audio_uuid) - if not chunk: - return False - - active_version = chunk.get("active_transcript_version") - if not active_version: - # Create initial transcript version if none exists - version_id = str(uuid.uuid4()) - version_data = { - "version_id": version_id, - "transcript": "", - "segments": [], - "status": status, - "provider": provider, - "created_at": datetime.now(UTC).isoformat(), - "processing_run_id": None, - "raw_data": {}, - "speakers_identified": [] - } - if error_message: - version_data["error_message"] = error_message - - update_doc = { - "active_transcript_version": version_id, - "transcription_status": status, - "transcription_updated_at": datetime.now(UTC).isoformat(), - } - if status == "COMPLETED": - update_doc["transcription_completed_at"] = datetime.now(UTC).isoformat() - - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - { - "$push": {"transcript_versions": version_data}, - "$set": update_doc - } - ) - else: - # Update existing active version - update_doc = { - "transcript_versions.$[version].status": status, - "transcript_versions.$[version].updated_at": datetime.now(UTC).isoformat(), - "transcription_status": status, - "transcription_updated_at": datetime.now(UTC).isoformat(), - } - if status == "COMPLETED": - update_doc["transcription_completed_at"] = datetime.now(UTC).isoformat() - if error_message: - update_doc["transcript_versions.$[version].error_message"] = error_message - update_doc["transcription_error"] = error_message - if provider: - update_doc["transcript_versions.$[version].provider"] = provider - update_doc["transcript_provider"] = provider - - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - {"$set": update_doc}, - array_filters=[{"version.version_id": active_version}] - ) - - if result.modified_count > 0: - logger.info(f"Updated transcription status to {status} for {audio_uuid}") - return result.modified_count > 0 - - # ======================================== - # SPEECH-DRIVEN CONVERSATIONS METHODS - # ======================================== - - async def add_audio_file_path(self, audio_uuid: str, file_path: str): - """Add new audio file path to existing session.""" - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - { - "$push": {"audio_file_paths": file_path}, - "$set": {"updated_at": datetime.now(UTC).isoformat()} - } - ) - if result.modified_count > 0: - logger.info(f"Added audio file path {file_path} to session {audio_uuid}") - return result.modified_count > 0 - - async def update_speech_detection(self, audio_uuid: str, **speech_data): - """Update speech detection results.""" - update_doc = { - "updated_at": datetime.now(UTC).isoformat() - } - - # Add speech detection fields - for key, value in speech_data.items(): - if key in ["has_speech", "conversation_created", "conversation_id", - "speech_start_time", "speech_end_time", "status"]: - update_doc[key] = value - - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - {"$set": update_doc} - ) - if result.modified_count > 0: - logger.info(f"Updated speech detection for {audio_uuid}: {speech_data}") - return result.modified_count > 0 - - async def mark_conversation_created(self, audio_uuid: str, conversation_id: str): - """Mark that conversation was created for this audio.""" - result = await self.col.update_one( - {"audio_uuid": audio_uuid}, - {"$set": { - "conversation_created": True, - "conversation_id": conversation_id, - "has_speech": True, - "status": "speech_detected", - "updated_at": datetime.now(UTC).isoformat() - }} - ) - if result.modified_count > 0: - logger.info(f"Marked conversation created for {audio_uuid} with ID {conversation_id}") - return result.modified_count > 0 - - async def get_sessions_with_speech(self, user_id: str, limit: int = 100): - """Get audio sessions that have detected speech.""" - cursor = self.col.find({ - "user_id": user_id, - "has_speech": True, - "conversation_created": True - }).sort("timestamp", -1).limit(limit) - - return await cursor.to_list(length=None) - - -class ConversationsRepository: - """Repository for user-facing conversations (speech-driven architecture).""" - - def __init__(self, collection): - self.col = collection - - async def create_conversation(self, conversation_data: dict) -> str: - """Create new user-facing conversation.""" - result = await self.col.insert_one(conversation_data) - return conversation_data["conversation_id"] - - def _populate_legacy_fields(self, conversation): - """Auto-populate legacy fields from active versions for backward compatibility.""" - if not conversation: - return conversation - - # Auto-populate transcript from active transcript version - active_transcript_version_id = conversation.get("active_transcript_version") - if active_transcript_version_id: - for version in conversation.get("transcript_versions", []): - if version.get("version_id") == active_transcript_version_id: - conversation["transcript"] = version.get("segments", []) - conversation["speakers_identified"] = version.get("speakers_identified", []) - break - else: - # No active version - ensure empty transcript - conversation["transcript"] = [] - - # Auto-populate memories from active memory version - active_memory_version_id = conversation.get("active_memory_version") - if active_memory_version_id: - for version in conversation.get("memory_versions", []): - if version.get("version_id") == active_memory_version_id: - conversation["memories"] = version.get("memories", []) - conversation["memory_processing_status"] = version.get("status", "pending") - break - else: - # No active version - ensure empty memories - conversation["memories"] = [] - conversation["memory_processing_status"] = "pending" - - return conversation - - async def get_conversation(self, conversation_id: str): - """Get conversation by conversation_id with auto-populated legacy fields.""" - conversation = await self.col.find_one({"conversation_id": conversation_id}) - return self._populate_legacy_fields(conversation) - - async def get_user_conversations(self, user_id: str, limit=100): - """Get all conversations for a user (only shows conversations with speech).""" - cursor = self.col.find({"user_id": user_id}) - conversations = await cursor.sort("created_at", -1).limit(limit).to_list() - # Auto-populate legacy fields for all conversations - return [self._populate_legacy_fields(conv) for conv in conversations] - - async def update_conversation(self, conversation_id: str, update_data: dict): - """Update conversation data.""" - result = await self.col.update_one( - {"conversation_id": conversation_id}, - {"$set": {**update_data, "updated_at": datetime.now(UTC)}} - ) - return result.modified_count > 0 - - async def add_memories(self, conversation_id: str, memories: list): - """Add memories to conversation.""" - result = await self.col.update_one( - {"conversation_id": conversation_id}, - { - "$push": {"memories": {"$each": memories}}, - "$set": {"updated_at": datetime.now(UTC)} - } - ) - return result.modified_count > 0 - - async def update_memory_processing_status(self, conversation_id: str, status: str): - """Update memory processing status for conversation.""" - result = await self.col.update_one( - {"conversation_id": conversation_id}, - { - "$set": { - "memory_processing_status": status, - "memory_processing_updated_at": datetime.now(UTC) - } - } - ) - return result.modified_count > 0 - - # ======================================== - # NEW: VERSIONING METHODS FOR REPROCESSING - # ======================================== - - async def create_transcript_version( - self, - conversation_id: str, - segments: list = None, - processing_run_id: str = None, - provider: str = None, - raw_data: dict = None - ) -> Optional[str]: - """Create a new transcript version in conversation.""" - version_id = str(uuid.uuid4()) - version_data = { - "version_id": version_id, - "segments": segments or [], - "status": "PENDING", - "provider": provider, - "created_at": datetime.now(UTC).isoformat(), - "processing_run_id": processing_run_id, - "raw_data": raw_data or {}, - "speakers_identified": [] - } - - result = await self.col.update_one( - {"conversation_id": conversation_id}, - {"$push": {"transcript_versions": version_data}} - ) - - if result.modified_count > 0: - logger.info(f"Created new transcript version {version_id} for conversation {conversation_id}") - return version_id - return None - - async def create_memory_version( - self, - conversation_id: str, - transcript_version_id: str, - memories: list = None, - processing_run_id: str = None - ) -> Optional[str]: - """Create a new memory version in conversation.""" - version_id = str(uuid.uuid4()) - version_data = { - "version_id": version_id, - "memories": memories or [], - "status": "PENDING", - "created_at": datetime.now(UTC).isoformat(), - "processing_run_id": processing_run_id, - "transcript_version_id": transcript_version_id - } - - result = await self.col.update_one( - {"conversation_id": conversation_id}, - {"$push": {"memory_versions": version_data}} - ) - - if result.modified_count > 0: - logger.info(f"Created new memory version {version_id} for conversation {conversation_id}") - return version_id - return None - - async def activate_transcript_version(self, conversation_id: str, version_id: str) -> bool: - """Activate a specific transcript version in conversation.""" - # First verify the version exists - conversation = await self.col.find_one( - {"conversation_id": conversation_id, "transcript_versions.version_id": version_id} - ) - if not conversation: - return False - - # Find the version and update active fields - version_data = None - for version in conversation.get("transcript_versions", []): - if version["version_id"] == version_id: - version_data = version - break - - if not version_data: - return False - - result = await self.col.update_one( - {"conversation_id": conversation_id}, - { - "$set": { - "active_transcript_version": version_id, - "transcript": version_data["segments"], - "speakers_identified": version_data["speakers_identified"], - "updated_at": datetime.now(UTC) - } - } - ) - - if result.modified_count > 0: - logger.info(f"Activated transcript version {version_id} for conversation {conversation_id}") - return result.modified_count > 0 - - async def activate_memory_version(self, conversation_id: str, version_id: str) -> bool: - """Activate a specific memory version in conversation.""" - # First verify the version exists - conversation = await self.col.find_one( - {"conversation_id": conversation_id, "memory_versions.version_id": version_id} - ) - if not conversation: - return False - - # Find the version and update active fields - version_data = None - for version in conversation.get("memory_versions", []): - if version["version_id"] == version_id: - version_data = version - break - - if not version_data: - return False - - result = await self.col.update_one( - {"conversation_id": conversation_id}, - { - "$set": { - "active_memory_version": version_id, - "memories": version_data["memories"], - "memory_processing_status": version_data["status"], - "updated_at": datetime.now(UTC) - } - } - ) - - if result.modified_count > 0: - logger.info(f"Activated memory version {version_id} for conversation {conversation_id}") - return result.modified_count > 0 - - async def get_version_history(self, conversation_id: str) -> dict: - """Get all version history for a conversation.""" - conversation = await self.col.find_one({"conversation_id": conversation_id}) - if not conversation: - return {} - - return { - "conversation_id": conversation_id, - "active_transcript_version": conversation.get("active_transcript_version"), - "active_memory_version": conversation.get("active_memory_version"), - "transcript_versions": conversation.get("transcript_versions", []), - "memory_versions": conversation.get("memory_versions", []) - } - - async def update_transcript_processing_status( - self, - conversation_id: str, - status: str, - provider: str = None, - error_message: str = None - ): - """Update transcript processing status for conversation.""" - update_doc = { - "transcript_processing_status": status, - "transcript_processing_updated_at": datetime.now(UTC), - "updated_at": datetime.now(UTC) - } - if provider: - update_doc["transcript_provider"] = provider - if error_message: - update_doc["transcript_processing_error"] = error_message - - result = await self.col.update_one( - {"conversation_id": conversation_id}, - {"$set": update_doc} - ) - return result.modified_count > 0 - - -class ProcessingRunsRepository: - """Repository for processing run tracking (updated for conversation_id).""" - - def __init__(self, collection): - self.col = collection - - async def create_run( - self, - *, - conversation_id: str, - audio_uuid: str, # Keep for audio file access - run_type: str, # 'transcript' or 'memory' - user_id: str, - trigger: str, # 'manual_reprocess', 'initial_processing', etc. - config_hash: str = None - ) -> str: - """Create a new processing run for conversation.""" - run_id = str(uuid.uuid4()) - doc = { - "run_id": run_id, - "conversation_id": conversation_id, - "audio_uuid": audio_uuid, # Keep for file access - "run_type": run_type, - "user_id": user_id, - "trigger": trigger, - "config_hash": config_hash, - "status": "PENDING", - "started_at": datetime.now(UTC), - "completed_at": None, - "error_message": None, - "result_version_id": None - } - await self.col.insert_one(doc) - logger.info(f"Created processing run {run_id} for conversation {conversation_id}") - return run_id - - async def update_run_status( - self, - run_id: str, - status: str, - error_message: str = None, - result_version_id: str = None - ) -> bool: - """Update processing run status.""" - update_doc = { - "status": status, - "updated_at": datetime.now(UTC) - } - if status in ["COMPLETED", "FAILED"]: - update_doc["completed_at"] = datetime.now(UTC) - if error_message: - update_doc["error_message"] = error_message - if result_version_id: - update_doc["result_version_id"] = result_version_id - - result = await self.col.update_one( - {"run_id": run_id}, - {"$set": update_doc} - ) - - if result.modified_count > 0: - logger.info(f"Updated processing run {run_id} status to {status}") - return result.modified_count > 0 - - async def get_run(self, run_id: str): - """Get a processing run by ID.""" - return await self.col.find_one({"run_id": run_id}) - - async def get_runs_for_conversation(self, conversation_id: str): - """Get all processing runs for a conversation.""" - cursor = self.col.find({"conversation_id": conversation_id}).sort("started_at", -1) - return await cursor.to_list(length=None) diff --git a/backends/advanced/src/advanced_omi_backend/job_tracker.py b/backends/advanced/src/advanced_omi_backend/job_tracker.py deleted file mode 100644 index f16b1c2b..00000000 --- a/backends/advanced/src/advanced_omi_backend/job_tracker.py +++ /dev/null @@ -1,266 +0,0 @@ -""" -Job tracking system for async file processing operations. - -Provides in-memory job tracking for file upload and processing operations. -""" - -import asyncio -import logging -import time -import uuid -from dataclasses import dataclass, field -from datetime import datetime, timezone -from enum import Enum -from typing import Dict, List, Optional - -logger = logging.getLogger(__name__) - - -class JobStatus(str, Enum): - QUEUED = "queued" - PROCESSING = "processing" - COMPLETED = "completed" - FAILED = "failed" - CANCELLED = "cancelled" - - -class FileStatus(str, Enum): - PENDING = "pending" - PROCESSING = "processing" - COMPLETED = "completed" - FAILED = "failed" - SKIPPED = "skipped" - - -@dataclass -class FileProcessingInfo: - filename: str - duration_seconds: Optional[float] = None - size_bytes: Optional[int] = None - status: FileStatus = FileStatus.PENDING - client_id: Optional[str] = None - audio_uuid: Optional[str] = None - transcription_status: Optional[str] = None - memory_status: Optional[str] = None - error_message: Optional[str] = None - started_at: Optional[datetime] = None - completed_at: Optional[datetime] = None - - -@dataclass -class ProcessingJob: - job_id: str - user_id: str - device_name: str - status: JobStatus = JobStatus.QUEUED - files: List[FileProcessingInfo] = field(default_factory=list) - created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) - started_at: Optional[datetime] = None - completed_at: Optional[datetime] = None - error_message: Optional[str] = None - current_file_index: int = 0 - - @property - def total_files(self) -> int: - return len(self.files) - - @property - def processed_files(self) -> int: - return len( - [ - f - for f in self.files - if f.status in [FileStatus.COMPLETED, FileStatus.FAILED, FileStatus.SKIPPED] - ] - ) - - @property - def progress_percent(self) -> float: - if self.total_files == 0: - return 0.0 - return (self.processed_files / self.total_files) * 100 - - @property - def current_file(self) -> Optional[FileProcessingInfo]: - if 0 <= self.current_file_index < len(self.files): - return self.files[self.current_file_index] - return None - - def to_dict(self) -> dict: - return { - "job_id": self.job_id, - "status": self.status.value, - "total_files": self.total_files, - "processed_files": self.processed_files, - "current_file": self.current_file.filename if self.current_file else None, - "progress_percent": round(self.progress_percent, 1), - "created_at": self.created_at.isoformat(), - "started_at": self.started_at.isoformat() if self.started_at else None, - "completed_at": self.completed_at.isoformat() if self.completed_at else None, - "error_message": self.error_message, - "files": [ - { - "filename": f.filename, - "duration_seconds": f.duration_seconds, - "size_bytes": f.size_bytes, - "status": f.status.value, - "client_id": f.client_id, - "audio_uuid": f.audio_uuid, - "transcription_status": f.transcription_status, - "memory_status": f.memory_status, - "error_message": f.error_message, - "started_at": f.started_at.isoformat() if f.started_at else None, - "completed_at": f.completed_at.isoformat() if f.completed_at else None, - } - for f in self.files - ], - } - - -class JobTracker: - """In-memory job tracking system.""" - - def __init__(self): - self.jobs: Dict[str, ProcessingJob] = {} - self._lock = asyncio.Lock() - - # Start cleanup task - self._cleanup_task = None - self._start_cleanup_task() - - def _start_cleanup_task(self): - """Start background task to clean up old jobs.""" - if self._cleanup_task is None or self._cleanup_task.done(): - self._cleanup_task = asyncio.create_task(self._cleanup_old_jobs()) - - async def _cleanup_old_jobs(self): - """Remove jobs older than 1 hour to prevent memory leaks.""" - while True: - try: - await asyncio.sleep(3600) # Check every hour - cutoff_time = datetime.now(timezone.utc).timestamp() - 3600 # 1 hour ago - - async with self._lock: - jobs_to_remove = [] - for job_id, job in self.jobs.items(): - job_age = job.created_at.timestamp() - if job_age < cutoff_time and job.status in [ - JobStatus.COMPLETED, - JobStatus.FAILED, - JobStatus.CANCELLED, - ]: - jobs_to_remove.append(job_id) - - for job_id in jobs_to_remove: - del self.jobs[job_id] - logger.info(f"Cleaned up old job: {job_id}") - - except Exception as e: - logger.error(f"Error in job cleanup task: {e}") - - async def create_job(self, user_id: str, device_name: str, files: List[str]) -> str: - """Create a new processing job.""" - job_id = str(uuid.uuid4()) - - file_infos = [] - for filename in files: - file_infos.append(FileProcessingInfo(filename=filename)) - - job = ProcessingJob( - job_id=job_id, user_id=user_id, device_name=device_name, files=file_infos - ) - - async with self._lock: - self.jobs[job_id] = job - - logger.info(f"Created job {job_id} with {len(files)} files for user {user_id}") - return job_id - - async def get_job(self, job_id: str) -> Optional[ProcessingJob]: - """Get job by ID.""" - async with self._lock: - return self.jobs.get(job_id) - - async def update_job_status(self, job_id: str, status: JobStatus, error_message: str = None): - """Update job status.""" - async with self._lock: - if job_id in self.jobs: - job = self.jobs[job_id] - job.status = status - if error_message: - job.error_message = error_message - - if status == JobStatus.PROCESSING and job.started_at is None: - job.started_at = datetime.now(timezone.utc) - elif status in [JobStatus.COMPLETED, JobStatus.FAILED]: - job.completed_at = datetime.now(timezone.utc) - - async def update_file_status( - self, - job_id: str, - filename: str, - status: FileStatus, - client_id: str = None, - audio_uuid: str = None, - transcription_status: str = None, - memory_status: str = None, - error_message: str = None, - ): - """Update status of a specific file in the job.""" - async with self._lock: - if job_id in self.jobs: - job = self.jobs[job_id] - for file_info in job.files: - if file_info.filename == filename: - file_info.status = status - if client_id: - file_info.client_id = client_id - if audio_uuid: - file_info.audio_uuid = audio_uuid - if transcription_status: - file_info.transcription_status = transcription_status - if memory_status: - file_info.memory_status = memory_status - if error_message: - file_info.error_message = error_message - - if status == FileStatus.PROCESSING and file_info.started_at is None: - file_info.started_at = datetime.now(timezone.utc) - elif status in [ - FileStatus.COMPLETED, - FileStatus.FAILED, - FileStatus.SKIPPED, - ]: - file_info.completed_at = datetime.now(timezone.utc) - break - - async def set_current_file(self, job_id: str, filename: str): - """Set the currently processing file.""" - async with self._lock: - if job_id in self.jobs: - job = self.jobs[job_id] - for i, file_info in enumerate(job.files): - if file_info.filename == filename: - job.current_file_index = i - break - - async def get_active_jobs(self) -> List[ProcessingJob]: - """Get all active (non-completed) jobs.""" - async with self._lock: - return [ - job - for job in self.jobs.values() - if job.status in [JobStatus.QUEUED, JobStatus.PROCESSING] - ] - - -# Global job tracker instance -_job_tracker: Optional[JobTracker] = None - - -def get_job_tracker() -> JobTracker: - """Get the global job tracker instance.""" - global _job_tracker - if _job_tracker is None: - _job_tracker = JobTracker() - return _job_tracker diff --git a/backends/advanced/src/advanced_omi_backend/llm_client.py b/backends/advanced/src/advanced_omi_backend/llm_client.py index 03c15db0..21ee3331 100644 --- a/backends/advanced/src/advanced_omi_backend/llm_client.py +++ b/backends/advanced/src/advanced_omi_backend/llm_client.py @@ -57,12 +57,25 @@ def __init__( if not self.api_key or not self.base_url or not self.model: raise ValueError("OPENAI_API_KEY, OPENAI_BASE_URL, and OPENAI_MODEL must be set") - # Initialize OpenAI client + # Initialize OpenAI client with optional Langfuse tracing try: - import langfuse.openai as openai + # Check if Langfuse is configured + langfuse_enabled = ( + os.getenv("LANGFUSE_PUBLIC_KEY") + and os.getenv("LANGFUSE_SECRET_KEY") + and os.getenv("LANGFUSE_HOST") + ) - self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url) - self.logger.info(f"OpenAI client initialized with base_url: {self.base_url}") + if langfuse_enabled: + # Use Langfuse-wrapped OpenAI for tracing + import langfuse.openai as openai + self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url) + self.logger.info(f"OpenAI client initialized with Langfuse tracing, base_url: {self.base_url}") + else: + # Use regular OpenAI client without tracing + from openai import OpenAI + self.client = OpenAI(api_key=self.api_key, base_url=self.base_url) + self.logger.info(f"OpenAI client initialized (no tracing), base_url: {self.base_url}") except ImportError: self.logger.error("OpenAI library not installed. Install with: pip install openai") raise diff --git a/backends/advanced/src/advanced_omi_backend/main.py b/backends/advanced/src/advanced_omi_backend/main.py index 1eaafabe..df51e1cc 100644 --- a/backends/advanced/src/advanced_omi_backend/main.py +++ b/backends/advanced/src/advanced_omi_backend/main.py @@ -1,1285 +1,49 @@ #!/usr/bin/env python3 -"""Unified Omi-audio service - -* Accepts Opus packets over a WebSocket (`/ws`) or PCM over a WebSocket (`/ws_pcm`). -* Uses a central queue to decouple audio ingestion from processing. -* A saver consumer buffers PCM and writes 30-second WAV chunks to `./data/audio_chunks/`. -* A transcription consumer sends each chunk to a Wyoming ASR service. -* The transcript is stored in **mem0** and MongoDB. - """ -import logging - -logging.basicConfig(level=logging.INFO) - -import asyncio -import concurrent.futures -import json -import os -import time -import uuid -from contextlib import asynccontextmanager -from functools import partial -from pathlib import Path -from typing import Optional - -import aiohttp - -# Import authentication components -from advanced_omi_backend.auth import ( - bearer_backend, - cookie_backend, - create_admin_user_if_needed, - fastapi_users, - websocket_auth, -) -from advanced_omi_backend.client import ClientState -from advanced_omi_backend.client_manager import generate_client_id -from advanced_omi_backend.constants import ( - OMI_CHANNELS, - OMI_SAMPLE_RATE, - OMI_SAMPLE_WIDTH, -) -from advanced_omi_backend.database import AudioChunksRepository -from advanced_omi_backend.llm_client import async_health_check -from advanced_omi_backend.memory import get_memory_service, shutdown_memory_service -from advanced_omi_backend.processors import ( - AudioProcessingItem, - get_processor_manager, - init_processor_manager, -) -from advanced_omi_backend.audio_utils import process_audio_chunk -from advanced_omi_backend.task_manager import init_task_manager, get_task_manager -from advanced_omi_backend.transcript_coordinator import get_transcript_coordinator -from advanced_omi_backend.transcription_providers import get_transcription_provider -from advanced_omi_backend.users import ( - User, - UserRead, - UserUpdate, - register_client_to_user, -) - -# Import Beanie for user management -from beanie import init_beanie -from dotenv import load_dotenv -from fastapi import ( - FastAPI, - HTTPException, - Query, - Request, - WebSocket, - WebSocketDisconnect, -) -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse -from fastapi.staticfiles import StaticFiles -from friend_lite.decoder import OmiOpusDecoder -from motor.motor_asyncio import AsyncIOMotorClient -from pymongo.errors import ConnectionFailure, PyMongoError -from wyoming.audio import AudioChunk -from wyoming.client import AsyncTcpClient +Unified Omi-audio service + + * Accepts Opus packets over a WebSocket (`/ws`) or PCM over a WebSocket (`/ws_pcm`). + * Uses a central queue to decouple audio ingestion from processing. + * A saver consumer buffers PCM and writes 30-second WAV chunks to `./data/audio_chunks/`. + * A transcription consumer sends each chunk to a Wyoming ASR service. + * The transcript is stored in **mem0** and MongoDB. + +Refactored to use a modular architecture with proper separation of concerns: +- app_factory.py: FastAPI application creation and configuration +- app_config.py: Centralized configuration management +- middleware/app_middleware.py: CORS and exception handling +- routers/modules/: Organized route handlers +""" -############################################################################### -# SETUP -############################################################################### +import logging +import uvicorn -# Load environment variables first -load_dotenv() +from advanced_omi_backend.app_factory import create_app # Logging setup +logging.basicConfig(level=logging.INFO) logger = logging.getLogger("advanced-backend") -application_logger = logging.getLogger("audio_processing") - -# Conditional Deepgram import -try: - from deepgram import DeepgramClient, FileSource, PrerecordedOptions # type: ignore -except ImportError: - logger.warning("Deepgram SDK not available. Install with: uv sync --group deepgram") - -############################################################################### -# CONFIGURATION -############################################################################### - -# MongoDB Configuration -MONGODB_URI = os.getenv("MONGODB_URI", "mongodb://mongo:27017") -mongo_client = AsyncIOMotorClient(MONGODB_URI) -db = mongo_client.get_default_database("friend-lite") -chunks_col = db["audio_chunks"] -users_col = db["users"] -speakers_col = db["speakers"] - - -# Audio Configuration -SEGMENT_SECONDS = 60 # length of each stored chunk -TARGET_SAMPLES = OMI_SAMPLE_RATE * SEGMENT_SECONDS - -# Conversation timeout configuration -NEW_CONVERSATION_TIMEOUT_MINUTES = float(os.getenv("NEW_CONVERSATION_TIMEOUT_MINUTES", "1.5")) - -# Audio cropping configuration -AUDIO_CROPPING_ENABLED = os.getenv("AUDIO_CROPPING_ENABLED", "true").lower() == "true" -MIN_SPEECH_SEGMENT_DURATION = float(os.getenv("MIN_SPEECH_SEGMENT_DURATION", "1.0")) # seconds -CROPPING_CONTEXT_PADDING = float( - os.getenv("CROPPING_CONTEXT_PADDING", "0.1") -) # seconds of padding around speech - -# Directory where WAV chunks are written -CHUNK_DIR = Path("./audio_chunks") # This will be mounted to ./data/audio_chunks by Docker -CHUNK_DIR.mkdir(parents=True, exist_ok=True) - - -# Transcription Configuration -TRANSCRIPTION_PROVIDER = os.getenv("TRANSCRIPTION_PROVIDER") -DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") -MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY") - -# Get configured transcription provider (online or offline) -transcription_provider = get_transcription_provider(TRANSCRIPTION_PROVIDER) -if transcription_provider: - logger.info( - f"✅ Using {transcription_provider.name} transcription provider ({transcription_provider.mode})" - ) -else: - logger.warning("⚠️ No transcription provider configured - speech-to-text will not be available") - -# Ollama & Qdrant Configuration -QDRANT_BASE_URL = os.getenv("QDRANT_BASE_URL", "qdrant") -QDRANT_PORT = os.getenv("QDRANT_PORT", "6333") - -# Speaker service configuration - -# Track pending WebSocket connections to prevent race conditions -pending_connections: set[str] = set() - -# Thread pool executors -_DEC_IO_EXECUTOR = concurrent.futures.ThreadPoolExecutor( - max_workers=os.cpu_count() or 4, - thread_name_prefix="opus_io", -) - -# Initialize memory service -memory_service = get_memory_service() - -############################################################################### -# UTILITY FUNCTIONS & HELPER CLASSES -############################################################################### - - -async def parse_wyoming_protocol(ws: WebSocket) -> tuple[dict, Optional[bytes]]: - """Parse Wyoming protocol: JSON header line followed by optional binary payload. - - Returns: - Tuple of (header_dict, payload_bytes or None) - """ - # Read data from WebSocket - logger.debug(f"parse_wyoming_protocol: About to call ws.receive()") - message = await ws.receive() - logger.debug(f"parse_wyoming_protocol: Received message with keys: {message.keys() if message else 'None'}") - - # Handle WebSocket close frame - if "type" in message and message["type"] == "websocket.disconnect": - # This is a normal WebSocket close event - code = message.get("code", 1000) - reason = message.get("reason", "") - logger.info(f"📴 WebSocket disconnect received in parse_wyoming_protocol. Code: {code}, Reason: {reason}") - raise WebSocketDisconnect(code=code, reason=reason) - - # Handle text message (JSON header) - if "text" in message: - header_text = message["text"] - # Wyoming protocol uses newline-terminated JSON - if not header_text.endswith("\n"): - header_text += "\n" - - # Parse JSON header - json_line = header_text.strip() - header = json.loads(json_line) - - # If payload is expected, read binary data - payload = None - payload_length = header.get("payload_length") - if payload_length is not None and payload_length > 0: - payload_msg = await ws.receive() - if "bytes" in payload_msg: - payload = payload_msg["bytes"] - else: - logger.warning(f"Expected binary payload but got: {payload_msg.keys()}") - - return header, payload - - # Handle binary message (invalid - Wyoming protocol requires JSONL headers) - elif "bytes" in message: - raise ValueError( - "Raw binary messages not supported - Wyoming protocol requires JSONL headers" - ) - - else: - raise ValueError(f"Unexpected WebSocket message type: {message.keys()}") - - -# Initialize repository and global state -ac_repository = AudioChunksRepository(chunks_col) -# Client-to-user mapping for reliable permission checking -client_to_user_mapping: dict[str, str] = {} # client_id -> user_id - -# Initialize client manager (self-initializing, no external dict needed) -from advanced_omi_backend.client_manager import get_client_manager - -client_manager = get_client_manager() - -# Initialize client utilities with the mapping dictionaries -from advanced_omi_backend.client_manager import ( - init_client_user_mapping, - register_client_user_mapping, - track_client_user_relationship, - unregister_client_user_mapping, -) - -# Client ownership tracking for database records -# Since we're in development, we'll track all client-user relationships in memory -# This will be populated when clients connect and persisted in database records -all_client_user_mappings: dict[str, str] = ( - {} -) # client_id -> user_id (includes disconnected clients) - -# Initialize client user mapping with both dictionaries -init_client_user_mapping(client_to_user_mapping, all_client_user_mappings) - - -async def create_client_state( - client_id: str, user: User, device_name: Optional[str] = None -) -> ClientState: - """Create and register a new client state.""" - # Use ClientManager for atomic client creation and registration - client_state = client_manager.create_client( - client_id, ac_repository, CHUNK_DIR, user.user_id, user.email - ) - - # Also track in persistent mapping (for database queries) - track_client_user_relationship(client_id, user.user_id) - - # Register client in user model (persistent) - await register_client_to_user(user, client_id, device_name) - - # Note: No need to start processing - it's handled at application level now - - return client_state - - -async def cleanup_client_state(client_id: str): - """Clean up and remove client state.""" - # Use ClientManager for atomic client removal with cleanup - removed = await client_manager.remove_client_with_cleanup(client_id) - - if removed: - # Clean up any orphaned transcript events for this client - coordinator = get_transcript_coordinator() - coordinator.cleanup_transcript_events_for_client(client_id) - - logger.info(f"Client {client_id} cleaned up successfully") - else: - logger.warning(f"Client {client_id} was not found for cleanup") - - -############################################################################### -# CORE APPLICATION LOGIC -############################################################################### - - -@asynccontextmanager -async def lifespan(app: FastAPI): - """Manage application lifespan events.""" - # Startup - application_logger.info("Starting application...") - - # Initialize Beanie for user management - try: - await init_beanie( - database=mongo_client.get_default_database("friend-lite"), - document_models=[User], - ) - application_logger.info("Beanie initialized for user management") - except Exception as e: - application_logger.error(f"Failed to initialize Beanie: {e}") - raise - - # Create admin user if needed - try: - await create_admin_user_if_needed() - except Exception as e: - application_logger.error(f"Failed to create admin user: {e}") - # Don't raise here as this is not critical for startup - - # Initialize task manager - task_manager = init_task_manager() - await task_manager.start() - application_logger.info("Task manager started") - - # Initialize processor manager - processor_manager = init_processor_manager(CHUNK_DIR, ac_repository) - await processor_manager.start() - - logger.info("App ready") - try: - yield - finally: - # Shutdown - application_logger.info("Shutting down application...") - - # Clean up all active clients - for client_id in client_manager.get_all_client_ids(): - await cleanup_client_state(client_id) - - # Shutdown processor manager - processor_manager = get_processor_manager() - await processor_manager.shutdown() - application_logger.info("Processor manager shut down") - - # Shutdown task manager - task_manager = get_task_manager() - await task_manager.shutdown() - application_logger.info("Task manager shut down") - - # Stop metrics collection and save final report - application_logger.info("Metrics collection stopped") - - # Shutdown memory service and speaker service - shutdown_memory_service() - application_logger.info("Memory and speaker services shut down.") - - application_logger.info("Shutdown complete.") - - -# FastAPI Application -app = FastAPI(lifespan=lifespan) - -# Configure CORS with configurable origins (includes Tailscale support by default) -default_origins = "http://localhost:3000,http://localhost:3001,http://127.0.0.1:3000,http://127.0.0.1:3002" -cors_origins = os.getenv("CORS_ORIGINS", default_origins) -allowed_origins = [origin.strip() for origin in cors_origins.split(",") if origin.strip()] - -# Support Tailscale IP range (100.x.x.x) via regex pattern -tailscale_regex = r"http://100\.\d{1,3}\.\d{1,3}\.\d{1,3}:3000" - -logger.info(f"🌐 CORS configured with origins: {allowed_origins}") -logger.info(f"🌐 CORS also allows Tailscale IPs via regex: {tailscale_regex}") - -app.add_middleware( - CORSMiddleware, - allow_origins=allowed_origins, - allow_origin_regex=tailscale_regex, - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -############################################################################### -# GLOBAL EXCEPTION HANDLERS -############################################################################### - -@app.exception_handler(ConnectionFailure) -@app.exception_handler(PyMongoError) -async def database_exception_handler(request: Request, exc: Exception): - """Handle database connection failures and return structured error response.""" - logger.error(f"Database connection error: {type(exc).__name__}: {exc}") - return JSONResponse( - status_code=500, - content={ - "detail": "Unable to connect to server. Please check your connection and try again.", - "error_type": "connection_failure", - "error_category": "database" - } - ) - - -@app.exception_handler(ConnectionError) -async def connection_exception_handler(request: Request, exc: ConnectionError): - """Handle general connection errors and return structured error response.""" - logger.error(f"Connection error: {exc}") - return JSONResponse( - status_code=500, - content={ - "detail": "Unable to connect to server. Please check your connection and try again.", - "error_type": "connection_failure", - "error_category": "network" - } - ) - - -@app.exception_handler(HTTPException) -async def http_exception_handler(request: Request, exc: HTTPException): - """Handle HTTP exceptions with structured error response.""" - # For authentication failures (401), add error_type - if exc.status_code == 401: - return JSONResponse( - status_code=exc.status_code, - content={ - "detail": exc.detail, - "error_type": "authentication_failure" - }, - headers=getattr(exc, "headers", None), - ) - - # For other HTTP exceptions, return as-is - return JSONResponse( - status_code=exc.status_code, - content={"detail": exc.detail}, - headers=getattr(exc, "headers", None), - ) - - -############################################################################### -# HEALTH CHECK ENDPOINTS -############################################################################### - -@app.get("/api/auth/health") -async def auth_health_check(): - """Pre-flight health check for authentication service connectivity.""" - try: - # Test database connectivity - await mongo_client.admin.command("ping") - - # Test memory service if available - if memory_service: - try: - await asyncio.wait_for(memory_service.test_connection(), timeout=2.0) - memory_status = "ok" - except Exception as e: - logger.warning(f"Memory service health check failed: {e}") - memory_status = "degraded" - else: - memory_status = "unavailable" - - return { - "status": "ok", - "database": "ok", - "memory_service": memory_status, - "timestamp": int(time.time()) - } - except Exception as e: - logger.error(f"Auth health check failed: {e}") - return JSONResponse( - status_code=500, - content={ - "status": "error", - "detail": "Service connectivity check failed", - "error_type": "connection_failure", - "timestamp": int(time.time()) - } - ) - - -app.mount("/audio", StaticFiles(directory=CHUNK_DIR), name="audio") - -# Add authentication routers -app.include_router( - fastapi_users.get_auth_router(cookie_backend), - prefix="/auth/cookie", - tags=["auth"], -) -app.include_router( - fastapi_users.get_auth_router(bearer_backend), - prefix="/auth/jwt", - tags=["auth"], -) - -# Add users router for /users/me and other user endpoints -app.include_router( - fastapi_users.get_users_router(UserRead, UserUpdate), - prefix="/users", - tags=["users"], -) - -# API endpoints -from advanced_omi_backend.routers.api_router import router as api_router - -app.include_router(api_router) - - -@app.websocket("/ws_omi") -async def ws_endpoint_omi( - ws: WebSocket, - token: Optional[str] = Query(None), - device_name: Optional[str] = Query(None), -): - """Accepts WebSocket connections with Wyoming protocol, decodes OMI Opus audio, and processes per-client.""" - # Generate pending client_id to track connection even if auth fails - pending_client_id = f"pending_{uuid.uuid4()}" - pending_connections.add(pending_client_id) - - client_id = None - client_state = None - - try: - # Authenticate user before accepting WebSocket connection - user = await websocket_auth(ws, token) - if not user: - await ws.close(code=1008, reason="Authentication required") - return - - await ws.accept() - - # Generate proper client_id using user and device_name - client_id = generate_client_id(user, device_name) - - # Remove from pending now that we have real client_id - pending_connections.discard(pending_client_id) - application_logger.info( - f"🔌 WebSocket connection accepted - User: {user.user_id} ({user.email}), Client: {client_id}" - ) - - # Create client state - client_state = await create_client_state(client_id, user, device_name) - - # Setup decoder (only required for decoding OMI audio) - decoder = OmiOpusDecoder() - _decode_packet = partial(decoder.decode_packet, strip_header=False) - - # Get processor manager - processor_manager = get_processor_manager() - - packet_count = 0 - total_bytes = 0 - - while True: - # Parse Wyoming protocol - header, payload = await parse_wyoming_protocol(ws) - - if header["type"] == "audio-start": - # Handle audio session start (optional for OMI devices) - application_logger.info( - f"🎙️ OMI audio session started for {client_id} (explicit start)" - ) - - elif header["type"] == "audio-chunk" and payload: - packet_count += 1 - total_bytes += len(payload) - - # OMI devices stream continuously - always process audio chunks - if packet_count <= 5 or packet_count % 1000 == 0: # Log first 5 and every 1000th - application_logger.info( - f"🎵 Received OMI audio chunk #{packet_count}: {len(payload)} bytes" - ) - - # Decode Opus payload to PCM using OMI decoder - start_time = time.time() - loop = asyncio.get_running_loop() - pcm_data = await loop.run_in_executor(_DEC_IO_EXECUTOR, _decode_packet, payload) - decode_time = time.time() - start_time - - if pcm_data: - if packet_count <= 5 or packet_count % 1000 == 0: # Log first 5 and every 1000th - application_logger.debug( - f"🎵 Decoded OMI packet #{packet_count}: {len(payload)} bytes -> {len(pcm_data)} PCM bytes (took {decode_time:.3f}s)" - ) - - # Use timestamp from Wyoming header if provided, otherwise current time - audio_data = header.get("data", {}) - chunk_timestamp = audio_data.get("timestamp", int(time.time())) - - # Queue to application-level processor - if packet_count <= 5 or packet_count % 100 == 0: # Log first 5 and every 100th - application_logger.debug( - f"🚀 About to queue audio chunk #{packet_count} for client {client_id}" - ) - - # Process audio chunk through unified pipeline - await process_audio_chunk( - audio_data=pcm_data, - client_id=client_id, - user_id=user.user_id, - user_email=user.email, - audio_format={ - "rate": OMI_SAMPLE_RATE, - "width": OMI_SAMPLE_WIDTH, - "channels": OMI_CHANNELS, - "timestamp": chunk_timestamp, - }, - client_state=client_state, - ) - - # Log every 1000th packet to avoid spam - if packet_count % 1000 == 0: - application_logger.info( - f"📊 Processed {packet_count} OMI packets ({total_bytes} bytes total) for client {client_id}" - ) - else: - # Log decode failures for first 5 packets - if packet_count <= 5: - application_logger.warning( - f"❌ Failed to decode OMI packet #{packet_count}: {len(payload)} bytes" - ) - - elif header["type"] == "audio-stop": - # Handle audio session stop - application_logger.info( - f"🛑 OMI audio session stopped for {client_id} - " - f"Total chunks: {packet_count}, Total bytes: {total_bytes}" - ) - - # Signal end of audio stream to processor - await processor_manager.close_client_audio(client_id) - - # Close current conversation to trigger memory processing - if client_state: - application_logger.info( - f"📝 Closing conversation for {client_id} on audio-stop" - ) - await client_state.close_current_conversation() - - # Reset counters for next session - packet_count = 0 - total_bytes = 0 - - else: - # Unknown event type - application_logger.debug( - f"Ignoring Wyoming event type '{header['type']}' for OMI client {client_id}" - ) - - except WebSocketDisconnect: - application_logger.info( - f"🔌 WebSocket disconnected - Client: {client_id}, Packets: {packet_count}, Total bytes: {total_bytes}" - ) - except Exception as e: - application_logger.error(f"❌ WebSocket error for client {client_id}: {e}", exc_info=True) - finally: - # Clean up pending connection tracking - pending_connections.discard(pending_client_id) - - # Ensure cleanup happens even if client_id is None - if client_id: - try: - # Signal end of audio stream to processor - processor_manager = get_processor_manager() - await processor_manager.close_client_audio(client_id) - - # Clean up client state - await cleanup_client_state(client_id) - except Exception as cleanup_error: - application_logger.error( - f"Error during cleanup for client {client_id}: {cleanup_error}", exc_info=True - ) - - -@app.websocket("/ws_pcm") -async def ws_endpoint_pcm( - ws: WebSocket, token: Optional[str] = Query(None), device_name: Optional[str] = Query(None) -): - """Accepts WebSocket connections, processes PCM audio per-client.""" - # Generate pending client_id to track connection even if auth fails - pending_client_id = f"pending_{uuid.uuid4()}" - pending_connections.add(pending_client_id) - - client_id = None - client_state = None - - try: - # Authenticate user before accepting WebSocket connection - user = await websocket_auth(ws, token) - if not user: - await ws.close(code=1008, reason="Authentication required") - return - - # Accept WebSocket AFTER authentication succeeds (fixes race condition) - await ws.accept() - - # Generate proper client_id using user and device_name - client_id = generate_client_id(user, device_name) - - # Remove from pending now that we have real client_id - pending_connections.discard(pending_client_id) - application_logger.info( - f"🔌 PCM WebSocket connection accepted - User: {user.user_id} ({user.email}), Client: {client_id}" - ) - - # Send ready message to client (similar to speaker recognition service) - try: - ready_msg = json.dumps({"type": "ready", "message": "WebSocket connection established"}) + "\n" - await ws.send_text(ready_msg) - application_logger.debug(f"✅ Sent ready message to {client_id}") - except Exception as e: - application_logger.error(f"Failed to send ready message to {client_id}: {e}") - - # Create client state - client_state = await create_client_state(client_id, user, device_name) - - # Get processor manager - processor_manager = get_processor_manager() - - packet_count = 0 - total_bytes = 0 - audio_streaming = False # Track if audio session is active - - while True: - try: - if not audio_streaming: - # Control message mode - parse Wyoming protocol - application_logger.debug(f"🔄 Control mode for {client_id}, WebSocket state: {ws.client_state if hasattr(ws, 'client_state') else 'unknown'}") - application_logger.debug(f"📨 About to receive control message for {client_id}") - header, payload = await parse_wyoming_protocol(ws) - application_logger.debug(f"✅ Received message type: {header.get('type')} for {client_id}") - - if header["type"] == "audio-start": - application_logger.debug(f"🎙️ Processing audio-start for {client_id}") - # Handle audio session start - audio_streaming = True - audio_format = header.get("data", {}) - application_logger.info( - f"🎙️ Audio session started for {client_id} - " - f"Format: {audio_format.get('rate')}Hz, " - f"{audio_format.get('width')}bytes, " - f"{audio_format.get('channels')}ch" - ) - - # Create transcription manager early for this client - processor_manager = get_processor_manager() - try: - application_logger.debug(f"📋 Creating transcription manager for {client_id}") - await processor_manager.ensure_transcription_manager(client_id) - application_logger.info( - f"🔌 Created transcription manager for {client_id} on audio-start" - ) - except Exception as tm_error: - application_logger.error( - f"❌ Failed to create transcription manager for {client_id}: {tm_error}", exc_info=True - ) - - application_logger.info(f"🎵 Switching to audio streaming mode for {client_id}") - continue # Continue to audio streaming mode - - elif header["type"] == "ping": - # Handle keepalive ping from frontend - application_logger.debug(f"🏓 Received ping from {client_id}") - continue - - else: - # Unknown control message type - application_logger.debug( - f"Ignoring Wyoming control event type '{header['type']}' for {client_id}" - ) - continue - - else: - # Audio streaming mode - receive raw bytes (like speaker recognition) - application_logger.debug(f"🎵 Audio streaming mode for {client_id} - waiting for audio data") - - try: - # Receive raw audio bytes or check for control messages - message = await ws.receive() - - - # Check if it's a disconnect - if "type" in message and message["type"] == "websocket.disconnect": - code = message.get("code", 1000) - reason = message.get("reason", "") - application_logger.info(f"🔌 WebSocket disconnect during audio streaming for {client_id}. Code: {code}, Reason: {reason}") - break - - # Check if it's a text message (control message like audio-stop) - if "text" in message: - try: - control_header = json.loads(message["text"].strip()) - if control_header.get("type") == "audio-stop": - application_logger.info(f"🛑 Audio session stopped for {client_id}") - audio_streaming = False - - # Signal end of audio stream to processor - await processor_manager.close_client_audio(client_id) - - # Close current conversation to trigger memory processing - if client_state: - application_logger.info(f"📝 Closing conversation for {client_id} on audio-stop") - await client_state.close_current_conversation() - - # Reset counters for next session - packet_count = 0 - total_bytes = 0 - continue - elif control_header.get("type") == "ping": - application_logger.debug(f"🏓 Received ping during streaming from {client_id}") - continue - elif control_header.get("type") == "audio-start": - # Handle duplicate audio-start messages gracefully (idempotent behavior) - application_logger.info(f"🔄 Ignoring duplicate audio-start message during streaming for {client_id}") - continue - elif control_header.get("type") == "audio-chunk": - # Handle Wyoming protocol audio-chunk with binary payload - payload_length = control_header.get("payload_length") - if payload_length and payload_length > 0: - # Receive the binary audio data - payload_msg = await ws.receive() - if "bytes" in payload_msg: - audio_data = payload_msg["bytes"] - packet_count += 1 - total_bytes += len(audio_data) - - application_logger.debug(f"🎵 Received audio chunk #{packet_count}: {len(audio_data)} bytes") - - # Process audio chunk through unified pipeline - audio_format = control_header.get("data", {}) - await process_audio_chunk( - audio_data=audio_data, - client_id=client_id, - user_id=user.user_id, - user_email=user.email, - audio_format=audio_format, - client_state=None, # No client state update needed for Wyoming protocol - ) - else: - application_logger.warning(f"Expected binary payload for audio-chunk, got: {payload_msg.keys()}") - else: - application_logger.warning(f"audio-chunk missing payload_length: {payload_length}") - continue - else: - application_logger.warning(f"Unknown control message during streaming: {control_header.get('type')}") - continue - except json.JSONDecodeError: - application_logger.warning(f"Invalid control message during streaming for {client_id}") - continue - - # Check if it's binary data (raw audio without Wyoming protocol) - elif "bytes" in message: - # Raw binary audio data (legacy support) - audio_data = message["bytes"] - packet_count += 1 - total_bytes += len(audio_data) - - application_logger.debug(f"🎵 Received raw audio chunk #{packet_count}: {len(audio_data)} bytes") - - # Process raw audio chunk through unified pipeline (assume PCM 16kHz mono) - await process_audio_chunk( - audio_data=audio_data, - client_id=client_id, - user_id=user.user_id, - user_email=user.email, - audio_format={ - "rate": 16000, - "width": 2, - "channels": 1, - "timestamp": int(time.time()), - }, - client_state=None, # No client state update needed for raw streaming - ) - - else: - application_logger.warning(f"Unexpected message format in streaming mode: {message.keys()}") - continue - - except Exception as streaming_error: - application_logger.error(f"Error in audio streaming mode: {streaming_error}") - if "disconnect" in str(streaming_error).lower(): - break - continue - - # This section is now handled in the streaming mode above - - except WebSocketDisconnect as e: - application_logger.info( - f"🔌 WebSocket disconnected during message processing for {client_id}. " - f"Code: {e.code}, Reason: {e.reason}" - ) - break # Exit the loop on disconnect - except json.JSONDecodeError as e: - application_logger.error( - f"❌ JSON decode error in Wyoming protocol for {client_id}: {e}" - ) - continue # Skip this message but don't disconnect - except ValueError as e: - application_logger.error( - f"❌ Protocol error for {client_id}: {e}" - ) - continue # Skip this message but don't disconnect - except RuntimeError as e: - # Handle "Cannot call receive once a disconnect message has been received" - if "disconnect" in str(e).lower(): - application_logger.info( - f"🔌 WebSocket already disconnected for {client_id}: {e}" - ) - break # Exit the loop on disconnect - else: - application_logger.error( - f"❌ Runtime error for {client_id}: {e}", exc_info=True - ) - continue - except Exception as e: - application_logger.error( - f"❌ Unexpected error processing message for {client_id}: {e}", exc_info=True - ) - # Check if it's a connection-related error - error_msg = str(e).lower() - if "disconnect" in error_msg or "closed" in error_msg or "receive" in error_msg: - application_logger.info( - f"🔌 Connection issue detected for {client_id}, exiting loop" - ) - break - else: - continue # Skip this message for other errors - - except WebSocketDisconnect: - application_logger.info( - f"🔌 PCM WebSocket disconnected - Client: {client_id}, Packets: {packet_count}, Total bytes: {total_bytes}" - ) - except Exception as e: - application_logger.error( - f"❌ PCM WebSocket error for client {client_id}: {e}", exc_info=True - ) - finally: - # Clean up pending connection tracking - pending_connections.discard(pending_client_id) - - # Ensure cleanup happens even if client_id is None - if client_id: - try: - # Signal end of audio stream to processor - processor_manager = get_processor_manager() - await processor_manager.close_client_audio(client_id) - - # Clean up client state - await cleanup_client_state(client_id) - except Exception as cleanup_error: - application_logger.error( - f"Error during cleanup for client {client_id}: {cleanup_error}", exc_info=True - ) - - -@app.get("/health") -async def health_check(): - """Comprehensive health check for all services.""" - health_status = { - "status": "healthy", - "timestamp": int(time.time()), - "services": {}, - "config": { - "mongodb_uri": MONGODB_URI, - "qdrant_url": f"http://{QDRANT_BASE_URL}:{QDRANT_PORT}", - "transcription_service": ( - f"Speech to Text ({transcription_provider.name})" - if transcription_provider - else "Speech to Text (Not Configured)" - ), - "asr_uri": ( - f"{transcription_provider.mode.upper()} ({transcription_provider.name})" - if transcription_provider - else "Not configured" - ), - "transcription_provider": TRANSCRIPTION_PROVIDER or "auto-detect", - "provider_type": ( - transcription_provider.mode if transcription_provider else "none" - ), - "chunk_dir": str(CHUNK_DIR), - "active_clients": client_manager.get_client_count(), - "new_conversation_timeout_minutes": NEW_CONVERSATION_TIMEOUT_MINUTES, - "audio_cropping_enabled": AUDIO_CROPPING_ENABLED, - "llm_provider": os.getenv("LLM_PROVIDER"), - "llm_model": os.getenv("OPENAI_MODEL"), - "llm_base_url": os.getenv("OPENAI_BASE_URL"), - }, - } - - overall_healthy = True - critical_services_healthy = True - - # Get configuration once at the start - memory_provider = os.getenv("MEMORY_PROVIDER", "friend_lite") - speaker_service_url = os.getenv("SPEAKER_SERVICE_URL") - openmemory_mcp_url = os.getenv("OPENMEMORY_MCP_URL") - - # Check MongoDB (critical service) - try: - await asyncio.wait_for(mongo_client.admin.command("ping"), timeout=5.0) - health_status["services"]["mongodb"] = { - "status": "✅ Connected", - "healthy": True, - "critical": True, - } - except asyncio.TimeoutError: - health_status["services"]["mongodb"] = { - "status": "❌ Connection Timeout (5s)", - "healthy": False, - "critical": True, - } - overall_healthy = False - critical_services_healthy = False - except Exception as e: - health_status["services"]["mongodb"] = { - "status": f"❌ Connection Failed: {str(e)}", - "healthy": False, - "critical": True, - } - overall_healthy = False - critical_services_healthy = False - - # Check LLM service (non-critical service - may not be running) - try: - - llm_health = await asyncio.wait_for(async_health_check(), timeout=8.0) - health_status["services"]["audioai"] = { - "status": llm_health.get("status", "❌ Unknown"), - "healthy": "✅" in llm_health.get("status", ""), - "base_url": llm_health.get("base_url", ""), - "model": llm_health.get("default_model", ""), - "provider": os.getenv("LLM_PROVIDER", "openai"), - "critical": False, - } - except asyncio.TimeoutError: - health_status["services"]["audioai"] = { - "status": "⚠️ Connection Timeout (8s) - Service may not be running", - "healthy": False, - "provider": os.getenv("LLM_PROVIDER", "openai"), - "critical": False, - } - overall_healthy = False - except Exception as e: - health_status["services"]["audioai"] = { - "status": f"⚠️ Connection Failed: {str(e)} - Service may not be running", - "healthy": False, - "provider": os.getenv("LLM_PROVIDER", "openai"), - "critical": False, - } - overall_healthy = False - - # Check memory service (provider-dependent) - if memory_provider == "friend_lite": - try: - # Test Friend-Lite memory service connection with timeout - test_success = await asyncio.wait_for(memory_service.test_connection(), timeout=8.0) - if test_success: - health_status["services"]["memory_service"] = { - "status": "✅ Friend-Lite Memory Connected", - "healthy": True, - "provider": "friend_lite", - "critical": False, - } - else: - health_status["services"]["memory_service"] = { - "status": "⚠️ Friend-Lite Memory Test Failed", - "healthy": False, - "provider": "friend_lite", - "critical": False, - } - overall_healthy = False - except asyncio.TimeoutError: - health_status["services"]["memory_service"] = { - "status": "⚠️ Friend-Lite Memory Timeout (8s) - Check Qdrant", - "healthy": False, - "provider": "friend_lite", - "critical": False, - } - overall_healthy = False - except Exception as e: - health_status["services"]["memory_service"] = { - "status": f"⚠️ Friend-Lite Memory Failed: {str(e)}", - "healthy": False, - "provider": "friend_lite", - "critical": False, - } - overall_healthy = False - elif memory_provider == "openmemory_mcp": - # OpenMemory MCP check is handled separately above - health_status["services"]["memory_service"] = { - "status": "✅ Using OpenMemory MCP", - "healthy": True, - "provider": "openmemory_mcp", - "critical": False, - } - else: - health_status["services"]["memory_service"] = { - "status": f"❌ Unknown memory provider: {memory_provider}", - "healthy": False, - "provider": memory_provider, - "critical": False, - } - overall_healthy = False - - # Check Speech to Text service based on configured provider - if transcription_provider: - provider_name = transcription_provider.name - provider_type = transcription_provider.mode - - # Generic provider health check - let each provider handle its own connection logic - try: - # Test provider connection - await transcription_provider.connect("health-check") - await transcription_provider.disconnect() - - health_status["services"]["speech_to_text"] = { - "status": "✅ Provider Available", - "healthy": True, - "type": provider_type.title(), - "provider": provider_name, - "critical": False, - } - except Exception as e: - health_status["services"]["speech_to_text"] = { - "status": f"⚠️ Provider Error: {str(e)}", - "healthy": False, - "type": provider_type.title(), - "provider": provider_name, - "critical": False, - } - # Don't mark overall health as unhealthy for transcription issues - # since the service may be external or optional - else: - # No transcription service configured - health_status["services"]["speech_to_text"] = { - "status": "❌ No transcription service configured", - "healthy": False, - "type": "None", - "provider": "None", - "critical": False, - } - overall_healthy = False - - # Check Speaker Recognition service (non-critical - optional feature) - if speaker_service_url: - try: - # Make a health check request to the speaker service - async with aiohttp.ClientSession() as session: - async with session.get( - f"{speaker_service_url}/health", timeout=aiohttp.ClientTimeout(total=5) - ) as response: - if response.status == 200: - health_status["services"]["speaker_recognition"] = { - "status": "✅ Connected", - "healthy": True, - "url": speaker_service_url, - "critical": False, - } - else: - health_status["services"]["speaker_recognition"] = { - "status": f"⚠️ Unhealthy: HTTP {response.status}", - "healthy": False, - "url": speaker_service_url, - "critical": False, - } - overall_healthy = False - except asyncio.TimeoutError: - health_status["services"]["speaker_recognition"] = { - "status": "⚠️ Connection Timeout (5s)", - "healthy": False, - "url": speaker_service_url, - "critical": False, - } - overall_healthy = False - except Exception as e: - health_status["services"]["speaker_recognition"] = { - "status": f"⚠️ Connection Failed: {str(e)}", - "healthy": False, - "url": speaker_service_url, - "critical": False, - } - overall_healthy = False - - # Check OpenMemory MCP service (if configured) - if memory_provider == "openmemory_mcp" and openmemory_mcp_url: - try: - # Make a health check request to the OpenMemory MCP service - async with aiohttp.ClientSession() as session: - async with session.get( - f"{openmemory_mcp_url}/api/v1/apps/", timeout=aiohttp.ClientTimeout(total=5) - ) as response: - if response.status == 200: - health_status["services"]["openmemory_mcp"] = { - "status": "✅ Connected", - "healthy": True, - "url": openmemory_mcp_url, - "provider": "openmemory_mcp", - "critical": False, - } - else: - health_status["services"]["openmemory_mcp"] = { - "status": f"⚠️ Unhealthy: HTTP {response.status}", - "healthy": False, - "url": openmemory_mcp_url, - "provider": "openmemory_mcp", - "critical": False, - } - overall_healthy = False - except asyncio.TimeoutError: - health_status["services"]["openmemory_mcp"] = { - "status": "⚠️ Connection Timeout (5s)", - "healthy": False, - "url": openmemory_mcp_url, - "provider": "openmemory_mcp", - "critical": False, - } - overall_healthy = False - except Exception as e: - health_status["services"]["openmemory_mcp"] = { - "status": f"⚠️ Connection Failed: {str(e)}", - "healthy": False, - "url": openmemory_mcp_url, - "provider": "openmemory_mcp", - "critical": False, - } - overall_healthy = False - - # Track health check results in debug tracker - try: - # Can add health check tracking to debug tracker if needed - pass - except Exception as e: - application_logger.error(f"Failed to record health check metrics: {e}") - - # Set overall status - health_status["overall_healthy"] = overall_healthy - health_status["critical_services_healthy"] = critical_services_healthy - - if not critical_services_healthy: - health_status["status"] = "critical" - elif not overall_healthy: - health_status["status"] = "degraded" - else: - health_status["status"] = "healthy" - - # Add helpful messages - if not overall_healthy: - messages = [] - if not critical_services_healthy: - messages.append( - "Critical services (MongoDB) are unavailable - core functionality will not work" - ) - - unhealthy_optional = [ - name - for name, service in health_status["services"].items() - if not service["healthy"] and not service.get("critical", True) - ] - if unhealthy_optional: - messages.append(f"Optional services unavailable: {', '.join(unhealthy_optional)}") - - health_status["message"] = "; ".join(messages) - - return JSONResponse(content=health_status, status_code=200) - - -@app.get("/readiness") -async def readiness_check(): - """Simple readiness check for container orchestration.""" - # Use debug level for health check to reduce log spam - logger.debug("Readiness check requested") - - # Only check critical services for readiness - try: - # Quick MongoDB ping to ensure we can serve requests - await asyncio.wait_for(mongo_client.admin.command("ping"), timeout=2.0) - return JSONResponse(content={"status": "ready", "timestamp": int(time.time())}, status_code=200) - except Exception as e: - logger.error(f"Readiness check failed: {e}") - return JSONResponse( - content={"status": "not_ready", "error": str(e), "timestamp": int(time.time())}, - status_code=503 - ) +# Create FastAPI application using the app factory pattern +app = create_app() if __name__ == "__main__": - import uvicorn + """Main entry point for running the application.""" + import os + # Get port from environment or use default + port = int(os.getenv("PORT", 8000)) host = os.getenv("HOST", "0.0.0.0") - port = int(os.getenv("PORT", "8000")) - application_logger.info("Starting Omi unified service at ws://%s:%s/ws", host, port) - uvicorn.run("main:app", host=host, port=port, reload=False) + logger.info(f"Starting server on {host}:{port}") + + # Run the application + uvicorn.run( + "main:app", + host=host, + port=port, + reload=False, # Set to True for development + access_log=False, # Disabled - using custom RequestLoggingMiddleware instead + log_level="info" + ) diff --git a/backends/advanced/src/advanced_omi_backend/memory/compat_service.py b/backends/advanced/src/advanced_omi_backend/memory/compat_service.py index e3cb9827..3814f29e 100644 --- a/backends/advanced/src/advanced_omi_backend/memory/compat_service.py +++ b/backends/advanced/src/advanced_omi_backend/memory/compat_service.py @@ -281,13 +281,13 @@ async def get_memories_with_transcripts(self, user_id: str, limit: int = 100) -> # Get memories first memories = await self.get_all_memories(user_id, limit) - # Import database connection + # Import Conversation model try: - from advanced_omi_backend.database import chunks_col + from advanced_omi_backend.models.conversation import Conversation except ImportError: - memory_logger.error("Cannot import database connection") + memory_logger.error("Cannot import Conversation model") return memories # Return memories without transcript enrichment - + # Extract source IDs for bulk query source_ids = [] for memory in memories: @@ -295,12 +295,15 @@ async def get_memories_with_transcripts(self, user_id: str, limit: int = 100) -> source_id = metadata.get("source_id") or metadata.get("audio_uuid") # Backward compatibility if source_id: source_ids.append(source_id) - - # Bulk query for chunks (support both old audio_uuid and new source_id) - chunks_cursor = chunks_col.find({"audio_uuid": {"$in": source_ids}}) - chunks_by_id = {} - async for chunk in chunks_cursor: - chunks_by_id[chunk["audio_uuid"]] = chunk + + # Bulk query for conversations (support both old audio_uuid and new source_id) + conversations_list = await Conversation.find( + Conversation.audio_uuid.in_(source_ids) + ).to_list() + + conversations_by_id = {} + for conv in conversations_list: + conversations_by_id[conv.audio_uuid] = conv enriched_memories = [] @@ -328,15 +331,15 @@ async def get_memories_with_transcripts(self, user_id: str, limit: int = 100) -> enriched_memory["client_id"] = metadata.get("client_id") enriched_memory["user_email"] = metadata.get("user_email") - # Get transcript from bulk-loaded chunks - chunk = chunks_by_id.get(source_id) - if chunk: - transcript_segments = chunk.get("transcript", []) + # Get transcript from bulk-loaded conversations + conversation = conversations_by_id.get(source_id) + if conversation: + transcript_segments = conversation.segments if transcript_segments: full_transcript = " ".join( - segment.get("text", "") + segment.text for segment in transcript_segments - if isinstance(segment, dict) and segment.get("text") + if segment.text ) if full_transcript.strip(): diff --git a/backends/advanced/src/advanced_omi_backend/memory/config.py b/backends/advanced/src/advanced_omi_backend/memory/config.py index 7b821eab..99e79d38 100644 --- a/backends/advanced/src/advanced_omi_backend/memory/config.py +++ b/backends/advanced/src/advanced_omi_backend/memory/config.py @@ -1,17 +1,27 @@ """Memory service configuration utilities.""" -import os import logging -from typing import Dict, Any +import os from dataclasses import dataclass from enum import Enum +from typing import Any, Dict memory_logger = logging.getLogger("memory_service") +def _is_langfuse_enabled() -> bool: + """Check if Langfuse is properly configured.""" + return bool( + os.getenv("LANGFUSE_PUBLIC_KEY") + and os.getenv("LANGFUSE_SECRET_KEY") + and os.getenv("LANGFUSE_HOST") + ) + + class LLMProvider(Enum): """Supported LLM providers.""" OPENAI = "openai" + OLLAMA = "ollama" CUSTOM = "custom" @@ -72,6 +82,7 @@ def create_ollama_config( ) -> Dict[str, Any]: """Create Ollama configuration.""" return { + "api_key": "dummy", # Ollama doesn't require an API key "base_url": base_url, "model": model, "embedding_model": embedding_model, @@ -146,10 +157,15 @@ def build_memory_config_from_env() -> MemoryConfig: memory_config = config_loader.get_memory_extraction_config() # Get LLM provider from environment - llm_provider = os.getenv("LLM_PROVIDER", "openai").lower() - if llm_provider not in ["openai"]: + llm_provider = os.getenv("LLM_PROVIDER", "openai").lower().strip() + memory_logger.info(f"LLM_PROVIDER: {llm_provider}") + if llm_provider not in [p.value for p in LLMProvider]: raise ValueError(f"Unsupported LLM provider: {llm_provider}") + llm_config = None + llm_provider_enum = None + embedding_dims = 1536 # Default + # Build LLM configuration if llm_provider == "openai": openai_api_key = os.getenv("OPENAI_API_KEY") @@ -171,18 +187,31 @@ def build_memory_config_from_env() -> MemoryConfig: max_tokens=memory_config.get("llm_settings", {}).get("max_tokens", 2000) ) llm_provider_enum = LLMProvider.OPENAI + embedding_dims = get_embedding_dims(llm_config) + memory_logger.info(f"🔧 Setting Embedder dims {embedding_dims}") + + elif llm_provider == "ollama": + base_url = os.getenv("OLLAMA_BASE_URL") + if not base_url: + raise ValueError("OLLAMA_BASE_URL required for Ollama provider") - # Determine embedding dimensions based on model - if embedding_model == "text-embedding-3-small": - embedding_dims = 1536 - elif embedding_model == "text-embedding-3-large": - embedding_dims = 3072 - elif embedding_model == "text-embedding-ada-002": - embedding_dims = 1536 - else: - # Default for OpenAI embedding models - embedding_dims = 1536 - + model = os.getenv("OLLAMA_MODEL") + if not model: + raise ValueError("OLLAMA_MODEL required for Ollama provider") + embedding_model = os.getenv("OLLAMA_EMBEDDER_MODEL") + if not embedding_model: + raise ValueError("OLLAMA_EMBEDDER_MODEL required for Ollama provider") + memory_logger.info(f"🔧 Memory config: LLM={model}, Embedding={embedding_model}, Base URL={base_url}") + + llm_config = create_ollama_config( + base_url=base_url, + model=model, + embedding_model=embedding_model, + ) + llm_provider_enum = LLMProvider.OLLAMA + embedding_dims = get_embedding_dims(llm_config) + memory_logger.info(f"🔧 Setting Embedder dims {embedding_dims}") + # Build vector store configuration vector_store_provider = os.getenv("VECTOR_STORE_PROVIDER", "qdrant").lower() @@ -220,3 +249,49 @@ def build_memory_config_from_env() -> MemoryConfig: except ImportError: memory_logger.warning("Config loader not available, using environment variables only") raise + + +def get_embedding_dims(llm_config: Dict[str, Any]) -> int: + """ + Query the embedding endpoint and return the embedding vector length. + Works for OpenAI and OpenAI-compatible endpoints (e.g., Ollama). + """ + embedding_model = llm_config.get('embedding_model') + try: + # Conditionally use Langfuse if configured + if _is_langfuse_enabled(): + from langfuse.openai import OpenAI + client = OpenAI( + api_key=llm_config.get('api_key'), + base_url=llm_config.get('base_url') + ) + else: + from openai import OpenAI + client = OpenAI( + api_key=llm_config.get('api_key'), + base_url=llm_config.get('base_url') + ) + response = client.embeddings.create( + model=embedding_model, + input="hello world" + ) + embedding = response.data[0].embedding + if not embedding or not isinstance(embedding, list): + return 1536 + return len(embedding) + + except (ImportError, KeyError, AttributeError, IndexError, TypeError, ValueError) as e: + embedding_dims = 1536 # default + memory_logger.exception(f"Failed to get embedding dimensions for model '{embedding_model}'") + if embedding_model == "text-embedding-3-small": + embedding_dims = 1536 + elif embedding_model == "text-embedding-3-large": + embedding_dims = 3072 + elif embedding_model == "text-embedding-ada-002": + embedding_dims = 1536 + elif embedding_model == "nomic-embed-text:latest": + embedding_dims = 768 + else: + # Default for OpenAI embedding models + memory_logger.info(f"Unrecognized embedding model '{embedding_model}', using default dimension {embedding_dims}") + return embedding_dims \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/memory/memory_service.py b/backends/advanced/src/advanced_omi_backend/memory/memory_service.py index dc5bc21e..6460aa25 100644 --- a/backends/advanced/src/advanced_omi_backend/memory/memory_service.py +++ b/backends/advanced/src/advanced_omi_backend/memory/memory_service.py @@ -69,7 +69,7 @@ async def initialize(self) -> None: try: # Initialize LLM provider - if self.config.llm_provider == LLMProviderEnum.OPENAI: + if self.config.llm_provider in [LLMProviderEnum.OPENAI, LLMProviderEnum.OLLAMA]: self.llm_provider = OpenAIProvider(self.config.llm_config) else: raise ValueError(f"Unsupported LLM provider: {self.config.llm_provider}") @@ -201,9 +201,9 @@ async def add_memory( memory_logger.info(f"✅ Upserted {len(created_ids)} memories for {source_id}") return True, created_ids - error_msg = f"❌ No memories created for {source_id}: memory_entries={len(memory_entries) if memory_entries else 0}, allow_update={allow_update}" - memory_logger.error(error_msg) - raise RuntimeError(error_msg) + # No memories created - this is a valid outcome (duplicates, no extractable facts, etc.) + memory_logger.info(f"ℹ️ No new memories created for {source_id}: memory_entries={len(memory_entries) if memory_entries else 0}, allow_update={allow_update}") + return True, [] except asyncio.TimeoutError as e: memory_logger.error(f"⏰ Memory processing timed out for {source_id}") diff --git a/backends/advanced/src/advanced_omi_backend/memory/prompts.py b/backends/advanced/src/advanced_omi_backend/memory/prompts.py index 96aa4153..f655752e 100644 --- a/backends/advanced/src/advanced_omi_backend/memory/prompts.py +++ b/backends/advanced/src/advanced_omi_backend/memory/prompts.py @@ -229,7 +229,7 @@ ## Summary of the agent's execution history **Task Objective**: Scrape blog post titles and full content from the OpenAI blog. -**Progress Status**: 10\% \complete — 5 out of 50 blog posts processed. +**Progress Status**: 10% complete — 5 out of 50 blog posts processed. 1. **Agent Action**: Opened URL "https://openai.com" **Action Result**: diff --git a/backends/advanced/src/advanced_omi_backend/memory/providers/llm_providers.py b/backends/advanced/src/advanced_omi_backend/memory/providers/llm_providers.py index 2d54d3fa..a876e643 100644 --- a/backends/advanced/src/advanced_omi_backend/memory/providers/llm_providers.py +++ b/backends/advanced/src/advanced_omi_backend/memory/providers/llm_providers.py @@ -10,6 +10,8 @@ import json import logging +import os +import httpx from typing import Any, Dict, List, Optional # TODO: Re-enable spacy when Docker build is fixed @@ -30,6 +32,42 @@ memory_logger = logging.getLogger("memory_service") + +def _is_langfuse_enabled() -> bool: + """Check if Langfuse is properly configured.""" + return bool( + os.getenv("LANGFUSE_PUBLIC_KEY") + and os.getenv("LANGFUSE_SECRET_KEY") + and os.getenv("LANGFUSE_HOST") + ) + + +def _get_openai_client(api_key: str, base_url: str, is_async: bool = False): + """Get OpenAI client with optional Langfuse tracing. + + Args: + api_key: OpenAI API key + base_url: OpenAI API base URL + is_async: Whether to return async or sync client + + Returns: + OpenAI client instance (with or without Langfuse tracing) + """ + if _is_langfuse_enabled(): + # Use Langfuse-wrapped OpenAI for tracing + import langfuse.openai as openai + memory_logger.debug("Using OpenAI client with Langfuse tracing") + else: + # Use regular OpenAI client without tracing + from openai import OpenAI, AsyncOpenAI + openai = type('OpenAI', (), {'OpenAI': OpenAI, 'AsyncOpenAI': AsyncOpenAI})() + memory_logger.debug("Using OpenAI client without tracing") + + if is_async: + return openai.AsyncOpenAI(api_key=api_key, base_url=base_url) + else: + return openai.OpenAI(api_key=api_key, base_url=base_url) + # TODO: Re-enable spacy when Docker build is fixed # try: # nlp = spacy.load("en_core_web_sm") @@ -119,20 +157,19 @@ def __init__(self, config: Dict[str, Any]): async def extract_memories(self, text: str, prompt: str) -> List[str]: """Extract memories using OpenAI API with the enhanced fact retrieval prompt. - + Args: text: Input text to extract memories from prompt: System prompt to guide extraction (uses default if empty) - + Returns: List of extracted memory strings """ try: - import langfuse.openai as openai - - client = openai.AsyncOpenAI( + client = _get_openai_client( api_key=self.api_key, - base_url=self.base_url + base_url=self.base_url, + is_async=True ) # Use the provided prompt or fall back to default @@ -204,19 +241,18 @@ async def _process_chunk(self, client, system_prompt: str, chunk: str, index: in async def generate_embeddings(self, texts: List[str]) -> List[List[float]]: """Generate embeddings using OpenAI API. - + Args: texts: List of texts to generate embeddings for - + Returns: List of embedding vectors, one per input text """ try: - import langfuse.openai as openai - - client = openai.AsyncOpenAI( + client = _get_openai_client( api_key=self.api_key, - base_url=self.base_url + base_url=self.base_url, + is_async=True ) response = await client.embeddings.create( @@ -232,16 +268,24 @@ async def generate_embeddings(self, texts: List[str]) -> List[List[float]]: async def test_connection(self) -> bool: """Test OpenAI connection. - + Returns: True if connection successful, False otherwise """ try: - import langfuse.openai as openai - - client = openai.AsyncOpenAI( + # For Ollama, just check if the base URL is reachable + if os.getenv("LLM_PROVIDER", "openai").lower() == "ollama": + import httpx + async with httpx.AsyncClient() as client: + # For Ollama, test connection by hitting the /v1/models endpoint + response = await client.get(f"{self.base_url}/models") + response.raise_for_status() + return True + + client = _get_openai_client( api_key=self.api_key, - base_url=self.base_url + base_url=self.base_url, + is_async=True ) await client.models.list() @@ -258,30 +302,29 @@ async def propose_memory_actions( custom_prompt: Optional[str] = None, ) -> Dict[str, Any]: """Use OpenAI chat completion with enhanced prompt to propose memory actions. - + Args: retrieved_old_memory: List of existing memories for context new_facts: List of new facts to process custom_prompt: Optional custom prompt to override default - + Returns: Dictionary containing proposed memory actions """ try: - import langfuse.openai as openai - # Generate the complete prompt using the helper function memory_logger.debug(f"🧠 Facts passed to prompt builder: {new_facts}") update_memory_messages = build_update_memory_messages( - retrieved_old_memory, - new_facts, + retrieved_old_memory, + new_facts, custom_prompt ) memory_logger.debug(f"🧠 Generated prompt user content: {update_memory_messages[1]['content'][:200]}...") - client = openai.AsyncOpenAI( + client = _get_openai_client( api_key=self.api_key, base_url=self.base_url, + is_async=True ) response = await client.chat.completions.create( diff --git a/backends/advanced/src/advanced_omi_backend/memory/providers/mcp_client.py b/backends/advanced/src/advanced_omi_backend/memory/providers/mcp_client.py index fe29266f..7942a17a 100644 --- a/backends/advanced/src/advanced_omi_backend/memory/providers/mcp_client.py +++ b/backends/advanced/src/advanced_omi_backend/memory/providers/mcp_client.py @@ -77,7 +77,37 @@ async def add_memories(self, text: str) -> List[str]: MCPError: If the server request fails """ try: - # Use REST API endpoint for creating memories + # Get app_id first to handle duplicate app names + apps_response = await self.client.get(f"{self.server_url}/api/v1/apps/") + apps_response.raise_for_status() + apps_data = apps_response.json() + + memory_logger.debug(f"Apps API response: {apps_data}") + memory_logger.debug(f"Apps data type: {type(apps_data)}") + if isinstance(apps_data, dict): + memory_logger.debug(f"Apps dict keys: {apps_data.keys()}") + if "apps" in apps_data: + memory_logger.debug(f"Number of apps: {len(apps_data['apps'])}") + memory_logger.debug(f"Apps list: {apps_data['apps']}") + + app_id = None + if apps_data.get("apps"): + # Find matching app by name, prefer one with most memories + matching = [a for a in apps_data["apps"] if a["name"] == self.client_name] + memory_logger.debug(f"Matching apps for '{self.client_name}': {matching}") + if matching: + matching.sort(key=lambda x: x.get("total_memories_created", 0), reverse=True) + app_id = matching[0]["id"] + memory_logger.info(f"Found matching app with ID: {app_id}") + else: + app_id = apps_data["apps"][0]["id"] + memory_logger.info(f"No matching app name, using first app ID: {app_id}") + + if not app_id: + memory_logger.error("No apps found in OpenMemory - cannot create memory") + raise MCPError("No apps found in OpenMemory") + + # Use REST API endpoint for creating memories (trailing slash required) response = await self.client.post( f"{self.server_url}/api/v1/memories/", json={ @@ -87,8 +117,8 @@ async def add_memories(self, text: str) -> List[str]: "source": "friend_lite", "client": self.client_name }, - "infer": True, # Let OpenMemory extract memories - "app": self.client_name # Use client name as app name + "infer": True, + "app_id": app_id # Use app_id to avoid duplicate name issues } ) response.raise_for_status() diff --git a/backends/advanced/src/advanced_omi_backend/memory/service_factory.py b/backends/advanced/src/advanced_omi_backend/memory/service_factory.py index 1df6ac27..df2a23c9 100644 --- a/backends/advanced/src/advanced_omi_backend/memory/service_factory.py +++ b/backends/advanced/src/advanced_omi_backend/memory/service_factory.py @@ -34,7 +34,7 @@ def create_memory_service(config: MemoryConfig) -> MemoryServiceBase: ValueError: If unsupported memory provider is specified RuntimeError: If required dependencies are missing """ - memory_logger.info(f"Creating memory service with provider: {config.memory_provider.value}") + memory_logger.info(f"🧠 Creating memory service with provider: {config.memory_provider.value}") if config.memory_provider == MemoryProvider.FRIEND_LITE: # Use the sophisticated Friend-Lite implementation diff --git a/backends/advanced/src/advanced_omi_backend/memory/utils.py b/backends/advanced/src/advanced_omi_backend/memory/utils.py index 8db92f51..b3c231f7 100644 --- a/backends/advanced/src/advanced_omi_backend/memory/utils.py +++ b/backends/advanced/src/advanced_omi_backend/memory/utils.py @@ -73,9 +73,9 @@ def extract_json_from_text(response_text: str) -> Optional[Dict[str, Any]]: # Try to find JSON using comprehensive regex patterns json_patterns = [ # Look for memory format: {"memory": [...]} - r'\{"memory"\s*:\s*\[.*?\]\s*\}', + r'\{"memory"\\s*:\\s*\[.*?\]\\s*\}', # Look for facts format: {"facts": [...]} - r'\{"facts"\s*:\s*\[.*?\]\s*\}', + r'\{"facts"\\s*:\\s*\[.*?\]\\s*\}', # Look for any JSON object containing memory or facts r'\{[^{}]*"(?:memory|facts)"[^{}]*\}', # Look for any balanced JSON object @@ -108,7 +108,7 @@ def extract_json_from_text(response_text: str) -> Optional[Dict[str, Any]]: # Try to extract just the facts or memory array if JSON object parsing fails for key in ["memory", "facts"]: - array_pattern = f'"{key}"\s*:\s*(\[.*?\])' + array_pattern = f'"{key}"\\s*:\\s*(\\[.*?\\])' try: match = re.search(array_pattern, response_text, re.DOTALL) if match: diff --git a/backends/advanced/src/advanced_omi_backend/middleware/app_middleware.py b/backends/advanced/src/advanced_omi_backend/middleware/app_middleware.py new file mode 100644 index 00000000..be2f2705 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/middleware/app_middleware.py @@ -0,0 +1,240 @@ +""" +Middleware configuration for Friend-Lite backend. + +Centralizes CORS configuration and global exception handlers. +""" + +import json +import logging +import time +from typing import Optional + +from fastapi import FastAPI, HTTPException, Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, StreamingResponse +from pymongo.errors import ConnectionFailure, PyMongoError +from starlette.middleware.base import BaseHTTPMiddleware + +from advanced_omi_backend.app_config import get_app_config + +logger = logging.getLogger(__name__) +request_logger = logging.getLogger("api.requests") + + +def setup_cors_middleware(app: FastAPI) -> None: + """Configure CORS middleware for the FastAPI application.""" + config = get_app_config() + + logger.info(f"🌐 CORS configured with origins: {config.allowed_origins}") + logger.info(f"🌐 CORS also allows Tailscale IPs via regex: {config.tailscale_regex}") + + app.add_middleware( + CORSMiddleware, + allow_origins=config.allowed_origins, + allow_origin_regex=config.tailscale_regex, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + +class RequestLoggingMiddleware(BaseHTTPMiddleware): + """ + Middleware to log API requests and JSON responses. + + Excludes: + - Authentication endpoints (login, logout) + - WebSocket connections + - Binary file responses (audio, images) + - Streaming responses + """ + + # Paths to exclude from logging + EXCLUDED_PATHS = { + "/auth/jwt/login", + "/auth/cookie/login", + "/auth/jwt/logout", + "/auth/cookie/logout", + "/ws", + "/ws_omi", + "/ws_pcm", + "/mcp", + "/health", + "/auth/health", + "/readiness", + } + + # Binary content types to exclude + BINARY_CONTENT_TYPES = { + "audio/", + "image/", + "video/", + "application/octet-stream", + } + + def should_log_request(self, path: str) -> bool: + """Determine if request should be logged.""" + # Exclude exact path matches + if path in self.EXCLUDED_PATHS: + return False + + # Exclude paths starting with excluded prefixes + for excluded in self.EXCLUDED_PATHS: + if path.startswith(excluded): + return False + + # Exclude audio file serving + if path.startswith("/audio/"): + return False + + return True + + def should_log_response_body(self, content_type: str) -> bool: + """Determine if response body should be logged.""" + if not content_type: + return True + + # Exclude binary content types + for binary_type in self.BINARY_CONTENT_TYPES: + if content_type.startswith(binary_type): + return False + + return True + + async def dispatch(self, request: Request, call_next): + """Process request and log request/response information.""" + path = request.url.path + + # Skip logging for excluded paths + if not self.should_log_request(path): + return await call_next(request) + + # Start timing + start_time = time.time() + + # Log request + request_logger.info(f"→ {request.method} {path}") + + # Process request + response = await call_next(request) + + # Calculate duration + duration_ms = (time.time() - start_time) * 1000 + + # Check if we should log response body + content_type = response.headers.get("content-type", "") + should_log_body = self.should_log_response_body(content_type) + + # Skip body logging for streaming responses + if isinstance(response, StreamingResponse): + request_logger.info( + f"← {request.method} {path} - {response.status_code} " + f"(streaming response) - {duration_ms:.2f}ms" + ) + return response + + # For non-streaming responses, try to extract and log JSON body + if should_log_body and response.status_code != 204: # No content + try: + # Read response body + response_body = b"" + async for chunk in response.body_iterator: + response_body += chunk + + # Try to parse as JSON for pretty printing + try: + json_body = json.loads(response_body) + formatted_json = json.dumps(json_body, indent=2) + request_logger.info( + f"← {request.method} {path} - {response.status_code} - {duration_ms:.2f}ms\n" + f"Response body:\n{formatted_json}" + ) + except (json.JSONDecodeError, UnicodeDecodeError): + # Not JSON or not UTF-8, just log the status + request_logger.info( + f"← {request.method} {path} - {response.status_code} - {duration_ms:.2f}ms " + f"(non-JSON response)" + ) + + # Recreate response with the body we consumed + from starlette.responses import Response + return Response( + content=response_body, + status_code=response.status_code, + headers=dict(response.headers), + media_type=response.media_type, + ) + except Exception as e: + # If anything goes wrong, just log basic info + request_logger.warning( + f"← {request.method} {path} - {response.status_code} - {duration_ms:.2f}ms " + f"(error reading response: {e})" + ) + return response + else: + # Just log status for responses without body + request_logger.info( + f"← {request.method} {path} - {response.status_code} - {duration_ms:.2f}ms" + ) + return response + + +def setup_exception_handlers(app: FastAPI) -> None: + """Configure global exception handlers for the FastAPI application.""" + + @app.exception_handler(ConnectionFailure) + @app.exception_handler(PyMongoError) + async def database_exception_handler(request: Request, exc: Exception): + """Handle database connection failures and return structured error response.""" + logger.error(f"Database connection error: {type(exc).__name__}: {exc}") + return JSONResponse( + status_code=500, + content={ + "detail": "Unable to connect to server. Please check your connection and try again.", + "error_type": "connection_failure", + "error_category": "database" + } + ) + + @app.exception_handler(ConnectionError) + async def connection_exception_handler(request: Request, exc: ConnectionError): + """Handle general connection errors and return structured error response.""" + logger.error(f"Connection error: {exc}") + return JSONResponse( + status_code=500, + content={ + "detail": "Unable to connect to server. Please check your connection and try again.", + "error_type": "connection_failure", + "error_category": "network" + } + ) + + @app.exception_handler(HTTPException) + async def http_exception_handler(request: Request, exc: HTTPException): + """Handle HTTP exceptions with structured error response.""" + # For authentication failures (401), add error_type + if exc.status_code == 401: + return JSONResponse( + status_code=exc.status_code, + content={ + "detail": exc.detail, + "error_type": "authentication_failure", + "error_category": "security" + } + ) + + # For other HTTP exceptions, return as normal + return JSONResponse( + status_code=exc.status_code, + content={"detail": exc.detail} + ) + + +def setup_middleware(app: FastAPI) -> None: + """Set up all middleware for the FastAPI application.""" + # Add request logging middleware + app.add_middleware(RequestLoggingMiddleware) + logger.info("📝 Request logging middleware enabled") + + setup_cors_middleware(app) + setup_exception_handlers(app) \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/models/__init__.py b/backends/advanced/src/advanced_omi_backend/models/__init__.py new file mode 100644 index 00000000..52c63c20 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/models/__init__.py @@ -0,0 +1,10 @@ +""" +Models package for Friend-Lite backend. + +This package contains Pydantic models that define the structure and validation +for all data entities in the Friend-Lite system. +""" + +# Models can be imported directly from their files +# e.g. from .job import TranscriptionJob +# e.g. from .conversation import Conversation, create_conversation \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/models/audio_file.py b/backends/advanced/src/advanced_omi_backend/models/audio_file.py new file mode 100644 index 00000000..de1c6f3f --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/models/audio_file.py @@ -0,0 +1,61 @@ +""" +AudioFile models for Friend-Lite backend. + +This module contains the Beanie Document model for audio_chunks collection, +which stores ALL audio files (both with and without speech). This is the +storage layer - all audio gets stored here with its metadata. + +Note: Named AudioFile (not AudioChunk) to avoid confusion with wyoming.audio.AudioChunk +which is the in-memory streaming audio data structure. +""" + +from datetime import datetime +from typing import Dict, List, Optional, Any +from pydantic import BaseModel, Field + +from beanie import Document, Indexed + + +class AudioFile(Document): + """ + Audio file model representing persisted audio files in MongoDB. + + The audio_chunks collection stores ALL raw audio files (both with and without speech). + This is just for audio file storage and metadata. If speech is detected, a + Conversation document is created which contains transcripts and memories. + + This is different from wyoming.audio.AudioChunk which is for streaming audio data. + """ + + # Core identifiers + audio_uuid: Indexed(str, unique=True) = Field(description="Unique audio identifier") + audio_path: str = Field(description="Path to raw audio file") + client_id: Indexed(str) = Field(description="Client device identifier") + timestamp: Indexed(int) = Field(description="Unix timestamp in milliseconds") + + # User information + user_id: Indexed(str) = Field(description="User who owns this audio") + user_email: Optional[str] = Field(None, description="User email") + + # Audio processing + cropped_audio_path: Optional[str] = Field(None, description="Path to cropped audio (speech only)") + + # Speech-driven conversation linking + conversation_id: Optional[str] = Field( + None, + description="Link to Conversation if speech was detected" + ) + has_speech: bool = Field(default=False, description="Whether speech was detected") + speech_analysis: Dict[str, Any] = Field( + default_factory=dict, + description="Speech detection results" + ) + + class Settings: + name = "audio_chunks" + indexes = [ + "audio_uuid", + "client_id", + "user_id", + "timestamp" + ] \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/models/conversation.py b/backends/advanced/src/advanced_omi_backend/models/conversation.py new file mode 100644 index 00000000..7caf8a55 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/models/conversation.py @@ -0,0 +1,364 @@ +""" +Conversation models for Friend-Lite backend. + +This module contains Beanie Document and Pydantic models for conversations, +transcript versions, and memory versions. +""" + +from datetime import datetime +from typing import Dict, List, Optional, Any, Union +from pydantic import BaseModel, Field, model_validator, computed_field +from enum import Enum +import uuid + +from beanie import Document, Indexed + + +class Conversation(Document): + """Complete conversation model with versioned processing.""" + + # Nested Enums + class TranscriptProvider(str, Enum): + """Supported transcription providers.""" + DEEPGRAM = "deepgram" + MISTRAL = "mistral" + PARAKEET = "parakeet" + SPEECH_DETECTION = "speech_detection" # Legacy value + UNKNOWN = "unknown" # Fallback value + + class MemoryProvider(str, Enum): + """Supported memory providers.""" + FRIEND_LITE = "friend_lite" + OPENMEMORY_MCP = "openmemory_mcp" + + class ConversationStatus(str, Enum): + """Conversation processing status.""" + ACTIVE = "active" # Has running jobs or open websocket + COMPLETED = "completed" # All jobs succeeded + FAILED = "failed" # One or more jobs failed + + class EndReason(str, Enum): + """Reason for conversation ending.""" + USER_STOPPED = "user_stopped" # User manually stopped recording + INACTIVITY_TIMEOUT = "inactivity_timeout" # No speech detected for threshold period + WEBSOCKET_DISCONNECT = "websocket_disconnect" # Connection lost (Bluetooth, network, etc.) + MAX_DURATION = "max_duration" # Hit maximum conversation duration + ERROR = "error" # Processing error forced conversation end + UNKNOWN = "unknown" # Unknown or legacy reason + + # Nested Models + class SpeakerSegment(BaseModel): + """Individual speaker segment in a transcript.""" + start: float = Field(description="Start time in seconds") + end: float = Field(description="End time in seconds") + text: str = Field(description="Transcript text for this segment") + speaker: str = Field(description="Speaker identifier") + confidence: Optional[float] = Field(None, description="Confidence score (0-1)") + + class TranscriptVersion(BaseModel): + """Version of a transcript with processing metadata.""" + version_id: str = Field(description="Unique version identifier") + transcript: Optional[str] = Field(None, description="Full transcript text") + segments: List["Conversation.SpeakerSegment"] = Field(default_factory=list, description="Speaker segments") + provider: Optional["Conversation.TranscriptProvider"] = Field(None, description="Transcription provider used") + model: Optional[str] = Field(None, description="Model used (e.g., nova-3, voxtral-mini-2507)") + created_at: datetime = Field(description="When this version was created") + processing_time_seconds: Optional[float] = Field(None, description="Time taken to process") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional provider-specific metadata") + + class MemoryVersion(BaseModel): + """Version of memory extraction with processing metadata.""" + version_id: str = Field(description="Unique version identifier") + memory_count: int = Field(description="Number of memories extracted") + transcript_version_id: str = Field(description="Which transcript version was used") + provider: "Conversation.MemoryProvider" = Field(description="Memory provider used") + model: Optional[str] = Field(None, description="Model used (e.g., gpt-4o-mini, llama3)") + created_at: datetime = Field(description="When this version was created") + processing_time_seconds: Optional[float] = Field(None, description="Time taken to process") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional provider-specific metadata") + + # Core identifiers + conversation_id: Indexed(str, unique=True) = Field(default_factory=lambda: str(uuid.uuid4()), description="Unique conversation identifier") + audio_uuid: Indexed(str) = Field(description="Session/audio identifier (for tracking audio files)") + user_id: Indexed(str) = Field(description="User who owns this conversation") + client_id: Indexed(str) = Field(description="Client device identifier") + + # Audio file reference + audio_path: Optional[str] = Field(None, description="Path to audio file (relative to CHUNK_DIR)") + cropped_audio_path: Optional[str] = Field(None, description="Path to cropped audio file (relative to CHUNK_DIR)") + + # Creation metadata + created_at: Indexed(datetime) = Field(default_factory=datetime.utcnow, description="When the conversation was created") + + # Processing status tracking + deleted: bool = Field(False, description="Whether this conversation was deleted due to processing failure") + deletion_reason: Optional[str] = Field(None, description="Reason for deletion (no_meaningful_speech, audio_file_not_ready, etc.)") + deleted_at: Optional[datetime] = Field(None, description="When the conversation was marked as deleted") + + # Conversation completion tracking + end_reason: Optional["Conversation.EndReason"] = Field(None, description="Reason why the conversation ended") + completed_at: Optional[datetime] = Field(None, description="When the conversation was completed/closed") + + # Summary fields (auto-generated from transcript) + title: Optional[str] = Field(None, description="Auto-generated conversation title") + summary: Optional[str] = Field(None, description="Auto-generated short summary (1-2 sentences)") + detailed_summary: Optional[str] = Field(None, description="Auto-generated detailed summary (comprehensive, corrected content)") + + # Versioned processing + transcript_versions: List["Conversation.TranscriptVersion"] = Field( + default_factory=list, + description="All transcript processing attempts" + ) + memory_versions: List["Conversation.MemoryVersion"] = Field( + default_factory=list, + description="All memory extraction attempts" + ) + + # Active version pointers + active_transcript_version: Optional[str] = Field( + None, + description="Version ID of currently active transcript" + ) + active_memory_version: Optional[str] = Field( + None, + description="Version ID of currently active memory extraction" + ) + + # Legacy fields removed - use transcript_versions[active_transcript_version] and memory_versions[active_memory_version] + # Frontend should access: conversation.active_transcript.segments, conversation.active_transcript.transcript + + @model_validator(mode='before') + @classmethod + def clean_legacy_data(cls, data: Any) -> Any: + """Clean up legacy/malformed data before Pydantic validation.""" + + if not isinstance(data, dict): + return data + + # Fix malformed transcript_versions (from old schema versions) + if 'transcript_versions' in data and isinstance(data['transcript_versions'], list): + for version in data['transcript_versions']: + if isinstance(version, dict): + # If segments is not a list, clear it + if 'segments' in version and not isinstance(version['segments'], list): + version['segments'] = [] + # If transcript is a dict, clear it + if 'transcript' in version and isinstance(version['transcript'], dict): + version['transcript'] = None + # Normalize provider to lowercase (legacy data had "Deepgram" instead of "deepgram") + if 'provider' in version and isinstance(version['provider'], str): + version['provider'] = version['provider'].lower() + # Fix speaker IDs in segments (legacy data had integers, need strings) + if 'segments' in version and isinstance(version['segments'], list): + for segment in version['segments']: + if isinstance(segment, dict) and 'speaker' in segment: + if isinstance(segment['speaker'], int): + segment['speaker'] = f"Speaker {segment['speaker']}" + elif not isinstance(segment['speaker'], str): + segment['speaker'] = "unknown" + + return data + + @computed_field + @property + def active_transcript(self) -> Optional["Conversation.TranscriptVersion"]: + """Get the currently active transcript version.""" + if not self.active_transcript_version: + return None + + for version in self.transcript_versions: + if version.version_id == self.active_transcript_version: + return version + return None + + @computed_field + @property + def active_memory(self) -> Optional["Conversation.MemoryVersion"]: + """Get the currently active memory version.""" + if not self.active_memory_version: + return None + + for version in self.memory_versions: + if version.version_id == self.active_memory_version: + return version + return None + + # Convenience properties that return data from active transcript version + @computed_field + @property + def transcript(self) -> Optional[str]: + """Get transcript text from active transcript version.""" + return self.active_transcript.transcript if self.active_transcript else None + + @computed_field + @property + def segments(self) -> List["Conversation.SpeakerSegment"]: + """Get segments from active transcript version.""" + return self.active_transcript.segments if self.active_transcript else [] + + @computed_field + @property + def segment_count(self) -> int: + """Get segment count from active transcript version.""" + return len(self.segments) if self.segments else 0 + + @computed_field + @property + def memory_count(self) -> int: + """Get memory count from active memory version.""" + return self.active_memory.memory_count if self.active_memory else 0 + + @computed_field + @property + def has_memory(self) -> bool: + """Check if conversation has any memory versions.""" + return len(self.memory_versions) > 0 + + @computed_field + @property + def transcript_version_count(self) -> int: + """Get count of transcript versions.""" + return len(self.transcript_versions) + + @computed_field + @property + def memory_version_count(self) -> int: + """Get count of memory versions.""" + return len(self.memory_versions) + + def add_transcript_version( + self, + version_id: str, + transcript: str, + segments: List["Conversation.SpeakerSegment"], + provider: "Conversation.TranscriptProvider", + model: Optional[str] = None, + processing_time_seconds: Optional[float] = None, + metadata: Optional[Dict[str, Any]] = None, + set_as_active: bool = True + ) -> "Conversation.TranscriptVersion": + """Add a new transcript version and optionally set it as active.""" + new_version = Conversation.TranscriptVersion( + version_id=version_id, + transcript=transcript, + segments=segments, + provider=provider, + model=model, + created_at=datetime.now(), + processing_time_seconds=processing_time_seconds, + metadata=metadata or {} + ) + + self.transcript_versions.append(new_version) + + if set_as_active: + self.active_transcript_version = version_id + + return new_version + + def add_memory_version( + self, + version_id: str, + memory_count: int, + transcript_version_id: str, + provider: "Conversation.MemoryProvider", + model: Optional[str] = None, + processing_time_seconds: Optional[float] = None, + metadata: Optional[Dict[str, Any]] = None, + set_as_active: bool = True + ) -> "Conversation.MemoryVersion": + """Add a new memory version and optionally set it as active.""" + new_version = Conversation.MemoryVersion( + version_id=version_id, + memory_count=memory_count, + transcript_version_id=transcript_version_id, + provider=provider, + model=model, + created_at=datetime.now(), + processing_time_seconds=processing_time_seconds, + metadata=metadata or {} + ) + + self.memory_versions.append(new_version) + + if set_as_active: + self.active_memory_version = version_id + + return new_version + + def set_active_transcript_version(self, version_id: str) -> bool: + """Set a specific transcript version as active.""" + for version in self.transcript_versions: + if version.version_id == version_id: + self.active_transcript_version = version_id + return True + return False + + def set_active_memory_version(self, version_id: str) -> bool: + """Set a specific memory version as active.""" + for version in self.memory_versions: + if version.version_id == version_id: + self.active_memory_version = version_id + return True + return False + + class Settings: + name = "conversations" + indexes = [ + "conversation_id", + "user_id", + "created_at", + [("user_id", 1), ("created_at", -1)] # Compound index for user queries + ] + + +# Factory function for creating conversations +def create_conversation( + audio_uuid: str, + user_id: str, + client_id: str, + conversation_id: Optional[str] = None, + title: Optional[str] = None, + summary: Optional[str] = None, + transcript: Optional[str] = None, + segments: Optional[List["Conversation.SpeakerSegment"]] = None, +) -> Conversation: + """ + Factory function to create a new conversation. + + Args: + audio_uuid: Unique identifier for the audio session + user_id: User who owns this conversation + client_id: Client device identifier + conversation_id: Optional unique conversation identifier (auto-generated if not provided) + title: Optional conversation title + summary: Optional conversation summary + transcript: Optional transcript text + segments: Optional speaker segments + + Returns: + Conversation instance + """ + # Build the conversation data + conv_data = { + "audio_uuid": audio_uuid, + "user_id": user_id, + "client_id": client_id, + "created_at": datetime.now(), + "title": title, + "summary": summary, + "transcript": transcript or "", + "segments": segments or [], + "transcript_versions": [], + "active_transcript_version": None, + "memory_versions": [], + "active_memory_version": None, + "memories": [], + "memory_count": 0 + } + + # Only set conversation_id if provided, otherwise let the model auto-generate it + if conversation_id is not None: + conv_data["conversation_id"] = conversation_id + + return Conversation(**conv_data) \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/models/job.py b/backends/advanced/src/advanced_omi_backend/models/job.py new file mode 100644 index 00000000..9d355ce5 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/models/job.py @@ -0,0 +1,289 @@ +""" +Job models and base classes for RQ queue system. + +This module provides: +- JobPriority enum for job priority levels +- BaseRQJob abstract class for common job setup and teardown +- async_job decorator for simplified job creation +""" + +import asyncio +import logging +import time +from abc import ABC, abstractmethod +from datetime import datetime, timezone +from enum import Enum +from typing import Any, Dict, Optional, Callable +from functools import wraps + +import redis.asyncio as redis_async + +logger = logging.getLogger(__name__) + +# Global flag to track if Beanie is initialized in this process +_beanie_initialized = False +_beanie_init_lock = asyncio.Lock() + +async def _ensure_beanie_initialized(): + """Ensure Beanie is initialized in the current process (for RQ workers).""" + global _beanie_initialized + async with _beanie_init_lock: + if _beanie_initialized: + return + try: + import os + from motor.motor_asyncio import AsyncIOMotorClient + from beanie import init_beanie + from advanced_omi_backend.models.conversation import Conversation + from advanced_omi_backend.models.audio_file import AudioFile + from advanced_omi_backend.models.user import User + from pymongo.errors import ConfigurationError + + # Get MongoDB URI from environment + mongodb_uri = os.getenv("MONGODB_URI", "mongodb://localhost:27017") + + # Create MongoDB client + client = AsyncIOMotorClient(mongodb_uri) + try: + database = client.get_default_database("friend-lite") + except ConfigurationError: + database = client["friend-lite"] + raise + _beanie_initialized = True + # Initialize Beanie + await init_beanie( + database=database, + document_models=[User, Conversation, AudioFile], + ) + + _beanie_initialized = True + logger.info("✅ Beanie initialized in RQ worker process") + + except Exception as e: + logger.error(f"❌ Failed to initialize Beanie in RQ worker: {e}") + raise + + +class JobPriority(str, Enum): + """Priority levels for RQ job processing. + + Used to map priority to RQ job timeout values: + - URGENT: 10 minutes timeout + - HIGH: 8 minutes timeout + - NORMAL: 5 minutes timeout (default) + - LOW: 3 minutes timeout + """ + URGENT = "urgent" # 1 - Process immediately + HIGH = "high" # 2 - Process before normal + NORMAL = "normal" # 3 - Default priority + LOW = "low" # 4 - Process when idle + + +class BaseRQJob(ABC): + """ + Base class for RQ job implementations. + + Handles common setup and teardown: + - Event loop management + - Beanie (MongoDB ODM) initialization + - Redis client creation (optional) + - Exception handling and logging + + Subclasses must implement the `execute()` method with job-specific logic. + + Example: + class MyJob(BaseRQJob): + async def execute(self) -> Dict[str, Any]: + # Job-specific async logic here + result = await some_async_operation() + return {"success": True, "result": result} + + # RQ job function wrapper + def my_job_function(arg1, arg2, redis_url=None): + job = MyJob(redis_url=redis_url) + return job.run(arg1=arg1, arg2=arg2) + """ + + def __init__(self, redis_url: Optional[str] = None, initialize_beanie: bool = True): + """ + Initialize base job with common dependencies. + + Args: + redis_url: Redis connection URL (optional, creates client if provided) + initialize_beanie: Whether to initialize Beanie ODM (default True) + """ + self.redis_url = redis_url + self.initialize_beanie = initialize_beanie + self.redis_client: Optional[redis_async.Redis] = None + self.job_start_time = time.time() + + async def _setup(self): + """Setup common dependencies before job execution.""" + # Initialize Beanie for MongoDB access + if self.initialize_beanie: + await _ensure_beanie_initialized() + logger.debug("Beanie initialized") + + # Create Redis client if URL provided + if self.redis_url: + self.redis_client = redis_async.from_url(self.redis_url) + logger.debug(f"Redis client created: {self.redis_url}") + + async def _teardown(self): + """Cleanup resources after job execution.""" + if self.redis_client: + await self.redis_client.close() + logger.debug("Redis client closed") + + @abstractmethod + async def execute(self, **kwargs) -> Dict[str, Any]: + """ + Execute job-specific logic. + + This method must be implemented by subclasses. + + Args: + **kwargs: Job-specific parameters passed from RQ + + Returns: + Dict with job results + """ + pass + + def run(self, **kwargs) -> Dict[str, Any]: + """ + Run the job with common setup and teardown. + + This method: + 1. Creates a new event loop + 2. Calls _setup() for dependencies + 3. Calls execute() with job-specific logic + 4. Calls _teardown() for cleanup + 5. Handles exceptions and logging + + Args: + **kwargs: Job-specific parameters to pass to execute() + + Returns: + Dict with job results + """ + job_name = self.__class__.__name__ + logger.info(f"🚀 Starting {job_name}") + + try: + # Create new event loop for this job + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + async def process(): + await self._setup() + try: + result = await self.execute(**kwargs) + return result + finally: + await self._teardown() + + result = loop.run_until_complete(process()) + + elapsed = time.time() - self.job_start_time + logger.info(f"✅ {job_name} completed in {elapsed:.2f}s") + return result + + finally: + loop.close() + + except Exception as e: + elapsed = time.time() - self.job_start_time + logger.error(f"❌ {job_name} failed after {elapsed:.2f}s: {e}", exc_info=True) + raise + + +def async_job(redis: bool = True, beanie: bool = True, timeout: int = 300, result_ttl: int = 3600): + """ + Decorator to convert async functions into RQ-compatible job functions. + + Handles common setup/teardown: + - Event loop management + - Beanie (MongoDB ODM) initialization + - Redis client creation (optional) + - Exception handling and logging + - Default job configuration (timeout, result_ttl) + + Args: + redis: If True, creates Redis client and passes as 'redis_client' kwarg + beanie: If True, initializes Beanie ODM (default True) + timeout: Default job timeout in seconds (default 300 = 5 minutes) + result_ttl: Default result TTL in seconds (default 3600 = 1 hour) + + Example: + @async_job(redis=True, beanie=True, timeout=600) + async def my_job(arg1, arg2, redis_client=None): + # Job logic with redis_client available + result = await some_async_operation() + return {"success": True, "result": result} + + # Enqueue with defaults or override + queue.enqueue(my_job, arg1_value, arg2_value) # Uses timeout=600 + queue.enqueue(my_job, arg1_value, arg2_value, job_timeout=1200) # Override + """ + def decorator(func: Callable) -> Callable: + @wraps(func) + def wrapper(*args, **kwargs) -> Dict[str, Any]: + job_name = func.__name__ + start_time = time.time() + logger.info(f"🚀 Starting {job_name}") + + redis_client = None + + try: + # Create new event loop for this job + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + async def process(): + nonlocal redis_client + + # Initialize Beanie for MongoDB access + if beanie: + await _ensure_beanie_initialized() + logger.debug("Beanie initialized") + + # Create Redis client if requested + if redis: + from advanced_omi_backend.controllers.queue_controller import REDIS_URL + redis_client = redis_async.from_url(REDIS_URL) + kwargs['redis_client'] = redis_client + logger.debug(f"Redis client created") + + try: + # Call the actual job function + result = await func(*args, **kwargs) + return result + finally: + # Cleanup Redis client + if redis_client: + await redis_client.close() + logger.debug("Redis client closed") + + result = loop.run_until_complete(process()) + + elapsed = time.time() - start_time + logger.info(f"✅ {job_name} completed in {elapsed:.2f}s") + return result + + finally: + loop.close() + + except Exception as e: + elapsed = time.time() - start_time + logger.error(f"❌ {job_name} failed after {elapsed:.2f}s: {e}", exc_info=True) + raise + + # Store default job configuration as attributes for RQ introspection + wrapper.job_timeout = timeout + wrapper.result_ttl = result_ttl + + return wrapper + return decorator \ No newline at end of file diff --git a/backends/advanced/src/advanced_omi_backend/models/transcription.py b/backends/advanced/src/advanced_omi_backend/models/transcription.py new file mode 100644 index 00000000..13893a68 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/models/transcription.py @@ -0,0 +1,134 @@ +""" +Transcription provider abstract base classes. + +This module defines the interfaces for transcription providers. +All concrete provider implementations should inherit from these base classes. + +Provider Output Formats: +----------------------- +All providers return a standardized dictionary with the following structure: +{ + "text": str, # Full transcript text + "words": List[dict], # Word-level data (if available) + "segments": List[dict] # Speaker segments (if available) +} + +Word object format (when available): +{ + "word": str, # The word text + "start": float, # Start time in seconds + "end": float, # End time in seconds + "confidence": float, # Confidence score (0-1) + "speaker": int # Speaker ID (optional) +} + +Provider-specific behaviors: +- Deepgram: Returns rich word-level timestamps with confidence scores +- NeMo Parakeet: Returns word-level timestamps (streaming and batch modes) +""" + +import abc +from enum import Enum +from typing import Optional + + +class TranscriptionProvider(Enum): + """Available transcription providers for audio stream routing.""" + DEEPGRAM = "deepgram" + PARAKEET = "parakeet" + MISTRAL = "mistral" + + +class BaseTranscriptionProvider(abc.ABC): + """Abstract base class for all transcription providers.""" + + @abc.abstractmethod + async def transcribe(self, audio_data: bytes, sample_rate: int, **kwargs) -> dict: + """ + Transcribe audio data to text with word-level timestamps. + + Args: + audio_data: Raw audio bytes (PCM format) + sample_rate: Audio sample rate (Hz) + **kwargs: Additional parameters (e.g. diarize=True for speaker diarization) + + Returns: + Dictionary containing: + - text: Transcribed text string + - words: List of word-level data with timestamps (required) + - segments: List of speaker segments (empty for non-RTTM providers) + """ + pass + + @property + @abc.abstractmethod + def name(self) -> str: + """Return the provider name for logging.""" + pass + + @property + @abc.abstractmethod + def mode(self) -> str: + """Return 'streaming' or 'batch' for processing mode.""" + pass + + async def connect(self, client_id: Optional[str] = None): + """Initialize/connect the provider. Default implementation does nothing.""" + pass + + async def disconnect(self): + """Cleanup/disconnect the provider. Default implementation does nothing.""" + pass + + +class StreamingTranscriptionProvider(BaseTranscriptionProvider): + """Base class for streaming transcription providers.""" + + @property + def mode(self) -> str: + return "streaming" + + @abc.abstractmethod + async def start_stream(self, client_id: str, sample_rate: int = 16000, diarize: bool = False): + """Start a transcription stream for a client. + + Args: + client_id: Unique client identifier + sample_rate: Audio sample rate + diarize: Whether to enable speaker diarization (provider-dependent) + """ + pass + + @abc.abstractmethod + async def process_audio_chunk(self, client_id: str, audio_chunk: bytes) -> Optional[dict]: + """ + Process audio chunk and return partial/final transcription. + + Returns: + None for partial results, dict with transcription for final results + """ + pass + + @abc.abstractmethod + async def end_stream(self, client_id: str) -> dict: + """End stream and return final transcription with word-level timestamps.""" + pass + + +class BatchTranscriptionProvider(BaseTranscriptionProvider): + """Base class for batch transcription providers.""" + + @property + def mode(self) -> str: + return "batch" + + @abc.abstractmethod + async def transcribe(self, audio_data: bytes, sample_rate: int, diarize: bool = False) -> dict: + """Transcribe audio data. + + Args: + audio_data: Raw audio bytes + sample_rate: Audio sample rate + diarize: Whether to enable speaker diarization (provider-dependent) + """ + pass diff --git a/backends/advanced/src/advanced_omi_backend/models/user.py b/backends/advanced/src/advanced_omi_backend/models/user.py new file mode 100644 index 00000000..b0ced195 --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/models/user.py @@ -0,0 +1,130 @@ +"""User models for fastapi-users integration with Beanie and MongoDB.""" + +import logging +from datetime import UTC, datetime +from typing import Optional + +from beanie import Document, PydanticObjectId +from fastapi_users.db import BeanieBaseUser, BeanieUserDatabase +from fastapi_users.schemas import BaseUser, BaseUserCreate, BaseUserUpdate +from pydantic import ConfigDict, EmailStr, Field + +logger = logging.getLogger(__name__) + + +class UserCreate(BaseUserCreate): + """Schema for creating new users.""" + + display_name: Optional[str] = None + is_superuser: Optional[bool] = False + + +class UserRead(BaseUser[PydanticObjectId]): + """Schema for reading user data.""" + + display_name: Optional[str] = None + registered_clients: dict[str, dict] = Field(default_factory=dict) + primary_speakers: list[dict] = Field(default_factory=list) + + +class UserUpdate(BaseUserUpdate): + """Schema for updating user data.""" + + display_name: Optional[str] = None + is_superuser: Optional[bool] = None + + def create_update_dict(self): + """Create update dictionary for regular user operations.""" + update_dict = super().create_update_dict() + if self.display_name is not None: + update_dict["display_name"] = self.display_name + return update_dict + + def create_update_dict_superuser(self): + """Create update dictionary for superuser operations.""" + update_dict = super().create_update_dict_superuser() + if self.display_name is not None: + update_dict["display_name"] = self.display_name + return update_dict + + +class User(BeanieBaseUser, Document): + """User model extending fastapi-users BeanieBaseUser with custom fields.""" + + # Pydantic v2 configuration + model_config = ConfigDict( + from_attributes=True, + populate_by_name=True, + ) + + display_name: Optional[str] = None + # Client tracking for audio devices + registered_clients: dict[str, dict] = Field(default_factory=dict) + # Speaker processing filter configuration + primary_speakers: list[dict] = Field(default_factory=list) + + class Settings: + name = "users" # Collection name in MongoDB - standardized from "fastapi_users" + email_collation = {"locale": "en", "strength": 2} # Case-insensitive comparison + + @property + def user_id(self) -> str: + """Return string representation of MongoDB ObjectId for backward compatibility.""" + return str(self.id) + + def register_client(self, client_id: str, device_name: Optional[str] = None) -> None: + """Register a new client for this user.""" + # Check if client already exists + if client_id in self.registered_clients: + # Update existing client + logger.info(f"Updating existing client {client_id} for user {self.user_id}") + self.registered_clients[client_id]["last_seen"] = datetime.now(UTC) + self.registered_clients[client_id]["device_name"] = ( + device_name or self.registered_clients[client_id].get("device_name") + ) + return + + # Add new client + self.registered_clients[client_id] = { + "client_id": client_id, + "device_name": device_name, + "first_seen": datetime.now(UTC), + "last_seen": datetime.now(UTC), + "is_active": True, + } + + def get_client_ids(self) -> list[str]: + """Get all client IDs registered to this user.""" + return list(self.registered_clients.keys()) + + +# Rebuild Pydantic model to ensure inherited fields are properly accessible +User.model_rebuild() + + +async def get_user_db(): + """Get the user database instance for dependency injection.""" + yield BeanieUserDatabase(User) # type: ignore + + +async def get_user_by_id(user_id: str) -> Optional[User]: + """Get user by MongoDB ObjectId string.""" + try: + return await User.get(PydanticObjectId(user_id)) + except Exception as e: + logger.error(f"Failed to get user by ID {user_id}: {e}") + # Re-raise for proper error handling upstream + raise + + +async def get_user_by_client_id(client_id: str) -> Optional[User]: + """Find the user that owns a specific client_id.""" + return await User.find_one({"registered_clients.client_id": client_id}) + + +async def register_client_to_user( + user: User, client_id: str, device_name: Optional[str] = None +) -> None: + """Register a client to a user and save to database.""" + user.register_client(client_id, device_name) + await user.save() diff --git a/backends/advanced/src/advanced_omi_backend/processors.py b/backends/advanced/src/advanced_omi_backend/processors.py deleted file mode 100644 index e4c077ca..00000000 --- a/backends/advanced/src/advanced_omi_backend/processors.py +++ /dev/null @@ -1,1239 +0,0 @@ -"""Application-level processors for audio, transcription, memory, and cropping. - -This module implements global processing queues and processors that handle -all processing tasks at the application level, decoupled from individual -client connections. -""" - -import asyncio -import logging -import time -import uuid -from dataclasses import dataclass -from datetime import UTC, datetime -from pathlib import Path - -# Import TranscriptionManager for type hints -from typing import TYPE_CHECKING, Any, Optional - -from advanced_omi_backend.audio_utils import ( - _process_audio_cropping_with_relative_timestamps, -) -from advanced_omi_backend.client_manager import get_client_manager -from advanced_omi_backend.database import ( - AudioChunksRepository, - ConversationsRepository, - conversations_col, -) -from advanced_omi_backend.memory import get_memory_service -from advanced_omi_backend.task_manager import get_task_manager -from advanced_omi_backend.users import get_user_by_id -from easy_audio_interfaces.filesystem.filesystem_interfaces import LocalFileSink -from wyoming.audio import AudioChunk - -# Lazy import to avoid config loading issues -# from advanced_omi_backend.memory import get_memory_service - -if TYPE_CHECKING: - from advanced_omi_backend.transcription import TranscriptionManager - -logger = logging.getLogger(__name__) -audio_logger = logging.getLogger("audio_processing") - -# Audio configuration constants -OMI_SAMPLE_RATE = 16_000 -OMI_CHANNELS = 1 -OMI_SAMPLE_WIDTH = 2 -SEGMENT_SECONDS = 60 -TARGET_SAMPLES = OMI_SAMPLE_RATE * SEGMENT_SECONDS - -if TYPE_CHECKING: - from advanced_omi_backend.transcription import TranscriptionManager - - -@dataclass -class AudioProcessingItem: - """Item for audio processing queue.""" - - client_id: str - user_id: str - user_email: str - audio_chunk: AudioChunk - audio_uuid: Optional[str] = None - timestamp: Optional[int] = None - - -@dataclass -class TranscriptionItem: - """Item for transcription processing queue.""" - - client_id: str - user_id: str - audio_uuid: str - audio_chunk: AudioChunk - - -@dataclass -class MemoryProcessingItem: - """Item for memory processing queue (speech-driven conversations architecture).""" - - client_id: str - user_id: str - user_email: str - conversation_id: str - - -@dataclass -class AudioCroppingItem: - """Item for audio cropping queue.""" - - client_id: str - user_id: str - audio_uuid: str - original_path: str - speech_segments: list[tuple[float, float]] - output_path: str - - -class ProcessorManager: - """Manages all application-level processors and queues.""" - - def __init__(self, chunk_dir: Path, audio_chunks_repository: AudioChunksRepository): - self.chunk_dir = chunk_dir - self.repository = audio_chunks_repository - - # Global processing queues - self.audio_queue: asyncio.Queue[Optional[AudioProcessingItem]] = asyncio.Queue() - self.transcription_queue: asyncio.Queue[Optional[TranscriptionItem]] = asyncio.Queue() - self.memory_queue: asyncio.Queue[Optional[MemoryProcessingItem]] = asyncio.Queue() - self.cropping_queue: asyncio.Queue[Optional[AudioCroppingItem]] = asyncio.Queue() - - # Processor tasks - self.audio_processor_task: Optional[asyncio.Task] = None - self.transcription_processor_task: Optional[asyncio.Task] = None - self.memory_processor_task: Optional[asyncio.Task] = None - self.cropping_processor_task: Optional[asyncio.Task] = None - - # Services - lazy import - self.memory_service = None - self.task_manager = get_task_manager() - self.client_manager = get_client_manager() - - # Track active file sinks per client - self.active_file_sinks: dict[str, LocalFileSink] = {} - self.active_audio_uuids: dict[str, str] = {} - - # Transcription managers pool - self.transcription_managers: dict[str, "TranscriptionManager"] = {} - - # Shutdown flag - self.shutdown_flag = False - - # Task tracking for specific processing jobs - self.processing_tasks: dict[str, dict[str, str]] = {} # client_id -> {stage: task_id} - - # Direct state tracking for synchronous operations - self.processing_state: dict[str, dict[str, Any]] = {} # client_id -> {stage: state_info} - - # Track clients currently being closed to prevent duplicate close operations - self.closing_clients: set[str] = set() - - async def _update_memory_status(self, conversation_id: str, status: str): - """Update memory processing status for conversation.""" - try: - conversations_repo = ConversationsRepository(conversations_col) - await conversations_repo.update_memory_processing_status(conversation_id, status) - - audio_logger.info(f"📝 Updated memory status to {status} for conversation {conversation_id}") - except Exception as e: - audio_logger.error(f"Failed to update memory status to {status} for conversation {conversation_id}: {e}") - - async def start(self): - """Start all processors.""" - # Create processor tasks - self.audio_processor_task = asyncio.create_task( - self._audio_processor(), name="audio_processor" - ) - self.transcription_processor_task = asyncio.create_task( - self._transcription_processor(), name="transcription_processor" - ) - self.memory_processor_task = asyncio.create_task( - self._memory_processor(), name="memory_processor" - ) - self.cropping_processor_task = asyncio.create_task( - self._cropping_processor(), name="cropping_processor" - ) - - # Track processor tasks in task manager - self.task_manager.track_task( - self.audio_processor_task, "audio_processor", {"type": "processor"} - ) - self.task_manager.track_task( - self.transcription_processor_task, "transcription_processor", {"type": "processor"} - ) - self.task_manager.track_task( - self.memory_processor_task, "memory_processor", {"type": "processor"} - ) - self.task_manager.track_task( - self.cropping_processor_task, "cropping_processor", {"type": "processor"} - ) - - async def _should_process_memory(self, user_id: str, conversation_id: str) -> tuple[bool, str]: - """ - Determine if memory processing should proceed based on primary speakers configuration. - - Implements graceful degradation: - - No primary speakers configured → Process all (True) - - Speaker service unavailable → Process all (True) - - No speakers identified → Process all (True) - - Primary speakers found → Process (True) - - Only non-primary speakers → Skip (False) - - Args: - user_id: User ID to check primary speakers configuration - conversation_id: Conversation ID to check transcript speakers - - Returns: - Tuple of (should_process: bool, reason: str) - """ - try: - # Get user's primary speaker configuration - user = await get_user_by_id(user_id) - if not user or not user.primary_speakers: - return True, "No primary speakers configured - processing all conversations" - - audio_logger.info(f"🔍 Checking primary speakers filter for conversation {conversation_id} - user has {len(user.primary_speakers)} primary speakers configured") - - # Get conversation data from conversations collection - conversations_repo = ConversationsRepository(conversations_col) - conversation = await conversations_repo.get_conversation(conversation_id) - if not conversation or not conversation.get('transcript'): - return True, "No transcript data available - processing conversation" - - # Extract speakers from transcript segments (normalized for comparison) - transcript_speakers = set() - transcript_speaker_originals = {} # Keep original names for logging - total_segments = 0 - identified_segments = 0 - - for segment in conversation['transcript']: - total_segments += 1 - if 'identified_as' in segment and segment['identified_as'] and segment['identified_as'] != 'Unknown': - original_name = segment['identified_as'] - normalized_name = original_name.strip().lower() - transcript_speakers.add(normalized_name) - transcript_speaker_originals[normalized_name] = original_name - identified_segments += 1 - - if not transcript_speakers: - return True, f"No speakers identified in transcript ({identified_segments}/{total_segments} segments) - processing conversation" - - # Check if any primary speakers are present (normalized comparison) - primary_speaker_names = {ps['name'].strip().lower() for ps in user.primary_speakers} - primary_speaker_originals = {ps['name'].strip().lower(): ps['name'] for ps in user.primary_speakers} - found_primary_speakers_normalized = transcript_speakers.intersection(primary_speaker_names) - - if found_primary_speakers_normalized: - # Convert back to original names for display - found_primary_originals = [primary_speaker_originals[name] for name in found_primary_speakers_normalized] - audio_logger.info(f"✅ Primary speakers found in conversation: {found_primary_originals} - processing memory") - return True, f"Primary speakers detected: {', '.join(found_primary_originals)}" - else: - # Show original names in logs - transcript_originals = [transcript_speaker_originals[name] for name in transcript_speakers] - primary_originals = [primary_speaker_originals[name] for name in primary_speaker_names] - audio_logger.info(f"❌ No primary speakers found - transcript speakers: {transcript_originals}, primary speakers: {primary_originals} - skipping memory processing") - return False, f"Only non-primary speakers found: {', '.join(transcript_originals)}" - - except Exception as e: - # On any error, default to processing (fail-safe) - audio_logger.warning(f"Error checking primary speakers filter for {conversation_id}: {e} - defaulting to process conversation") - return True, f"Error in speaker filtering: {str(e)} - processing conversation as fallback" - - async def shutdown(self): - """Shutdown all processors gracefully.""" - logger.info("Shutting down processors...") - self.shutdown_flag = True - - # Signal all queues to stop - await self.audio_queue.put(None) - await self.transcription_queue.put(None) - await self.memory_queue.put(None) - await self.cropping_queue.put(None) - - # Wait for processors to complete with timeout - tasks = [ - ("audio_processor", self.audio_processor_task, 30.0), - ("transcription_processor", self.transcription_processor_task, 60.0), - ("memory_processor", self.memory_processor_task, 300.0), # 5 minutes for LLM - ("cropping_processor", self.cropping_processor_task, 60.0), - ] - - for name, task, timeout in tasks: - if task: - try: - await asyncio.wait_for(task, timeout=timeout) - logger.info(f"{name} shut down gracefully") - except asyncio.TimeoutError: - logger.warning(f"{name} did not shut down within {timeout}s, cancelling") - task.cancel() - try: - await task - except asyncio.CancelledError: - logger.info(f"{name} cancelled successfully") - - # Clean up transcription managers - for manager in self.transcription_managers.values(): - try: - await manager.disconnect() - except Exception as e: - logger.error(f"Error disconnecting transcription manager: {e}") - - # Close any remaining file sinks - for sink in self.active_file_sinks.values(): - try: - await sink.close() - except Exception as e: - logger.error(f"Error closing file sink: {e}") - - logger.info("All processors shut down") - - def _new_local_file_sink( - self, file_path: str, sample_rate: Optional[int] = None - ) -> LocalFileSink: - """Create a properly configured LocalFileSink with dynamic sample rate.""" - effective_sample_rate = sample_rate or OMI_SAMPLE_RATE - return LocalFileSink( - file_path=file_path, - sample_rate=int(effective_sample_rate), - channels=int(OMI_CHANNELS), - sample_width=int(OMI_SAMPLE_WIDTH), - ) - - async def queue_audio(self, item: AudioProcessingItem): - """Queue audio for processing.""" - audio_logger.debug( - f"📥 queue_audio called for client {item.client_id}, audio chunk: {len(item.audio_chunk.audio)} bytes" - ) - await self.audio_queue.put(item) - queue_size = self.audio_queue.qsize() - audio_logger.debug( - f"✅ Successfully queued audio for client {item.client_id}, queue size: {queue_size}" - ) - - async def queue_transcription(self, item: TranscriptionItem): - """Queue audio for transcription.""" - audio_logger.debug( - f"📥 queue_transcription called for client {item.client_id}, audio_uuid: {item.audio_uuid}" - ) - await self.transcription_queue.put(item) - audio_logger.debug( - f"📤 Successfully put item in transcription_queue for client {item.client_id}, queue size: {self.transcription_queue.qsize()}" - ) - - async def queue_memory(self, item: MemoryProcessingItem): - """Queue conversation for memory processing.""" - audio_logger.info( - f"📥 queue_memory called for conversation {item.conversation_id} (client {item.client_id})" - ) - audio_logger.info(f"📥 Memory queue size before: {self.memory_queue.qsize()}") - await self.memory_queue.put(item) - audio_logger.info(f"📥 Memory queue size after: {self.memory_queue.qsize()}") - audio_logger.info(f"✅ Successfully queued memory processing item for conversation {item.conversation_id}") - - async def queue_cropping(self, item: AudioCroppingItem): - """Queue audio for cropping.""" - await self.cropping_queue.put(item) - - def track_processing_task( - self, client_id: str, stage: str, task_id: str, metadata: dict[str, Any] | None = None - ): - """Track a processing task for a specific client and stage.""" - if client_id not in self.processing_tasks: - self.processing_tasks[client_id] = {} - self.processing_tasks[client_id][stage] = task_id - logger.info(f"Tracking task {task_id} for client {client_id} stage {stage}") - - def track_processing_stage( - self, client_id: str, stage: str, status: str, metadata: dict[str, Any] | None = None - ): - """Track processing stage completion directly for synchronous operations.""" - if client_id not in self.processing_state: - self.processing_state[client_id] = {} - - self.processing_state[client_id][stage] = { - "status": status, # "started", "completed", "failed" - "completed": status == "completed", - "error": None if status != "failed" else metadata.get("error") if metadata else None, - "metadata": metadata or {}, - "timestamp": time.time(), - } - logger.info(f"Tracking stage {stage} as {status} for client {client_id}") - - def get_processing_status(self, client_id: str) -> dict[str, Any]: - """Get processing status for a specific client using both direct state and task tracking.""" - logger.debug(f"Getting processing status for client {client_id}") - logger.debug( - f"Available client_ids in processing_tasks: {list(self.processing_tasks.keys())}" - ) - logger.debug( - f"Available client_ids in processing_state: {list(self.processing_state.keys())}" - ) - - stages = {} - - # First, get task tracking (for asynchronous operations like memory/cropping) - if client_id in self.processing_tasks: - client_tasks = self.processing_tasks[client_id] - for stage, task_id in client_tasks.items(): - logger.info(f"Looking up task {task_id} for stage {stage}") - task_info = self.task_manager.get_task_info(task_id) - logger.info(f"Task info for {task_id}: {task_info}") - if task_info: - stages[stage] = { - "task_id": task_id, - "completed": task_info.completed_at is not None, - "error": task_info.error, - "created_at": task_info.created_at, - "completed_at": task_info.completed_at, - "cancelled": task_info.cancelled, - } - else: - stages[stage] = { - "task_id": task_id, - "completed": False, - "error": "Task not found", - "created_at": None, - "completed_at": None, - "cancelled": False, - } - - # Then, get direct state tracking (for synchronous operations like audio, transcription) - # Direct state takes PRECEDENCE over task tracking for the same stage - if client_id in self.processing_state: - client_state = self.processing_state[client_id] - for stage, state_info in client_state.items(): - stages[stage] = { - "completed": state_info["completed"], - "error": state_info["error"], - "status": state_info["status"], - "metadata": state_info["metadata"], - "timestamp": state_info["timestamp"], - } - logger.debug(f"Direct state - {stage}: {state_info['status']} (takes precedence)") - - # If no stages found, return no_tasks - if not stages: - return {"status": "no_tasks", "stages": {}} - - # Check if all stages are complete - all_complete = all(stage_info["completed"] for stage_info in stages.values()) - - return { - "status": "complete" if all_complete else "processing", - "stages": stages, - "client_id": client_id, - } - - def cleanup_processing_tasks(self, client_id: str): - """Clean up processing task tracking for a client.""" - if client_id in self.processing_tasks: - del self.processing_tasks[client_id] - logger.debug(f"Cleaned up processing tasks for client {client_id}") - - if client_id in self.processing_state: - del self.processing_state[client_id] - logger.debug(f"Cleaned up processing state for client {client_id}") - - def get_all_processing_status(self) -> dict[str, Any]: - """Get processing status for all clients.""" - # Get all client IDs from both tracking types - all_client_ids = set(self.processing_tasks.keys()) | set(self.processing_state.keys()) - return {client_id: self.get_processing_status(client_id) for client_id in all_client_ids} - - async def mark_transcription_failed(self, client_id: str, error: str): - """Mark transcription as failed and clean up transcription manager. - - This method handles transcription failures without closing audio files, - allowing long recordings to continue even if intermediate transcriptions fail. - - Args: - client_id: The client ID whose transcription failed - error: The error message describing the failure - """ - # Mark as failed in state tracking - self.track_processing_stage(client_id, "transcription", "failed", {"error": error}) - - # Remove transcription manager to allow fresh retry - if client_id in self.transcription_managers: - try: - manager = self.transcription_managers.pop(client_id) - await manager.disconnect() - audio_logger.info(f"🧹 Removed failed transcription manager for {client_id}") - except Exception as cleanup_error: - audio_logger.error( - f"❌ Error cleaning up transcription manager for {client_id}: {cleanup_error}" - ) - - # Do NOT close audio files - client may still be streaming - # Audio will be closed when client disconnects or sends audio-stop - audio_logger.warning( - f"❌ Transcription failed for {client_id}: {error}, keeping audio session open" - ) - - async def close_client_audio(self, client_id: str): - """Close audio file for a client when conversation ends.""" - audio_logger.info(f"🔚 close_client_audio called for client {client_id}") - - # Check if already closing to prevent duplicate operations - if client_id in self.closing_clients: - audio_logger.info(f"⏭️ Client {client_id} already being closed, skipping duplicate close") - return - - # Mark as being closed - self.closing_clients.add(client_id) - - # First, flush ASR to complete any pending transcription - if client_id in self.transcription_managers: - try: - manager = self.transcription_managers[client_id] - audio_logger.info( - f"🔄 Found transcription manager - flushing ASR for client {client_id}" - ) - audio_logger.info( - f"📊 Transcription manager state - has manager: {manager is not None}, type: {type(manager).__name__}" - ) - - flush_start_time = time.time() - audio_logger.info( - f"📤 Calling flush_final_transcript for client {client_id} (manager: {manager})" - ) - try: - await manager.process_collected_audio() - flush_duration = time.time() - flush_start_time - audio_logger.info( - f"✅ ASR flush completed for client {client_id} in {flush_duration:.2f}s" - ) - # Mark transcription as completed after successful flush - self.track_processing_stage( - client_id, "transcription", "completed", {"flushed": True} - ) - except Exception as flush_error: - audio_logger.error( - f"❌ Error during flush_final_transcript: {flush_error}", exc_info=True - ) - # Mark transcription as failed on flush error - self.track_processing_stage( - client_id, "transcription", "failed", {"error": str(flush_error)} - ) - raise - - # Verify that transcription was marked as completed after flush - current_status = self.get_processing_status(client_id) - transcription_stage = current_status.get("stages", {}).get("transcription", {}) - audio_logger.info( - f"🔍 Post-flush transcription status: {transcription_stage.get('status', 'unknown')} (completed: {transcription_stage.get('completed', False)})" - ) - except Exception as e: - audio_logger.error( - f"❌ Error flushing ASR for client {client_id}: {e}", exc_info=True - ) - else: - audio_logger.warning( - f"⚠️ No transcription manager found for client {client_id} - cannot flush transcription" - ) - - # Then close the audio file - if client_id in self.active_file_sinks: - try: - sink = self.active_file_sinks[client_id] - await sink.close() - del self.active_file_sinks[client_id] - - if client_id in self.active_audio_uuids: - del self.active_audio_uuids[client_id] - - audio_logger.info(f"Closed audio file for client {client_id}") - except Exception as e: - audio_logger.error(f"Error closing audio file for client {client_id}: {e}") - - # Remove from closing set now that we're done - self.closing_clients.discard(client_id) - audio_logger.info(f"✅ Completed close_client_audio for client {client_id}") - - async def ensure_transcription_manager(self, client_id: str): - """Ensure a transcription manager exists for the given client. - - This can be called early (e.g., on audio-start) to create the manager - before audio chunks arrive. - """ - from advanced_omi_backend.transcription import TranscriptionManager - if client_id not in self.transcription_managers: - audio_logger.info( - f"🔌 Creating transcription manager for client {client_id} (early creation)" - ) - manager = TranscriptionManager( - chunk_repo=self.repository, processor_manager=self - ) - try: - await manager.connect(client_id) - self.transcription_managers[client_id] = manager - audio_logger.info( - f"✅ Successfully created transcription manager for {client_id}" - ) - except Exception as e: - audio_logger.error( - f"❌ Failed to create transcription manager for {client_id}: {e}" - ) - raise - else: - audio_logger.debug( - f"♻️ Transcription manager already exists for client {client_id}" - ) - - async def _audio_processor(self): - """Process audio chunks and save to files.""" - audio_logger.info("Audio processor started") - - try: - while not self.shutdown_flag: - try: - # Get item with timeout to allow periodic health checks - queue_size = self.audio_queue.qsize() - if queue_size > 0: - audio_logger.debug( - f"🔄 Audio processor waiting for items, queue size: {queue_size}" - ) - item = await asyncio.wait_for(self.audio_queue.get(), timeout=30.0) - - audio_logger.debug( - f"📦 Audio processor dequeued item for client {item.client_id if item else 'None'}" - ) - - if item is None: # Shutdown signal - audio_logger.info("🛑 Audio processor received shutdown signal") - self.audio_queue.task_done() - break - - try: - # Get or create file sink for this client - if item.client_id not in self.active_file_sinks: - audio_logger.debug( - f"🆕 Creating new audio file sink for client {item.client_id}" - ) - # Get client state to access/store sample rate - client_state = self.client_manager.get_client(item.client_id) - audio_logger.debug( - f"👤 Client state lookup for {item.client_id}: {client_state is not None}" - ) - - # Store sample rate from first audio chunk - if client_state and client_state.sample_rate is None: - client_state.sample_rate = item.audio_chunk.rate - audio_logger.info( - f"📊 Set sample rate to {client_state.sample_rate}Hz for client {item.client_id}" - ) - - # Get sample rate for file sink (use client state or fallback to chunk rate) - file_sample_rate = None - if client_state and client_state.sample_rate: - file_sample_rate = client_state.sample_rate - else: - file_sample_rate = item.audio_chunk.rate - audio_logger.warning( - f"Using chunk sample rate {file_sample_rate}Hz for {item.client_id} (no client state)" - ) - - # Create new file - audio_uuid = uuid.uuid4().hex - timestamp = item.timestamp or int(time.time()) - wav_filename = f"{timestamp}_{item.client_id}_{audio_uuid}.wav" - - sink = self._new_local_file_sink( - f"{self.chunk_dir}/{wav_filename}", file_sample_rate - ) - await sink.open() - - self.active_file_sinks[item.client_id] = sink - self.active_audio_uuids[item.client_id] = audio_uuid - - # Create database entry - await self.repository.create_chunk( - audio_uuid=audio_uuid, - audio_path=wav_filename, - client_id=item.client_id, - timestamp=timestamp, - user_id=item.user_id, - user_email=item.user_email, - ) - - # Notify client state about new audio UUID - if client_state: - client_state.set_current_audio_uuid(audio_uuid) - - # Track audio processing completion directly (synchronous operation) - self.track_processing_stage( - item.client_id, - "audio", - "completed", - { - "audio_uuid": audio_uuid, - "wav_filename": wav_filename, - "file_created": True, - }, - ) - - audio_logger.info( - f"Created new audio file for client {item.client_id}: {wav_filename}" - ) - - # Write audio chunk - sink = self.active_file_sinks[item.client_id] - await sink.write(item.audio_chunk) - - # Queue for transcription - audio_uuid = self.active_audio_uuids[item.client_id] - audio_logger.debug( - f"🔄 About to queue transcription for client {item.client_id}, audio_uuid: {audio_uuid}" - ) - await self.queue_transcription( - TranscriptionItem( - client_id=item.client_id, - user_id=item.user_id, - audio_uuid=audio_uuid, - audio_chunk=item.audio_chunk, - ) - ) - audio_logger.debug( - f"✅ Successfully queued transcription for client {item.client_id}, audio_uuid: {audio_uuid}" - ) - - except Exception as e: - audio_logger.error( - f"Error processing audio for client {item.client_id}: {e}", - exc_info=True, - ) - finally: - self.audio_queue.task_done() - audio_logger.debug( - f"✅ Completed processing audio item for client {item.client_id if item else 'None'}" - ) - - except asyncio.TimeoutError: - # Periodic health check - active_clients = len(self.active_file_sinks) - queue_size = self.audio_queue.qsize() - if queue_size > 0 or active_clients > 0: - audio_logger.info( - f"⏰ Audio processor timeout (periodic health check): {active_clients} active files, " - f"{queue_size} items in queue" - ) - - except Exception as e: - audio_logger.error(f"Fatal error in audio processor: {e}", exc_info=True) - finally: - audio_logger.info("Audio processor stopped") - - async def _transcription_processor(self): - """Process transcription requests.""" - audio_logger.info("Transcription processor started") - from advanced_omi_backend.transcription import TranscriptionManager - - try: - while not self.shutdown_flag: - try: - item = await asyncio.wait_for(self.transcription_queue.get(), timeout=30.0) - - if item is None: # Shutdown signal - self.transcription_queue.task_done() - break - - try: - # Get or create transcription manager for client - if item.client_id not in self.transcription_managers: - # Import here to avoid circular imports - - audio_logger.info( - f"🔌 Creating new transcription manager for client {item.client_id}" - ) - manager = TranscriptionManager( - chunk_repo=self.repository, processor_manager=self - ) - try: - await manager.connect(item.client_id) - self.transcription_managers[item.client_id] = manager - audio_logger.info( - f"✅ Successfully created transcription manager for {item.client_id}" - ) - except Exception as e: - audio_logger.error( - f"❌ Failed to create transcription manager for {item.client_id}: {e}" - ) - # Mark transcription as failed when manager creation fails - self.track_processing_stage( - item.client_id, "transcription", "failed", {"error": str(e)} - ) - self.transcription_queue.task_done() - continue - else: - audio_logger.debug( - f"♻️ Reusing existing transcription manager for client {item.client_id}" - ) - - manager = self.transcription_managers[item.client_id] - - # Process transcription chunk - audio_logger.debug( - f"🎵 Processing transcribe_chunk for client {item.client_id}, audio_uuid: {item.audio_uuid}" - ) - - try: - # Add timeout for transcription processing (5 minutes) - async with asyncio.timeout(300): # 5 minute timeout - await manager.transcribe_chunk( - item.audio_uuid, item.audio_chunk, item.client_id - ) - audio_logger.debug( - f"✅ Completed transcribe_chunk for client {item.client_id}" - ) - except asyncio.TimeoutError: - audio_logger.error( - f"❌ Transcription timeout for client {item.client_id} after 5 minutes" - ) - # Mark transcription as failed on timeout - self.track_processing_stage( - item.client_id, - "transcription", - "failed", - {"error": "Transcription timeout (5 minutes)"}, - ) - except Exception as e: - audio_logger.error( - f"❌ Error in transcribe_chunk for client {item.client_id}: {e}", - exc_info=True, - ) - # Mark transcription as failed when chunk processing fails - self.track_processing_stage( - item.client_id, "transcription", "failed", {"error": str(e)} - ) - - # Track transcription as started using direct state tracking - ONLY ONCE per audio session - # Check if we haven't already marked this transcription as started for this audio UUID - current_transcription_status = self.processing_state.get( - item.client_id, {} - ).get("transcription", {}) - current_audio_uuid = current_transcription_status.get("metadata", {}).get( - "audio_uuid" - ) - - # Only mark as started if this is a new audio UUID or no transcription status exists - if current_audio_uuid != item.audio_uuid: - audio_logger.info( - f"🎯 Starting transcription tracking for new audio UUID: {item.audio_uuid}" - ) - self.track_processing_stage( - item.client_id, - "transcription", - "started", - {"audio_uuid": item.audio_uuid, "chunk_processing": True}, - ) - else: - audio_logger.debug( - f"⏩ Skipping transcription status update - already tracking audio UUID: {item.audio_uuid}" - ) - - except Exception as e: - audio_logger.error( - f"Error processing transcription for client {item.client_id}: {e}", - exc_info=True, - ) - finally: - self.transcription_queue.task_done() - - except asyncio.TimeoutError: - # Periodic health check only (NO cleanup based on client active status) - queue_size = self.transcription_queue.qsize() - active_managers = len(self.transcription_managers) - audio_logger.debug( - f"Transcription processor health: {active_managers} managers, " - f"{queue_size} items in queue" - ) - - except Exception as e: - audio_logger.error(f"Fatal error in transcription processor: {e}", exc_info=True) - finally: - audio_logger.info("Transcription processor stopped") - - async def _memory_processor(self): - """Process memory/LLM requests.""" - audio_logger.info("Memory processor started") - - try: - while not self.shutdown_flag: - try: - item = await asyncio.wait_for(self.memory_queue.get(), timeout=30.0) - - if item is None: # Shutdown signal - self.memory_queue.task_done() - break - - try: - # Create background task for memory processing - task = asyncio.create_task(self._process_memory_item(item)) - - # Track task with 5 minute timeout - task_name = f"memory_{item.client_id}_{item.conversation_id}" - actual_task_id = self.task_manager.track_task( - task, - task_name, - { - "client_id": item.client_id, - "conversation_id": item.conversation_id, - "type": "memory", - "timeout": 3600, # 60 minutes - }, - ) - - # Register task with client for tracking (use the actual task_id from TaskManager) - self.track_processing_task( - item.client_id, - "memory", - actual_task_id, - {"conversation_id": item.conversation_id}, - ) - - except Exception as e: - audio_logger.error( - f"Error queuing memory processing for {item.conversation_id}: {e}", - exc_info=True, - ) - finally: - self.memory_queue.task_done() - - except asyncio.TimeoutError: - # Periodic health check - queue_size = self.memory_queue.qsize() - audio_logger.debug(f"Memory processor health: {queue_size} items in queue") - - except Exception as e: - audio_logger.error(f"Fatal error in memory processor: {e}", exc_info=True) - finally: - audio_logger.info("Memory processor stopped") - - async def _process_memory_item(self, item: MemoryProcessingItem): - """Process a single memory item (speech-driven conversations architecture).""" - start_time = time.time() - audio_logger.info(f"🚀 MEMORY PROCESSING STARTED for conversation {item.conversation_id} at {start_time}") - - # Track memory processing start - self.track_processing_stage( - item.client_id, - "memory", - "started", - {"conversation_id": item.conversation_id, "started_at": start_time}, - ) - - try: - # Get conversation data directly from conversations collection (speech-driven architecture) - conversations_repo = ConversationsRepository(conversations_col) - conversation = await conversations_repo.get_conversation(item.conversation_id) - - if not conversation: - audio_logger.warning( - f"No conversation found for {item.conversation_id}, skipping memory processing" - ) - return None - - # Extract conversation text from transcript segments - full_conversation = "" - transcript = conversation.get("transcript", []) - if transcript: - dialogue_lines = [] - for segment in transcript: - text = segment.get("text", "").strip() - if text: - speaker = segment.get("speaker", "Unknown") - dialogue_lines.append(f"{speaker}: {text}") - full_conversation = "\n".join(dialogue_lines) - else: - audio_logger.warning( - f"No transcript found in conversation {item.conversation_id}, skipping memory processing" - ) - return None - if len(full_conversation) < 10: # Minimum length check - audio_logger.warning( - f"Conversation too short for memory processing ({len(full_conversation)} chars): conversation {item.conversation_id}" - ) - return None - - # Debug tracking removed for cleaner architecture - - # Check if memory processing should proceed based on primary speakers configuration - should_process, filter_reason = await self._should_process_memory(item.user_id, item.conversation_id) - audio_logger.info(f"🎯 Speaker filter decision for conversation {item.conversation_id}: {filter_reason}") - - if not should_process: - # Update memory processing status to skipped - await self._update_memory_status(item.conversation_id, "skipped") - - # Track completion - self.track_processing_stage( - item.client_id, - "memory", - "completed", - { - "conversation_id": item.conversation_id, - "status": "skipped", - "reason": filter_reason, - "completed_at": time.time(), - }, - ) - audio_logger.info(f"⏭️ Skipped memory processing for conversation {item.conversation_id}: {filter_reason}") - return None - - # Lazy import memory service - if self.memory_service is None: - audio_logger.info(f"🔧 Initializing memory service for conversation {item.conversation_id}...") - self.memory_service = get_memory_service() - audio_logger.info(f"✅ Memory service initialized for conversation {item.conversation_id}") - - # Process memory with timeout - memory_result = await asyncio.wait_for( - self.memory_service.add_memory( - full_conversation, - item.client_id, - item.conversation_id, # Use conversation_id instead of audio_uuid - item.user_id, - item.user_email, - allow_update=True, - ), - timeout=3600, # 60 minutes - ) - - if memory_result: - # Check if this was a successful result with actual memories created - success, created_memory_ids = memory_result - logger.info(f"Memory result: {memory_result}") - - if success and created_memory_ids: - # Memories were actually created - audio_logger.info( - f"✅ Successfully processed memory for conversation {item.conversation_id} - created {len(created_memory_ids)} memories" - ) - - # Add memory references to conversations collection (speech-driven architecture) - try: - conversations_repo = ConversationsRepository(conversations_col) - - # Add memory references to conversation - memory_refs = [{"memory_id": mid, "created_at": datetime.now(UTC).isoformat(), "status": "created"} for mid in created_memory_ids] - await conversations_repo.add_memories(item.conversation_id, memory_refs) - - # Update memory processing status - await conversations_repo.update_memory_processing_status(item.conversation_id, "completed") - - audio_logger.info( - f"📝 Added {len(created_memory_ids)} memories to conversation {item.conversation_id}" - ) - except Exception as e: - audio_logger.warning(f"Failed to add memory references: {e}") - - # Track memory processing completion - self.track_processing_stage( - item.client_id, - "memory", - "completed", - { - "conversation_id": item.conversation_id, - "memories_created": len(created_memory_ids), - "processing_time": time.time() - start_time, - }, - ) - elif success and not created_memory_ids: - # Successful processing but no memories created (likely empty transcript) - audio_logger.info( - f"✅ Memory processing completed for conversation {item.conversation_id} but no memories created (likely empty transcript)" - ) - - # Update database memory processing status to skipped - await self._update_memory_status(item.conversation_id, "skipped") - - # Track memory processing completion (even though no memories created) - self.track_processing_stage( - item.client_id, - "memory", - "completed", - { - "conversation_id": item.conversation_id, - "memories_created": 0, - "processing_time": time.time() - start_time, - "status": "skipped", - }, - ) - else: - # This shouldn't happen, but handle it gracefully - audio_logger.warning( - f"⚠️ Unexpected memory result for conversation {item.conversation_id}: success={success}, ids={created_memory_ids}" - ) - - # Update database memory processing status to failed - await self._update_memory_status(item.conversation_id, "failed") - - # Track memory processing failure - self.track_processing_stage( - item.client_id, - "memory", - "failed", - { - "conversation_id": item.conversation_id, - "error": f"Unexpected result: success={success}, ids={created_memory_ids}", - "processing_time": time.time() - start_time, - }, - ) - - else: - audio_logger.warning(f"⚠️ Memory service returned False for conversation {item.conversation_id}") - - # Update database memory processing status to failed - await self._update_memory_status(item.conversation_id, "failed") - - # Track memory processing failure - self.track_processing_stage( - item.client_id, - "memory", - "failed", - { - "conversation_id": item.conversation_id, - "error": "Memory service returned False", - "processing_time": time.time() - start_time, - }, - ) - - except asyncio.TimeoutError: - audio_logger.error(f"Memory processing timed out for conversation {item.conversation_id}") - - # Update database memory processing status to failed - await self._update_memory_status(item.conversation_id, "failed") - - # Track memory processing timeout failure - self.track_processing_stage( - item.client_id, - "memory", - "failed", - { - "conversation_id": item.conversation_id, - "error": "Processing timeout (5 minutes)", - "processing_time": time.time() - start_time, - }, - ) - - except Exception as e: - audio_logger.error(f"Error processing memory for conversation {item.conversation_id}: {e}") - - # Update database memory processing status to failed - await self._update_memory_status(item.conversation_id, "failed") - - # Track memory processing exception failure - self.track_processing_stage( - item.client_id, - "memory", - "failed", - { - "conversation_id": item.conversation_id, - "error": f"Exception: {str(e)}", - "processing_time": time.time() - start_time, - }, - ) - - end_time = time.time() - processing_time_ms = (end_time - start_time) * 1000 - audio_logger.info( - f"🏁 MEMORY PROCESSING COMPLETED for conversation {item.conversation_id} in {processing_time_ms:.1f}ms (end time: {end_time})" - ) - - async def _cropping_processor(self): - """Process audio cropping requests.""" - audio_logger.info("Audio cropping processor started") - - try: - while not self.shutdown_flag: - try: - item = await asyncio.wait_for(self.cropping_queue.get(), timeout=30.0) - - if item is None: # Shutdown signal - self.cropping_queue.task_done() - break - - try: - # Create background task for cropping - task = asyncio.create_task( - _process_audio_cropping_with_relative_timestamps( - item.original_path, - item.speech_segments, - item.output_path, - item.audio_uuid, - self.repository, - ) - ) - - # Track task - task_name = f"cropping_{item.client_id}_{item.audio_uuid}" - actual_task_id = self.task_manager.track_task( - task, - task_name, - { - "client_id": item.client_id, - "audio_uuid": item.audio_uuid, - "type": "cropping", - "segments": len(item.speech_segments), - }, - ) - - # Register task with client for tracking (use the actual task_id from TaskManager) - self.track_processing_task( - item.client_id, - "cropping", - actual_task_id, - {"audio_uuid": item.audio_uuid, "segments": len(item.speech_segments)}, - ) - - audio_logger.info( - f"✂️ Queued audio cropping for {item.audio_uuid} " - f"with {len(item.speech_segments)} segments" - ) - - except Exception as e: - audio_logger.error( - f"Error queuing audio cropping for {item.audio_uuid}: {e}", - exc_info=True, - ) - finally: - self.cropping_queue.task_done() - - except asyncio.TimeoutError: - # Periodic health check - queue_size = self.cropping_queue.qsize() - audio_logger.debug(f"Cropping processor health: {queue_size} items in queue") - - except Exception as e: - audio_logger.error(f"Fatal error in cropping processor: {e}", exc_info=True) - finally: - audio_logger.info("Audio cropping processor stopped") - - -# Global processor manager instance -_processor_manager: Optional[ProcessorManager] = None - - -def init_processor_manager(chunk_dir: Path, db_helper: AudioChunksRepository): - """Initialize the global processor manager.""" - global _processor_manager - _processor_manager = ProcessorManager(chunk_dir, db_helper) - return _processor_manager - - -def get_processor_manager() -> ProcessorManager: - """Get the global processor manager instance.""" - if _processor_manager is None: - raise RuntimeError("ProcessorManager not initialized. Call init_processor_manager first.") - return _processor_manager diff --git a/backends/advanced/src/advanced_omi_backend/routers/api_router.py b/backends/advanced/src/advanced_omi_backend/routers/api_router.py index 4a6ab878..a510d396 100644 --- a/backends/advanced/src/advanced_omi_backend/routers/api_router.py +++ b/backends/advanced/src/advanced_omi_backend/routers/api_router.py @@ -10,13 +10,16 @@ from fastapi import APIRouter from .modules import ( + audio_router, chat_router, client_router, conversation_router, memory_router, + queue_router, system_router, user_router, ) +from .modules.health_routes import router as health_router logger = logging.getLogger(__name__) audio_logger = logging.getLogger("audio_processing") @@ -25,12 +28,15 @@ router = APIRouter(prefix="/api", tags=["api"]) # Include all sub-routers +router.include_router(audio_router) router.include_router(user_router) router.include_router(chat_router) router.include_router(client_router) router.include_router(conversation_router) router.include_router(memory_router) router.include_router(system_router) +router.include_router(queue_router) +router.include_router(health_router) # Also include under /api for frontend compatibility logger.info("API router initialized with all sub-modules") diff --git a/backends/advanced/src/advanced_omi_backend/routers/modules/__init__.py b/backends/advanced/src/advanced_omi_backend/routers/modules/__init__.py index 54fcf543..371fd38d 100644 --- a/backends/advanced/src/advanced_omi_backend/routers/modules/__init__.py +++ b/backends/advanced/src/advanced_omi_backend/routers/modules/__init__.py @@ -7,14 +7,33 @@ - client_routes: Active client monitoring and management - conversation_routes: Conversation CRUD and audio processing - memory_routes: Memory management, search, and debug -- system_routes: System utilities, metrics, and file processing +- system_routes: System utilities and metrics +- queue_routes: Job queue management and monitoring +- audio_routes: Audio file uploads and processing +- health_routes: Health check endpoints +- websocket_routes: WebSocket connection handling """ +from .audio_routes import router as audio_router from .chat_routes import router as chat_router from .client_routes import router as client_router from .conversation_routes import router as conversation_router +from .health_routes import router as health_router from .memory_routes import router as memory_router +from .queue_routes import router as queue_router from .system_routes import router as system_router from .user_routes import router as user_router +from .websocket_routes import router as websocket_router -__all__ = ["user_router", "chat_router", "client_router", "conversation_router", "memory_router", "system_router"] +__all__ = [ + "audio_router", + "chat_router", + "client_router", + "conversation_router", + "health_router", + "memory_router", + "queue_router", + "system_router", + "user_router", + "websocket_router", +] diff --git a/backends/advanced/src/advanced_omi_backend/routers/modules/audio_routes.py b/backends/advanced/src/advanced_omi_backend/routers/modules/audio_routes.py new file mode 100644 index 00000000..5fb24a1a --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/routers/modules/audio_routes.py @@ -0,0 +1,100 @@ +""" +Audio file upload and serving routes. + +Handles audio file uploads, processing job management, and audio file serving. +""" + +from typing import Optional +from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile +from fastapi.responses import FileResponse + +from advanced_omi_backend.auth import current_superuser, current_active_user_optional, get_user_from_token_param +from advanced_omi_backend.controllers import audio_controller +from advanced_omi_backend.models.user import User + +router = APIRouter(prefix="/audio", tags=["audio"]) + + +@router.get("/get_audio/{conversation_id}") +async def get_conversation_audio( + conversation_id: str, + cropped: bool = Query(default=False, description="Serve cropped (speech-only) audio instead of original"), + token: Optional[str] = Query(default=None, description="JWT token for audio element access"), + current_user: Optional[User] = Depends(current_active_user_optional), +): + """ + Serve audio file for a conversation. + + This endpoint uses conversation_id for direct lookup and ownership verification, + which is more efficient than querying by filename. + + Supports both header-based auth (Authorization: Bearer) and query param token + for