diff --git a/.coderabbit.yaml b/.coderabbit.yaml index 00f37c1..d9de751 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -1,12 +1,15 @@ # yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json language: "ko-KR" reviews: - profile: "assertive" # 깐깐하게 로직 검토 - request_changes_workflow: false # AI가 승인을 막지 않도록 설정 + profile: "assertive" + request_changes_workflow: false high_level_summary: true review_status: true review_details: false - poem: false # 불필요한 기능 제거 + poem: false + pre_merge_checks: + docstrings: + mode: "off" auto_review: enabled: true drafts: false @@ -14,4 +17,4 @@ reviews: - "develop" - "main" chat: - auto_reply: true \ No newline at end of file + auto_reply: true diff --git a/.github/PULL_REQUEST_TEMPLATE/release.md b/.github/PULL_REQUEST_TEMPLATE/release.md new file mode 100644 index 0000000..b8cff89 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/release.md @@ -0,0 +1,31 @@ +## 📌 작업 요약 + +- 요약: + - develop 브랜치의 누적 변경사항을 main으로 릴리즈 배포 +- 관련 이슈: closes # + +## 🌿 브랜치 정보 + +- **Source**: `develop` (기본) +- **Target**: `main` (릴리즈) + +## ✅ 체크리스트 + +- [ ] 브랜치 컨벤션 준수 (`feat/refac/hotfix/chore/design/bugfix`) +- [ ] 커밋 컨벤션 준수 (`feat/fix/refactor/docs/style/chore`) +- [ ] self-review 완료 +- [ ] 테스트 및 로컬 실행 확인 완료 + +## 🧪 테스트 결과 + +- GitHub Actions `Deploy AI to EC2` 실행 확인 (`workflow_dispatch`, ref: `main`) + - 결과: + - 스크린샷: ![ssm-send-step]() + +- 원격 배포 순서/재기동 확인 + - 결과: + - 스크린샷: ![ssm-order]() + +- 배포 후 컨테이너 상태 확인 + - 결과: + - 스크린샷: ![compose-ps]() diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 1939f6d..2ce0da1 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,24 +1,20 @@ ## 📌 작업 요약 -- 요약: + +- 요약: - 관련 이슈: closes # ## 🌿 브랜치 정보 + - **Source**: `feat/#이슈번호-기능명` - **Target**: `develop` (기본) / `main` (릴리즈, 핫픽스) -## 🧩 변경 타입 -- [ ] feat: 새로운 기능 추가 -- [ ] fix: 버그 수정 -- [ ] refactor: 코드 리팩토링 -- [ ] docs: 문서 수정 -- [ ] style: 코드 포맷팅, 세미콜론 누락 등 -- [ ] chore: 빌드 업무, 패키지 매니저 설정 등 - ## ✅ 체크리스트 + - [ ] 브랜치 컨벤션 준수 (`feat/refac/hotfix/chore/design/bugfix`) - [ ] 커밋 컨벤션 준수 (`feat/fix/refactor/docs/style/chore`) - [ ] self-review 완료 - [ ] 테스트 및 로컬 실행 확인 완료 ## 🧪 테스트 결과 -- (테스트 코드 실행 결과 스크린샷이나 로그, 또는 테스트 방법) \ No newline at end of file + +- (테스트 코드 실행 결과 스크린샷이나 로그, 또는 테스트 방법) diff --git a/.github/workflows/deploy-ai-ec2.yml b/.github/workflows/deploy-ai-ec2.yml new file mode 100644 index 0000000..83c462c --- /dev/null +++ b/.github/workflows/deploy-ai-ec2.yml @@ -0,0 +1,203 @@ +name: Deploy AI to EC2 + +"on": + workflow_run: + workflows: ["AI Docker CI"] + types: [completed] + branches: ["main"] + workflow_dispatch: + +permissions: + contents: read + id-token: write + +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + +jobs: + deploy: + if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' + runs-on: ubuntu-latest + concurrency: + group: deploy-ai-ec2-ssm + cancel-in-progress: false + + steps: + - name: Resolve deploy path + id: deploy-path + env: + EC2_DEPLOY_PATH_SECRET: ${{ secrets.EC2_DEPLOY_PATH }} + run: | + VALUE="${EC2_DEPLOY_PATH_SECRET:-}" + VALUE="${VALUE%$'\r'}" + VALUE="$(printf '%s' "$VALUE" | sed 's/[[:space:]]*$//')" + if [ -n "$VALUE" ]; then + echo "value=$VALUE" >> "$GITHUB_OUTPUT" + else + echo "value=/home/ubuntu/GACHI-BE/deploy" >> "$GITHUB_OUTPUT" + fi + + - name: Resolve AWS region + id: aws-region + env: + AWS_REGION_SECRET: ${{ secrets.AWS_REGION }} + run: | + VALUE="${AWS_REGION_SECRET:-}" + VALUE="${VALUE%$'\r'}" + VALUE="$(printf '%s' "$VALUE" | sed 's/[[:space:]]*$//')" + if [ -n "$VALUE" ]; then + echo "value=$VALUE" >> "$GITHUB_OUTPUT" + else + echo "value=ap-northeast-2" >> "$GITHUB_OUTPUT" + fi + + - name: Validate deploy inputs + id: auth-check + env: + EC2_INSTANCE_ID: ${{ secrets.EC2_INSTANCE_ID }} + AWS_OIDC_ROLE_ARN: ${{ secrets.AWS_OIDC_ROLE_ARN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + run: | + set -Eeuo pipefail + if [ -z "${EC2_INSTANCE_ID:-}" ]; then + echo "::error::EC2_INSTANCE_ID secret is required." + exit 1 + fi + if [ -z "${DOCKERHUB_USERNAME:-}" ]; then + echo "::error::DOCKERHUB_USERNAME secret is required." + exit 1 + fi + if [ -n "${AWS_OIDC_ROLE_ARN:-}" ]; then + echo "auth_mode=oidc" >> "$GITHUB_OUTPUT" + echo "has_session_token=false" >> "$GITHUB_OUTPUT" + else + if [ -z "${AWS_ACCESS_KEY_ID:-}" ] || [ -z "${AWS_SECRET_ACCESS_KEY:-}" ]; then + echo "::error::Set AWS_OIDC_ROLE_ARN or AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY." + exit 1 + fi + echo "auth_mode=access_key" >> "$GITHUB_OUTPUT" + if [ -n "${AWS_SESSION_TOKEN:-}" ]; then + echo "has_session_token=true" >> "$GITHUB_OUTPUT" + else + echo "has_session_token=false" >> "$GITHUB_OUTPUT" + fi + fi + + - name: Configure AWS credentials with OIDC + if: ${{ steps.auth-check.outputs.auth_mode == 'oidc' }} + uses: aws-actions/configure-aws-credentials@v6 + with: + aws-region: ${{ steps.aws-region.outputs.value }} + role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }} + role-session-name: gachi-ai-deploy-ssm + + - name: Configure AWS credentials with access key + if: ${{ steps.auth-check.outputs.auth_mode == 'access_key' && steps.auth-check.outputs.has_session_token != 'true' }} + uses: aws-actions/configure-aws-credentials@v6 + with: + aws-region: ${{ steps.aws-region.outputs.value }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Configure AWS credentials with access key session + if: ${{ steps.auth-check.outputs.auth_mode == 'access_key' && steps.auth-check.outputs.has_session_token == 'true' }} + uses: aws-actions/configure-aws-credentials@v6 + with: + aws-region: ${{ steps.aws-region.outputs.value }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-session-token: ${{ secrets.AWS_SESSION_TOKEN }} + + - name: Send AI deploy command via SSM + id: ssm-send + env: + AWS_REGION: ${{ steps.aws-region.outputs.value }} + EC2_INSTANCE_ID: ${{ secrets.EC2_INSTANCE_ID }} + EC2_DEPLOY_PATH: ${{ steps.deploy-path.outputs.value }} + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + run: | + set -Eeuo pipefail + COMMANDS="$(jq -cn \ + --arg deployPath "$EC2_DEPLOY_PATH" \ + --arg image "$DOCKERHUB_USERNAME/gachi-ai:latest" \ + '{ + commands: [[ + "set -Eeuo pipefail", + "cd " + ($deployPath | @sh), + "test -f .env", + "if grep -q ^AI_IMAGE= .env; then sed -i " + ("s|^AI_IMAGE=.*|AI_IMAGE=" + $image + "|" | @sh) + " .env; else echo " + ("AI_IMAGE=" + $image | @sh) + " >> .env; fi", + "docker compose --env-file .env pull ai", + "docker compose --env-file .env up -d --remove-orphans --force-recreate ai", + "AI_CID=$(docker compose --env-file .env ps -q ai)", + "test -n \"$AI_CID\"", + "for i in $(seq 1 24); do AI_HEALTH=$(docker inspect --format '\''{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'\'' \"$AI_CID\" 2>/dev/null || echo missing); echo \"ai health: $AI_HEALTH\"; [ \"$AI_HEALTH\" = healthy ] && break; sleep 5; done", + "test \"$AI_HEALTH\" = healthy", + "docker compose --env-file .env up -d --remove-orphans --force-recreate --no-deps nginx", + "docker compose --env-file .env ps", + "docker compose --env-file .env logs --tail=80 ai nginx || true" + ] | join(" && ")], + executionTimeout: ["900"] + }')" + COMMAND_ID="$(aws ssm send-command \ + --region "$AWS_REGION" \ + --document-name "AWS-RunShellScript" \ + --instance-ids "$EC2_INSTANCE_ID" \ + --comment "deploy gachi-ai (run=${GITHUB_RUN_ID})" \ + --max-concurrency "1" \ + --max-errors "0" \ + --parameters "$COMMANDS" \ + --query "Command.CommandId" \ + --output text)" + echo "command_id=$COMMAND_ID" >> "$GITHUB_OUTPUT" + + - name: Wait for SSM command completion + env: + AWS_REGION: ${{ steps.aws-region.outputs.value }} + EC2_INSTANCE_ID: ${{ secrets.EC2_INSTANCE_ID }} + COMMAND_ID: ${{ steps.ssm-send.outputs.command_id }} + run: | + set -Eeuo pipefail + for attempt in $(seq 1 90); do + STATUS="$(aws ssm get-command-invocation \ + --region "$AWS_REGION" \ + --command-id "$COMMAND_ID" \ + --instance-id "$EC2_INSTANCE_ID" \ + --query "Status" \ + --output text 2>/dev/null || true)" + echo "[$attempt/90] status: ${STATUS:-not-ready}" + case "$STATUS" in + Success|Failed|TimedOut|Cancelled|Cancelling) + break + ;; + *) + sleep 10 + ;; + esac + done + + INVOCATION_JSON="$(aws ssm get-command-invocation \ + --region "$AWS_REGION" \ + --command-id "$COMMAND_ID" \ + --instance-id "$EC2_INSTANCE_ID" \ + --output json)" + FINAL_STATUS="$(printf '%s' "$INVOCATION_JSON" | jq -r '.Status // "Unknown"')" + STDOUT_CONTENT="$(printf '%s' "$INVOCATION_JSON" | jq -r '.StandardOutputContent // ""')" + STDERR_CONTENT="$(printf '%s' "$INVOCATION_JSON" | jq -r '.StandardErrorContent // ""')" + + echo "::group::SSM stdout" + printf '%s\n' "$STDOUT_CONTENT" + echo "::endgroup::" + + if [ -n "$STDERR_CONTENT" ]; then + echo "::group::SSM stderr" + printf '%s\n' "$STDERR_CONTENT" + echo "::endgroup::" + fi + + if [ "$FINAL_STATUS" != "Success" ]; then + echo "::error::SSM command failed. status=$FINAL_STATUS" + exit 1 + fi diff --git a/.github/workflows/docker-ai.yml b/.github/workflows/docker-ai.yml index f30c075..b139edb 100644 --- a/.github/workflows/docker-ai.yml +++ b/.github/workflows/docker-ai.yml @@ -1,8 +1,10 @@ name: AI Docker CI -on: +"on": + pull_request: + branches: ["develop", "main"] push: - branches: [ "develop", "main" ] + branches: ["develop", "main"] workflow_dispatch: permissions: @@ -12,7 +14,7 @@ env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true jobs: - build-and-push: + build: runs-on: ubuntu-latest steps: @@ -24,13 +26,16 @@ jobs: with: python-version: "3.11" - - name: Install deps - run: pip install -r requirements.txt + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt - - name: Syntax check - run: python -m py_compile app/main.py + - name: Compile app + run: python -m compileall app - name: Login Docker Hub + if: (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && (github.ref_name == 'main' || github.ref_name == 'develop') uses: docker/login-action@v4 with: username: ${{ secrets.DOCKERHUB_USERNAME }} @@ -39,18 +44,34 @@ jobs: - name: Set image tags id: tags run: | - IMAGE="${{ secrets.DOCKERHUB_USERNAME }}/gachi-ai" SHORT_SHA="${GITHUB_SHA::7}" - if [ "${GITHUB_REF_NAME}" = "main" ]; then - echo "tags=${IMAGE}:latest,${IMAGE}:sha-${SHORT_SHA}" >> $GITHUB_OUTPUT + if [ "${GITHUB_EVENT_NAME}" = "pull_request" ]; then + echo "tags=gachi-ai:pr-${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT" + elif [ "${GITHUB_REF_NAME}" = "main" ]; then + IMAGE="${{ secrets.DOCKERHUB_USERNAME }}/gachi-ai" + echo "tags=${IMAGE}:latest,${IMAGE}:sha-${SHORT_SHA}" >> "$GITHUB_OUTPUT" + elif [ "${GITHUB_REF_NAME}" = "develop" ]; then + IMAGE="${{ secrets.DOCKERHUB_USERNAME }}/gachi-ai" + echo "tags=${IMAGE}:develop,${IMAGE}:sha-${SHORT_SHA}" >> "$GITHUB_OUTPUT" else - echo "tags=${IMAGE}:develop,${IMAGE}:sha-${SHORT_SHA}" >> $GITHUB_OUTPUT + SAFE_REF="$(echo "${GITHUB_REF_NAME}" | tr '/' '-')" + echo "tags=gachi-ai:${SAFE_REF},gachi-ai:sha-${SHORT_SHA}" >> "$GITHUB_OUTPUT" fi - - name: Build and Push + - name: Build image without push + if: github.event_name == 'pull_request' || (github.event_name == 'workflow_dispatch' && github.ref_name != 'main' && github.ref_name != 'develop') + uses: docker/build-push-action@v7 + with: + context: . + push: false + tags: ${{ steps.tags.outputs.tags }} + platforms: linux/amd64 + + - name: Build and push image + if: (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && (github.ref_name == 'main' || github.ref_name == 'develop') uses: docker/build-push-action@v7 with: context: . push: true tags: ${{ steps.tags.outputs.tags }} - platforms: linux/amd64 \ No newline at end of file + platforms: linux/amd64 diff --git a/.github/workflows/quality-ai.yml b/.github/workflows/quality-ai.yml index d22f745..842aeec 100644 --- a/.github/workflows/quality-ai.yml +++ b/.github/workflows/quality-ai.yml @@ -1,10 +1,10 @@ name: AI Quality -on: +"on": pull_request: - branches: [ "develop", "main" ] + branches: ["develop", "main"] push: - branches: [ "develop", "main" ] + branches: ["develop", "main"] permissions: contents: read @@ -13,8 +13,9 @@ env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true jobs: - ruff: + test: runs-on: ubuntu-latest + steps: - name: Checkout uses: actions/checkout@v6 @@ -24,11 +25,25 @@ jobs: with: python-version: "3.11" - - name: Install Ruff - run: pip install ruff + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install ruff pytest - name: Ruff check run: ruff check . - name: Ruff format check run: ruff format --check . + + - name: Compile app + run: python -m compileall app + + - name: Test if present + run: | + if [ -d tests ]; then + pytest + else + echo "tests 디렉터리가 없어 pytest를 건너뜁니다." + fi diff --git a/README.md b/README.md index 9f97f63..ee0601a 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,45 @@ # GACHI-AI -GACHI 프로젝트 AI 서버(FastAPI) 레포지토리입니다. +GACHI 프로젝트의 AI 서버입니다. 백엔드와 분리된 FastAPI 애플리케이션으로 운영하며, 기존 EC2 `docker-compose`의 `ai` 서비스로 배포합니다. -## 문서 -- `docs/env.md`: 환경 변수 가이드 -- `docs/deploy.md`: 이미지 태그/배포 가이드 +## 역할 + +- 가정통신문 원문과 날짜 후보를 기반으로 일정, 마감, 체크리스트, 알림 항목을 추출합니다. +- OpenAI API 호출 전에도 검증할 수 있도록 비용 없는 rule-based baseline을 제공합니다. +- 실제 LLM에 전달할 prompt-preview API를 제공합니다. + +## 로컬 실행 + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 +``` + +Windows PowerShell에서는 다음처럼 가상환경을 활성화합니다. + +```powershell +.\.venv\Scripts\Activate.ps1 +``` + +## 주요 엔드포인트 + +- `GET /ai/health`: 헬스체크 +- `GET /ai/docs`: Swagger UI +- `POST /ai/newsletters/extract-items`: 날짜 후보 기반 baseline 추출 +- `POST /ai/newsletters/prompt-preview`: LLM 입력용 prompt와 response schema 미리보기 + +## 작업 규칙 -## 협업 규칙 - 기본 브랜치: `develop` -- 브랜치 전략: `feat/xx`, `refac/xx`, `hotfix/xx`, `chore/xx`, `design/xx`, `bugfix/xx` +- 브랜치 예시: `feat/#1-feature-name`, `chore/#1-ci-cd-setup` - 커밋 타입: `feat`, `fix`, `refactor`, `docs`, `style`, `chore` -- `main`, `develop` 직접 push 금지, PR 승인 후 머지 -- CI 체크(`build-and-push`) 통과 후 머지 +- `main`, `develop` 직접 커밋은 피하고 PR로 병합합니다. + +## 문서 -## 배포 태그 규칙 -- `develop` push: `/gachi-ai:develop` -- `main` push: `/gachi-ai:latest` +- `docs/env.md`: 환경변수 +- `docs/deploy.md`: Docker image와 EC2 배포 방식 +- `docs/newsletter-extraction.md`: 가정통신문 추출 API와 프롬프트 흐름 +- `docs/newsletter-labeling-guide.md`: 정답 데이터 라벨링 기준 diff --git a/app/main.py b/app/main.py index ecadd8e..e535acd 100644 --- a/app/main.py +++ b/app/main.py @@ -1,5 +1,6 @@ from fastapi import FastAPI -from pydantic import BaseModel + +from app.routers import health, newsletters app = FastAPI( title="GACHI-AI", @@ -9,21 +10,5 @@ openapi_url="/ai/openapi.json", ) - -class EchoRequest(BaseModel): - text: str - - -@app.get("/ai/health") -def health() -> dict: - return {"status": "ok"} - - -@app.get("/ai/ping") -def ping() -> dict: - return {"message": "pong"} - - -@app.post("/ai/echo") -def echo(req: EchoRequest) -> dict: - return {"text": req.text} +app.include_router(health.router) +app.include_router(newsletters.router) diff --git a/app/routers/health.py b/app/routers/health.py new file mode 100644 index 0000000..25bdb9b --- /dev/null +++ b/app/routers/health.py @@ -0,0 +1,13 @@ +from fastapi import APIRouter + +router = APIRouter(prefix="/ai", tags=["health"]) + + +@router.get("/health") +def health() -> dict[str, str]: + return {"status": "ok"} + + +@router.get("/ping") +def ping() -> dict[str, str]: + return {"message": "pong"} diff --git a/app/routers/newsletters.py b/app/routers/newsletters.py new file mode 100644 index 0000000..6c34554 --- /dev/null +++ b/app/routers/newsletters.py @@ -0,0 +1,22 @@ +from fastapi import APIRouter + +from app.schemas import ( + NewsletterExtractionRequest, + NewsletterExtractionResponse, + PromptPreviewResponse, +) +from app.services.newsletter_extractor import extract_newsletter_items +from app.services.newsletter_prompt import EXTRACTION_RESPONSE_SCHEMA, build_prompt_messages + +router = APIRouter(prefix="/ai/newsletters", tags=["newsletters"]) + + +@router.post("/extract-items", response_model=NewsletterExtractionResponse) +def extract_items(req: NewsletterExtractionRequest) -> NewsletterExtractionResponse: + return extract_newsletter_items(req) + + +@router.post("/prompt-preview", response_model=PromptPreviewResponse) +def prompt_preview(req: NewsletterExtractionRequest) -> PromptPreviewResponse: + messages = build_prompt_messages(req) + return PromptPreviewResponse(messages=messages, responseSchema=EXTRACTION_RESPONSE_SCHEMA) diff --git a/app/schemas.py b/app/schemas.py new file mode 100644 index 0000000..867c90c --- /dev/null +++ b/app/schemas.py @@ -0,0 +1,89 @@ +from datetime import date +from enum import StrEnum +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field, model_validator + + +class ExtractedItemType(StrEnum): + SCHEDULE = "schedule" + DEADLINE = "deadline" + CHECKLIST = "checklist" + REMINDER = "reminder" + + +class DateStatus(StrEnum): + CONFIRMED = "confirmed" + AMBIGUOUS = "ambiguous" + MISSING = "missing" + + +class DateCandidate(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + candidate_id: str | None = Field(default=None, alias="candidateId") + original_text: str = Field(alias="originalText") + normalized_date: date = Field(alias="normalizedDate") + start_offset: int = Field(alias="startOffset", ge=0) + end_offset: int = Field(alias="endOffset", ge=0) + extraction_type: str | None = Field(default=None, alias="extractionType") + + @model_validator(mode="after") + def validate_offsets(self) -> "DateCandidate": + if self.end_offset < self.start_offset: + raise ValueError("endOffset must be greater than or equal to startOffset") + return self + + +class NewsletterExtractionRequest(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + original_text: str = Field(alias="originalText") + translated_text: str | None = Field(default=None, alias="translatedText") + language: str = "KO" + reference_date: date | None = Field(default=None, alias="referenceDate") + timezone: str = "Asia/Seoul" + date_candidates: list[DateCandidate] = Field(default_factory=list, alias="dateCandidates") + + +class SelectedDateCandidate(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + index: int + candidate_id: str | None = Field(default=None, alias="candidateId") + original_text: str = Field(alias="originalText") + normalized_date: date = Field(alias="normalizedDate") + + +class ExtractedItem(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + type: ExtractedItemType + title: str + selected_date_candidate: SelectedDateCandidate | None = Field( + default=None, alias="selectedDateCandidate" + ) + date_status: DateStatus = Field(alias="dateStatus") + datetime: str | None = None + timezone: str + evidence_text: str = Field(alias="evidenceText") + confidence: float = Field(ge=0.0, le=1.0) + needs_user_confirmation: bool = Field(alias="needsUserConfirmation") + confirmation_question: str | None = Field(default=None, alias="confirmationQuestion") + + +class NewsletterExtractionResponse(BaseModel): + items: list[ExtractedItem] + meta: dict[str, Any] = Field(default_factory=dict) + + +class PromptMessage(BaseModel): + role: str + content: str + + +class PromptPreviewResponse(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + messages: list[PromptMessage] + response_schema: dict[str, Any] = Field(alias="responseSchema") diff --git a/app/services/newsletter_extractor.py b/app/services/newsletter_extractor.py new file mode 100644 index 0000000..7899eb4 --- /dev/null +++ b/app/services/newsletter_extractor.py @@ -0,0 +1,195 @@ +import re +from collections.abc import Iterable + +from app.schemas import ( + DateCandidate, + DateStatus, + ExtractedItem, + ExtractedItemType, + NewsletterExtractionRequest, + NewsletterExtractionResponse, + SelectedDateCandidate, +) + +DEADLINE_KEYWORDS = ( + "마감", + "까지", + "제출", + "신청", + "접수", + "납부", + "등록", + "동의서", + "회신", +) +SCHEDULE_KEYWORDS = ( + "일정", + "행사", + "체험", + "상담", + "설명회", + "교육", + "운영", + "참여", + "개최", + "방문", + "학습", +) +CHECKLIST_KEYWORDS = ( + "준비물", + "지참", + "가져", + "챙겨", + "확인", + "작성", + "서명", + "제출", +) + + +def extract_newsletter_items( + request: NewsletterExtractionRequest, +) -> NewsletterExtractionResponse: + text = request.original_text or "" + items = _extract_candidate_backed_items(text, request) + items.extend(_extract_missing_date_checklists(text, request)) + return NewsletterExtractionResponse( + items=_dedupe_items(items), + meta={ + "mode": "rule_based_baseline", + "dateCandidateCount": len(request.date_candidates), + "requiresLLMReview": True, + }, + ) + + +def _extract_candidate_backed_items( + text: str, + request: NewsletterExtractionRequest, +) -> list[ExtractedItem]: + items = [] + for index, candidate in enumerate(request.date_candidates): + evidence = _evidence_window(text, candidate) + item_type = _classify_item_type(evidence) + selected = SelectedDateCandidate( + index=index, + candidateId=candidate.candidate_id, + originalText=candidate.original_text, + normalizedDate=candidate.normalized_date, + ) + items.append( + ExtractedItem( + type=item_type, + title=_build_title(evidence, item_type), + selectedDateCandidate=selected, + dateStatus=DateStatus.CONFIRMED, + datetime=candidate.normalized_date.isoformat(), + timezone=request.timezone, + evidenceText=evidence, + confidence=_confidence_for(item_type), + needsUserConfirmation=False, + confirmationQuestion=None, + ) + ) + return items + + +def _extract_missing_date_checklists( + text: str, + request: NewsletterExtractionRequest, +) -> list[ExtractedItem]: + items = [] + for sentence in _split_sentences(text): + if not _contains_any(sentence, CHECKLIST_KEYWORDS): + continue + if _overlaps_any_candidate(sentence, request.date_candidates): + continue + items.append( + ExtractedItem( + type=ExtractedItemType.CHECKLIST, + title=_compact_title(sentence), + selectedDateCandidate=None, + dateStatus=DateStatus.MISSING, + datetime=None, + timezone=request.timezone, + evidenceText=sentence, + confidence=0.55, + needsUserConfirmation=True, + confirmationQuestion="이 항목을 체크리스트에 추가할까요?", + ) + ) + return items + + +def _classify_item_type(evidence: str) -> ExtractedItemType: + if _contains_any(evidence, DEADLINE_KEYWORDS): + return ExtractedItemType.DEADLINE + if _contains_any(evidence, SCHEDULE_KEYWORDS): + return ExtractedItemType.SCHEDULE + if _contains_any(evidence, CHECKLIST_KEYWORDS): + return ExtractedItemType.CHECKLIST + return ExtractedItemType.REMINDER + + +def _build_title(evidence: str, item_type: ExtractedItemType) -> str: + title = _compact_title(evidence) + if item_type == ExtractedItemType.DEADLINE and "마감" not in title: + return f"{title} 마감" + return title + + +def _compact_title(text: str) -> str: + title = re.sub(r"\s+", " ", text).strip(" \n\t-::") + if len(title) <= 40: + return title + return title[:39].rstrip() + "..." + + +def _evidence_window(text: str, candidate: DateCandidate) -> str: + if not text: + return candidate.original_text + + start = max(candidate.start_offset - 45, 0) + end = min(candidate.end_offset + 70, len(text)) + window = text[start:end] + left_break = max(window.rfind("\n", 0, candidate.start_offset - start), 0) + right_break = window.find("\n", candidate.end_offset - start) + if right_break == -1: + right_break = len(window) + evidence = window[left_break:right_break] + return re.sub(r"\s+", " ", evidence).strip() or candidate.original_text + + +def _split_sentences(text: str) -> Iterable[str]: + for part in re.split(r"[\n.!?。]+", text): + sentence = re.sub(r"\s+", " ", part).strip(" -::") + if sentence: + yield sentence + + +def _overlaps_any_candidate(sentence: str, candidates: list[DateCandidate]) -> bool: + return any(candidate.original_text in sentence for candidate in candidates) + + +def _contains_any(text: str, keywords: tuple[str, ...]) -> bool: + return any(keyword in text for keyword in keywords) + + +def _confidence_for(item_type: ExtractedItemType) -> float: + if item_type in (ExtractedItemType.DEADLINE, ExtractedItemType.SCHEDULE): + return 0.82 + if item_type == ExtractedItemType.CHECKLIST: + return 0.74 + return 0.62 + + +def _dedupe_items(items: list[ExtractedItem]) -> list[ExtractedItem]: + seen = set() + result = [] + for item in items: + key = (item.type, item.datetime, item.evidence_text) + if key in seen: + continue + seen.add(key) + result.append(item) + return result diff --git a/app/services/newsletter_prompt.py b/app/services/newsletter_prompt.py new file mode 100644 index 0000000..f35be02 --- /dev/null +++ b/app/services/newsletter_prompt.py @@ -0,0 +1,111 @@ +from app.schemas import NewsletterExtractionRequest + +EXTRACTION_RESPONSE_SCHEMA = { + "type": "object", + "additionalProperties": False, + "required": ["items"], + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": False, + "required": [ + "type", + "title", + "selectedDateCandidate", + "dateStatus", + "datetime", + "timezone", + "evidenceText", + "confidence", + "needsUserConfirmation", + "confirmationQuestion", + ], + "properties": { + "type": { + "type": "string", + "enum": ["schedule", "deadline", "checklist", "reminder"], + }, + "title": {"type": "string"}, + "selectedDateCandidate": {"type": ["object", "null"]}, + "dateStatus": { + "type": "string", + "enum": ["confirmed", "ambiguous", "missing"], + }, + "datetime": {"type": ["string", "null"]}, + "timezone": {"type": "string"}, + "evidenceText": {"type": "string"}, + "confidence": {"type": "number", "minimum": 0, "maximum": 1}, + "needsUserConfirmation": {"type": "boolean"}, + "confirmationQuestion": {"type": ["string", "null"]}, + }, + }, + } + }, +} + + +def build_prompt_messages(request: NewsletterExtractionRequest) -> list[dict[str, str]]: + return [ + {"role": "system", "content": _build_system_prompt()}, + {"role": "user", "content": _build_user_prompt(request)}, + ] + + +def _build_system_prompt() -> str: + return """ +역할: 학교 가정통신문에서 캘린더, 알림, 체크리스트로 만들 항목을 추출한다. + +핵심 규칙: +- 구체적인 날짜는 반드시 제공된 date candidates 중 하나만 선택할 것. +- date candidates에 없는 날짜를 새로 만들거나 추론하지 말 것. +- 날짜 근거가 명확할 때만 dateStatus를 "confirmed"로 설정할 것. +- 날짜 후보가 없거나 근거가 약하면 "ambiguous" 또는 "missing"을 사용할 것. +- evidenceText는 원문에서 짧게 가져올 것. +- response schema에 맞는 JSON만 반환할 것. + +항목 분류 기준: +- deadline: 제출, 신청, 납부, 등록, 동의, 회신, 마감 행동 +- schedule: 행사, 수업, 상담, 체험학습, 설명회, 운영일 +- checklist: 준비물, 지참물, 확인 문서, 보호자나 학생이 해야 할 행동 +- reminder: deadline이나 schedule은 아니지만 알림으로 보여줄 가치가 있는 항목 +""".strip() + + +def _build_user_prompt(request: NewsletterExtractionRequest) -> str: + translated_text = request.translated_text.strip() if request.translated_text else "" + reference_date = request.reference_date.isoformat() if request.reference_date else "null" + sections = [ + f"referenceDate: {reference_date}", + f"timezone: {request.timezone}", + f"language: {request.language}", + "", + "", + _format_candidates(request), + "", + "", + "", + request.original_text.strip(), + "", + ] + if translated_text: + sections.extend(["", "", translated_text, ""]) + return "\n".join(sections) + + +def _format_candidates(request: NewsletterExtractionRequest) -> str: + if not request.date_candidates: + return "[]" + + lines = [] + for index, candidate in enumerate(request.date_candidates): + lines.append( + f"- index: {index}, " + f"candidateId: {candidate.candidate_id or 'null'}, " + f"originalText: {candidate.original_text}, " + f"normalizedDate: {candidate.normalized_date.isoformat()}, " + f"startOffset: {candidate.start_offset}, " + f"endOffset: {candidate.end_offset}" + ) + return "\n".join(lines) diff --git a/docs/deploy.md b/docs/deploy.md index db88849..9d86667 100644 --- a/docs/deploy.md +++ b/docs/deploy.md @@ -1,42 +1,43 @@ -# GACHI-AI Deploy Guide +# GACHI-AI 배포 가이드 -## 1) 이미지 태그 규칙 -- `develop` 브랜치 push: `/gachi-ai:develop`, `sha-<7자리>` -- `main` 브랜치 push: `/gachi-ai:latest`, `sha-<7자리>` +## Docker image 태그 + +- `develop` push: `/gachi-ai:develop`, `/gachi-ai:sha-xxxxxxx` +- `main` push: `/gachi-ai:latest`, `/gachi-ai:sha-xxxxxxx` + +## GitHub Actions secrets + +Docker image build/push에 필요합니다. -## 2) GitHub Actions 필수 시크릿 - `DOCKERHUB_USERNAME` - `DOCKERHUB_TOKEN` -## 3) EC2 반영 (BE compose에서 함께 기동) +EC2 배포에 필요합니다. -### develop 반영 -```bash -cd ~/GACHI-BE/deploy -sed -i 's|^AI_IMAGE=.*|AI_IMAGE=/gachi-ai:develop|' .env -docker compose --env-file .env pull ai -docker compose --env-file .env up -d --force-recreate ai nginx -``` +- `EC2_INSTANCE_ID` +- `AWS_REGION` +- `AWS_OIDC_ROLE_ARN` 또는 `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` +- 선택: `AWS_SESSION_TOKEN` +- 선택: `EC2_DEPLOY_PATH` 기본값은 `/home/ubuntu/GACHI-BE/deploy` -### main 반영 -```bash -cd ~/GACHI-BE/deploy -sed -i 's|^AI_IMAGE=.*|AI_IMAGE=/gachi-ai:latest|' .env -docker compose --env-file .env pull ai -docker compose --env-file .env up -d --force-recreate ai nginx -``` +## 배포 방식 + +AI 서버는 별도 EC2를 만들지 않고 기존 백엔드 EC2의 compose 파일에 정의된 `ai` 서비스로 배포합니다. + +`main` push에서 `AI Docker CI`가 성공하거나, `workflow_dispatch`로 `deploy-ai-ec2.yml`을 직접 실행하면 다음 작업을 수행합니다. + +1. EC2의 deploy path로 이동 +2. `.env`의 `AI_IMAGE`를 `/gachi-ai:latest`로 갱신 +3. `docker compose --env-file .env pull ai` +4. `ai` 컨테이너 재생성 +5. `ai` health check 확인 +6. 필요 시 `nginx` 재생성 + +## 수동 확인 -## 4) 상태 확인 ```bash +cd /home/ubuntu/GACHI-BE/deploy docker compose --env-file .env ps +curl -i http://localhost:8000/ai/health curl -i http://localhost/ai/health -curl -i http://localhost/ai/docs -``` - -## 5) 롤백 -```bash -cd ~/GACHI-BE/deploy -sed -i 's|^AI_IMAGE=.*|AI_IMAGE=/gachi-ai:sha-<원하는태그>|' .env -docker compose --env-file .env pull ai -docker compose --env-file .env up -d --force-recreate ai nginx ``` diff --git a/docs/env.md b/docs/env.md index 3b19f18..8cc1b13 100644 --- a/docs/env.md +++ b/docs/env.md @@ -1,7 +1,13 @@ -# AI Environment Variables +# AI 서버 환경변수 -## Required -- `OPENAI_API_KEY`: LLM API key +## 필수 -## Optional -- `LOG_LEVEL`: default `INFO` \ No newline at end of file +- `OPENAI_API_KEY`: 추후 LLM API 호출을 붙일 때 사용할 OpenAI API key + +## 선택 + +- `LOG_LEVEL`: 로그 레벨. 기본값은 `INFO` + +## 현재 상태 + +현재 구현은 OpenAI API를 직접 호출하지 않습니다. `OPENAI_API_KEY`는 기존 EC2 compose 환경과 향후 LLM client 연결을 고려해 유지합니다. diff --git a/docs/newsletter-extraction.md b/docs/newsletter-extraction.md new file mode 100644 index 0000000..b626a5a --- /dev/null +++ b/docs/newsletter-extraction.md @@ -0,0 +1,55 @@ +# 가정통신문 항목 추출 설계 + +## 목적 + +가정통신문 본문에서 일정, 마감, 체크리스트, 알림 항목을 추출합니다. + +핵심 원칙은 날짜를 AI가 새로 만들지 않게 하는 것입니다. 구체적인 날짜는 백엔드나 전처리 단계에서 만든 `dateCandidates` 중 하나만 선택해야 합니다. + +## 엔드포인트 + +### `POST /ai/newsletters/extract-items` + +비용 없이 실행되는 rule-based baseline입니다. OpenAI API를 붙이기 전에도 스키마, 날짜 후보 매칭, 샘플 케이스를 확인할 수 있습니다. + +### `POST /ai/newsletters/prompt-preview` + +LLM에 전달할 system/user prompt와 응답 JSON schema를 생성합니다. ChatGPT Plus에서 수동 실험하거나, 추후 OpenAI API 호출에 그대로 사용할 수 있습니다. + +## 요청 예시 + +```json +{ + "originalText": "5월 10일까지 참가 신청서를 제출해주세요.", + "translatedText": null, + "language": "KO", + "referenceDate": "2026-05-06", + "timezone": "Asia/Seoul", + "dateCandidates": [ + { + "candidateId": "dc_1", + "originalText": "5월 10일", + "normalizedDate": "2026-05-10", + "startOffset": 0, + "endOffset": 6, + "extractionType": "REGEX" + } + ] +} +``` + +## 추출 규칙 + +- `confirmed`는 항목이 제공된 date candidate 중 하나를 사용할 때만 부여합니다. +- 날짜 표현이 있지만 후보 매칭이 불확실하면 `ambiguous`로 둡니다. +- 실행 가능한 체크리스트인데 날짜가 없으면 `missing`으로 둡니다. +- `evidenceText`는 원문 근거를 짧게 담습니다. +- 캘린더와 알림 생성은 `confirmed` 항목만 대상으로 삼습니다. + +## 작업 흐름 + +1. 백엔드 또는 전처리 단계에서 날짜 후보를 만듭니다. +2. AI 서버에 원문과 `dateCandidates`를 전달합니다. +3. `extract-items`로 비용 없는 baseline 결과를 먼저 확인합니다. +4. 부족한 케이스는 `prompt-preview` 결과를 ChatGPT Plus에 넣어 비교합니다. +5. 충분히 안정화된 뒤 OpenAI API 호출을 AI 서버 내부에 붙입니다. diff --git a/docs/newsletter-labeling-guide.md b/docs/newsletter-labeling-guide.md new file mode 100644 index 0000000..b347776 --- /dev/null +++ b/docs/newsletter-labeling-guide.md @@ -0,0 +1,39 @@ +# 가정통신문 라벨링 기준 + +본 문서는 사람이 직접 구축하는 정답 데이터와 AI 모델의 판단 기준을 일치시키기 위한 라벨링 가이드라인입니다. + +## 기본 원칙 + +- 원문 근거가 있는 항목만 라벨링한다. +- 날짜는 제공된 `dateCandidates` 중 하나만 선택한다. +- 날짜 후보에 없는 날짜를 사람이 임의로 만들지 않는다. +- 캘린더/알림 생성은 `dateStatus=confirmed` 항목만 대상으로 본다. + +## 항목 분류 + +- `deadline`: 제출, 신청, 납부, 등록, 동의, 회신, 마감처럼 특정 날짜까지 해야 하는 행동 +- `schedule`: 행사, 수업, 상담, 체험학습, 설명회, 운영일처럼 실제로 진행되는 일정 +- `checklist`: 준비물, 지참물, 확인할 문서, 보호자나 학생이 해야 할 행동 +- `reminder`: deadline이나 schedule은 아니지만 알림으로 보여줄 가치가 있는 정보 + +## 날짜 상태 + +- `confirmed`: 원문 근거와 `dateCandidates` 중 하나가 명확히 연결된 상태 +- `ambiguous`: 날짜 표현은 있지만 후보 매칭이 불확실한 상태 +- `missing`: 행동 항목은 있지만 날짜가 없는 상태 + +## 라벨링 예시 + +```json +{ + "type": "deadline", + "title": "체험학습 동의서 제출", + "evidenceText": "5월 20일까지 체험학습 동의서를 제출해주세요.", + "selectedDateCandidateId": "dc_1", + "dateStatus": "confirmed", + "date": "2026-05-20", + "target": "parent", + "actionRequired": true, + "schoolContext": "보호자가 동의서를 확인하고 제출해야 하는 안내" +} +``` diff --git a/requirements.txt b/requirements.txt index f037fb5..60d3c14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ fastapi==0.116.1 -uvicorn[standard]==0.35.0 \ No newline at end of file +pydantic>=2.0,<3.0 +uvicorn[standard]==0.35.0