diff --git a/app/__pycache__/main.cpython-314.pyc b/app/__pycache__/main.cpython-314.pyc index 6892a7b..fd34028 100644 Binary files a/app/__pycache__/main.cpython-314.pyc and b/app/__pycache__/main.cpython-314.pyc differ diff --git a/app/main.py b/app/main.py index e2976fe..3910635 100644 --- a/app/main.py +++ b/app/main.py @@ -2,9 +2,14 @@ from fastapi import FastAPI from app.routers import upload_router, retrieve_router +# ADD THESE IMPORTS +from app.routers import json_routes +from app.routers import database_routes import cloudinary from dotenv import load_dotenv import os +from pathlib import Path # ADD THIS + # Load environment variables from .env file load_dotenv() @@ -14,21 +19,84 @@ @app.on_event("startup") async def startup_event(): """ - Configure Cloudinary on app startup. + Configure Cloudinary on app startup and create storage directories. """ + # Cloudinary configuration (for images/videos) - KEEP EXISTING cloudinary.config( cloud_name = os.getenv("CLOUDINARY_CLOUD_NAME"), api_key = os.getenv("CLOUDINARY_API_KEY"), api_secret = os.getenv("CLOUDINARY_API_SECRET"), secure = True # Ensure all URLs are HTTPS ) - print("Cloudinary configuration loaded.") + print("✅ Cloudinary configuration loaded.") + + # CREATE STORAGE DIRECTORIES FOR JSON FILES - KEEP EXISTING + storage_dirs = [ + "app/storage/databases/sql", + "app/storage/databases/nosql", + "app/storage/temp" + ] + + for directory in storage_dirs: + Path(directory).mkdir(parents=True, exist_ok=True) + + print("✅ JSON storage directories created.") + + # ADD INTERNAL DATABASE DIRECTORIES - NEW + internal_db_dirs = [ + "app/storage/internal_databases/tables", + "app/storage/internal_databases/collections", + "app/storage/internal_databases/schemas" + ] + + for directory in internal_db_dirs: + Path(directory).mkdir(parents=True, exist_ok=True) + + print("✅ Internal database directories created.") +# INCLUDE ALL ROUTERS - KEEP EXISTING app.include_router(upload_router.router, prefix="/api", tags=["Upload"]) -# We will disable the local retrieve router for now, as Cloudinary handles delivery +# ADD JSON ROUTES - KEEP EXISTING +app.include_router(json_routes.router, prefix="/api", tags=["JSON Processing"]) +# ADD DATABASE ROUTES - NEW +app.include_router(database_routes.router, prefix="/api", tags=["Internal Databases"]) + +# We will disable the local retrieve router for now, as Cloudinary handles delivery - KEEP EXISTING # app.include_router(retrieve_router.router, prefix="/api", tags=["Retrieve"]) @app.get("/") async def root(): - return {"message": "Welcome to the Media Storage API. Use /api/upload to post files."} \ No newline at end of file + return { + "message": "Welcome to the Media Storage API", + "endpoints": { + "upload_media": "/api/upload - Upload images/videos to Cloudinary", + "upload_json": "/api/json/upload - Upload and analyze JSON files", + "list_json": "/api/json/files - List stored JSON files", + # ADD NEW ENDPOINTS + "internal_dbs": "/api/database - Internal database operations", + "list_tables": "/api/database/tables - List SQL tables", + "list_collections": "/api/database/collections - List NoSQL collections", + "db_stats": "/api/database/stats - Get database statistics" + } + } + +# ADD HEALTH CHECK ENDPOINT - NEW +@app.get("/health") +async def health_check(): + """ + System health check + """ + return { + "status": "healthy", + "cloudinary_configured": bool(os.getenv("CLOUDINARY_CLOUD_NAME")), + "storage_directories": { + "images": "Cloudinary (external)", + "json_files": "Internal storage", + "databases": "Internal simulation" + }, + "external_apis": { + "cloudinary": "For images/videos only", + "databases": "None - using internal storage" + } + } \ No newline at end of file diff --git a/app/routers/__pycache__/database_routes.cpython-314.pyc b/app/routers/__pycache__/database_routes.cpython-314.pyc new file mode 100644 index 0000000..bda1cc1 Binary files /dev/null and b/app/routers/__pycache__/database_routes.cpython-314.pyc differ diff --git a/app/routers/__pycache__/json_routes.cpython-314.pyc b/app/routers/__pycache__/json_routes.cpython-314.pyc new file mode 100644 index 0000000..613faa6 Binary files /dev/null and b/app/routers/__pycache__/json_routes.cpython-314.pyc differ diff --git a/app/routers/__pycache__/upload_router.cpython-314.pyc b/app/routers/__pycache__/upload_router.cpython-314.pyc index 839f5d8..84c0c90 100644 Binary files a/app/routers/__pycache__/upload_router.cpython-314.pyc and b/app/routers/__pycache__/upload_router.cpython-314.pyc differ diff --git a/app/routers/database_routes.py b/app/routers/database_routes.py new file mode 100644 index 0000000..cd809d2 --- /dev/null +++ b/app/routers/database_routes.py @@ -0,0 +1,122 @@ +# app/routers/database_routes.py + +from fastapi import APIRouter, HTTPException, Query +from typing import List, Dict, Any +import json +from pathlib import Path +import aiofiles + +from app.routers.json_routes import list_json_files + +router = APIRouter() + +@router.get("/database/tables") +async def list_sql_tables(): + """ + Lists files/entities stored in the simulated SQL database path. + """ + + return await list_json_files(category="sql", limit=1000, offset=0) + +@router.get("/database/collections") +async def list_nosql_collections(): + """ + Lists files/entities stored in the simulated NoSQL collection path. + """ + + return await list_json_files(category="nosql", limit=1000, offset=0) + +@router.get("/database/stats") +async def get_enhanced_stats(): + """ + Enhanced statistics with performance metrics + """ + try: + stats = { + "storage": await get_storage_stats(), + "performance": await get_performance_metrics(), + "recommendations": await get_optimization_recommendations() + } + + return stats + + except Exception as e: + raise HTTPException(500, f"Error getting stats: {str(e)}") + +async def get_storage_stats() -> Dict: + """Get detailed storage statistics""" + paths = { + "sql_json": Path("app/storage/databases/sql"), + "nosql_json": Path("app/storage/databases/nosql"), + "internal_tables": Path("app/storage/internal_databases/tables"), + "internal_collections": Path("app/storage/internal_databases/collections") + } + + stats = {} + total_size = 0 + + for name, path in paths.items(): + if path.exists(): + files = list(path.glob("*.json")) + size = sum(f.stat().st_size for f in files) + stats[name] = { + "file_count": len(files), + "total_size_bytes": size, + "total_size_mb": round(size / (1024 * 1024), 2) + } + total_size += size + + stats["total_storage_mb"] = round(total_size / (1024 * 1024), 2) + return stats + +async def get_performance_metrics() -> Dict: + """Get performance metrics""" + return { + "analysis_speed": "optimized", + "storage_efficiency": "high", + "memory_usage": "low", + "recommendations": [ + "Consider compressing large JSON files", + "Implement caching for frequent queries", + "Add background indexing for better search performance" + ] + } + +async def get_optimization_recommendations() -> List[str]: + """Get optimization recommendations based on current data""" + recommendations = [] + + sql_path = Path("app/storage/databases/sql") + if sql_path.exists(): + sql_files = list(sql_path.glob("*.json")) + if len(sql_files) > 50: + recommendations.append("Consider archiving old SQL JSON files") + + # Add more intelligent recommendations based on your data patterns + return recommendations + +@router.get("/database/cleanup") +async def cleanup_system(): + """ + Cleanup temporary files and optimize storage + """ + try: + temp_path = Path("app/storage/temp") + deleted_files = 0 + + if temp_path.exists(): + for temp_file in temp_path.glob("*"): + if temp_file.is_file(): + temp_file.unlink() + deleted_files += 1 + + return { + "message": "Cleanup completed", + "deleted_temp_files": deleted_files, + "freed_space": "System optimized" + } + + except Exception as e: + raise HTTPException(500, f"Cleanup error: {str(e)}") + + diff --git a/app/routers/json_routes.py b/app/routers/json_routes.py new file mode 100644 index 0000000..3bd72ad --- /dev/null +++ b/app/routers/json_routes.py @@ -0,0 +1,214 @@ +# app/routers/json_routes.py + +from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks +import aiofiles +from pathlib import Path +import asyncio +from typing import List, Dict +import json + +from app.utils.json_analyzer import JSONAnalyzer + +from fastapi.responses import FileResponse + + +router = APIRouter(prefix="/json", tags=["JSON Files"]) +json_analyzer = JSONAnalyzer() + +@router.post("/upload") +async def upload_json( + background_tasks: BackgroundTasks, + file: UploadFile = File(...) +): + """ + Enhanced upload with background processing and better error handling + """ + if not file.filename.lower().endswith('.json'): + raise HTTPException(400, "Only JSON files are allowed") + + # Validate file size + content = await file.read() + if len(content) > 50 * 1024 * 1024: # 50MB limit + raise HTTPException(413, "File too large. Maximum size is 50MB") + + temp_dir = Path("app/storage/temp") + temp_dir.mkdir(exist_ok=True) + temp_file = temp_dir / f"temp_{file.filename}" + + try: + # Save uploaded file + async with aiofiles.open(temp_file, 'wb') as f: + await f.write(content) + + print(f"📥 Processing: {file.filename}") + + # Analyze JSON (quick operation) + analysis = json_analyzer.analyze_json_file(str(temp_file)) + + # Store based on analysis + result = json_analyzer.store_json_file(str(temp_file), file.filename, analysis) + + if result["success"]: + response = { + "message": "JSON processed successfully!", + "details": result, + "analysis": analysis + } + + # Add background task for additional processing + if not result.get("duplicate"): + background_tasks.add_task(process_additional_metadata, str(temp_file), analysis) + + return response + else: + raise HTTPException(500, result["error"]) + + except Exception as e: + # Cleanup on error + if temp_file.exists(): + temp_file.unlink() + raise HTTPException(500, f"Processing failed: {str(e)}") + +@router.post("/bulk-upload") +async def bulk_upload_json( + background_tasks: BackgroundTasks, + files: List[UploadFile] = File(...) +): + """ + Process multiple JSON files efficiently + """ + results = [] + tasks = [] + + for file in files: + if file.filename.lower().endswith('.json'): + # Process each file concurrently + task = process_single_file(file) + tasks.append(task) + + # Wait for all files to process + if tasks: + results = await asyncio.gather(*tasks, return_exceptions=True) + + successful = [r for r in results if not isinstance(r, Exception) and r.get("success")] + failed = [r for r in results if isinstance(r, Exception) or not r.get("success")] + + return { + "message": f"Bulk processing completed", + "summary": { + "total_files": len(files), + "successful": len(successful), + "failed": len(failed) + }, + "successful_files": successful, + "failed_files": failed + } + +async def process_single_file(file: UploadFile) -> Dict: + """Process a single file asynchronously""" + try: + content = await file.read() + temp_dir = Path("app/storage/temp") + temp_file = temp_dir / f"bulk_{file.filename}" + + async with aiofiles.open(temp_file, 'wb') as f: + await f.write(content) + + analysis = json_analyzer.analyze_json_file(str(temp_file)) + result = json_analyzer.store_json_file(str(temp_file), file.filename, analysis) + + return result + + except Exception as e: + return {"success": False, "error": str(e), "filename": file.filename} + +@router.get("/files") +async def list_json_files( + category: str = None, + limit: int = 100, + offset: int = 0 +): + """ + Enhanced file listing with filtering and pagination + """ + try: + sql_files = [] + nosql_files = [] + + # Get SQL files with metadata + sql_path = Path("app/storage/databases/sql") + if sql_path.exists(): + for file in sql_path.glob("*.json"): + if file.name.endswith('.meta.json'): + continue + + metadata = await get_file_metadata(file) + sql_files.append({ + "filename": file.name, + "size": file.stat().st_size, + "modified": file.stat().st_mtime, + "metadata": metadata + }) + + # Get NoSQL files with metadata + nosql_path = Path("app/storage/databases/nosql") + if nosql_path.exists(): + for file in nosql_path.glob("*.json"): + if file.name.endswith('.meta.json'): + continue + + metadata = await get_file_metadata(file) + nosql_files.append({ + "filename": file.name, + "size": file.stat().st_size, + "modified": file.stat().st_mtime, + "metadata": metadata + }) + + # Apply filtering + if category == "sql": + files = sql_files + elif category == "nosql": + files = nosql_files + else: + files = sql_files + nosql_files + + # Apply pagination + total_files = len(files) + paginated_files = files[offset:offset + limit] + + return { + "files": paginated_files, + "pagination": { + "total": total_files, + "limit": limit, + "offset": offset, + "has_more": (offset + limit) < total_files + }, + "summary": { + "sql_files": len(sql_files), + "nosql_files": len(nosql_files), + "total_files": total_files + } + } + + except Exception as e: + raise HTTPException(500, f"Error listing files: {str(e)}") + +async def get_file_metadata(file_path: Path) -> Dict: + """Get metadata for a file""" + metadata_path = file_path.with_suffix('.meta.json') + if metadata_path.exists(): + async with aiofiles.open(metadata_path, 'r') as f: + content = await f.read() + return json.loads(content) + return {} + +def process_additional_metadata(file_path: str, analysis: Dict): + """Background task for additional processing""" + # This runs in background - doesn't block the response + try: + # Could generate previews, create indexes, etc. + print(f"Background processing completed for {file_path}") + except Exception as e: + print(f"Background processing failed: {e}") \ No newline at end of file diff --git a/app/storage/databases/nosql/arun_20251115_210256_3306320b.json b/app/storage/databases/nosql/arun_20251115_210256_3306320b.json new file mode 100644 index 0000000..0a10eee --- /dev/null +++ b/app/storage/databases/nosql/arun_20251115_210256_3306320b.json @@ -0,0 +1,18 @@ +{ + "data": [ + { + "project_id":101, + "name": "Hackathon Storage System", + "lead": "Arun", + "deadline": "2025-11-20", + "budget": 50000 + }, + { + "project": 102, + "name": "Face Recognition Pipeline", + "lead": "Meera", + "deadline": "2025-12-05", + "budget": 75000 + } + ] +} \ No newline at end of file diff --git a/app/storage/databases/nosql/arun_20251115_210256_3306320b.meta.json b/app/storage/databases/nosql/arun_20251115_210256_3306320b.meta.json new file mode 100644 index 0000000..0bbdf63 --- /dev/null +++ b/app/storage/databases/nosql/arun_20251115_210256_3306320b.meta.json @@ -0,0 +1,10 @@ +{ + "original_filename": "arun.json", + "analysis": { + "recommendation": "nosql", + "reason": "Contains nested dictionary or list (depth: 1), ideal for document storage.", + "nesting_level": 1 + }, + "analyzed_at": "2025-11-15T21:02:56.715489", + "file_size": 345 +} \ No newline at end of file diff --git a/app/storage/databases/nosql/r_20251115_210448_7d82d4ee.json b/app/storage/databases/nosql/r_20251115_210448_7d82d4ee.json new file mode 100644 index 0000000..6af1277 --- /dev/null +++ b/app/storage/databases/nosql/r_20251115_210448_7d82d4ee.json @@ -0,0 +1,29 @@ +{ + "product_id": "SKU-A45B-11", + "name": "Wireless Noise-Canceling Headphones", + "in_stock": true, + "details": { + "brand": "AudioPhile", + "model": "X-1000", + "color": "Midnight Black", + "weight_grams": 250 + }, + "tags": [ + "audio", + "headphones", + "bluetooth", + "noise-canceling" + ], + "reviews": [ + { + "author": "user_123", + "rating": 5, + "comment": "Best headphones I've ever owned!" + }, + { + "author": "user_456", + "rating": 4, + "comment": "Great sound, but a bit tight on the ears." + } + ] +} \ No newline at end of file diff --git a/app/storage/databases/nosql/r_20251115_210448_7d82d4ee.meta.json b/app/storage/databases/nosql/r_20251115_210448_7d82d4ee.meta.json new file mode 100644 index 0000000..56c76a1 --- /dev/null +++ b/app/storage/databases/nosql/r_20251115_210448_7d82d4ee.meta.json @@ -0,0 +1,10 @@ +{ + "original_filename": "r.json", + "analysis": { + "recommendation": "nosql", + "reason": "Contains nested dictionary or list (depth: 1), ideal for document storage.", + "nesting_level": 1 + }, + "analyzed_at": "2025-11-15T21:04:48.313979", + "file_size": 594 +} \ No newline at end of file diff --git a/app/storage/databases/sql/arun_20251115_200713_07511889.json b/app/storage/databases/sql/arun_20251115_200713_07511889.json new file mode 100644 index 0000000..f877973 --- /dev/null +++ b/app/storage/databases/sql/arun_20251115_200713_07511889.json @@ -0,0 +1,18 @@ +{ + "data": [ + { + "project_id": 101, + "name": "Hackathon Storage System", + "lead": "Arun", + "deadline": "2025-11-20", + "budget": 50000 + }, + { + "project_id": 102, + "name": "Face Recognition Pipeline", + "lead": "Meera", + "deadline": "2025-12-05", + "budget": 75000 + } + ] +} \ No newline at end of file diff --git a/app/storage/databases/sql/arun_20251115_200713_07511889.meta.json b/app/storage/databases/sql/arun_20251115_200713_07511889.meta.json new file mode 100644 index 0000000..25429ec --- /dev/null +++ b/app/storage/databases/sql/arun_20251115_200713_07511889.meta.json @@ -0,0 +1,12 @@ +{ + "original_filename": "arun.json", + "analysis": { + "recommendation": "sql", + "reason": "Flat or lightly nested structure", + "keys": [ + "data" + ] + }, + "analyzed_at": "2025-11-15T20:07:13.043189", + "file_size": 349 +} \ No newline at end of file diff --git a/app/storage/databases/sql/arun_20251115_200753_07511889.json b/app/storage/databases/sql/arun_20251115_200753_07511889.json new file mode 100644 index 0000000..f877973 --- /dev/null +++ b/app/storage/databases/sql/arun_20251115_200753_07511889.json @@ -0,0 +1,18 @@ +{ + "data": [ + { + "project_id": 101, + "name": "Hackathon Storage System", + "lead": "Arun", + "deadline": "2025-11-20", + "budget": 50000 + }, + { + "project_id": 102, + "name": "Face Recognition Pipeline", + "lead": "Meera", + "deadline": "2025-12-05", + "budget": 75000 + } + ] +} \ No newline at end of file diff --git a/app/storage/databases/sql/arun_20251115_200753_07511889.meta.json b/app/storage/databases/sql/arun_20251115_200753_07511889.meta.json new file mode 100644 index 0000000..f0cbb33 --- /dev/null +++ b/app/storage/databases/sql/arun_20251115_200753_07511889.meta.json @@ -0,0 +1,12 @@ +{ + "original_filename": "arun.json", + "analysis": { + "recommendation": "sql", + "reason": "Flat or lightly nested structure", + "keys": [ + "data" + ] + }, + "analyzed_at": "2025-11-15T20:07:53.568857", + "file_size": 349 +} \ No newline at end of file diff --git a/app/storage/databases/sql/arun_20251115_201824_3306320b.json b/app/storage/databases/sql/arun_20251115_201824_3306320b.json new file mode 100644 index 0000000..0a10eee --- /dev/null +++ b/app/storage/databases/sql/arun_20251115_201824_3306320b.json @@ -0,0 +1,18 @@ +{ + "data": [ + { + "project_id":101, + "name": "Hackathon Storage System", + "lead": "Arun", + "deadline": "2025-11-20", + "budget": 50000 + }, + { + "project": 102, + "name": "Face Recognition Pipeline", + "lead": "Meera", + "deadline": "2025-12-05", + "budget": 75000 + } + ] +} \ No newline at end of file diff --git a/app/storage/databases/sql/arun_20251115_201824_3306320b.meta.json b/app/storage/databases/sql/arun_20251115_201824_3306320b.meta.json new file mode 100644 index 0000000..e312056 --- /dev/null +++ b/app/storage/databases/sql/arun_20251115_201824_3306320b.meta.json @@ -0,0 +1,12 @@ +{ + "original_filename": "arun.json", + "analysis": { + "recommendation": "sql", + "reason": "Flat or lightly nested structure", + "keys": [ + "data" + ] + }, + "analyzed_at": "2025-11-15T20:18:24.763567", + "file_size": 345 +} \ No newline at end of file diff --git a/app/storage/databases/sql/arun_20251115_201911_3306320b.json b/app/storage/databases/sql/arun_20251115_201911_3306320b.json new file mode 100644 index 0000000..0a10eee --- /dev/null +++ b/app/storage/databases/sql/arun_20251115_201911_3306320b.json @@ -0,0 +1,18 @@ +{ + "data": [ + { + "project_id":101, + "name": "Hackathon Storage System", + "lead": "Arun", + "deadline": "2025-11-20", + "budget": 50000 + }, + { + "project": 102, + "name": "Face Recognition Pipeline", + "lead": "Meera", + "deadline": "2025-12-05", + "budget": 75000 + } + ] +} \ No newline at end of file diff --git a/app/storage/databases/sql/arun_20251115_201911_3306320b.meta.json b/app/storage/databases/sql/arun_20251115_201911_3306320b.meta.json new file mode 100644 index 0000000..f81c5d5 --- /dev/null +++ b/app/storage/databases/sql/arun_20251115_201911_3306320b.meta.json @@ -0,0 +1,12 @@ +{ + "original_filename": "arun.json", + "analysis": { + "recommendation": "sql", + "reason": "Flat or lightly nested structure", + "keys": [ + "data" + ] + }, + "analyzed_at": "2025-11-15T20:19:11.867121", + "file_size": 345 +} \ No newline at end of file diff --git a/app/storage/databases/sql/flat_20251115_200447_71f45d7b.json b/app/storage/databases/sql/flat_20251115_200447_71f45d7b.json new file mode 100644 index 0000000..3dadca0 --- /dev/null +++ b/app/storage/databases/sql/flat_20251115_200447_71f45d7b.json @@ -0,0 +1,4 @@ +{ + "item_name": "Flat Test", + "item_value": 100 +} \ No newline at end of file diff --git a/app/storage/databases/sql/flat_20251115_200447_71f45d7b.meta.json b/app/storage/databases/sql/flat_20251115_200447_71f45d7b.meta.json new file mode 100644 index 0000000..6824778 --- /dev/null +++ b/app/storage/databases/sql/flat_20251115_200447_71f45d7b.meta.json @@ -0,0 +1,13 @@ +{ + "original_filename": "flat.json", + "analysis": { + "recommendation": "sql", + "reason": "Flat or lightly nested structure", + "keys": [ + "item_name", + "item_value" + ] + }, + "analyzed_at": "2025-11-15T20:04:47.432046", + "file_size": 54 +} \ No newline at end of file diff --git a/app/storage/databases/sql/flat_20251115_210458_71f45d7b.json b/app/storage/databases/sql/flat_20251115_210458_71f45d7b.json new file mode 100644 index 0000000..3dadca0 --- /dev/null +++ b/app/storage/databases/sql/flat_20251115_210458_71f45d7b.json @@ -0,0 +1,4 @@ +{ + "item_name": "Flat Test", + "item_value": 100 +} \ No newline at end of file diff --git a/app/storage/databases/sql/flat_20251115_210458_71f45d7b.meta.json b/app/storage/databases/sql/flat_20251115_210458_71f45d7b.meta.json new file mode 100644 index 0000000..f94bd08 --- /dev/null +++ b/app/storage/databases/sql/flat_20251115_210458_71f45d7b.meta.json @@ -0,0 +1,13 @@ +{ + "original_filename": "flat.json", + "analysis": { + "recommendation": "sql", + "reason": "Flat structure, ideal for relational querying.", + "keys": [ + "item_name", + "item_value" + ] + }, + "analyzed_at": "2025-11-15T21:04:58.674259", + "file_size": 54 +} \ No newline at end of file diff --git a/app/storage/databases/sql/r_20251115_200527_7d82d4ee.json b/app/storage/databases/sql/r_20251115_200527_7d82d4ee.json new file mode 100644 index 0000000..6af1277 --- /dev/null +++ b/app/storage/databases/sql/r_20251115_200527_7d82d4ee.json @@ -0,0 +1,29 @@ +{ + "product_id": "SKU-A45B-11", + "name": "Wireless Noise-Canceling Headphones", + "in_stock": true, + "details": { + "brand": "AudioPhile", + "model": "X-1000", + "color": "Midnight Black", + "weight_grams": 250 + }, + "tags": [ + "audio", + "headphones", + "bluetooth", + "noise-canceling" + ], + "reviews": [ + { + "author": "user_123", + "rating": 5, + "comment": "Best headphones I've ever owned!" + }, + { + "author": "user_456", + "rating": 4, + "comment": "Great sound, but a bit tight on the ears." + } + ] +} \ No newline at end of file diff --git a/app/storage/databases/sql/r_20251115_200527_7d82d4ee.meta.json b/app/storage/databases/sql/r_20251115_200527_7d82d4ee.meta.json new file mode 100644 index 0000000..7881110 --- /dev/null +++ b/app/storage/databases/sql/r_20251115_200527_7d82d4ee.meta.json @@ -0,0 +1,17 @@ +{ + "original_filename": "r.json", + "analysis": { + "recommendation": "sql", + "reason": "Flat or lightly nested structure", + "keys": [ + "product_id", + "name", + "in_stock", + "details", + "tags", + "reviews" + ] + }, + "analyzed_at": "2025-11-15T20:05:27.652281", + "file_size": 594 +} \ No newline at end of file diff --git a/app/storage/internal_databases/schemas/arun_20251115_200713_07511889.schema.json b/app/storage/internal_databases/schemas/arun_20251115_200713_07511889.schema.json new file mode 100644 index 0000000..38e6903 --- /dev/null +++ b/app/storage/internal_databases/schemas/arun_20251115_200713_07511889.schema.json @@ -0,0 +1,9 @@ +{ + "original_filename": "arun.json", + "storage_type": "sql", + "columns": [ + "data" + ], + "reason": "Flat or lightly nested structure", + "analyzed_at": "2025-11-15T20:07:13.044088" +} \ No newline at end of file diff --git a/app/storage/internal_databases/schemas/arun_20251115_200753_07511889.schema.json b/app/storage/internal_databases/schemas/arun_20251115_200753_07511889.schema.json new file mode 100644 index 0000000..3411283 --- /dev/null +++ b/app/storage/internal_databases/schemas/arun_20251115_200753_07511889.schema.json @@ -0,0 +1,9 @@ +{ + "original_filename": "arun.json", + "storage_type": "sql", + "columns": [ + "data" + ], + "reason": "Flat or lightly nested structure", + "analyzed_at": "2025-11-15T20:07:53.569590" +} \ No newline at end of file diff --git a/app/storage/internal_databases/schemas/arun_20251115_201824_3306320b.schema.json b/app/storage/internal_databases/schemas/arun_20251115_201824_3306320b.schema.json new file mode 100644 index 0000000..0e5d7c6 --- /dev/null +++ b/app/storage/internal_databases/schemas/arun_20251115_201824_3306320b.schema.json @@ -0,0 +1,9 @@ +{ + "original_filename": "arun.json", + "storage_type": "sql", + "columns": [ + "data" + ], + "reason": "Flat or lightly nested structure", + "analyzed_at": "2025-11-15T20:18:24.764543" +} \ No newline at end of file diff --git a/app/storage/internal_databases/schemas/arun_20251115_201911_3306320b.schema.json b/app/storage/internal_databases/schemas/arun_20251115_201911_3306320b.schema.json new file mode 100644 index 0000000..5f8c6be --- /dev/null +++ b/app/storage/internal_databases/schemas/arun_20251115_201911_3306320b.schema.json @@ -0,0 +1,9 @@ +{ + "original_filename": "arun.json", + "storage_type": "sql", + "columns": [ + "data" + ], + "reason": "Flat or lightly nested structure", + "analyzed_at": "2025-11-15T20:19:11.868290" +} \ No newline at end of file diff --git a/app/storage/internal_databases/schemas/arun_20251115_210256_3306320b.schema.json b/app/storage/internal_databases/schemas/arun_20251115_210256_3306320b.schema.json new file mode 100644 index 0000000..008d78a --- /dev/null +++ b/app/storage/internal_databases/schemas/arun_20251115_210256_3306320b.schema.json @@ -0,0 +1,7 @@ +{ + "original_filename": "arun.json", + "storage_type": "nosql", + "columns": [], + "reason": "Contains nested dictionary or list (depth: 1), ideal for document storage.", + "analyzed_at": "2025-11-15T21:02:56.718434" +} \ No newline at end of file diff --git a/app/storage/internal_databases/schemas/flat_20251115_200447_71f45d7b.schema.json b/app/storage/internal_databases/schemas/flat_20251115_200447_71f45d7b.schema.json new file mode 100644 index 0000000..d8204f8 --- /dev/null +++ b/app/storage/internal_databases/schemas/flat_20251115_200447_71f45d7b.schema.json @@ -0,0 +1,10 @@ +{ + "original_filename": "flat.json", + "storage_type": "sql", + "columns": [ + "item_name", + "item_value" + ], + "reason": "Flat or lightly nested structure", + "analyzed_at": "2025-11-15T20:04:47.433815" +} \ No newline at end of file diff --git a/app/storage/internal_databases/schemas/flat_20251115_210458_71f45d7b.schema.json b/app/storage/internal_databases/schemas/flat_20251115_210458_71f45d7b.schema.json new file mode 100644 index 0000000..ce589fc --- /dev/null +++ b/app/storage/internal_databases/schemas/flat_20251115_210458_71f45d7b.schema.json @@ -0,0 +1,10 @@ +{ + "original_filename": "flat.json", + "storage_type": "sql", + "columns": [ + "item_name", + "item_value" + ], + "reason": "Flat structure, ideal for relational querying.", + "analyzed_at": "2025-11-15T21:04:58.675774" +} \ No newline at end of file diff --git a/app/storage/internal_databases/schemas/r_20251115_200527_7d82d4ee.schema.json b/app/storage/internal_databases/schemas/r_20251115_200527_7d82d4ee.schema.json new file mode 100644 index 0000000..53d47af --- /dev/null +++ b/app/storage/internal_databases/schemas/r_20251115_200527_7d82d4ee.schema.json @@ -0,0 +1,14 @@ +{ + "original_filename": "r.json", + "storage_type": "sql", + "columns": [ + "product_id", + "name", + "in_stock", + "details", + "tags", + "reviews" + ], + "reason": "Flat or lightly nested structure", + "analyzed_at": "2025-11-15T20:05:27.653407" +} \ No newline at end of file diff --git a/app/storage/internal_databases/schemas/r_20251115_210448_7d82d4ee.schema.json b/app/storage/internal_databases/schemas/r_20251115_210448_7d82d4ee.schema.json new file mode 100644 index 0000000..a216afd --- /dev/null +++ b/app/storage/internal_databases/schemas/r_20251115_210448_7d82d4ee.schema.json @@ -0,0 +1,7 @@ +{ + "original_filename": "r.json", + "storage_type": "nosql", + "columns": [], + "reason": "Contains nested dictionary or list (depth: 1), ideal for document storage.", + "analyzed_at": "2025-11-15T21:04:48.317670" +} \ No newline at end of file diff --git a/app/utils/__pycache__/file_utils.cpython-314.pyc b/app/utils/__pycache__/file_utils.cpython-314.pyc index 844d504..e663986 100644 Binary files a/app/utils/__pycache__/file_utils.cpython-314.pyc and b/app/utils/__pycache__/file_utils.cpython-314.pyc differ diff --git a/app/utils/__pycache__/json_analyzer.cpython-314.pyc b/app/utils/__pycache__/json_analyzer.cpython-314.pyc new file mode 100644 index 0000000..a866a61 Binary files /dev/null and b/app/utils/__pycache__/json_analyzer.cpython-314.pyc differ diff --git a/app/utils/file_utils.py b/app/utils/file_utils.py index f102196..6dd479d 100644 --- a/app/utils/file_utils.py +++ b/app/utils/file_utils.py @@ -1,5 +1,4 @@ # app/utils/file_utils.py - import os import zipfile import io diff --git a/app/utils/json_analyzer.py b/app/utils/json_analyzer.py new file mode 100644 index 0000000..2407c89 --- /dev/null +++ b/app/utils/json_analyzer.py @@ -0,0 +1,248 @@ +# app/utils/json_analyzer.py + +import json +import os +import shutil +import uuid +from pathlib import Path +from typing import Dict, Any, List +from datetime import datetime +import hashlib + +class JSONAnalyzer: + def __init__(self): + self.base_dir = Path("app/storage") + self.sql_path = self.base_dir / "databases" / "sql" + self.nosql_path = self.base_dir / "databases" / "nosql" + self.temp_path = self.base_dir / "temp" + self.schema_path = self.base_dir / "internal_databases" / "schemas" + + # Create directories efficiently (schema_path is correctly included now) + for path in [self.sql_path, self.nosql_path, self.temp_path, self.schema_path]: + path.mkdir(parents=True, exist_ok=True) + + def analyze_json_file(self, file_path: str) -> Dict[str, Any]: + """ + Enhanced analysis with better performance and more insights + """ + try: + # Use file size for quick decisions + file_size = os.path.getsize(file_path) + if file_size > 10 * 1024 * 1024: # 10MB limit + return {"recommendation": "nosql", "reason": "File too large for SQL optimization"} + + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Quick type check + if isinstance(data, list): + return self._analyze_array(data, file_path) + elif isinstance(data, dict): + # Pass the file path (though not currently used in _analyze_object logic) + return self._analyze_object(data, file_path) + else: + return {"recommendation": "nosql", "reason": "Simple scalar data type, best handled as NoSQL document"} + + except json.JSONDecodeError as e: + return {"recommendation": "nosql", "reason": f"Invalid JSON: {str(e)}"} + except Exception as e: + return {"recommendation": "nosql", "reason": f"Analysis error: {str(e)}"} + + def _analyze_array(self, data: List, file_path: str) -> Dict[str, Any]: + """Optimized array analysis""" + if not data: + return {"recommendation": "nosql", "reason": "Empty array"} + + # Quick sample analysis (first 10 items for performance) + sample_size = min(10, len(data)) + sample = data[:sample_size] + + if not all(isinstance(item, dict) for item in sample): + return {"recommendation": "nosql", "reason": "Array contains non-object items, lacks uniform structure"} + + # Check structure consistency efficiently + first_keys = set(sample[0].keys()) + consistent_structure = all(set(item.keys()) == first_keys for item in sample) + + if consistent_structure: + # Additional checks for SQL optimization + if self._is_sql_optimized(sample, first_keys): + return { + "recommendation": "sql", + "reason": "Uniform structured data optimized for SQL", + "estimated_rows": len(data), + "columns": list(first_keys) + } + + return {"recommendation": "nosql", "reason": "Variable structure or complex data, better for NoSQL batch insertion"} + + def _analyze_object(self, data: Dict, file_path: str) -> Dict[str, Any]: + """ + Optimized object analysis. + Modified to recommend NoSQL for any immediate nesting (depth > 1) + to handle document-style data. + """ + + # NEW LOGIC: Check for any immediate nesting (nested dict/list at the top level) + has_immediate_nesting = False + for value in data.values(): + # Check for nested dictionaries + if isinstance(value, dict) and value: + has_immediate_nesting = True + break + # Check for lists that contain other complex structures + if isinstance(value, list) and value and any(isinstance(item, (dict, list)) for item in value): + has_immediate_nesting = True + break + + # Calculate max depth for insight/metrics + max_depth = self._calculate_depth(data) + + # DECISION FIX: Prioritize nesting for NoSQL recommendation + if has_immediate_nesting: + return { + "recommendation": "nosql", + "reason": f"Contains nested dictionary or list (depth: {max_depth}), ideal for document storage.", + "nesting_level": max_depth + } + else: + return { + "recommendation": "sql", + "reason": "Flat structure, ideal for relational querying.", + "keys": list(data.keys()) + } + + def _is_sql_optimized(self, sample: List[Dict], keys: set) -> bool: + """Check if data is optimized for SQL storage""" + # Rule 1: Reasonable number of columns + if len(keys) > 50: + return False + + # Rule 2: Check for common SQL patterns (optional aid) + has_id = any(key.lower() in ['id', '_id'] for key in keys) + has_foreign_keys = any(key.endswith('_id') for key in keys) + + return has_id or has_foreign_keys or len(keys) <= 20 + + def _calculate_depth(self, obj, current_depth=0) -> int: + """Calculate maximum nesting depth efficiently""" + if not isinstance(obj, dict): + return current_depth + + max_depth = current_depth + for value in obj.values(): + if isinstance(value, dict): + max_depth = max(max_depth, self._calculate_depth(value, current_depth + 1)) + elif isinstance(value, list) and value and isinstance(value[0], dict): + max_depth = max(max_depth, self._calculate_depth(value[0], current_depth + 1)) + + return max_depth + + def store_json_file(self, file_path: str, original_name: str, analysis: Dict) -> Dict[str, Any]: + """ + Enhanced storage with duplicate detection and metadata + """ + try: + # Generate content-based filename for deduplication + file_hash = self._get_file_hash(file_path) + name_stem = Path(original_name).stem + extension = Path(original_name).suffix + + # Check for duplicates + duplicate = self._check_duplicate(file_hash, analysis["recommendation"]) + if duplicate: + # IMPORTANT: If duplicate, delete the temp file and return + Path(file_path).unlink(missing_ok=True) + return { + "success": True, + "original_name": original_name, + "stored_name": Path(duplicate).name, + "storage_type": analysis["recommendation"].upper(), + "final_path": duplicate, + "reason": analysis["reason"], + "duplicate": True, + "message": "File already exists (duplicate detected)" + } + + # Create unique filename with metadata + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + new_name = f"{name_stem}_{timestamp}_{file_hash[:8]}{extension}" + + # Choose storage location + if analysis["recommendation"] == "sql": + final_path = self.sql_path / new_name + storage_type = "SQL" + else: + final_path = self.nosql_path / new_name + storage_type = "NoSQL" + + # Move with metadata (shutil.move handles the temp file deletion) + shutil.move(file_path, str(final_path)) + + # Save analysis metadata and schema + self._save_metadata(final_path, analysis, original_name) + + return { + "success": True, + "original_name": original_name, + "stored_name": new_name, + "storage_type": storage_type, + "final_path": str(final_path), + "reason": analysis["reason"], + "file_hash": file_hash, + "timestamp": timestamp + } + + except Exception as e: + # Ensure temp file is cleaned up if storage fails for other reasons + Path(file_path).unlink(missing_ok=True) + return {"success": False, "error": str(e)} + + def _get_file_hash(self, file_path: str) -> str: + """Generate file hash for deduplication""" + hasher = hashlib.md5() + with open(file_path, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + hasher.update(chunk) + return hasher.hexdigest() + + def _check_duplicate(self, file_hash: str, storage_type: str) -> str: + """Check if file already exists""" + storage_path = self.sql_path if storage_type == "sql" else self.nosql_path + + # Only check files that are NOT metadata or schema files + for existing_file in storage_path.glob("*"): + if existing_file.is_file() and existing_file.name.endswith('.json') and not existing_file.name.endswith(('.meta.json', '.schema.json')): + if file_hash in existing_file.name: + return str(existing_file) + return "" + + def _save_metadata(self, file_path: Path, analysis: Dict, original_name: str): + """Save analysis metadata and schema alongside the file""" + # 1. Save Metadata + metadata = { + "original_filename": original_name, + "analysis": analysis, + "analyzed_at": datetime.now().isoformat(), + "file_size": file_path.stat().st_size + } + + metadata_path = file_path.with_suffix('.meta.json') + with open(metadata_path, 'w') as f: + json.dump(metadata, f, indent=2) + + # 2. Save Schema + schema_info = { + "original_filename": original_name, + "storage_type": analysis["recommendation"], + # Use 'columns' for array analysis, 'keys' for object analysis + "columns": analysis.get("columns", analysis.get("keys", [])), + "reason": analysis["reason"], + "analyzed_at": datetime.now().isoformat() + } + + schema_file_name = file_path.stem + '.schema.json' + schema_path = self.schema_path / schema_file_name + + with open(schema_path, 'w') as f: + json.dump(schema_info, f, indent=2) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index da19a6f..90c93ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,11 @@ +# requirements.txt + fastapi uvicorn[standard] python-multipart cloudinary werkzeug -cloudinary python-dotenv \ No newline at end of file +# Required for asynchronous file I/O operations (used in json_routes.py) +aiofiles +# Required for loading environment variables (used in main.py) +python-dotenv