diff --git a/bugtrace/agents/gospider_agent.py b/bugtrace/agents/gospider_agent.py index 9f7812c..97f312f 100644 --- a/bugtrace/agents/gospider_agent.py +++ b/bugtrace/agents/gospider_agent.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Set +from typing import List, Dict, Any, Set, Optional from loguru import logger from bugtrace.tools.external import external_tools from bugtrace.core.ui import dashboard @@ -29,12 +29,13 @@ class GoSpiderAgent(BaseAgent): IMPROVED 2026-01-30: Extract ALL testable parameters, not just URLs. """ - def __init__(self, target: str, report_dir: Path, max_depth: int = 2, max_urls: int = 10, event_bus: Any = None): + def __init__(self, target: str, report_dir: Path, max_depth: int = 2, max_urls: int = 10, event_bus: Any = None, scan_ctx_id: Optional[str] = None): super().__init__("GoSpiderAgent", "URL Discovery", event_bus=event_bus, agent_id="gospider_agent") self.target = target self.report_dir = report_dir self.max_depth = max_depth self.max_urls = max_urls + self.scan_ctx_id = scan_ctx_id self.target_domain = urlparse(target).hostname.lower() if urlparse(target).hostname else "" # Load extension filters from config @@ -73,11 +74,27 @@ def _should_analyze_url(self, url: str) -> bool: async def _discover_urls(self) -> List[str]: """Run GoSpider and fallback discovery if needed.""" + cookies: List[Dict[str, str]] = [] + auth_extra_headers: Dict[str, str] = {} + if self.scan_ctx_id: + from bugtrace.services.scan_context import get_scan_auth_headers + auth_headers = get_scan_auth_headers(self.scan_ctx_id) + if "Cookie" in auth_headers: + cookie_str = auth_headers["Cookie"] + cookies = [{"name": p.split("=")[0], "value": "=".join(p.split("=")[1:])} + for p in cookie_str.split("; ") if "=" in p] + dashboard.log(f"[{self.name}] Using {len(cookies)} auth cookies for crawling", "INFO") + if "Authorization" in auth_headers: + auth_extra_headers["Authorization"] = auth_headers["Authorization"] + dashboard.log(f"[{self.name}] Using Authorization header for crawling", "INFO") + # Pass max_urls to support early exit (optimization) gospider_urls = await external_tools.run_gospider( self.target, + cookies=cookies, depth=self.max_depth, - max_urls=self.max_urls + max_urls=self.max_urls, + extra_headers=auth_extra_headers if auth_extra_headers else None, ) # If GoSpider only returns 1 URL (the target itself), trigger fallback diff --git a/bugtrace/api/routes/reports.py b/bugtrace/api/routes/reports.py index cc9565a..af36134 100644 --- a/bugtrace/api/routes/reports.py +++ b/bugtrace/api/routes/reports.py @@ -150,46 +150,82 @@ def _find_report_dir(scan_id: int) -> FilePath | None: with db.get_session() as session: from bugtrace.schemas.db_models import ScanTable, TargetTable scan = session.get(ScanTable, scan_id) - if not scan: - return None - target = session.get(TargetTable, scan.target_id) - if not target: - return None - - # Pattern 0: Direct DB match (new v5.1 architecture) - if hasattr(scan, 'report_dir') and scan.report_dir: - db_dir = FilePath(scan.report_dir) - if db_dir.is_dir() and _has_report_files(db_dir): - return db_dir - - # Pattern 1: Pipeline-generated reports ({domain}_{timestamp}) - from urllib.parse import urlparse - domain = urlparse(target.url).hostname or "" - scan_ts = scan.timestamp.strftime("%Y%m%d_%H%M%S") - - # Priority 1a: Exact timestamp match - exact_match = report_base / f"{domain}_{scan_ts}" - if exact_match.is_dir() and _has_report_files(exact_match): - return exact_match - - # Priority 1b: Fuzzy match (latest for domain) - matches = sorted( - report_base.glob(f"{domain}_*"), - key=lambda p: p.stat().st_mtime, - reverse=True, - ) - for match in matches: - if _has_report_files(match): - return match - - # Pattern 2: API-generated reports (fallback) - api_dir = report_base / f"scan_{scan_id}" - if api_dir.is_dir() and _has_report_files(api_dir): - return api_dir + if scan: + target = session.get(TargetTable, scan.target_id) + + # Pattern 0: Direct DB match (new v5.1 architecture) + if hasattr(scan, 'report_dir') and scan.report_dir: + db_dir = FilePath(scan.report_dir) + if db_dir.is_dir() and _has_report_files(db_dir): + return db_dir + + # Pattern 1: Pipeline-generated reports ({domain}_{timestamp}) + from urllib.parse import urlparse + if target: + domain = urlparse(target.url).hostname or "" + scan_ts = scan.timestamp.strftime("%Y%m%d_%H%M%S") + + # Priority 1a: Exact timestamp match + exact_match = report_base / f"{domain}_{scan_ts}" + if exact_match.is_dir() and _has_report_files(exact_match): + return exact_match + + # Priority 1b: Fuzzy match (latest for domain) + matches = sorted( + report_base.glob(f"{domain}_*"), + key=lambda p: p.stat().st_mtime, + reverse=True, + ) + for match in matches: + if _has_report_files(match): + return match + + # Pattern 2: API-generated reports (fallback) + api_dir = report_base / f"scan_{scan_id}" + if api_dir.is_dir() and _has_report_files(api_dir): + return api_dir + + # DB has no scan record - fall through to filesystem scan below except Exception as e: logger.warning(f"Error resolving report dir for scan {scan_id}: {e}") + # Filesystem fallback: scan ALL report dirs for scan_id in metadata + import json as _json + for report_dir in sorted( + report_base.glob("*_*"), + key=lambda p: p.stat().st_mtime, + reverse=True, + ): + if not report_dir.is_dir(): + continue + if not _has_report_files(report_dir): + continue + vf = report_dir / "validated_findings.json" + if vf.is_file(): + try: + data = _json.loads(vf.read_text()) + if isinstance(data, dict): + if data.get("scan_id") == scan_id: + return report_dir + meta = data.get("meta", {}) + if isinstance(meta, dict) and meta.get("scan_id") == scan_id: + return report_dir + except Exception: + pass + rf = report_dir / "raw_findings.json" + if rf.is_file(): + try: + data = _json.loads(rf.read_text()) + if isinstance(data, dict): + if data.get("scan_id") == scan_id: + return report_dir + meta = data.get("meta", {}) + if isinstance(meta, dict) and meta.get("scan_id") == scan_id: + return report_dir + except Exception: + pass + # Last resort: check scan_{id} without DB access api_dir = report_base / f"scan_{scan_id}" if api_dir.is_dir() and _has_report_files(api_dir): diff --git a/bugtrace/api/routes/scans.py b/bugtrace/api/routes/scans.py index dc9b9d8..a0ce8e0 100644 --- a/bugtrace/api/routes/scans.py +++ b/bugtrace/api/routes/scans.py @@ -100,6 +100,7 @@ def _build_scan_options(request: CreateScanRequest) -> ScanOptions: scan_depth=request.scan_depth, auth_token=request.auth_token, auth=request.auth, + auth_format=request.auth_format, url_list=request.url_list, ) diff --git a/bugtrace/api/schemas.py b/bugtrace/api/schemas.py index e4e1d67..1fb13ce 100644 --- a/bugtrace/api/schemas.py +++ b/bugtrace/api/schemas.py @@ -34,6 +34,7 @@ class CreateScanRequest(BaseModel): param: Optional[str] = Field(default=None, description="Specific parameter to target") auth_token: Optional[str] = Field(default=None, description="Pre-authenticated Bearer token (Level 1)") auth: Optional[Dict[str, Any]] = Field(default=None, description="Auto-login credentials: {login_url, credentials: {email, password}} (Level 2)") + auth_format: Optional[str] = Field(default=None, description="Login format for Auth Level 2: json or form") url_list: Optional[List[str]] = Field(default=None, description="Pre-defined URL list (from URL list file or Swagger import)") diff --git a/bugtrace/core/team.py b/bugtrace/core/team.py index 0039ac3..b06f77b 100644 --- a/bugtrace/core/team.py +++ b/bugtrace/core/team.py @@ -1502,7 +1502,7 @@ async def _run_gospider(self, recon_dir) -> list: """Run GoSpider agent for URL discovery.""" logger.info(f"Triggering GoSpiderAgent for {self.target}") self._v.emit("recon.gospider.started", {"target": self.target}) - gospider = GoSpiderAgent(self.target, recon_dir, max_depth=self.max_depth, max_urls=self.max_urls) + gospider = GoSpiderAgent(self.target, recon_dir, max_depth=self.max_depth, max_urls=self.max_urls, scan_ctx_id=self.scan_context) urls_to_scan = await gospider.run() self._v.emit("recon.gospider.completed", {"urls_found": len(urls_to_scan)}) logger.info(f"GoSpiderAgent finished. Found {len(urls_to_scan)} URLs") diff --git a/bugtrace/services/scan_context.py b/bugtrace/services/scan_context.py index a78b70d..7c7c7a1 100644 --- a/bugtrace/services/scan_context.py +++ b/bugtrace/services/scan_context.py @@ -89,6 +89,7 @@ class ScanOptions(BaseModel): scan_depth: str = "" # empty = use settings.SCAN_DEPTH default auth_token: Optional[str] = None # Level 1: pre-authenticated Bearer token auth: Optional[Dict[str, Any]] = None # Level 2: {login_url, credentials: {email, password}} + auth_format: Optional[str] = None # "json" or "form". If None, defaults to "json" in scan_service url_list: Optional[List[str]] = None # Pre-defined URL list (from file upload or Swagger import) diff --git a/bugtrace/services/scan_service.py b/bugtrace/services/scan_service.py index 9fc6ab0..8eeab8f 100644 --- a/bugtrace/services/scan_service.py +++ b/bugtrace/services/scan_service.py @@ -273,14 +273,30 @@ async def _setup_auth_tokens(self, options: ScanOptions, scan_ctx_id: str): if login_url.startswith("/"): login_url = options.target_url.rstrip("/") + login_url - logger.info(f"Auth Level 2: Attempting login at {login_url}") + auth_format = (options.auth_format or "json").lower() + logger.info(f"Auth Level 2: Attempting login at {login_url} (format={auth_format})") try: import httpx + import re async with httpx.AsyncClient(verify=False, timeout=15) as client: - resp = await client.post(login_url, json=credentials) + post_data = dict(credentials) + + if auth_format == "form": + login_page = await client.get(login_url) + for match in re.finditer(r']+>', login_page.text, re.IGNORECASE): + name_match = re.search(r'name=["\']([^"\']+)["\']', match.group(0), re.IGNORECASE) + value_match = re.search(r'value=["\']([^"\']*)["\']', match.group(0), re.IGNORECASE) + if name_match and value_match: + field_name = name_match.group(1) + if any(marker in field_name.lower() for marker in ("csrf", "token", "_token")): + post_data[field_name] = value_match.group(1) + + resp = await client.post(login_url, data=post_data) + else: + resp = await client.post(login_url, json=credentials) - if resp.status_code not in (200, 201): + if resp.status_code not in (200, 201, 302, 303): logger.warning( f"Auth Level 2: Login failed (HTTP {resp.status_code})" ) @@ -291,7 +307,17 @@ async def _setup_auth_tokens(self, options: ScanOptions, scan_ctx_id: str): if token: store_auth_token(scan_ctx_id, "auto_login", token) logger.info("Auth Level 2: JWT extracted and stored from login response") - else: + + # Capture session cookies regardless of JWT presence + # Use client.cookies (jar) not resp.cookies — resp only has cookies + # from the last response; the jar accumulates across GET + POST + redirects. + all_cookies = client.cookies + if all_cookies: + cookie_str = "; ".join([f"{name}={value}" for name, value in all_cookies.items()]) + store_auth_token(scan_ctx_id, "session_cookies", token=None, token_type="Cookie", cookies=cookie_str) # pyright: ignore[reportArgumentType] + logger.info(f"Auth Level 2: Captured {len(all_cookies)} session cookies from login ({cookie_str[:80]}...)") + + if not token: logger.warning("Auth Level 2: Login succeeded but no JWT found in response") except Exception as e: @@ -424,34 +450,73 @@ async def get_scan_status(self, scan_id: int) -> Dict[str, Any]: statement = select(ScanTable).where(ScanTable.id == scan_id) scan = session.exec(statement).first() - if not scan: - raise ValueError(f"Scan {scan_id} not found") + if scan: + # Get target info + target = session.get(TargetTable, scan.target_id) - # Get target info - target = session.get(TargetTable, scan.target_id) + # Count findings + from bugtrace.schemas.db_models import FindingTable + findings_statement = select(FindingTable).where(FindingTable.scan_id == scan_id) + findings = session.exec(findings_statement).all() + + return { + "scan_id": scan_id, + "target": target.url if target else "unknown", + "status": scan.status.value, + "progress": scan.progress_percent, + "uptime_seconds": None, # No longer running + "findings_count": len(findings), + "active_agent": None, + "phase": None, + "origin": getattr(scan, "origin", "cli"), + "enrichment_status": getattr(scan, "enrichment_status", None), + "scan_type": scan.scan_type, + "max_depth": scan.max_depth, + "max_urls": scan.max_urls, + "provider": getattr(scan, "provider", None), + } + + # Fallback: scan not in DB - try to build status from filesystem + import json + report_dir = self._find_report_dir_for_scan(scan_id) + if report_dir: + # Try to extract target from directory name (format: {domain}_{timestamp}) + dir_name = report_dir.name + parts = dir_name.rsplit("_", 1) + target_url = parts[0] if len(parts) >= 2 else "unknown" + + # Count findings from files + findings_count = 0 + vf = report_dir / "validated_findings.json" + if vf.is_file(): + try: + data = json.loads(vf.read_text()) + findings_count = len(data.get("findings", [])) if isinstance(data, dict) else len(data) if isinstance(data, list) else 0 + except Exception: + pass - # Count findings - from bugtrace.schemas.db_models import FindingTable - findings_statement = select(FindingTable).where(FindingTable.scan_id == scan_id) - findings = session.exec(findings_statement).all() + # Check if final report exists (means completed) + has_final = (report_dir / "final_report.md").is_file() return { "scan_id": scan_id, - "target": target.url if target else "unknown", - "status": scan.status.value, - "progress": scan.progress_percent, - "uptime_seconds": None, # No longer running - "findings_count": len(findings), + "target": target_url, + "status": "COMPLETED" if has_final else "STOPPED", + "progress": 100 if has_final else 50, + "uptime_seconds": None, + "findings_count": findings_count, "active_agent": None, "phase": None, - "origin": getattr(scan, "origin", "cli"), - "enrichment_status": getattr(scan, "enrichment_status", None), - "scan_type": scan.scan_type, - "max_depth": scan.max_depth, - "max_urls": scan.max_urls, - "provider": getattr(scan, "provider", None), + "origin": "cli", + "enrichment_status": None, + "scan_type": "full", + "max_depth": 2, + "max_urls": 20, + "provider": None, } + raise ValueError(f"Scan {scan_id} not found") + async def stop_scan(self, scan_id: int) -> Dict[str, Any]: """Stop a running or paused scan gracefully.""" async with self._lock: @@ -929,13 +994,17 @@ async def get_findings( Returns: Dictionary with findings, total, page, per_page """ - # Verify scan exists before loading findings - with self.db.get_session() as session: - from sqlmodel import select - from bugtrace.schemas.db_models import ScanTable - scan = session.exec(select(ScanTable).where(ScanTable.id == scan_id)).first() - if not scan: - raise ValueError(f"Scan {scan_id} not found") + # Verify scan exists in DB, but don't fail - filesystem may have results + scan_found_in_db = False + try: + with self.db.get_session() as session: + from sqlmodel import select + from bugtrace.schemas.db_models import ScanTable + scan = session.exec(select(ScanTable).where(ScanTable.id == scan_id)).first() + if scan: + scan_found_in_db = True + except Exception: + pass # Load all findings from files (source of truth) all_findings = self._load_findings_from_files(scan_id) @@ -981,37 +1050,85 @@ def _find_report_dir_for_scan(self, scan_id: int) -> Optional[Path]: with self.db.get_session() as session: from bugtrace.schemas.db_models import ScanTable, TargetTable scan = session.get(ScanTable, scan_id) - if not scan: - return None - target = session.get(TargetTable, scan.target_id) - if not target: - return None - - # Pattern 0: Direct DB match (v5.1 architecture) - if hasattr(scan, 'report_dir') and scan.report_dir: - db_dir = Path(scan.report_dir) - if db_dir.is_dir() and self._dir_has_report_files(db_dir): - return db_dir - - # Pattern 1: Pipeline-generated reports ({domain}_{timestamp}) - domain = urlparse(target.url).hostname or "" - matches = sorted( - report_base.glob(f"{domain}_*"), - key=lambda p: p.stat().st_mtime, - reverse=True, - ) - for match in matches: - if self._dir_has_report_files(match): - return match + if scan: + target = session.get(TargetTable, scan.target_id) + + # Pattern 0: Direct DB match (v5.1 architecture) + if hasattr(scan, 'report_dir') and scan.report_dir: + db_dir = Path(scan.report_dir) + if db_dir.is_dir() and self._dir_has_report_files(db_dir): + return db_dir + + # Pattern 1: Pipeline-generated reports ({domain}_{timestamp}) + if target: + domain = urlparse(target.url).hostname or "" + matches = sorted( + report_base.glob(f"{domain}_*"), + key=lambda p: p.stat().st_mtime, + reverse=True, + ) + for match in matches: + if self._dir_has_report_files(match): + return match - # Pattern 2: API-generated reports (fallback) - api_dir = report_base / f"scan_{scan_id}" - if api_dir.is_dir() and self._dir_has_report_files(api_dir): - return api_dir + # Pattern 2: API-generated reports (fallback) + api_dir = report_base / f"scan_{scan_id}" + if api_dir.is_dir() and self._dir_has_report_files(api_dir): + return api_dir + + # DB has no scan record - fall through to filesystem scan below except Exception as e: logger.warning(f"Error resolving report dir for scan {scan_id}: {e}") + # Filesystem fallback: scan ALL report dirs for scan_id in metadata + for report_dir in sorted( + report_base.glob("*_*"), + key=lambda p: p.stat().st_mtime, + reverse=True, + ): + if not report_dir.is_dir(): + continue + if not self._dir_has_report_files(report_dir): + continue + # Check validated_findings.json for scan_id match + vf = report_dir / "validated_findings.json" + if vf.is_file(): + try: + import json + data = json.loads(vf.read_text()) + # Check root-level scan_id, meta.scan_id, or list items + if isinstance(data, dict): + if data.get("scan_id") == scan_id: + return report_dir + meta = data.get("meta", {}) + if isinstance(meta, dict) and meta.get("scan_id") == scan_id: + return report_dir + if isinstance(data, list): + for item in data: + if isinstance(item, dict) and item.get("scan_id") == scan_id: + return report_dir + except Exception: + pass + # Also check raw_findings.json + rf = report_dir / "raw_findings.json" + if rf.is_file(): + try: + import json + data = json.loads(rf.read_text()) + if isinstance(data, dict): + if data.get("scan_id") == scan_id: + return report_dir + meta = data.get("meta", {}) + if isinstance(meta, dict) and meta.get("scan_id") == scan_id: + return report_dir + if isinstance(data, list): + for item in data: + if isinstance(item, dict) and item.get("scan_id") == scan_id: + return report_dir + except Exception: + pass + # Last resort without DB api_dir = report_base / f"scan_{scan_id}" if api_dir.is_dir() and self._dir_has_report_files(api_dir): diff --git a/bugtrace/tools/external.py b/bugtrace/tools/external.py index d6fed3b..b3c989f 100644 --- a/bugtrace/tools/external.py +++ b/bugtrace/tools/external.py @@ -688,7 +688,7 @@ def _parse_url_if_in_scope(self, url: str, target_domain: str) -> str: logger.debug(f"URL parsing error in GoSpider output: {e}") return None - async def run_gospider(self, url: str, cookies: List[Dict] = None, depth: int = 3, max_urls: int = None) -> List[str]: + async def run_gospider(self, url: str, cookies: List[Dict] = None, depth: int = 3, max_urls: int = None, extra_headers: Dict[str, str] = None) -> List[str]: """ Runs GoSpider crawler (native preferred, Docker fallback). Respects max_urls by counting unique in-scope URLs in real-time. @@ -712,7 +712,7 @@ async def run_gospider(self, url: str, cookies: List[Dict] = None, depth: int = self._record_tool_run("gospider") mode = "native" if native else "Docker" - logger.info(f"Starting GoSpider ({mode}) on {url} (depth={depth}, limit={max_urls})...") + logger.info(f"Starting GoSpider ({mode}) on {url} (depth={depth}, limit={max_urls}, cookies={'yes' if cookies else 'no'})...") dashboard.log(f"[External] Launching GoSpider ({mode}, depth={depth}) against {url}", "INFO") dashboard.update_task("gospider", name="GoSpider", status=f"Crawling: {url}") @@ -729,6 +729,10 @@ async def run_gospider(self, url: str, cookies: List[Dict] = None, depth: int = cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies]) cmd.extend(["--cookie", cookie_str]) + if extra_headers: + for header_name, header_value in extra_headers.items(): + cmd.extend(["-H", f"{header_name}: {header_value}"]) + if settings.GOSPIDER_NO_REDIRECT: cmd.append("--no-redirect")