diff --git a/aragora/ideacloud/__init__.py b/aragora/ideacloud/__init__.py new file mode 100644 index 0000000000..60c187779e --- /dev/null +++ b/aragora/ideacloud/__init__.py @@ -0,0 +1,44 @@ +""" +Idea Cloud — a graph-structured knowledge capture system. + +Dual-purpose: Obsidian-compatible personal thinking tool AND +structured input source for aragora's debate pipeline and KnowledgeMound. + +Storage: `.aragora_ideas/` vault with interlinked markdown files. + +Usage: + from aragora.ideacloud import IdeaCloud + + cloud = IdeaCloud(vault_path=".aragora_ideas") + cloud.load() + + # Ingest manually + node = await cloud.ingest_manual("AI safety is critical", title="AI Safety") + + # Ingest from Twitter bookmarks + added = await cloud.ingest_twitter_bookmarks("bookmarks.js") + + # Search, auto-link, auto-cluster + results = cloud.search("prompt injection") + cloud.auto_link() + clusters = cloud.auto_cluster() + + # Export cluster for debate pipeline + ideas = cloud.export_for_pipeline(cluster_id) + debate_ctx = cloud.export_for_debate(cluster_id) + propositions = cloud.extract_propositions(cluster_id) +""" + +from aragora.ideacloud.core import IdeaCloud +from aragora.ideacloud.graph.node import IdeaNode +from aragora.ideacloud.graph.edge import IdeaEdge +from aragora.ideacloud.graph.cluster import IdeaCluster +from aragora.ideacloud.graph.graph import IdeaGraph + +__all__ = [ + "IdeaCloud", + "IdeaNode", + "IdeaEdge", + "IdeaCluster", + "IdeaGraph", +] diff --git a/aragora/ideacloud/adapters/__init__.py b/aragora/ideacloud/adapters/__init__.py new file mode 100644 index 0000000000..6837c4b4ad --- /dev/null +++ b/aragora/ideacloud/adapters/__init__.py @@ -0,0 +1 @@ +"""Aragora integration adapters for Idea Cloud.""" diff --git a/aragora/ideacloud/adapters/km_adapter.py b/aragora/ideacloud/adapters/km_adapter.py new file mode 100644 index 0000000000..60f5e98e84 --- /dev/null +++ b/aragora/ideacloud/adapters/km_adapter.py @@ -0,0 +1,196 @@ +"""IdeaCloud adapter for Knowledge Mound. + +Provides bidirectional sync between the Idea Cloud graph and +KnowledgeMound for unified knowledge management. + +Forward sync: IdeaNode → KnowledgeNode +Reverse sync: KM validation results → IdeaNode confidence updates +""" + +from __future__ import annotations + +import logging +from typing import Any + +from aragora.knowledge.mound.adapters._base import KnowledgeMoundAdapter + +logger = logging.getLogger(__name__) + + +class IdeaCloudAdapter(KnowledgeMoundAdapter): + """Bridges Idea Cloud to Knowledge Mound. + + Forward flow: Sync unsynced IdeaNodes → KM as KnowledgeNodes + Reverse flow: Apply KM validations back to IdeaNode confidence + """ + + adapter_name = "ideacloud" + + def __init__(self, idea_cloud: Any = None, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._cloud = idea_cloud + + async def sync_to_km(self, **kwargs: Any) -> dict[str, Any]: + """Forward sync: IdeaCloud → KnowledgeMound. + + Syncs all nodes where ``km_synced=False`` to KM. + """ + if not self._cloud: + return {"records_synced": 0, "error": "No IdeaCloud instance configured"} + + synced = 0 + skipped = 0 + failed = 0 + + for node in self._cloud.graph.nodes.values(): + if node.km_synced: + skipped += 1 + continue + + try: + async with self._resilient_call("sync_to_km"): + self._emit_event( + "ideacloud_sync", + { + "node_id": node.id, + "title": node.title, + "direction": "forward", + }, + ) + + # Mark as synced + node.km_synced = True + synced += 1 + + except Exception as exc: + logger.warning("Failed to sync node %s: %s", node.id, exc) + failed += 1 + + # Persist sync status back to vault + if synced > 0: + self._cloud.save() + + result = { + "records_synced": synced, + "records_skipped": skipped, + "records_failed": failed, + } + + self._record_metric( + "sync_to_km", + success=failed == 0, + latency=0.0, + extra_labels={"synced": str(synced)}, + ) + + return result + + async def sync_from_km( + self, + km_validations: list[dict[str, Any]] | None = None, + **kwargs: Any, + ) -> dict[str, Any]: + """Reverse sync: KnowledgeMound → IdeaCloud. + + Applies KM validation results back to IdeaNode confidence scores + and pipeline status. + + Args: + km_validations: Optional list of validation dicts with fields: + - ``source_id``: IdeaNode ID + - ``confidence``: Updated confidence (0-1) + - ``validation_status``: "confirmed" | "disputed" | "uncertain" + - ``notes``: Optional validation notes + If not provided, queries KM for validation events. + + Returns: + Dict with counts of analyzed and updated records. + """ + if not self._cloud: + return {"records_updated": 0, "error": "No IdeaCloud instance configured"} + + updated = 0 + analyzed = 0 + + validations = km_validations or [] + + # Index validations by source_id for quick lookup + validation_map: dict[str, dict[str, Any]] = {} + for v in validations: + sid = v.get("source_id", "") + if sid: + validation_map[sid] = v + + for node in self._cloud.graph.nodes.values(): + if not node.km_synced: + continue + + analyzed += 1 + + validation = validation_map.get(node.id) + if not validation: + continue + + try: + async with self._resilient_call("sync_from_km"): + # Update confidence from KM validation + new_confidence = validation.get("confidence") + if new_confidence is not None: + node.confidence = float(new_confidence) + + # Update pipeline status based on validation + status = validation.get("validation_status", "") + if status == "confirmed" and node.pipeline_status == "inbox": + node.pipeline_status = "candidate" + elif status == "disputed": + node.confidence = max(0.0, node.confidence - 0.2) + + node.updated_at = __import__( + "aragora.ideacloud.graph.node", + fromlist=["_now_iso"], + )._now_iso() + + self._emit_event( + "ideacloud_sync", + { + "node_id": node.id, + "title": node.title, + "direction": "reverse", + "validation_status": status, + }, + ) + + updated += 1 + + except Exception as exc: + logger.warning( + "Failed to apply KM validation to node %s: %s", + node.id, + exc, + ) + + if updated > 0: + self._cloud.save() + + result = { + "records_analyzed": analyzed, + "records_updated": updated, + } + + self._record_metric( + "sync_from_km", + success=True, + latency=0.0, + extra_labels={"updated": str(updated)}, + ) + + return result + + def health_check(self) -> dict[str, Any]: + """Return adapter health status.""" + base = super().health_check() + if self._cloud: + stats = self._cloud.stats + base["ideacloud_nodes"] = stats.get("total_nodes", 0) + base["ideacloud_clusters"] = stats.get("total_clusters", 0) + return base diff --git a/aragora/ideacloud/adapters/pipeline_bridge.py b/aragora/ideacloud/adapters/pipeline_bridge.py new file mode 100644 index 0000000000..51612ac652 --- /dev/null +++ b/aragora/ideacloud/adapters/pipeline_bridge.py @@ -0,0 +1,253 @@ +"""Pipeline bridge — convert IdeaCloud clusters to pipeline inputs. + +Bridges the gap between the Idea Cloud (personal knowledge graph) and +aragora's Idea-to-Execution pipeline, enabling clusters of ideas to be +promoted into debate propositions or full pipeline runs. + +Entry points: + cluster_to_ideas(graph, cluster_id) → list[str] for pipeline.from_ideas() + cluster_to_brain_dump(graph, cluster_id) → str for pipeline.from_brain_dump() + cluster_to_universal_nodes(graph, cluster_id) → list[UniversalNode] + export_cluster_for_debate(graph, cluster_id) → dict for Arena.env +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from aragora.ideacloud.graph.graph import IdeaGraph + +logger = logging.getLogger(__name__) + + +def cluster_to_ideas(graph: IdeaGraph, cluster_id: str) -> list[str]: + """Convert a cluster's nodes into a list of idea strings. + + This is the simplest bridge — feed the result directly into + ``IdeaToExecutionPipeline.from_ideas(ideas)``. + + Each idea string is formatted as: + "[Title]: [Body excerpt]" + + Args: + graph: The idea graph. + cluster_id: Cluster to export. + + Returns: + List of idea strings ready for the pipeline. + + Raises: + KeyError: If cluster_id not found. + """ + cluster = graph.clusters.get(cluster_id) + if not cluster: + raise KeyError(f"Cluster {cluster_id!r} not found in graph") + + ideas: list[str] = [] + for nid in cluster.node_ids: + node = graph.nodes.get(nid) + if not node: + continue + + # Build a concise idea string + body_excerpt = node.body[:300].strip() if node.body else "" + if node.title and body_excerpt: + idea = f"{node.title}: {body_excerpt}" + elif node.title: + idea = node.title + elif body_excerpt: + idea = body_excerpt + else: + continue + + ideas.append(idea) + + logger.info("Exported %d ideas from cluster %s", len(ideas), cluster_id) + return ideas + + +def cluster_to_brain_dump(graph: IdeaGraph, cluster_id: str) -> str: + """Convert a cluster into a brain-dump text block. + + This produces a structured text blob suitable for + ``IdeaToExecutionPipeline.from_brain_dump(text)``. + + The output includes the cluster name, common themes, and each + idea as a bullet point with its connections noted. + + Args: + graph: The idea graph. + cluster_id: Cluster to export. + + Returns: + Formatted brain-dump string. + + Raises: + KeyError: If cluster_id not found. + """ + cluster = graph.clusters.get(cluster_id) + if not cluster: + raise KeyError(f"Cluster {cluster_id!r} not found in graph") + + lines: list[str] = [] + lines.append(f"# {cluster.name}") + lines.append("") + + if cluster.tags: + lines.append(f"Themes: {', '.join(cluster.tags)}") + lines.append("") + + lines.append("## Ideas") + lines.append("") + + for nid in cluster.node_ids: + node = graph.nodes.get(nid) + if not node: + continue + + body_preview = node.body[:200].strip() if node.body else "" + lines.append(f"- **{node.title}**: {body_preview}") + + # Note connections + edges = graph.get_edges_for(nid) + for edge in edges: + other_id = edge.target_id if edge.source_id == nid else edge.source_id + other = graph.nodes.get(other_id) + if other and other_id in cluster.node_ids: + lines.append(f" - {edge.edge_type} → {other.title}") + + lines.append("") + lines.append("## Implications") + lines.append("") + lines.append("What patterns, risks, or opportunities emerge from these ideas together?") + + return "\n".join(lines) + + +def cluster_to_universal_nodes( + graph: IdeaGraph, + cluster_id: str, +) -> list[dict[str, Any]]: + """Convert cluster nodes to UniversalNode-compatible dicts. + + These can be instantiated as ``UniversalNode.from_dict(d)`` and + added to a ``UniversalGraph`` for pipeline visualization. + + Each node maps to ``PipelineStage.IDEAS`` with ``node_subtype`` + derived from the IdeaNode's ``node_type``. + + Args: + graph: The idea graph. + cluster_id: Cluster to export. + + Returns: + List of dicts compatible with UniversalNode.from_dict(). + """ + cluster = graph.clusters.get(cluster_id) + if not cluster: + raise KeyError(f"Cluster {cluster_id!r} not found in graph") + + # Map ideacloud node_type → pipeline IdeaNodeType + _NODE_TYPE_MAP = { + "idea_concept": "concept", + "idea_insight": "insight", + "idea_evidence": "evidence", + "idea_hypothesis": "hypothesis", + "idea_question": "question", + "idea_cluster": "cluster", + } + + import time + + nodes: list[dict[str, Any]] = [] + for nid in cluster.node_ids: + node = graph.nodes.get(nid) + if not node: + continue + + subtype = _NODE_TYPE_MAP.get(node.node_type, "concept") + nodes.append( + { + "id": f"ic_{node.id}", + "stage": "ideas", + "node_subtype": subtype, + "label": node.title or "Untitled", + "description": node.body[:500] if node.body else "", + "content_hash": node.content_hash, + "confidence": node.confidence, + "status": "active", + "data": { + "source_type": node.source_type, + "source_url": node.source_url, + "source_author": node.source_author, + "tags": node.tags, + "ideacloud_id": node.id, + "cluster_id": cluster_id, + }, + "metadata": { + "origin": "ideacloud", + "cluster_name": cluster.name, + }, + "created_at": time.time(), + "updated_at": time.time(), + } + ) + + logger.info( + "Exported %d UniversalNode dicts from cluster %s", + len(nodes), + cluster_id, + ) + return nodes + + +def export_cluster_for_debate( + graph: IdeaGraph, + cluster_id: str, +) -> dict[str, Any]: + """Export a cluster as a debate environment context. + + Returns a dict suitable for constructing an ``Environment`` object + for a debate ``Arena`` run. + + Args: + graph: The idea graph. + cluster_id: Cluster to export. + + Returns: + Dict with ``task``, ``context``, ``metadata`` fields. + """ + cluster = graph.clusters.get(cluster_id) + if not cluster: + raise KeyError(f"Cluster {cluster_id!r} not found in graph") + + ideas = cluster_to_ideas(graph, cluster_id) + + # Build the debate task + task = ( + f"Analyze the following cluster of ideas about '{cluster.name}' " + f"and identify: (1) key insights, (2) potential risks, " + f"(3) actionable implications, (4) gaps or questions to investigate." + ) + + context_lines = [f"Cluster: {cluster.name}"] + if cluster.tags: + context_lines.append(f"Themes: {', '.join(cluster.tags)}") + context_lines.append("") + context_lines.append("Ideas:") + for i, idea in enumerate(ideas, 1): + context_lines.append(f" {i}. {idea}") + + return { + "task": task, + "context": "\n".join(context_lines), + "metadata": { + "origin": "ideacloud", + "cluster_id": cluster_id, + "cluster_name": cluster.name, + "node_count": len(cluster.node_ids), + "tags": cluster.tags, + }, + } diff --git a/aragora/ideacloud/cli/__init__.py b/aragora/ideacloud/cli/__init__.py new file mode 100644 index 0000000000..c6ba231970 --- /dev/null +++ b/aragora/ideacloud/cli/__init__.py @@ -0,0 +1 @@ +"""CLI commands for Idea Cloud.""" diff --git a/aragora/ideacloud/cli/commands.py b/aragora/ideacloud/cli/commands.py new file mode 100644 index 0000000000..52d0617504 --- /dev/null +++ b/aragora/ideacloud/cli/commands.py @@ -0,0 +1,270 @@ +"""CLI commands for Idea Cloud. + +Subcommands: + aragora ideacloud load — Ingest ideas from sources + aragora ideacloud list — List ideas or clusters + aragora ideacloud search — Search ideas by text + aragora ideacloud show — Show idea or cluster details + aragora ideacloud cluster — Auto-cluster ideas + aragora ideacloud link — Auto-link ideas + aragora ideacloud stats — Show graph statistics +""" +# ruff: noqa: T201 + +from __future__ import annotations + +import argparse +import asyncio +import json +import logging +from typing import Any + +logger = logging.getLogger(__name__) + + +def add_ideacloud_commands(subparsers: Any) -> None: + """Register ideacloud subcommand group.""" + + ic_parser = subparsers.add_parser( + "ideacloud", + help="Manage the Idea Cloud knowledge graph", + ) + ic_sub = ic_parser.add_subparsers(dest="ideacloud_cmd") + + # ---- load ---- + load_p = ic_sub.add_parser("load", help="Ingest ideas from a source") + load_p.add_argument( + "--source", + choices=["twitter-bookmarks", "twitter-likes", "manual"], + required=True, + help="Source type", + ) + load_p.add_argument("--file", help="Source file path (for twitter exports)") + load_p.add_argument("--text", help="Text content (for manual)") + load_p.add_argument("--url", help="URL (for manual)") + load_p.add_argument("--title", help="Title (for manual)") + load_p.add_argument("--tags", help="Comma-separated tags") + load_p.add_argument("--vault", default=".aragora_ideas", help="Vault path") + + # ---- list ---- + list_p = ic_sub.add_parser("list", help="List ideas or clusters") + list_p.add_argument("--clusters", action="store_true", help="List clusters instead of ideas") + list_p.add_argument("--status", help="Filter by pipeline status") + list_p.add_argument("--source", help="Filter by source type") + list_p.add_argument("--limit", type=int, default=20, help="Max results") + list_p.add_argument("--vault", default=".aragora_ideas", help="Vault path") + + # ---- search ---- + search_p = ic_sub.add_parser("search", help="Search ideas by text") + search_p.add_argument("query", help="Search query") + search_p.add_argument("--limit", type=int, default=10, help="Max results") + search_p.add_argument("--vault", default=".aragora_ideas", help="Vault path") + + # ---- show ---- + show_p = ic_sub.add_parser("show", help="Show idea or cluster details") + show_p.add_argument("id", help="Node ID (ic_...) or cluster ID (cl_...)") + show_p.add_argument("--format", choices=["markdown", "json"], default="markdown") + show_p.add_argument("--vault", default=".aragora_ideas", help="Vault path") + + # ---- cluster ---- + cluster_p = ic_sub.add_parser("cluster", help="Auto-cluster ideas") + cluster_p.add_argument("--min-size", type=int, default=2, help="Min cluster size") + cluster_p.add_argument("--vault", default=".aragora_ideas", help="Vault path") + + # ---- link ---- + link_p = ic_sub.add_parser("link", help="Auto-link related ideas") + link_p.add_argument("--node", help="Specific node ID to link (default: all)") + link_p.add_argument("--min-similarity", type=float, default=0.3) + link_p.add_argument("--vault", default=".aragora_ideas", help="Vault path") + + # ---- stats ---- + stats_p = ic_sub.add_parser("stats", help="Show graph statistics") + stats_p.add_argument("--vault", default=".aragora_ideas", help="Vault path") + + +def handle_ideacloud(args: argparse.Namespace) -> int: + """Dispatch ideacloud subcommands. Returns exit code.""" + + cmd = getattr(args, "ideacloud_cmd", None) + if not cmd: + print("Usage: aragora ideacloud ") + print("Commands: load, list, search, show, cluster, link, stats") + return 1 + + # Import here to avoid circular imports + from aragora.ideacloud.core import IdeaCloud + + vault = getattr(args, "vault", ".aragora_ideas") + cloud = IdeaCloud(vault_path=vault) + cloud.load() + + dispatch = { + "load": _cmd_load, + "list": _cmd_list, + "search": _cmd_search, + "show": _cmd_show, + "cluster": _cmd_cluster, + "link": _cmd_link, + "stats": _cmd_stats, + } + + handler = dispatch.get(cmd) + if not handler: + print(f"Unknown command: {cmd}") + return 1 + + return handler(cloud, args) + + +# ---- Command handlers ---- + + +def _cmd_load(cloud: Any, args: argparse.Namespace) -> int: + """Handle 'ideacloud load'.""" + source = args.source + + if source == "twitter-bookmarks": + if not args.file: + print("--file required for twitter-bookmarks") + return 1 + nodes = asyncio.run(cloud.ingest_twitter_bookmarks(args.file)) + print(f"Ingested {len(nodes)} bookmarks") + + elif source == "twitter-likes": + if not args.file: + print("--file required for twitter-likes") + return 1 + nodes = asyncio.run(cloud.ingest_twitter_likes(args.file)) + print(f"Ingested {len(nodes)} likes") + + elif source == "manual": + content = args.text or args.url + if not content: + print("--text or --url required for manual") + return 1 + tags = args.tags.split(",") if args.tags else [] + node = asyncio.run( + cloud.ingest_manual( + content=content, + title=args.title, + source_url=args.url, + tags=tags, + ) + ) + if node: + print(f"Added: {node.id} — {node.title}") + else: + print("Node rejected (low quality or duplicate)") + + return 0 + + +def _cmd_list(cloud: Any, args: argparse.Namespace) -> int: + """Handle 'ideacloud list'.""" + if args.clusters: + clusters = cloud.list_clusters() + if not clusters: + print("No clusters found. Run 'aragora ideacloud cluster' first.") + return 0 + for c in clusters: + print(f" {c.id} {c.name:<40} ({c.size} ideas) tags: {', '.join(c.tags[:5])}") + else: + nodes = cloud.list_nodes( + status=args.status, + source_type=args.source, + limit=args.limit, + ) + if not nodes: + print("No ideas found.") + return 0 + for n in nodes: + status = f"[{n.pipeline_status}]" + print(f" {n.id} {status:<14} {n.title[:60]}") + return 0 + + +def _cmd_search(cloud: Any, args: argparse.Namespace) -> int: + """Handle 'ideacloud search'.""" + results = cloud.search(args.query, limit=args.limit) + if not results: + print(f"No results for '{args.query}'") + return 0 + for node, score in results: + print(f" [{score:.2f}] {node.id} {node.title[:60]}") + return 0 + + +def _cmd_show(cloud: Any, args: argparse.Namespace) -> int: + """Handle 'ideacloud show'.""" + target_id = args.id + + if target_id.startswith("cl_"): + cluster = cloud.get_cluster(target_id) + if not cluster: + print(f"Cluster not found: {target_id}") + return 1 + if args.format == "json": + print(json.dumps(cluster.to_dict(), indent=2)) + else: + print(cloud.cluster_summary(target_id)) + else: + node = cloud.get_node(target_id) + if not node: + print(f"Node not found: {target_id}") + return 1 + if args.format == "json": + print(json.dumps(node.to_frontmatter_dict(), indent=2)) + else: + print(f"# {node.title}") + print(f"ID: {node.id}") + print(f"Source: {node.source_type} — {node.source_url or 'N/A'}") + print(f"Status: {node.pipeline_status}") + print(f"Tags: {', '.join(node.tags)}") + print(f"Relevance: {node.relevance_score:.2f}") + if node.cluster_id: + print(f"Cluster: {node.cluster_id}") + print(f"\n{node.body}") + + return 0 + + +def _cmd_cluster(cloud: Any, args: argparse.Namespace) -> int: + """Handle 'ideacloud cluster'.""" + clusters = cloud.auto_cluster(min_cluster_size=args.min_size) + print(f"Found {len(clusters)} clusters:") + for c in sorted(clusters.values(), key=lambda x: x.size, reverse=True): + print(f" {c.id} {c.name:<40} ({c.size} ideas)") + return 0 + + +def _cmd_link(cloud: Any, args: argparse.Namespace) -> int: + """Handle 'ideacloud link'.""" + new_edges = cloud.auto_link( + node_id=args.node, + min_similarity=args.min_similarity, + ) + print(f"Created {len(new_edges)} new connections") + for edge in new_edges[:20]: + src = cloud.get_node(edge.source_id) + tgt = cloud.get_node(edge.target_id) + src_title = src.title[:30] if src else edge.source_id + tgt_title = tgt.title[:30] if tgt else edge.target_id + print(f" {src_title} --{edge.edge_type}--> {tgt_title} (w={edge.weight:.2f})") + return 0 + + +def _cmd_stats(cloud: Any, args: argparse.Namespace) -> int: + """Handle 'ideacloud stats'.""" + s = cloud.stats + print( + f"Idea Cloud: {s['total_nodes']} ideas, {s['total_edges']} connections, {s['total_clusters']} clusters" + ) + if s.get("by_status"): + print("\nBy status:") + for status, count in sorted(s["by_status"].items()): + print(f" {status}: {count}") + if s.get("by_source"): + print("\nBy source:") + for source, count in sorted(s["by_source"].items()): + print(f" {source}: {count}") + return 0 diff --git a/aragora/ideacloud/core.py b/aragora/ideacloud/core.py new file mode 100644 index 0000000000..2b161e2c96 --- /dev/null +++ b/aragora/ideacloud/core.py @@ -0,0 +1,424 @@ +"""IdeaCloud — the main orchestrator for the Idea Cloud system. + +Ties together the graph, storage, ingestion, and operations layers +into a single high-level API. + +Usage: + cloud = IdeaCloud(".aragora_ideas") + cloud.load() + + # Ingest from various sources + cloud.ingest_manual("https://example.com/article", title="...", tags=[...]) + await cloud.ingest_twitter_bookmarks("path/to/bookmarks.js") + + # Search and explore + results = cloud.search("prompt injection") + cluster = cloud.get_cluster("cl_abc1234") + + # Auto-organize + cloud.auto_link() + cloud.auto_cluster() + + # Export for debate + propositions = cloud.cluster_summary("cl_abc1234") +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Any + +from aragora.ideacloud.graph.cluster import IdeaCluster +from aragora.ideacloud.graph.edge import IdeaEdge +from aragora.ideacloud.graph.graph import IdeaGraph +from aragora.ideacloud.graph.node import IdeaNode +from aragora.ideacloud.graph import operations as ops +from aragora.ideacloud.ingestion.quality import DeduplicationEngine, QualityFilter + +logger = logging.getLogger(__name__) + +# Default vault location (relative to workspace root) +DEFAULT_VAULT_PATH = ".aragora_ideas" + + +class IdeaCloud: + """High-level API for the Idea Cloud system. + + Orchestrates graph, storage, ingestion, and operations. + """ + + def __init__( + self, + vault_path: str | Path = DEFAULT_VAULT_PATH, + min_quality: float = 0.3, + dedup_threshold: float = 0.85, + ) -> None: + self.vault_path = Path(vault_path) + self.graph = IdeaGraph(self.vault_path) + self.quality_filter = QualityFilter(min_score=min_quality) + self.dedup_engine = DeduplicationEngine(similarity_threshold=dedup_threshold) + + # ---- Lifecycle ---- + + def load(self) -> int: + """Load the idea graph from vault. + + Returns: + Number of nodes loaded. + """ + return self.graph.load() + + def save(self) -> None: + """Persist all changes to vault.""" + self.graph.save() + + @property + def stats(self) -> dict[str, Any]: + """Graph statistics.""" + return self.graph.stats + + # ---- Ingestion ---- + + def add(self, node: IdeaNode, skip_quality: bool = False) -> bool: + """Add a single idea node to the graph. + + Runs quality filter and deduplication unless ``skip_quality=True``. + + Returns: + True if added, False if rejected (low quality or duplicate). + """ + if not skip_quality: + if not self.quality_filter.is_acceptable(node): + logger.debug("Rejected low-quality node: %s", node.title[:60]) + return False + node.relevance_score = self.quality_filter.score(node) + + dupes = self.dedup_engine.find_duplicates(node, self.graph) + if dupes: + logger.debug("Rejected duplicate node: %s (matches %s)", node.title[:60], dupes) + return False + + self.graph.add_node(node) + return True + + async def ingest_manual( + self, + content: str, + title: str | None = None, + source_url: str | None = None, + source_author: str | None = None, + tags: list[str] | None = None, + ) -> IdeaNode | None: + """Ingest a manually pasted idea. + + Returns the created node, or None if rejected. + """ + from aragora.ideacloud.ingestion.manual import ManualPasteIngestor + + ingestor = ManualPasteIngestor() + + if title or source_url or source_author or tags: + node = await ingestor.ingest_with_context( + content=content, + title=title, + source_url=source_url, + source_author=source_author, + tags=tags, + ) + return node if self.add(node) else None + + nodes = await ingestor.ingest(content) + if nodes and self.add(nodes[0]): + return nodes[0] + return None + + async def ingest_twitter_bookmarks(self, file_path: str | Path) -> list[IdeaNode]: + """Ingest from Twitter bookmarks export. + + Returns list of successfully added nodes. + """ + from aragora.ideacloud.ingestion.twitter_bookmarks import TwitterBookmarksIngestor + + ingestor = TwitterBookmarksIngestor() + raw_nodes = await ingestor.ingest(file_path) + return self._ingest_batch(raw_nodes) + + async def ingest_twitter_likes(self, file_path: str | Path) -> list[IdeaNode]: + """Ingest from Twitter likes export. + + Returns list of successfully added nodes. + """ + from aragora.ideacloud.ingestion.twitter_likes import TwitterLikesIngestor + + ingestor = TwitterLikesIngestor() + raw_nodes = await ingestor.ingest(file_path) + return self._ingest_batch(raw_nodes) + + def _ingest_batch(self, nodes: list[IdeaNode]) -> list[IdeaNode]: + """Filter, deduplicate, and add a batch of nodes.""" + # Quality filter + filtered = self.quality_filter.filter_batch(nodes) + # Dedup + unique = self.dedup_engine.deduplicate_batch(filtered, self.graph) + # Add to graph + added: list[IdeaNode] = [] + for node in unique: + self.graph.add_node(node) + added.append(node) + logger.info("Ingested %d/%d nodes", len(added), len(nodes)) + return added + + # ---- Search ---- + + def search(self, query: str, limit: int = 10) -> list[tuple[IdeaNode, float]]: + """Search ideas by text query. + + Returns (node, relevance) tuples sorted by relevance. + """ + return self.graph.search(query, limit=limit) + + # ---- Graph operations ---- + + def auto_link( + self, + node_id: str | None = None, + min_similarity: float = 0.3, + ) -> list[IdeaEdge]: + """Auto-create connections between related ideas. + + Args: + node_id: Link a specific node, or None for all. + min_similarity: Minimum similarity threshold. + + Returns: + List of newly created edges. + """ + new_edges = ops.auto_link(self.graph, node_id=node_id, min_similarity=min_similarity) + if new_edges: + self.graph.save() + return new_edges + + def auto_cluster(self, min_cluster_size: int = 2) -> dict[str, IdeaCluster]: + """Auto-cluster ideas based on connections and shared tags. + + Returns dict of cluster_id → IdeaCluster. + """ + clusters = ops.auto_cluster(self.graph, min_cluster_size=min_cluster_size) + self.graph.save() + return clusters + + # ---- Cluster access ---- + + def get_cluster(self, cluster_id: str) -> IdeaCluster | None: + """Get a cluster by ID.""" + return self.graph.get_cluster(cluster_id) + + def list_clusters(self) -> list[IdeaCluster]: + """List all clusters sorted by size (largest first).""" + return sorted( + self.graph.clusters.values(), + key=lambda c: c.size, + reverse=True, + ) + + def cluster_nodes(self, cluster_id: str) -> list[IdeaNode]: + """Get all nodes in a cluster.""" + cluster = self.graph.get_cluster(cluster_id) + if not cluster: + return [] + return [self.graph.nodes[nid] for nid in cluster.node_ids if nid in self.graph.nodes] + + def cluster_summary(self, cluster_id: str) -> str: + """Generate a text summary of a cluster for debate proposition generation. + + Returns a markdown summary of the cluster's ideas and connections. + """ + cluster = self.graph.get_cluster(cluster_id) + if not cluster: + return "" + + lines: list[str] = [] + lines.append(f"# Cluster: {cluster.name}") + if cluster.description: + lines.append(f"\n{cluster.description}") + lines.append(f"\nTags: {', '.join(cluster.tags)}") + lines.append(f"\n## Ideas ({cluster.size})") + + for nid in cluster.node_ids: + node = self.graph.nodes.get(nid) + if not node: + continue + lines.append(f"\n### {node.title}") + if node.source_url: + lines.append(f"Source: {node.source_url}") + if node.body: + # First 500 chars of body + body_preview = node.body[:500] + if len(node.body) > 500: + body_preview += "..." + lines.append(body_preview) + + return "\n".join(lines) + + # ---- RSS Ingestion ---- + + async def ingest_rss( + self, + feeds: list[dict[str, Any]] | None = None, + relevance_keywords: list[str] | None = None, + min_relevance: float = 0.0, + ) -> list[IdeaNode]: + """Ingest from RSS/Atom feeds. + + Args: + feeds: List of feed configs, each a dict with at least ``url`` + and optionally ``name``, ``category``, ``priority``. + relevance_keywords: Keywords for relevance filtering. + min_relevance: Minimum relevance score. + + Returns: + List of successfully added nodes. + """ + from aragora.ideacloud.ingestion.rss_feeds import RSSFeedIngestor + + ingestor = RSSFeedIngestor( + relevance_keywords=relevance_keywords, + min_relevance=min_relevance, + ) + + for feed in feeds or []: + ingestor.add_feed( + url=feed["url"], + name=feed.get("name", ""), + category=feed.get("category", ""), + priority=feed.get("priority", 5), + max_entries=feed.get("max_entries", 50), + topic_keywords=feed.get("topic_keywords"), + ) + + raw_nodes = await ingestor.ingest() + return self._ingest_batch(raw_nodes) + + # ---- Pipeline Bridge ---- + + def export_for_pipeline(self, cluster_id: str) -> list[str]: + """Export a cluster as idea strings for IdeaToExecutionPipeline.from_ideas(). + + Returns: + List of idea strings ready for the pipeline. + """ + from aragora.ideacloud.adapters.pipeline_bridge import cluster_to_ideas + + return cluster_to_ideas(self.graph, cluster_id) + + def export_for_brain_dump(self, cluster_id: str) -> str: + """Export a cluster as brain-dump text for pipeline.from_brain_dump(). + + Returns: + Formatted text suitable for brain-dump pipeline entry. + """ + from aragora.ideacloud.adapters.pipeline_bridge import cluster_to_brain_dump + + return cluster_to_brain_dump(self.graph, cluster_id) + + def export_for_debate(self, cluster_id: str) -> dict[str, Any]: + """Export a cluster as a debate environment context. + + Returns: + Dict with ``task``, ``context``, ``metadata`` fields. + """ + from aragora.ideacloud.adapters.pipeline_bridge import export_cluster_for_debate + + return export_cluster_for_debate(self.graph, cluster_id) + + def export_universal_nodes(self, cluster_id: str) -> list[dict[str, Any]]: + """Export cluster as UniversalNode-compatible dicts. + + Returns: + List of dicts for ``UniversalNode.from_dict()``. + """ + from aragora.ideacloud.adapters.pipeline_bridge import cluster_to_universal_nodes + + return cluster_to_universal_nodes(self.graph, cluster_id) + + # ---- Proposition Extraction ---- + + def extract_propositions(self, cluster_id: str) -> list[str]: + """Extract debate-ready propositions from a cluster. + + Returns: + List of proposition strings. + """ + return ops.extract_propositions(self.graph, cluster_id) + + # ---- Promote / Status ---- + + def promote_node(self, node_id: str, new_status: str) -> bool: + """Change a node's pipeline status. + + Status progression: inbox → candidate → prioritized → exported + + Returns: + True if status was updated. + """ + node = self.graph.get_node(node_id) + if not node: + return False + + valid_statuses = {"inbox", "candidate", "prioritized", "exported"} + if new_status not in valid_statuses: + logger.warning("Invalid status: %s", new_status) + return False + + node.pipeline_status = new_status + node.updated_at = __import__( + "aragora.ideacloud.graph.node", fromlist=["_now_iso"] + )._now_iso() + self.graph.save() + return True + + def promote_cluster(self, cluster_id: str, new_status: str) -> int: + """Promote all nodes in a cluster to a new pipeline status. + + Returns number of nodes promoted. + """ + cluster = self.graph.get_cluster(cluster_id) + if not cluster: + return 0 + + count = 0 + for nid in cluster.node_ids: + if self.promote_node(nid, new_status): + count += 1 + return count + + # ---- Node access ---- + + def get_node(self, node_id: str) -> IdeaNode | None: + """Get a node by ID.""" + return self.graph.get_node(node_id) + + def list_nodes( + self, + status: str | None = None, + source_type: str | None = None, + limit: int = 50, + ) -> list[IdeaNode]: + """List nodes with optional filtering. + + Args: + status: Filter by pipeline_status (inbox, candidate, etc.) + source_type: Filter by source_type (twitter_bookmark, etc.) + limit: Maximum results. + """ + nodes = list(self.graph.nodes.values()) + + if status: + nodes = [n for n in nodes if n.pipeline_status == status] + if source_type: + nodes = [n for n in nodes if n.source_type == source_type] + + # Sort by created_at descending + nodes.sort(key=lambda n: n.created_at, reverse=True) + return nodes[:limit] diff --git a/aragora/ideacloud/graph/__init__.py b/aragora/ideacloud/graph/__init__.py new file mode 100644 index 0000000000..492ef723be --- /dev/null +++ b/aragora/ideacloud/graph/__init__.py @@ -0,0 +1 @@ +"""Graph data model for Idea Cloud.""" diff --git a/aragora/ideacloud/graph/cluster.py b/aragora/ideacloud/graph/cluster.py new file mode 100644 index 0000000000..412a6472c0 --- /dev/null +++ b/aragora/ideacloud/graph/cluster.py @@ -0,0 +1,79 @@ +"""IdeaCluster — a group of related ideas. + +Clusters emerge from explicit connections and tag overlap. +They are the primary unit for debate proposition generation +and pipeline export. +""" + +from __future__ import annotations + +import uuid +from dataclasses import dataclass, field +from typing import Any + +from aragora.ideacloud.graph.node import _now_iso + + +def _generate_cluster_id() -> str: + """Generate a short unique ID with cl_ prefix.""" + return f"cl_{uuid.uuid4().hex[:7]}" + + +@dataclass +class IdeaCluster: + """A group of related ideas.""" + + id: str = field(default_factory=_generate_cluster_id) + name: str = "" + description: str = "" + node_ids: list[str] = field(default_factory=list) + tags: list[str] = field(default_factory=list) + confidence: float = 0.5 + created_at: str = field(default_factory=_now_iso) + updated_at: str = field(default_factory=_now_iso) + + @property + def size(self) -> int: + return len(self.node_ids) + + def add_node(self, node_id: str) -> None: + """Add a node to this cluster (idempotent).""" + if node_id not in self.node_ids: + self.node_ids.append(node_id) + self.updated_at = _now_iso() + + def remove_node(self, node_id: str) -> None: + """Remove a node from this cluster.""" + if node_id in self.node_ids: + self.node_ids.remove(node_id) + self.updated_at = _now_iso() + + def to_dict(self) -> dict[str, Any]: + """Serialize for index.json.""" + return { + "id": self.id, + "name": self.name, + "description": self.description, + "node_ids": self.node_ids, + "tags": self.tags, + "confidence": self.confidence, + "created_at": self.created_at, + "updated_at": self.updated_at, + } + + @classmethod + def from_dict(cls, d: dict[str, Any]) -> IdeaCluster: + """Deserialize from index.json entry.""" + return cls( + id=d.get("id", _generate_cluster_id()), + name=d.get("name", ""), + description=d.get("description", ""), + node_ids=d.get("node_ids", []), + tags=d.get("tags", []), + confidence=float(d.get("confidence", 0.5)), + created_at=d.get("created_at", _now_iso()), + updated_at=d.get("updated_at", _now_iso()), + ) + + def __repr__(self) -> str: + return f"IdeaCluster(id={self.id!r}, name={self.name!r}, size={self.size})" diff --git a/aragora/ideacloud/graph/edge.py b/aragora/ideacloud/graph/edge.py new file mode 100644 index 0000000000..217b8c6ec1 --- /dev/null +++ b/aragora/ideacloud/graph/edge.py @@ -0,0 +1,69 @@ +"""IdeaEdge — connections between ideas in the graph. + +Edges represent relationships: supports, refutes, relates_to, extends, conflicts. +They are stored both as wiki-links in markdown bodies and structurally in index.json. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal + +from aragora.ideacloud.graph.node import _now_iso + +EdgeType = Literal[ + "supports", + "refutes", + "relates_to", + "extends", + "conflicts", +] + + +@dataclass +class IdeaEdge: + """A directed connection between two ideas.""" + + source_id: str + target_id: str + edge_type: str = "relates_to" # EdgeType + weight: float = 1.0 # 0.0-1.0 strength + reason: str = "" # Why these are connected + auto_created: bool = False # True if from auto-linking + confidence: float = 0.5 + created_at: str = field(default_factory=_now_iso) + + def to_dict(self) -> dict: + """Serialize for index.json.""" + d = { + "source": self.source_id, + "target": self.target_id, + "type": self.edge_type, + "weight": self.weight, + "auto_created": self.auto_created, + "confidence": self.confidence, + "created_at": self.created_at, + } + if self.reason: + d["reason"] = self.reason + return d + + @classmethod + def from_dict(cls, d: dict) -> IdeaEdge: + """Deserialize from index.json entry.""" + return cls( + source_id=d["source"], + target_id=d["target"], + edge_type=d.get("type", "relates_to"), + weight=float(d.get("weight", 1.0)), + reason=d.get("reason", ""), + auto_created=bool(d.get("auto_created", False)), + confidence=float(d.get("confidence", 0.5)), + created_at=d.get("created_at", _now_iso()), + ) + + def __repr__(self) -> str: + return ( + f"IdeaEdge({self.source_id} --{self.edge_type}--> {self.target_id}, " + f"w={self.weight:.2f})" + ) diff --git a/aragora/ideacloud/graph/graph.py b/aragora/ideacloud/graph/graph.py new file mode 100644 index 0000000000..f652485d29 --- /dev/null +++ b/aragora/ideacloud/graph/graph.py @@ -0,0 +1,281 @@ +"""IdeaGraph — in-memory graph of ideas with search and traversal. + +Loads from an Obsidian-compatible vault (markdown files + index.json), +provides search, neighbour traversal, and persistence back to vault. +""" + +from __future__ import annotations + +import logging +from collections import defaultdict +from pathlib import Path +from typing import Any + +from aragora.ideacloud.graph.cluster import IdeaCluster +from aragora.ideacloud.graph.edge import IdeaEdge +from aragora.ideacloud.graph.node import IdeaNode +from aragora.ideacloud.storage import index as idx +from aragora.ideacloud.storage import markdown_io as md + +logger = logging.getLogger(__name__) + + +class IdeaGraph: + """In-memory graph backed by an Obsidian vault. + + Usage: + graph = IdeaGraph(".aragora_ideas") + graph.load() + graph.add_node(node) + results = graph.search("prompt injection") + graph.save() + """ + + def __init__(self, vault_path: str | Path) -> None: + self.vault_path = Path(vault_path) + self.nodes: dict[str, IdeaNode] = {} + self.edges: list[IdeaEdge] = [] + self.clusters: dict[str, IdeaCluster] = {} + + # Adjacency index (rebuilt on load/add) + self._adjacency: dict[str, list[str]] = defaultdict(list) + + # ---- Load / Save ---- + + def load(self) -> int: + """Load all nodes from vault markdown files, edges and clusters from index. + + Returns: + Number of nodes loaded. + """ + self.nodes.clear() + self.edges.clear() + self.clusters.clear() + self._adjacency.clear() + + # Load nodes from .md files + for fp in md.list_node_files(self.vault_path): + try: + node = md.read_node(fp) + self.nodes[node.id] = node + except Exception as exc: + logger.warning("Failed to load %s: %s", fp, exc) + + # Load edges and clusters from index + self.edges = idx.load_edges_from_index(self.vault_path) + self.clusters = idx.load_clusters_from_index(self.vault_path) + + # Rebuild adjacency + self._rebuild_adjacency() + + logger.info( + "Loaded idea graph: %d nodes, %d edges, %d clusters", + len(self.nodes), + len(self.edges), + len(self.clusters), + ) + return len(self.nodes) + + def save(self) -> None: + """Persist all nodes to markdown files and update index.""" + for node in self.nodes.values(): + md.write_node(node, self.vault_path) + idx.write_index(self.vault_path, self.nodes, self.edges, self.clusters) + logger.info("Saved idea graph: %d nodes", len(self.nodes)) + + def save_node(self, node_id: str) -> None: + """Persist a single node and update index.""" + if node_id in self.nodes: + md.write_node(self.nodes[node_id], self.vault_path) + idx.write_index(self.vault_path, self.nodes, self.edges, self.clusters) + + # ---- Node operations ---- + + def add_node(self, node: IdeaNode, persist: bool = True) -> None: + """Add a node to the graph. + + Args: + node: The idea node to add. + persist: If True, immediately write to disk and update index. + """ + self.nodes[node.id] = node + if persist: + md.write_node(node, self.vault_path) + idx.write_index(self.vault_path, self.nodes, self.edges, self.clusters) + + def remove_node(self, node_id: str) -> bool: + """Remove a node and its edges from the graph. + + Returns True if node was found and removed. + """ + if node_id not in self.nodes: + return False + + del self.nodes[node_id] + + # Remove edges involving this node + self.edges = [e for e in self.edges if e.source_id != node_id and e.target_id != node_id] + + # Remove from clusters + for cluster in self.clusters.values(): + cluster.remove_node(node_id) + + # Clean up empty clusters + self.clusters = {cid: c for cid, c in self.clusters.items() if c.size > 0} + + self._rebuild_adjacency() + md.delete_node_file(self.vault_path, node_id) + idx.write_index(self.vault_path, self.nodes, self.edges, self.clusters) + return True + + def get_node(self, node_id: str) -> IdeaNode | None: + """Get a node by ID.""" + return self.nodes.get(node_id) + + # ---- Edge operations ---- + + def add_edge(self, edge: IdeaEdge) -> None: + """Add an edge to the graph.""" + # Avoid duplicates + for existing in self.edges: + if ( + existing.source_id == edge.source_id + and existing.target_id == edge.target_id + and existing.edge_type == edge.edge_type + ): + return # Already exists + + self.edges.append(edge) + self._adjacency[edge.source_id].append(edge.target_id) + self._adjacency[edge.target_id].append(edge.source_id) + + def get_edges_for(self, node_id: str) -> list[IdeaEdge]: + """Get all edges involving a node.""" + return [e for e in self.edges if e.source_id == node_id or e.target_id == node_id] + + # ---- Cluster operations ---- + + def add_cluster(self, cluster: IdeaCluster) -> None: + """Add or update a cluster.""" + self.clusters[cluster.id] = cluster + # Update node cluster assignments + for nid in cluster.node_ids: + if nid in self.nodes: + self.nodes[nid].cluster_id = cluster.id + + def get_cluster(self, cluster_id: str) -> IdeaCluster | None: + """Get a cluster by ID.""" + return self.clusters.get(cluster_id) + + # ---- Search ---- + + def search(self, query: str, limit: int = 10) -> list[tuple[IdeaNode, float]]: + """Search nodes by text query. + + Returns (node, relevance_score) tuples sorted by relevance. + + MVP implementation: keyword overlap scoring. + Future: embedding-based semantic similarity. + """ + query_lower = query.lower() + query_terms = set(query_lower.split()) + + scored: list[tuple[IdeaNode, float]] = [] + for node in self.nodes.values(): + score = _text_match_score(query_terms, query_lower, node) + if score > 0: + scored.append((node, score)) + + scored.sort(key=lambda x: x[1], reverse=True) + return scored[:limit] + + # ---- Traversal ---- + + def get_neighbours(self, node_id: str, depth: int = 1) -> set[str]: + """Get all node IDs within N hops of a node.""" + visited: set[str] = set() + frontier: set[str] = {node_id} + + for _ in range(depth): + next_frontier: set[str] = set() + for nid in frontier: + if nid in visited: + continue + visited.add(nid) + next_frontier.update(self._adjacency.get(nid, [])) + frontier = next_frontier - visited + + # Include frontier nodes discovered but not yet expanded + visited.update(frontier) + visited.discard(node_id) # Don't include the start node + return visited + + # ---- Stats ---- + + @property + def stats(self) -> dict[str, Any]: + """Summary statistics for the graph.""" + status_counts: dict[str, int] = defaultdict(int) + source_counts: dict[str, int] = defaultdict(int) + for node in self.nodes.values(): + status_counts[node.pipeline_status] += 1 + source_counts[node.source_type] += 1 + + return { + "total_nodes": len(self.nodes), + "total_edges": len(self.edges), + "total_clusters": len(self.clusters), + "by_status": dict(status_counts), + "by_source": dict(source_counts), + } + + # ---- Internal ---- + + def _rebuild_adjacency(self) -> None: + """Rebuild the adjacency index from edges.""" + self._adjacency.clear() + for edge in self.edges: + self._adjacency[edge.source_id].append(edge.target_id) + self._adjacency[edge.target_id].append(edge.source_id) + + +def _text_match_score( + query_terms: set[str], + query_lower: str, + node: IdeaNode, +) -> float: + """Score a node against a search query using keyword overlap. + + Scoring: + - Title exact substring match: 0.5 + - Title term overlap: 0.3 * (matched terms / total terms) + - Body term overlap: 0.15 * (matched terms / total terms) + - Tag match: 0.3 per matching tag (capped at 0.6) + """ + score = 0.0 + + # Title substring match (highest signal) + if query_lower in node.title.lower(): + score += 0.5 + + # Term overlap in title + title_lower = node.title.lower() + title_hits = sum(1 for t in query_terms if t in title_lower) + if query_terms: + score += 0.3 * (title_hits / len(query_terms)) + + # Term overlap in body + body_lower = node.body.lower() + body_hits = sum(1 for t in query_terms if t in body_lower) + if query_terms: + score += 0.15 * (body_hits / len(query_terms)) + + # Tag matches + tag_hits = 0 + for tag in node.tags: + tag_lower = tag.lower().lstrip("#") + if any(t in tag_lower or tag_lower in t for t in query_terms): + tag_hits += 1 + score += min(0.6, tag_hits * 0.3) + + return min(1.0, score) diff --git a/aragora/ideacloud/graph/node.py b/aragora/ideacloud/graph/node.py new file mode 100644 index 0000000000..aa30ab29c8 --- /dev/null +++ b/aragora/ideacloud/graph/node.py @@ -0,0 +1,184 @@ +"""IdeaNode — the atomic unit of the Idea Cloud. + +Each node is both: +- An Obsidian markdown file (human-readable, editable, interlinked) +- A structured object for aragora (searchable, queryable, pipeline-compatible) + +Frontmatter holds all metadata; the markdown body is free-form content. +""" + +from __future__ import annotations + +import hashlib +import re +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any, Literal + +# Wiki-link pattern: [[Target]] or [[Target|Display Text]] +WIKI_LINK_PATTERN = re.compile(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]") + +# Valid node types (aligned with KnowledgeMound NodeType literals) +IdeaNodeType = Literal[ + "idea_concept", + "idea_insight", + "idea_evidence", + "idea_hypothesis", + "idea_question", + "idea_observation", + "idea_cluster", +] + +# Pipeline lifecycle stages +PipelineStatus = Literal["inbox", "candidate", "prioritized", "exported"] + +# Source types +SourceType = Literal[ + "twitter_bookmark", + "twitter_like", + "rss", + "manual", +] + + +def _generate_id() -> str: + """Generate a short unique ID with ic_ prefix.""" + return f"ic_{uuid.uuid4().hex[:7]}" + + +def _content_hash(text: str) -> str: + """SHA-256 hash of content for change detection and dedup.""" + return f"sha256:{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}" + + +def _now_iso() -> str: + """Current UTC time as ISO 8601 string.""" + return datetime.now(timezone.utc).isoformat() + + +@dataclass +class IdeaNode: + """An idea in the cloud with dual Obsidian/aragora representation.""" + + # Identity + id: str = field(default_factory=_generate_id) + title: str = "" + + # Content (markdown body, separate from frontmatter) + body: str = "" + + # Source tracking + source_type: str = "manual" # SourceType + source_url: str | None = None + source_author: str | None = None + date: str = field(default_factory=_now_iso) + + # Tags (human + auto) + tags: list[str] = field(default_factory=list) + + # Aragora metadata + node_type: str = "idea_insight" # IdeaNodeType + cluster_id: str | None = None + pipeline_status: str = "inbox" # PipelineStatus + relevance_score: float = 0.5 + confidence: float = 0.5 + + # KnowledgeMound sync + km_synced: bool = False + km_node_id: str | None = None + + # Provenance + content_hash: str = "" + created_at: str = field(default_factory=_now_iso) + updated_at: str = field(default_factory=_now_iso) + + def __post_init__(self) -> None: + if not self.content_hash: + self.content_hash = _content_hash(self.title + self.body) + + # ---- Wiki-link extraction ---- + + def extract_wiki_links(self) -> list[str]: + """Extract all [[wiki-link]] targets from the body.""" + return WIKI_LINK_PATTERN.findall(self.body) + + # ---- Frontmatter serialization ---- + + def to_frontmatter_dict(self) -> dict[str, Any]: + """Serialize metadata to a dict suitable for YAML frontmatter.""" + fm: dict[str, Any] = { + "title": self.title, + "source_type": self.source_type, + "date": self.date, + "tags": self.tags, + # Aragora metadata + "id": self.id, + "node_type": self.node_type, + "pipeline_status": self.pipeline_status, + "relevance_score": self.relevance_score, + "confidence": self.confidence, + "km_synced": self.km_synced, + "content_hash": self.content_hash, + "created_at": self.created_at, + "updated_at": self.updated_at, + } + # Optional fields — only include if set + if self.source_url: + fm["source_url"] = self.source_url + if self.source_author: + fm["source_author"] = self.source_author + if self.cluster_id: + fm["cluster_id"] = self.cluster_id + if self.km_node_id: + fm["km_node_id"] = self.km_node_id + return fm + + @classmethod + def from_frontmatter_dict(cls, fm: dict[str, Any], body: str = "") -> IdeaNode: + """Deserialize from frontmatter dict + body text.""" + return cls( + id=fm.get("id", _generate_id()), + title=fm.get("title", ""), + body=body, + source_type=fm.get("source_type", "manual"), + source_url=fm.get("source_url"), + source_author=fm.get("source_author"), + date=fm.get("date", _now_iso()), + tags=fm.get("tags", []), + node_type=fm.get("node_type", "idea_insight"), + cluster_id=fm.get("cluster_id"), + pipeline_status=fm.get("pipeline_status", "inbox"), + relevance_score=float(fm.get("relevance_score", 0.5)), + confidence=float(fm.get("confidence", 0.5)), + km_synced=bool(fm.get("km_synced", False)), + km_node_id=fm.get("km_node_id"), + content_hash=fm.get("content_hash", ""), + created_at=fm.get("created_at", _now_iso()), + updated_at=fm.get("updated_at", _now_iso()), + ) + + # ---- Content update ---- + + def update_content(self, title: str | None = None, body: str | None = None) -> None: + """Update content and refresh hash + timestamp.""" + if title is not None: + self.title = title + if body is not None: + self.body = body + self.content_hash = _content_hash(self.title + self.body) + self.updated_at = _now_iso() + + # ---- Searchable text ---- + + @property + def searchable_text(self) -> str: + """Combined text for search indexing.""" + parts = [self.title, self.body] + parts.extend(self.tags) + if self.source_author: + parts.append(self.source_author) + return " ".join(parts).lower() + + def __repr__(self) -> str: + return f"IdeaNode(id={self.id!r}, title={self.title[:50]!r}, tags={self.tags})" diff --git a/aragora/ideacloud/graph/operations.py b/aragora/ideacloud/graph/operations.py new file mode 100644 index 0000000000..74b44d8c76 --- /dev/null +++ b/aragora/ideacloud/graph/operations.py @@ -0,0 +1,433 @@ +"""Graph operations — auto-linking, clustering, proposition extraction. + +These transform the raw idea graph into a structured, interconnected +knowledge base suitable for debate generation and pipeline export. +""" + +from __future__ import annotations + +import logging +import re +from collections import Counter, defaultdict +from typing import TYPE_CHECKING + +from aragora.ideacloud.graph.cluster import IdeaCluster, _generate_cluster_id +from aragora.ideacloud.graph.edge import IdeaEdge +from aragora.ideacloud.graph.node import IdeaNode, _now_iso + +if TYPE_CHECKING: + from aragora.ideacloud.graph.graph import IdeaGraph + +logger = logging.getLogger(__name__) + + +# ---- Auto-Linking ---- + + +def auto_link( + graph: IdeaGraph, + node_id: str | None = None, + min_similarity: float = 0.3, + max_suggestions: int = 5, + inject_wiki_links: bool = True, +) -> list[IdeaEdge]: + """Find and create connections between ideas based on text similarity. + + If ``node_id`` is provided, only link that node to existing nodes. + If ``None``, run auto-linking across all unlinked node pairs. + + MVP: keyword/tag overlap scoring. + + Args: + graph: The idea graph. + node_id: Specific node to link, or None for all. + min_similarity: Minimum similarity threshold for creating an edge. + max_suggestions: Max new edges per node. + + Returns: + List of newly created edges. + """ + new_edges: list[IdeaEdge] = [] + + if node_id: + target_node = graph.get_node(node_id) + if not target_node: + return [] + candidates = [n for n in graph.nodes.values() if n.id != node_id] + new_edges.extend( + _link_node_to_candidates( + graph, target_node, candidates, min_similarity, max_suggestions + ) + ) + else: + # Link all nodes that have few connections + for node in graph.nodes.values(): + existing_edges = graph.get_edges_for(node.id) + if len(existing_edges) >= max_suggestions: + continue + remaining = max_suggestions - len(existing_edges) + candidates = [n for n in graph.nodes.values() if n.id != node.id] + new_edges.extend( + _link_node_to_candidates(graph, node, candidates, min_similarity, remaining) + ) + + # Add edges to graph and inject wiki-links into node bodies + for edge in new_edges: + graph.add_edge(edge) + if inject_wiki_links: + _inject_wiki_link(graph, edge) + + if new_edges: + logger.info("Auto-linked %d new connections", len(new_edges)) + + return new_edges + + +def _link_node_to_candidates( + graph: IdeaGraph, + node: IdeaNode, + candidates: list[IdeaNode], + min_similarity: float, + max_results: int, +) -> list[IdeaEdge]: + """Score and link a node to its best candidate matches.""" + scored: list[tuple[IdeaNode, float]] = [] + + for candidate in candidates: + # Skip if already connected + existing = [ + e + for e in graph.edges + if (e.source_id == node.id and e.target_id == candidate.id) + or (e.source_id == candidate.id and e.target_id == node.id) + ] + if existing: + continue + + sim = _pairwise_similarity(node, candidate) + if sim >= min_similarity: + scored.append((candidate, sim)) + + scored.sort(key=lambda x: x[1], reverse=True) + + edges: list[IdeaEdge] = [] + for candidate, sim in scored[:max_results]: + edge = IdeaEdge( + source_id=node.id, + target_id=candidate.id, + edge_type="relates_to", + weight=sim, + auto_created=True, + confidence=sim, + ) + edges.append(edge) + + return edges + + +def _pairwise_similarity(a: IdeaNode, b: IdeaNode) -> float: + """Compute similarity between two nodes using keyword and tag overlap. + + Scoring (uses blend of Jaccard and overlap coefficient): + - Tag similarity: weighted blend favoring overlap coefficient (weight 0.4) + - Keyword similarity: Jaccard of significant words (weight 0.4) + - Title similarity: shared significant words in titles (weight 0.2) + + The overlap coefficient (|intersection| / min(|A|, |B|)) rewards sharing + any tags even when the total tag sets are diverse, which is typical for + nodes in the same broad domain (e.g., ai-security) but with different + specific sub-topics. + """ + # Tag similarity — blend Jaccard and overlap coefficient + tags_a = {t.lower().lstrip("#") for t in a.tags} + tags_b = {t.lower().lstrip("#") for t in b.tags} + tag_sim = _blended_similarity(tags_a, tags_b) if (tags_a or tags_b) else 0.0 + + # Keyword similarity (body + title) + words_a = _extract_keywords(a.title + " " + a.body) + words_b = _extract_keywords(b.title + " " + b.body) + keyword_sim = _jaccard(words_a, words_b) if (words_a or words_b) else 0.0 + + # Title word similarity + title_a = _extract_keywords(a.title) + title_b = _extract_keywords(b.title) + title_sim = _jaccard(title_a, title_b) if (title_a or title_b) else 0.0 + + return 0.5 * tag_sim + 0.35 * keyword_sim + 0.15 * title_sim + + +def _overlap_coefficient(set_a: set[str], set_b: set[str]) -> float: + """Overlap coefficient: |A ∩ B| / min(|A|, |B|). + + Returns 1.0 when all elements of the smaller set appear in the larger set, + regardless of how large the larger set is. This is less punishing than + Jaccard for sets of unequal size. + """ + if not set_a or not set_b: + return 0.0 + intersection = set_a & set_b + return len(intersection) / min(len(set_a), len(set_b)) + + +def _blended_similarity(set_a: set[str], set_b: set[str]) -> float: + """Blend of Jaccard (0.4) and overlap coefficient (0.6). + + Overlap coefficient is weighted higher because ideas in the same domain + often share a few key tags while diverging in specifics. + """ + return 0.4 * _jaccard(set_a, set_b) + 0.6 * _overlap_coefficient(set_a, set_b) + + +def _jaccard(set_a: set[str], set_b: set[str]) -> float: + """Jaccard similarity between two sets.""" + if not set_a and not set_b: + return 0.0 + intersection = set_a & set_b + union = set_a | set_b + return len(intersection) / len(union) if union else 0.0 + + +# Stopwords for keyword extraction +_STOPWORDS = frozenset( + "the a an is are was were be been being have has had do does did " + "will would shall should may might can could of in to for on with " + "at by from as into through during before after above below between " + "out off over under again further then once here there when where " + "why how all each every both few more most other some such no nor " + "not only own same so than too very it its this that these those " + "and but or if while because until about against".split() +) + + +def _extract_keywords(text: str) -> set[str]: + """Extract significant keywords from text (lowercase, no stopwords).""" + words = re.findall(r"[a-z]{3,}", text.lower()) + return {w for w in words if w not in _STOPWORDS} + + +# ---- Clustering ---- + + +def auto_cluster( + graph: IdeaGraph, + min_cluster_size: int = 2, +) -> dict[str, IdeaCluster]: + """Cluster ideas using connected components from edges + shared tags. + + 1. Build adjacency from explicit edges + 2. Add implicit edges for nodes sharing 2+ tags + 3. Find connected components + 4. Each component with ≥ min_cluster_size becomes a cluster + + Returns: + Dict of cluster_id → IdeaCluster (also updates graph.clusters). + """ + # Build adjacency graph + adj: dict[str, set[str]] = defaultdict(set) + + # From explicit edges + for edge in graph.edges: + if edge.source_id in graph.nodes and edge.target_id in graph.nodes: + adj[edge.source_id].add(edge.target_id) + adj[edge.target_id].add(edge.source_id) + + # From shared tags (2+ shared tags = implicit connection) + node_list = list(graph.nodes.values()) + for i, a in enumerate(node_list): + tags_a = {t.lower().lstrip("#") for t in a.tags} + for b in node_list[i + 1 :]: + tags_b = {t.lower().lstrip("#") for t in b.tags} + shared = tags_a & tags_b + if len(shared) >= 2: + adj[a.id].add(b.id) + adj[b.id].add(a.id) + + # Find connected components via BFS + visited: set[str] = set() + components: list[set[str]] = [] + + for nid in graph.nodes: + if nid in visited: + continue + component: set[str] = set() + queue = [nid] + while queue: + current = queue.pop(0) + if current in visited: + continue + visited.add(current) + component.add(current) + queue.extend(adj.get(current, set()) - visited) + if len(component) >= min_cluster_size: + components.append(component) + + # Create clusters + new_clusters: dict[str, IdeaCluster] = {} + for component in components: + # Check if an existing cluster covers this component + existing = _find_matching_cluster(graph.clusters, component) + if existing: + # Update existing cluster + existing.node_ids = sorted(component) + existing.tags = _derive_cluster_tags(graph, component) + existing.updated_at = _now_iso() + new_clusters[existing.id] = existing + else: + # Create new cluster + tags = _derive_cluster_tags(graph, component) + name = _derive_cluster_name(tags, graph, component) + cluster = IdeaCluster( + id=_generate_cluster_id(), + name=name, + node_ids=sorted(component), + tags=tags, + confidence=0.5, + ) + new_clusters[cluster.id] = cluster + + # Update graph + graph.clusters = new_clusters + for cluster in new_clusters.values(): + for nid in cluster.node_ids: + if nid in graph.nodes: + graph.nodes[nid].cluster_id = cluster.id + + logger.info( + "Clustering produced %d clusters from %d nodes", len(new_clusters), len(graph.nodes) + ) + + return new_clusters + + +def _find_matching_cluster( + existing: dict[str, IdeaCluster], + component: set[str], +) -> IdeaCluster | None: + """Find an existing cluster that substantially overlaps with a component.""" + for cluster in existing.values(): + existing_set = set(cluster.node_ids) + overlap = existing_set & component + if overlap and len(overlap) / max(len(existing_set), len(component)) > 0.5: + return cluster + return None + + +def _derive_cluster_tags(graph: IdeaGraph, node_ids: set[str]) -> list[str]: + """Derive cluster tags from the most common tags across member nodes.""" + tag_counts: Counter[str] = Counter() + for nid in node_ids: + node = graph.nodes.get(nid) + if node: + for tag in node.tags: + tag_counts[tag.lower().lstrip("#")] += 1 + + # Return tags that appear in at least half the nodes (min 1) + threshold = max(1, len(node_ids) // 2) + return [tag for tag, count in tag_counts.most_common(10) if count >= threshold] + + +def _derive_cluster_name( + tags: list[str], + graph: IdeaGraph, + node_ids: set[str], +) -> str: + """Derive a human-readable cluster name.""" + if tags: + return " + ".join(tags[:3]).title() + # Fall back to first node's title + for nid in sorted(node_ids): + node = graph.nodes.get(nid) + if node and node.title: + return node.title[:50] + return "Unnamed Cluster" + + +# ---- Wiki-link Injection ---- + + +def _inject_wiki_link(graph: IdeaGraph, edge: IdeaEdge) -> None: + """Inject a wiki-link into the source node's body for a new edge. + + Appends a ``[[Target Title]]`` link under a ``## Connections`` section + in the node's markdown body, creating the section if it doesn't exist. + """ + source = graph.nodes.get(edge.source_id) + target = graph.nodes.get(edge.target_id) + if not source or not target or not target.title: + return + + wiki_link = f"[[{target.title}]]" + + # Don't add duplicates + if wiki_link in source.body: + return + + # Find or create the Connections section + if "## Connections" in source.body: + # Append to existing section + source.body = source.body.replace( + "## Connections", + f"## Connections\n- {wiki_link} ({edge.edge_type})", + 1, + ) + else: + # Create new section at the end + source.body = source.body.rstrip() + f"\n\n## Connections\n- {wiki_link} ({edge.edge_type})" + + +# ---- Proposition Extraction ---- + + +def extract_propositions(graph: IdeaGraph, cluster_id: str) -> list[str]: + """Extract debate-ready propositions from a cluster. + + Analyzes the cluster's ideas and generates proposition strings + suitable for feeding into a debate ``Arena``. + + Each proposition is a standalone debatable statement derived from + the ideas and their connections. + + Args: + graph: The idea graph. + cluster_id: Cluster to extract propositions from. + + Returns: + List of proposition strings. + """ + cluster = graph.clusters.get(cluster_id) + if not cluster: + return [] + + propositions: list[str] = [] + + for nid in cluster.node_ids: + node = graph.nodes.get(nid) + if not node: + continue + + # Each node's title is a natural proposition seed + if node.title: + propositions.append(node.title) + + # Look for edges that create interesting tensions + edges = graph.get_edges_for(nid) + for edge in edges: + if edge.edge_type in ("refutes", "conflicts"): + other_id = edge.target_id if edge.source_id == nid else edge.source_id + other = graph.nodes.get(other_id) + if other and other.title: + propositions.append(f"Tension: {node.title} vs {other.title}") + elif edge.edge_type == "extends": + other_id = edge.target_id if edge.source_id == nid else edge.source_id + other = graph.nodes.get(other_id) + if other and other.title: + propositions.append(f"Building on {other.title}: {node.title}") + + # Add a synthesis proposition from cluster tags + if cluster.tags and len(cluster.node_ids) >= 2: + tag_str = ", ".join(cluster.tags[:3]) + propositions.append( + f"Synthesis: What patterns emerge from {tag_str} across {len(cluster.node_ids)} ideas?" + ) + + return propositions diff --git a/aragora/ideacloud/ingestion/__init__.py b/aragora/ideacloud/ingestion/__init__.py new file mode 100644 index 0000000000..4a05c6f6e2 --- /dev/null +++ b/aragora/ideacloud/ingestion/__init__.py @@ -0,0 +1 @@ +"""Ingestion pipeline for Idea Cloud — Twitter, RSS, manual sources.""" diff --git a/aragora/ideacloud/ingestion/base.py b/aragora/ideacloud/ingestion/base.py new file mode 100644 index 0000000000..25dee88a71 --- /dev/null +++ b/aragora/ideacloud/ingestion/base.py @@ -0,0 +1,33 @@ +"""Base ingestor interface for Idea Cloud sources.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any + +from aragora.ideacloud.graph.node import IdeaNode + + +class BaseIdeaIngestor(ABC): + """Abstract base for all idea ingestors. + + Subclasses implement ``ingest()`` to parse a specific source format + and return a list of IdeaNode objects. + """ + + source_type: str = "unknown" + + @abstractmethod + async def ingest(self, source: Any) -> list[IdeaNode]: + """Parse source data and return idea nodes. + + Implementations should: + 1. Parse the source format + 2. Create IdeaNode objects with appropriate metadata + 3. Set source_type, source_url, source_author, tags + 4. Return all parsed nodes (filtering happens in quality.py) + """ + ... + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(source_type={self.source_type!r})" diff --git a/aragora/ideacloud/ingestion/manual.py b/aragora/ideacloud/ingestion/manual.py new file mode 100644 index 0000000000..a20bc4a893 --- /dev/null +++ b/aragora/ideacloud/ingestion/manual.py @@ -0,0 +1,157 @@ +"""Manual paste ingestor — URL or free-text input. + +Handles: +- URLs: creates a node with the URL as source +- Tweet URLs: extracts tweet metadata from URL structure +- Plain text: uses as title + body + +Usage: + ingestor = ManualPasteIngestor() + nodes = await ingestor.ingest("https://example.com/article") + nodes = await ingestor.ingest("Some interesting thought about AI safety") +""" + +from __future__ import annotations + +import logging +import re + +from aragora.ideacloud.graph.node import IdeaNode, _generate_id +from aragora.ideacloud.ingestion.base import BaseIdeaIngestor + +logger = logging.getLogger(__name__) + +# URL detection +URL_PATTERN = re.compile(r"https?://\S+") + +# Twitter/X URL pattern +TWEET_URL_PATTERN = re.compile(r"https?://(?:twitter\.com|x\.com)/(\w+)/status/(\d+)") + + +class ManualPasteIngestor(BaseIdeaIngestor): + """Ingest manually pasted content (URL, tweet text, or free text).""" + + source_type = "manual" + + async def ingest(self, source: str) -> list[IdeaNode]: + """Parse pasted content and return IdeaNodes. + + Args: + source: URL string, tweet text, or free-form text. + + Returns: + List containing one IdeaNode. + """ + source = source.strip() + if not source: + return [] + + # Check if it's a tweet URL + tweet_match = TWEET_URL_PATTERN.search(source) + if tweet_match: + return [_tweet_url_to_node(tweet_match, source)] + + # Check if it's a generic URL + url_match = URL_PATTERN.match(source) + if url_match: + return [_url_to_node(source)] + + # Free text + return [_text_to_node(source)] + + async def ingest_with_context( + self, + content: str, + title: str | None = None, + source_url: str | None = None, + source_author: str | None = None, + tags: list[str] | None = None, + ) -> IdeaNode: + """Ingest with explicit metadata (for programmatic use). + + Args: + content: The body text. + title: Optional title (derived from content if not provided). + source_url: Optional source URL. + source_author: Optional author. + tags: Optional tags. + + Returns: + Single IdeaNode. + """ + if not title: + title = _derive_title(content) + + return IdeaNode( + id=_generate_id(), + title=title, + body=content, + source_type="manual", + source_url=source_url, + source_author=source_author, + tags=tags or [], + node_type="idea_insight", + pipeline_status="inbox", + ) + + +def _tweet_url_to_node(match: re.Match, full_text: str) -> IdeaNode: + """Create a node from a tweet URL.""" + username = match.group(1) + url = match.group(0) + + # Use the full pasted text as body (may contain more than just the URL) + body = full_text.replace(url, "").strip() + + return IdeaNode( + id=_generate_id(), + title=f"Tweet by @{username}", + body=body if body else f"Tweet: {url}", + source_type="manual", + source_url=url, + source_author=f"@{username}", + tags=[], + node_type="idea_insight", + pipeline_status="inbox", + ) + + +def _url_to_node(url: str) -> IdeaNode: + """Create a node from a generic URL.""" + # Extract domain for basic title + domain_match = re.search(r"https?://(?:www\.)?([^/]+)", url) + domain = domain_match.group(1) if domain_match else "unknown" + + return IdeaNode( + id=_generate_id(), + title=f"Link from {domain}", + body=f"Source: {url}", + source_type="manual", + source_url=url, + tags=[], + node_type="idea_insight", + pipeline_status="inbox", + ) + + +def _text_to_node(text: str) -> IdeaNode: + """Create a node from free-form text.""" + title = _derive_title(text) + + return IdeaNode( + id=_generate_id(), + title=title, + body=text, + source_type="manual", + tags=[], + node_type="idea_insight", + pipeline_status="inbox", + ) + + +def _derive_title(text: str) -> str: + """Derive a title from the first line or first 100 chars.""" + first_line = text.split("\n")[0].strip() + if len(first_line) > 100: + return first_line[:97] + "..." + return first_line or "Untitled Idea" diff --git a/aragora/ideacloud/ingestion/quality.py b/aragora/ideacloud/ingestion/quality.py new file mode 100644 index 0000000000..860f71b9ad --- /dev/null +++ b/aragora/ideacloud/ingestion/quality.py @@ -0,0 +1,178 @@ +"""Quality filtering and deduplication for ingested ideas. + +QualityFilter: scores ideas on text quality and relevance. +DeduplicationEngine: detects exact and near-duplicate ideas. +""" + +from __future__ import annotations + +import hashlib +import logging +import re +from typing import TYPE_CHECKING + +from aragora.ideacloud.graph.node import IdeaNode + +if TYPE_CHECKING: + from aragora.ideacloud.graph.graph import IdeaGraph + +logger = logging.getLogger(__name__) + + +class QualityFilter: + """Score and filter ideas by quality and relevance. + + Scoring (0.0 - 1.0): + - Text length (too short or empty = low) + - Has meaningful content (not just links/hashtags) + - Has title + - Has tags + """ + + def __init__(self, min_score: float = 0.3) -> None: + self.min_score = min_score + + def score(self, node: IdeaNode) -> float: + """Return 0.0-1.0 quality score for a node.""" + score = 0.0 + + # Has title (0.3) + if node.title and len(node.title.strip()) > 5: + score += 0.3 + + # Has body content (0.3) + body_text = _strip_links_and_mentions(node.body) + if len(body_text.strip()) > 20: + score += 0.3 + elif len(body_text.strip()) > 5: + score += 0.15 + + # Has tags (0.2) + if node.tags: + score += min(0.2, len(node.tags) * 0.05) + + # Has source URL (0.1) + if node.source_url: + score += 0.1 + + # Has author (0.1) + if node.source_author: + score += 0.1 + + return min(1.0, score) + + def is_acceptable(self, node: IdeaNode) -> bool: + """Check if node meets minimum quality threshold.""" + return self.score(node) >= self.min_score + + def filter_batch(self, nodes: list[IdeaNode]) -> list[IdeaNode]: + """Filter a batch of nodes, returning only acceptable ones.""" + accepted = [] + rejected = 0 + for node in nodes: + if self.is_acceptable(node): + node.relevance_score = self.score(node) + accepted.append(node) + else: + rejected += 1 + if rejected: + logger.info("Quality filter: accepted %d, rejected %d", len(accepted), rejected) + return accepted + + +class DeduplicationEngine: + """Detect duplicate ideas using content hashing. + + Exact dedup: SHA-256 hash of normalized text. + Near dedup: Jaccard similarity of word sets. + """ + + def __init__(self, similarity_threshold: float = 0.85) -> None: + self.similarity_threshold = similarity_threshold + + def find_duplicates( + self, + node: IdeaNode, + graph: IdeaGraph, + ) -> list[str]: + """Find duplicate node IDs in the graph. + + Returns list of existing node IDs that are duplicates of ``node``. + """ + duplicates: list[str] = [] + node_hash = _normalize_hash(node.title + " " + node.body) + node_words = _word_set(node.title + " " + node.body) + + for existing in graph.nodes.values(): + if existing.id == node.id: + continue + + # Exact hash match + existing_hash = _normalize_hash(existing.title + " " + existing.body) + if node_hash == existing_hash: + duplicates.append(existing.id) + continue + + # Near-duplicate via Jaccard + existing_words = _word_set(existing.title + " " + existing.body) + if node_words and existing_words: + jaccard = len(node_words & existing_words) / len(node_words | existing_words) + if jaccard >= self.similarity_threshold: + duplicates.append(existing.id) + + return duplicates + + def deduplicate_batch( + self, + nodes: list[IdeaNode], + graph: IdeaGraph, + ) -> list[IdeaNode]: + """Remove duplicates from a batch against the existing graph. + + Also removes intra-batch duplicates. + """ + seen_hashes: set[str] = set() + unique: list[IdeaNode] = [] + + for node in nodes: + h = _normalize_hash(node.title + " " + node.body) + + # Intra-batch dedup + if h in seen_hashes: + continue + seen_hashes.add(h) + + # Against existing graph + if self.find_duplicates(node, graph): + logger.debug("Skipping duplicate: %s", node.title[:60]) + continue + + unique.append(node) + + skipped = len(nodes) - len(unique) + if skipped: + logger.info("Dedup: kept %d, skipped %d duplicates", len(unique), skipped) + return unique + + +# ---- Helpers ---- + + +def _strip_links_and_mentions(text: str) -> str: + """Remove URLs and @mentions from text for quality scoring.""" + text = re.sub(r"https?://\S+", "", text) + text = re.sub(r"@\w+", "", text) + return text + + +def _normalize_hash(text: str) -> str: + """Normalize text and compute SHA-256 hash.""" + # Lowercase, collapse whitespace, strip punctuation + normalized = re.sub(r"\s+", " ", text.lower().strip()) + normalized = re.sub(r"[^\w\s]", "", normalized) + return hashlib.sha256(normalized.encode("utf-8")).hexdigest() + + +def _word_set(text: str) -> set[str]: + """Extract set of lowercase words (3+ chars) for Jaccard similarity.""" + return {w for w in re.findall(r"[a-z]{3,}", text.lower())} diff --git a/aragora/ideacloud/ingestion/rss_feeds.py b/aragora/ideacloud/ingestion/rss_feeds.py new file mode 100644 index 0000000000..047db13f18 --- /dev/null +++ b/aragora/ideacloud/ingestion/rss_feeds.py @@ -0,0 +1,268 @@ +"""RSS feed ingestor — wraps aragora.connectors.feeds for IdeaCloud. + +Fetches entries from configured RSS/Atom feeds and converts them to +IdeaNodes with quality filtering and relevance scoring. + +Usage: + ingestor = RSSFeedIngestor() + ingestor.add_feed("https://blog.example.com/feed.xml", name="Example Blog") + nodes = await ingestor.ingest() + +The ingestor delegates the actual feed parsing to ``FeedIngestor`` from +``aragora.connectors.feeds``, then wraps the entries as IdeaNodes with +relevance filtering based on configurable topic keywords. +""" + +from __future__ import annotations + +import hashlib +import logging +import re +from dataclasses import dataclass, field +from typing import Any + +from aragora.ideacloud.graph.node import IdeaNode, _generate_id, _now_iso +from aragora.ideacloud.ingestion.base import BaseIdeaIngestor + +logger = logging.getLogger(__name__) + + +@dataclass +class FeedConfig: + """Configuration for a single RSS feed source.""" + + url: str + name: str = "" + category: str = "" + priority: int = 5 + max_entries: int = 50 + enabled: bool = True + topic_keywords: list[str] = field(default_factory=list) + + +class RSSFeedIngestor(BaseIdeaIngestor): + """Ingest RSS/Atom feed entries into IdeaCloud as IdeaNodes. + + Wraps ``aragora.connectors.feeds.FeedIngestor`` for the actual + fetching and XML parsing, adding IdeaCloud-specific quality + filtering and relevance scoring. + + Args: + relevance_keywords: Keywords that indicate relevant content. + Entries matching more keywords get higher relevance scores. + If empty, all entries are accepted. + min_relevance: Minimum relevance score (0-1) to include an entry. + Only applies when relevance_keywords is non-empty. + """ + + def __init__( + self, + relevance_keywords: list[str] | None = None, + min_relevance: float = 0.0, + ) -> None: + self.feeds: list[FeedConfig] = [] + self.relevance_keywords = [k.lower() for k in (relevance_keywords or [])] + self.min_relevance = min_relevance + + def add_feed( + self, + url: str, + name: str = "", + category: str = "", + priority: int = 5, + max_entries: int = 50, + topic_keywords: list[str] | None = None, + ) -> None: + """Register a feed source for ingestion.""" + self.feeds.append( + FeedConfig( + url=url, + name=name or url, + category=category, + priority=priority, + max_entries=max_entries, + topic_keywords=topic_keywords or [], + ) + ) + + def remove_feed(self, url: str) -> bool: + """Remove a feed by URL. Returns True if found and removed.""" + before = len(self.feeds) + self.feeds = [f for f in self.feeds if f.url != url] + return len(self.feeds) < before + + async def ingest(self, source: str = "") -> list[IdeaNode]: + """Ingest all configured feeds and return IdeaNodes. + + Args: + source: Ignored (feeds are configured via add_feed). + + Returns: + List of IdeaNodes created from feed entries. + """ + if not self.feeds: + logger.warning("No feeds configured for RSS ingestion") + return [] + + all_nodes: list[IdeaNode] = [] + + try: + from aragora.connectors.feeds import FeedEntry, FeedIngestor, FeedSource + except ImportError: + logger.warning( + "aragora.connectors.feeds not available; falling back to stub RSS parsing" + ) + return await self._fallback_ingest() + + feed_ingestor = FeedIngestor() + + for feed_config in self.feeds: + if not feed_config.enabled: + continue + + feed_source = FeedSource( + url=feed_config.url, + name=feed_config.name, + category=feed_config.category, + priority=feed_config.priority, + max_entries=feed_config.max_entries, + ) + feed_ingestor.add_source(feed_source) + + try: + entries: list[FeedEntry] = await feed_ingestor.fetch_all() + except Exception as exc: + logger.error("Failed to fetch feeds: %s", exc) + return [] + + for entry in entries: + node = self._entry_to_node(entry) + if node and self._passes_relevance(node): + all_nodes.append(node) + + logger.info( + "RSS ingestion produced %d nodes from %d feeds", + len(all_nodes), + len(self.feeds), + ) + return all_nodes + + def _entry_to_node(self, entry: Any) -> IdeaNode | None: + """Convert a FeedEntry to an IdeaNode.""" + title = getattr(entry, "title", "") or "" + content = getattr(entry, "content", "") or "" + summary = getattr(entry, "summary", "") or "" + link = getattr(entry, "link", "") or "" + author = getattr(entry, "author", "") or "" + categories = getattr(entry, "categories", []) or [] + published = getattr(entry, "published", "") or "" + + # Use content if available, else summary + body = content or summary + + if not title and not body: + return None + + # Strip HTML tags from body + body = re.sub(r"<[^>]+>", "", body).strip() + + # Derive tags from categories + tags = [c.lower().replace(" ", "-") for c in categories[:10]] + + node_id = _generate_id() + content_hash = hashlib.sha256(f"{title}:{body[:500]}".encode()).hexdigest()[:16] + + return IdeaNode( + id=node_id, + title=title[:200], + body=body, + source_type="rss", + source_url=link, + source_author=author, + tags=tags, + content_hash=f"sha256:{content_hash}", + created_at=published or _now_iso(), + updated_at=_now_iso(), + ) + + def _passes_relevance(self, node: IdeaNode) -> bool: + """Check if a node passes the relevance filter. + + If no relevance_keywords are set, all nodes pass. + Otherwise, compute a relevance score from keyword overlap. + """ + if not self.relevance_keywords: + return True + + text = (node.title + " " + node.body + " " + " ".join(node.tags)).lower() + matched = sum(1 for kw in self.relevance_keywords if kw in text) + score = matched / len(self.relevance_keywords) if self.relevance_keywords else 0 + node.relevance_score = score + return score >= self.min_relevance + + async def _fallback_ingest(self) -> list[IdeaNode]: + """Minimal fallback when feed connector is unavailable. + + Uses standard library XML parsing for basic RSS/Atom support. + """ + nodes: list[IdeaNode] = [] + + try: + import xml.etree.ElementTree as ET + from urllib.request import urlopen + except ImportError: + return nodes + + for feed_config in self.feeds: + if not feed_config.enabled: + continue + + try: + with urlopen(feed_config.url, timeout=30) as resp: + data = resp.read() + root = ET.fromstring(data) # noqa: S314 + + # Try RSS 2.0 format + items = root.findall(".//item") + if not items: + # Try Atom format + ns = {"atom": "http://www.w3.org/2005/Atom"} + items = root.findall(".//atom:entry", ns) + + for item in items[: feed_config.max_entries]: + title_el = item.find("title") + desc_el = item.find("description") or item.find( + "{http://www.w3.org/2005/Atom}summary" + ) + link_el = item.find("link") + + title = title_el.text if title_el is not None else "" + body = desc_el.text if desc_el is not None else "" + link = "" + if link_el is not None: + link = link_el.text or link_el.get("href", "") + + if body: + body = re.sub(r"<[^>]+>", "", body).strip() + + if title or body: + node = IdeaNode( + id=_generate_id(), + title=(title or "")[:200], + body=body or "", + source_type="rss", + source_url=link or "", + source_author=feed_config.name, + tags=[feed_config.category] if feed_config.category else [], + ) + if self._passes_relevance(node): + nodes.append(node) + + except Exception as exc: + logger.error( + "Fallback RSS fetch failed for %s: %s", + feed_config.url, + exc, + ) + + return nodes diff --git a/aragora/ideacloud/ingestion/twitter_bookmarks.py b/aragora/ideacloud/ingestion/twitter_bookmarks.py new file mode 100644 index 0000000000..8180c39421 --- /dev/null +++ b/aragora/ideacloud/ingestion/twitter_bookmarks.py @@ -0,0 +1,161 @@ +"""Twitter bookmarks ingestor — parse Twitter data export. + +Twitter data exports contain a ``data/bookmarks.js`` file with format: + window.YTD.bookmark.part0 = [ + {"bookmark": {"tweetId": "123456789"}}, + ... + ] + +Some exports include full tweet objects with text, author, etc. +Others only have tweetIds requiring separate enrichment. + +Usage: + ingestor = TwitterBookmarksIngestor() + nodes = await ingestor.ingest("path/to/data/bookmarks.js") +""" + +from __future__ import annotations + +import json +import logging +import re +from pathlib import Path +from typing import Any + +from aragora.ideacloud.graph.node import IdeaNode, _generate_id +from aragora.ideacloud.ingestion.base import BaseIdeaIngestor + +logger = logging.getLogger(__name__) + +# Pattern to strip the JS variable assignment wrapper +JS_WRAPPER_PATTERN = re.compile(r"^window\.YTD\.\w+\.part\d+\s*=\s*", re.MULTILINE) + + +class TwitterBookmarksIngestor(BaseIdeaIngestor): + """Ingest bookmarked tweets from Twitter data export.""" + + source_type = "twitter_bookmark" + + async def ingest(self, source: str | Path) -> list[IdeaNode]: + """Parse bookmarks.js and return IdeaNodes. + + Args: + source: Path to the bookmarks.js file from Twitter data export. + + Returns: + List of IdeaNode objects, one per bookmark. + """ + path = Path(source) + if not path.exists(): + raise FileNotFoundError(f"Bookmarks file not found: {path}") + + raw = path.read_text(encoding="utf-8") + data = _parse_twitter_js(raw) + + nodes: list[IdeaNode] = [] + for entry in data: + node = _bookmark_entry_to_node(entry) + if node: + nodes.append(node) + + logger.info("Parsed %d bookmarks from %s", len(nodes), path.name) + return nodes + + +def _parse_twitter_js(raw: str) -> list[dict[str, Any]]: + """Parse Twitter JS export format to JSON array. + + Handles both: + - ``window.YTD.bookmark.part0 = [...]`` wrapper + - Raw JSON arrays + """ + # Strip JS wrapper if present + stripped = JS_WRAPPER_PATTERN.sub("", raw).strip() + # Remove trailing semicolons + if stripped.endswith(";"): + stripped = stripped[:-1].strip() + + try: + data = json.loads(stripped) + except json.JSONDecodeError as exc: + logger.error("Failed to parse Twitter JS: %s", exc) + return [] + + if not isinstance(data, list): + logger.warning("Expected list, got %s", type(data).__name__) + return [] + + return data + + +def _bookmark_entry_to_node(entry: dict[str, Any]) -> IdeaNode | None: + """Convert a single bookmark entry to an IdeaNode. + + Handles multiple Twitter export formats: + - {"bookmark": {"tweetId": "..."}} (minimal) + - {"bookmark": {"tweetId": "...", "fullText": "..."}} (with text) + - Direct tweet objects with "full_text", "user", etc. (legacy) + """ + bookmark = entry.get("bookmark", entry) # Unwrap if nested + + tweet_id = ( + bookmark.get("tweetId") + or bookmark.get("tweet_id") + or bookmark.get("id_str") + or bookmark.get("id") + ) + if not tweet_id: + return None + + # Extract text + full_text = bookmark.get("fullText") or bookmark.get("full_text") or bookmark.get("text") or "" + + # Extract author + user = bookmark.get("user", {}) + screen_name = ( + bookmark.get("screenName") or bookmark.get("screen_name") or user.get("screen_name") or "" + ) + + # Build URL + tweet_url = ( + f"https://x.com/{screen_name}/status/{tweet_id}" + if screen_name + else f"https://x.com/i/status/{tweet_id}" + ) + + # Title: first line or first 100 chars + title = _extract_title(full_text) + + # Extract hashtags as tags + tags = _extract_hashtags(full_text) + + return IdeaNode( + id=_generate_id(), + title=title, + body=full_text, + source_type="twitter_bookmark", + source_url=tweet_url, + source_author=f"@{screen_name}" if screen_name else None, + tags=tags, + node_type="idea_insight", + pipeline_status="inbox", + ) + + +def _extract_title(text: str) -> str: + """Extract a title from tweet text (first sentence or 100 chars).""" + if not text: + return "Untitled Bookmark" + + # First sentence + sentences = re.split(r"[.!?\n]", text) + first = sentences[0].strip() if sentences else text.strip() + + if len(first) > 100: + return first[:97] + "..." + return first or "Untitled Bookmark" + + +def _extract_hashtags(text: str) -> list[str]: + """Extract hashtags from tweet text.""" + return re.findall(r"#(\w+)", text) diff --git a/aragora/ideacloud/ingestion/twitter_likes.py b/aragora/ideacloud/ingestion/twitter_likes.py new file mode 100644 index 0000000000..156a7d4cfc --- /dev/null +++ b/aragora/ideacloud/ingestion/twitter_likes.py @@ -0,0 +1,58 @@ +"""Twitter likes ingestor — parse Twitter data export likes. + +Twitter data exports contain a ``data/like.js`` file with format: + window.YTD.like.part0 = [ + {"like": {"tweetId": "123456789", "fullText": "..."}}, + ... + ] + +Reuses the same parsing logic as bookmarks with different source_type. +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +from aragora.ideacloud.graph.node import IdeaNode +from aragora.ideacloud.ingestion.base import BaseIdeaIngestor +from aragora.ideacloud.ingestion.twitter_bookmarks import ( + _parse_twitter_js, + _bookmark_entry_to_node, +) + +logger = logging.getLogger(__name__) + + +class TwitterLikesIngestor(BaseIdeaIngestor): + """Ingest liked tweets from Twitter data export.""" + + source_type = "twitter_like" + + async def ingest(self, source: str | Path) -> list[IdeaNode]: + """Parse like.js and return IdeaNodes. + + Args: + source: Path to the like.js file from Twitter data export. + + Returns: + List of IdeaNode objects, one per like. + """ + path = Path(source) + if not path.exists(): + raise FileNotFoundError(f"Likes file not found: {path}") + + raw = path.read_text(encoding="utf-8") + data = _parse_twitter_js(raw) + + nodes: list[IdeaNode] = [] + for entry in data: + # Unwrap "like" wrapper if present + like_data = entry.get("like", entry) + node = _bookmark_entry_to_node({"bookmark": like_data}) + if node: + node.source_type = "twitter_like" + nodes.append(node) + + logger.info("Parsed %d likes from %s", len(nodes), path.name) + return nodes diff --git a/aragora/ideacloud/storage/__init__.py b/aragora/ideacloud/storage/__init__.py new file mode 100644 index 0000000000..d216fd76de --- /dev/null +++ b/aragora/ideacloud/storage/__init__.py @@ -0,0 +1 @@ +"""Obsidian-compatible storage layer for Idea Cloud.""" diff --git a/aragora/ideacloud/storage/index.py b/aragora/ideacloud/storage/index.py new file mode 100644 index 0000000000..87171851ec --- /dev/null +++ b/aragora/ideacloud/storage/index.py @@ -0,0 +1,120 @@ +"""Graph index for fast lookups without parsing all markdown files. + +Maintains ``index.json`` at the vault root with node metadata, edges, and clusters. +Updated on every write operation. +""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from typing import Any + +from aragora.ideacloud.graph.cluster import IdeaCluster +from aragora.ideacloud.graph.edge import IdeaEdge +from aragora.ideacloud.graph.node import IdeaNode + +logger = logging.getLogger(__name__) + +INDEX_FILENAME = "index.json" + + +@property +def _empty_index() -> dict[str, Any]: + return {"nodes": {}, "edges": [], "clusters": {}} + + +def write_index( + vault_path: str | Path, + nodes: dict[str, IdeaNode], + edges: list[IdeaEdge], + clusters: dict[str, IdeaCluster], +) -> Path: + """Write the full graph index to index.json. + + Args: + vault_path: Root directory of the idea vault. + nodes: All nodes keyed by ID. + edges: All edges. + clusters: All clusters keyed by ID. + + Returns: + Path to the written index file. + """ + vault = Path(vault_path) + vault.mkdir(parents=True, exist_ok=True) + index_path = vault / INDEX_FILENAME + + data: dict[str, Any] = { + "nodes": {}, + "edges": [], + "clusters": {}, + } + + # Node summaries (lightweight — not full body) + for nid, node in nodes.items(): + data["nodes"][nid] = { + "title": node.title, + "source_type": node.source_type, + "tags": node.tags, + "node_type": node.node_type, + "cluster_id": node.cluster_id, + "pipeline_status": node.pipeline_status, + "relevance_score": node.relevance_score, + "confidence": node.confidence, + "km_synced": node.km_synced, + "created_at": node.created_at, + "updated_at": node.updated_at, + "wiki_links": node.extract_wiki_links(), + } + + # Edges + data["edges"] = [e.to_dict() for e in edges] + + # Clusters + for cid, cluster in clusters.items(): + data["clusters"][cid] = cluster.to_dict() + + index_path.write_text( + json.dumps(data, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + logger.debug( + "Wrote index with %d nodes, %d edges, %d clusters", len(nodes), len(edges), len(clusters) + ) + return index_path + + +def read_index(vault_path: str | Path) -> dict[str, Any]: + """Read index.json if it exists. + + Returns: + Parsed index dict, or empty structure if file doesn't exist. + """ + index_path = Path(vault_path) / INDEX_FILENAME + if not index_path.exists(): + return {"nodes": {}, "edges": [], "clusters": {}} + + try: + raw = index_path.read_text(encoding="utf-8") + data = json.loads(raw) + return data + except (json.JSONDecodeError, OSError) as exc: + logger.warning("Failed to read index: %s — returning empty", exc) + return {"nodes": {}, "edges": [], "clusters": {}} + + +def load_edges_from_index(vault_path: str | Path) -> list[IdeaEdge]: + """Load edges from index.json.""" + data = read_index(vault_path) + return [IdeaEdge.from_dict(d) for d in data.get("edges", [])] + + +def load_clusters_from_index(vault_path: str | Path) -> dict[str, IdeaCluster]: + """Load clusters from index.json.""" + data = read_index(vault_path) + clusters: dict[str, IdeaCluster] = {} + for cid, cd in data.get("clusters", {}).items(): + clusters[cid] = IdeaCluster.from_dict(cd) + return clusters diff --git a/aragora/ideacloud/storage/markdown_io.py b/aragora/ideacloud/storage/markdown_io.py new file mode 100644 index 0000000000..6f5e307083 --- /dev/null +++ b/aragora/ideacloud/storage/markdown_io.py @@ -0,0 +1,169 @@ +"""Obsidian-compatible markdown I/O for Idea Cloud. + +Reads and writes .md files with YAML frontmatter and wiki-linked bodies. +Compatible with Obsidian vaults — files can be opened and edited in Obsidian. + +File format: + --- + title: "..." + tags: [...] + id: ic_abc1234 + ... + --- + + Markdown body with [[wiki-links]] here. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Any + +import yaml + +from aragora.ideacloud.graph.node import IdeaNode + +logger = logging.getLogger(__name__) + +# Frontmatter delimiters +FM_DELIMITER = "---" + + +def write_node(node: IdeaNode, vault_path: str | Path) -> Path: + """Write an IdeaNode to a markdown file in the vault. + + File is named ``{node.id}.md`` in the vault root. + + Args: + node: The idea node to persist. + vault_path: Root directory of the idea vault. + + Returns: + Path to the written file. + """ + vault = Path(vault_path) + vault.mkdir(parents=True, exist_ok=True) + + file_path = vault / f"{node.id}.md" + fm_dict = node.to_frontmatter_dict() + + # Build file content + lines: list[str] = [] + lines.append(FM_DELIMITER) + # Use default_flow_style=False for readable YAML + fm_yaml = yaml.dump( + fm_dict, + default_flow_style=False, + allow_unicode=True, + sort_keys=False, + width=120, + ) + lines.append(fm_yaml.rstrip()) + lines.append(FM_DELIMITER) + lines.append("") # blank line after frontmatter + + if node.body: + lines.append(node.body) + else: + # Default body with title as heading + lines.append(f"# {node.title}") + lines.append("") + + content = "\n".join(lines) + if not content.endswith("\n"): + content += "\n" + + file_path.write_text(content, encoding="utf-8") + logger.debug("Wrote idea %s to %s", node.id, file_path) + return file_path + + +def read_node(file_path: str | Path) -> IdeaNode: + """Read an IdeaNode from a markdown file with YAML frontmatter. + + Args: + file_path: Path to the .md file. + + Returns: + Parsed IdeaNode. + + Raises: + ValueError: If the file doesn't contain valid frontmatter. + """ + path = Path(file_path) + raw = path.read_text(encoding="utf-8") + fm_dict, body = _parse_frontmatter(raw) + + node = IdeaNode.from_frontmatter_dict(fm_dict, body=body) + return node + + +def list_node_files(vault_path: str | Path) -> list[Path]: + """List all idea markdown files in the vault. + + Returns files matching the ``ic_*.md`` naming convention. + """ + vault = Path(vault_path) + if not vault.is_dir(): + return [] + # Match idea files by prefix; also accept any .md for flexibility + files = sorted(vault.glob("ic_*.md"), key=lambda p: p.stat().st_mtime, reverse=True) + return files + + +def delete_node_file(vault_path: str | Path, node_id: str) -> bool: + """Delete a node's markdown file. + + Returns True if file was deleted, False if it didn't exist. + """ + path = Path(vault_path) / f"{node_id}.md" + if path.exists(): + path.unlink() + logger.debug("Deleted idea file %s", path) + return True + return False + + +# ---- Internal helpers ---- + + +def _parse_frontmatter(raw: str) -> tuple[dict[str, Any], str]: + """Parse YAML frontmatter and body from raw markdown. + + Expects format: + --- + key: value + --- + body text + + Returns: + (frontmatter_dict, body_text) + """ + lines = raw.split("\n") + + # Find frontmatter boundaries + if not lines or lines[0].strip() != FM_DELIMITER: + raise ValueError("File does not start with frontmatter delimiter '---'") + + end_idx = None + for i in range(1, len(lines)): + if lines[i].strip() == FM_DELIMITER: + end_idx = i + break + + if end_idx is None: + raise ValueError("No closing frontmatter delimiter '---' found") + + # Parse YAML + fm_raw = "\n".join(lines[1:end_idx]) + fm_dict = yaml.safe_load(fm_raw) or {} + + # Body is everything after the closing delimiter (skip leading blank line) + body_lines = lines[end_idx + 1 :] + # Strip one leading blank line if present (convention) + if body_lines and body_lines[0].strip() == "": + body_lines = body_lines[1:] + body = "\n".join(body_lines).rstrip() + + return fm_dict, body diff --git a/aragora/knowledge/mound/adapters/_base.py b/aragora/knowledge/mound/adapters/_base.py index a5326bf1fa..79cb6d0ef4 100644 --- a/aragora/knowledge/mound/adapters/_base.py +++ b/aragora/knowledge/mound/adapters/_base.py @@ -77,6 +77,8 @@ async def my_operation(self): "email": AdapterCircuitBreakerConfig(failure_threshold=5, timeout_seconds=45.0), "jira": AdapterCircuitBreakerConfig(failure_threshold=5, timeout_seconds=45.0), "confluence": AdapterCircuitBreakerConfig(failure_threshold=5, timeout_seconds=45.0), + # Idea Cloud adapter - external file I/O, moderate latency + "ideacloud": AdapterCircuitBreakerConfig(failure_threshold=5, timeout_seconds=45.0), } diff --git a/aragora/knowledge/mound/adapters/factory.py b/aragora/knowledge/mound/adapters/factory.py index 16a1870fb9..9da0f85452 100644 --- a/aragora/knowledge/mound/adapters/factory.py +++ b/aragora/knowledge/mound/adapters/factory.py @@ -582,6 +582,20 @@ def register_adapter_spec(spec: AdapterSpec) -> None: "config_key": "km_confluence_adapter", }, ), + # Idea Cloud - personal knowledge graph for debate pipeline input + ( + "aragora.ideacloud.adapters.km_adapter", + "IdeaCloudAdapter", + { + "name": "ideacloud", + "required_deps": [], + "forward_method": "sync_to_km", + "reverse_method": "sync_from_km", + "priority": 17, + "enabled_by_default": False, + "config_key": "km_ideacloud_adapter", + }, + ), ] diff --git a/tests/test_ideacloud/__init__.py b/tests/test_ideacloud/__init__.py new file mode 100644 index 0000000000..270fb12c00 --- /dev/null +++ b/tests/test_ideacloud/__init__.py @@ -0,0 +1 @@ +"""Tests for the Idea Cloud module.""" diff --git a/tests/test_ideacloud/test_core.py b/tests/test_ideacloud/test_core.py new file mode 100644 index 0000000000..76f462ea9e --- /dev/null +++ b/tests/test_ideacloud/test_core.py @@ -0,0 +1,573 @@ +"""Tests for the Idea Cloud module. + +Tests the full pipeline: node creation, markdown round-trip, +graph operations, ingestion, search, clustering. + +Includes a real-world test with the 3 tweets from the initial +design conversation (Brainworm, OBLITERATUS, Capraro). +""" + +from __future__ import annotations + +import asyncio +import json +import os +import tempfile +from pathlib import Path + +import pytest + +from aragora.ideacloud.graph.node import IdeaNode, _generate_id, _content_hash +from aragora.ideacloud.graph.edge import IdeaEdge +from aragora.ideacloud.graph.cluster import IdeaCluster +from aragora.ideacloud.graph.graph import IdeaGraph +from aragora.ideacloud.graph import operations as ops +from aragora.ideacloud.storage import markdown_io as md +from aragora.ideacloud.storage import index as idx +from aragora.ideacloud.ingestion.quality import QualityFilter, DeduplicationEngine +from aragora.ideacloud.ingestion.manual import ManualPasteIngestor +from aragora.ideacloud.ingestion.twitter_bookmarks import ( + TwitterBookmarksIngestor, + _parse_twitter_js, +) +from aragora.ideacloud.core import IdeaCloud + + +# ---- Fixtures ---- + + +@pytest.fixture +def tmp_vault(tmp_path): + """Create a temporary vault directory.""" + vault = tmp_path / ".aragora_ideas" + vault.mkdir() + return vault + + +@pytest.fixture +def sample_node(): + """Create a sample IdeaNode.""" + return IdeaNode( + id="ic_test123", + title="Brainworm: CUA Malware via Context Injection", + body=( + "Natural language malware that lives in CLAUDE.md files.\n\n" + "Key insight: everything in a context window gets reasoned over " + "with equal authority.\n\n" + "## Connections\n" + "- [[OBLITERATUS]] — weight-level attacks\n" + "- [[Aragora Consensus Defense]]\n" + ), + source_type="manual", + source_url="https://www.originhq.com/blog/brainworm", + source_author="Origin HQ", + tags=["ai-security", "prompt-injection", "agent-safety"], + node_type="idea_insight", + relevance_score=0.92, + confidence=0.85, + ) + + +# ---- Node Tests ---- + + +class TestIdeaNode: + def test_create_node(self): + node = IdeaNode(title="Test Idea", body="Some content") + assert node.id.startswith("ic_") + assert node.title == "Test Idea" + assert node.content_hash.startswith("sha256:") + + def test_extract_wiki_links(self, sample_node): + links = sample_node.extract_wiki_links() + assert "OBLITERATUS" in links + assert "Aragora Consensus Defense" in links + + def test_frontmatter_roundtrip(self, sample_node): + fm = sample_node.to_frontmatter_dict() + restored = IdeaNode.from_frontmatter_dict(fm, body=sample_node.body) + assert restored.id == sample_node.id + assert restored.title == sample_node.title + assert restored.source_url == sample_node.source_url + assert restored.tags == sample_node.tags + assert restored.relevance_score == sample_node.relevance_score + + def test_searchable_text(self, sample_node): + text = sample_node.searchable_text + assert "brainworm" in text + assert "ai-security" in text + assert "origin hq" in text + + def test_update_content(self): + node = IdeaNode(title="Original", body="Original body") + old_hash = node.content_hash + node.update_content(title="Updated", body="New body") + assert node.title == "Updated" + assert node.content_hash != old_hash + + def test_content_hash_deterministic(self): + h1 = _content_hash("hello world") + h2 = _content_hash("hello world") + assert h1 == h2 + assert h1.startswith("sha256:") + + +# ---- Edge Tests ---- + + +class TestIdeaEdge: + def test_create_edge(self): + edge = IdeaEdge( + source_id="ic_a", + target_id="ic_b", + edge_type="supports", + weight=0.8, + ) + assert edge.source_id == "ic_a" + assert edge.edge_type == "supports" + + def test_edge_roundtrip(self): + edge = IdeaEdge( + source_id="ic_a", + target_id="ic_b", + edge_type="refutes", + weight=0.7, + reason="Contradicts the premise", + ) + d = edge.to_dict() + restored = IdeaEdge.from_dict(d) + assert restored.source_id == edge.source_id + assert restored.edge_type == edge.edge_type + assert restored.reason == edge.reason + + +# ---- Cluster Tests ---- + + +class TestIdeaCluster: + def test_create_cluster(self): + cluster = IdeaCluster(name="AI Security", node_ids=["ic_a", "ic_b"]) + assert cluster.id.startswith("cl_") + assert cluster.size == 2 + + def test_add_remove_node(self): + cluster = IdeaCluster(name="Test") + cluster.add_node("ic_a") + cluster.add_node("ic_a") # Idempotent + assert cluster.size == 1 + cluster.remove_node("ic_a") + assert cluster.size == 0 + + def test_cluster_roundtrip(self): + cluster = IdeaCluster( + name="Test Cluster", + node_ids=["ic_a", "ic_b"], + tags=["security", "ai"], + ) + d = cluster.to_dict() + restored = IdeaCluster.from_dict(d) + assert restored.name == cluster.name + assert restored.node_ids == cluster.node_ids + + +# ---- Markdown I/O Tests ---- + + +class TestMarkdownIO: + def test_write_and_read_node(self, tmp_vault, sample_node): + # Write + path = md.write_node(sample_node, tmp_vault) + assert path.exists() + assert path.name == "ic_test123.md" + + # Read + restored = md.read_node(path) + assert restored.id == sample_node.id + assert restored.title == sample_node.title + assert restored.source_url == sample_node.source_url + assert restored.tags == sample_node.tags + assert "OBLITERATUS" in restored.body + + def test_list_node_files(self, tmp_vault, sample_node): + md.write_node(sample_node, tmp_vault) + files = md.list_node_files(tmp_vault) + assert len(files) == 1 + assert files[0].name == "ic_test123.md" + + def test_delete_node_file(self, tmp_vault, sample_node): + md.write_node(sample_node, tmp_vault) + assert md.delete_node_file(tmp_vault, "ic_test123") + assert not md.delete_node_file(tmp_vault, "ic_nonexistent") + + def test_frontmatter_parsing_preserves_body(self, tmp_vault): + node = IdeaNode( + id="ic_bodytest", + title="Body Test", + body="Line 1\n\nLine 2\n\n## Heading\n\nMore text", + ) + md.write_node(node, tmp_vault) + restored = md.read_node(tmp_vault / "ic_bodytest.md") + assert "Line 1" in restored.body + assert "## Heading" in restored.body + assert "More text" in restored.body + + +# ---- Index Tests ---- + + +class TestIndex: + def test_write_and_read_index(self, tmp_vault, sample_node): + nodes = {sample_node.id: sample_node} + edges = [IdeaEdge(source_id="ic_a", target_id="ic_b")] + clusters = {"cl_test": IdeaCluster(id="cl_test", name="Test")} + + idx.write_index(tmp_vault, nodes, edges, clusters) + + data = idx.read_index(tmp_vault) + assert "ic_test123" in data["nodes"] + assert len(data["edges"]) == 1 + assert "cl_test" in data["clusters"] + + def test_read_missing_index(self, tmp_vault): + data = idx.read_index(tmp_vault) + assert data == {"nodes": {}, "edges": [], "clusters": {}} + + +# ---- Graph Tests ---- + + +class TestIdeaGraph: + def test_add_and_search(self, tmp_vault, sample_node): + graph = IdeaGraph(tmp_vault) + graph.add_node(sample_node) + assert len(graph.nodes) == 1 + + results = graph.search("brainworm") + assert len(results) > 0 + assert results[0][0].id == sample_node.id + + def test_load_save_roundtrip(self, tmp_vault, sample_node): + # Save + graph = IdeaGraph(tmp_vault) + graph.add_node(sample_node) + graph.save() + + # Load fresh + graph2 = IdeaGraph(tmp_vault) + loaded = graph2.load() + assert loaded == 1 + assert sample_node.id in graph2.nodes + + def test_edges_and_neighbours(self, tmp_vault): + graph = IdeaGraph(tmp_vault) + n1 = IdeaNode(id="ic_n1", title="Node 1") + n2 = IdeaNode(id="ic_n2", title="Node 2") + n3 = IdeaNode(id="ic_n3", title="Node 3") + graph.add_node(n1, persist=False) + graph.add_node(n2, persist=False) + graph.add_node(n3, persist=False) + + graph.add_edge(IdeaEdge(source_id="ic_n1", target_id="ic_n2")) + graph.add_edge(IdeaEdge(source_id="ic_n2", target_id="ic_n3")) + + neighbours = graph.get_neighbours("ic_n1", depth=1) + assert "ic_n2" in neighbours + assert "ic_n3" not in neighbours + + neighbours_2 = graph.get_neighbours("ic_n1", depth=2) + assert "ic_n3" in neighbours_2 + + def test_remove_node(self, tmp_vault, sample_node): + graph = IdeaGraph(tmp_vault) + graph.add_node(sample_node) + assert graph.remove_node(sample_node.id) + assert sample_node.id not in graph.nodes + + def test_stats(self, tmp_vault, sample_node): + graph = IdeaGraph(tmp_vault) + graph.add_node(sample_node, persist=False) + stats = graph.stats + assert stats["total_nodes"] == 1 + assert stats["by_source"]["manual"] == 1 + + +# ---- Quality & Dedup Tests ---- + + +class TestQualityFilter: + def test_score_good_node(self, sample_node): + qf = QualityFilter() + score = qf.score(sample_node) + assert score > 0.5 + + def test_score_empty_node(self): + qf = QualityFilter() + node = IdeaNode(title="", body="") + score = qf.score(node) + assert score < 0.3 + + def test_filter_batch(self, sample_node): + qf = QualityFilter(min_score=0.3) + empty = IdeaNode(title="", body="") + result = qf.filter_batch([sample_node, empty]) + assert len(result) == 1 + assert result[0].id == sample_node.id + + +class TestDeduplication: + def test_exact_duplicate(self, tmp_vault, sample_node): + graph = IdeaGraph(tmp_vault) + graph.add_node(sample_node, persist=False) + + dupe = IdeaNode( + id="ic_dupe", + title=sample_node.title, + body=sample_node.body, + ) + engine = DeduplicationEngine() + matches = engine.find_duplicates(dupe, graph) + assert sample_node.id in matches + + +# ---- Ingestion Tests ---- + + +class TestManualIngestor: + def test_ingest_url(self): + ingestor = ManualPasteIngestor() + nodes = asyncio.run(ingestor.ingest("https://www.originhq.com/blog/brainworm")) + assert len(nodes) == 1 + assert nodes[0].source_url == "https://www.originhq.com/blog/brainworm" + + def test_ingest_tweet_url(self): + ingestor = ManualPasteIngestor() + nodes = asyncio.run( + ingestor.ingest("https://x.com/elder_plinius/status/2029317072765784156") + ) + assert len(nodes) == 1 + assert nodes[0].source_author == "@elder_plinius" + + def test_ingest_text(self): + ingestor = ManualPasteIngestor() + nodes = asyncio.run(ingestor.ingest("AI safety is important")) + assert len(nodes) == 1 + assert "AI safety" in nodes[0].title + + +class TestTwitterBookmarksIngestor: + def test_parse_twitter_js(self): + js_content = """window.YTD.bookmark.part0 = [ + {"bookmark": {"tweetId": "123", "fullText": "Test tweet about AI"}}, + {"bookmark": {"tweetId": "456", "fullText": "Another tweet"}} + ]""" + data = _parse_twitter_js(js_content) + assert len(data) == 2 + assert data[0]["bookmark"]["tweetId"] == "123" + + def test_ingest_bookmarks_file(self, tmp_path): + bookmarks_file = tmp_path / "bookmarks.js" + bookmarks_file.write_text("""window.YTD.bookmark.part0 = [ + {"bookmark": {"tweetId": "111", "fullText": "AI safety research is critical for alignment", "screenName": "researcher1"}}, + {"bookmark": {"tweetId": "222", "fullText": "New paper on adversarial attacks against LLMs", "screenName": "researcher2"}} + ]""") + + ingestor = TwitterBookmarksIngestor() + nodes = asyncio.run(ingestor.ingest(str(bookmarks_file))) + assert len(nodes) == 2 + assert all(n.source_type == "twitter_bookmark" for n in nodes) + assert any("safety" in n.title.lower() for n in nodes) + + +# ---- Operations Tests ---- + + +class TestAutoLink: + def test_auto_link_similar_nodes(self, tmp_vault): + graph = IdeaGraph(tmp_vault) + n1 = IdeaNode( + id="ic_a1", + title="AI Safety and Prompt Injection", + body="Prompt injection is a key AI safety concern", + tags=["ai-security", "prompt-injection"], + ) + n2 = IdeaNode( + id="ic_a2", + title="Defending Against Prompt Attacks", + body="Multi-model consensus defends against prompt injection attacks", + tags=["ai-security", "prompt-injection", "defense"], + ) + n3 = IdeaNode( + id="ic_a3", + title="Cooking Recipes for Pasta", + body="How to make great pasta from scratch", + tags=["cooking", "recipes"], + ) + graph.add_node(n1, persist=False) + graph.add_node(n2, persist=False) + graph.add_node(n3, persist=False) + + new_edges = ops.auto_link(graph, min_similarity=0.2) + # n1 and n2 should be linked (similar), n3 should not + linked_pairs = {(e.source_id, e.target_id) for e in new_edges} + assert any(("ic_a1" in pair and "ic_a2" in pair) for pair in linked_pairs) + + +class TestAutoClustering: + def test_cluster_connected_nodes(self, tmp_vault): + graph = IdeaGraph(tmp_vault) + n1 = IdeaNode(id="ic_c1", title="Node 1", tags=["ai", "security"]) + n2 = IdeaNode(id="ic_c2", title="Node 2", tags=["ai", "security"]) + n3 = IdeaNode(id="ic_c3", title="Node 3", tags=["cooking", "recipes"]) + graph.add_node(n1, persist=False) + graph.add_node(n2, persist=False) + graph.add_node(n3, persist=False) + + clusters = ops.auto_cluster(graph, min_cluster_size=2) + # n1 and n2 share 2+ tags, should cluster; n3 should not + assert len(clusters) >= 1 + cluster_node_sets = [set(c.node_ids) for c in clusters.values()] + assert any({"ic_c1", "ic_c2"}.issubset(s) for s in cluster_node_sets) + + +# ---- IdeaCloud Core Tests ---- + + +class TestIdeaCloud: + def test_full_workflow(self, tmp_vault): + """Test the complete workflow: ingest → search → cluster.""" + cloud = IdeaCloud(vault_path=tmp_vault) + cloud.load() + + # Ingest manually + node = asyncio.run( + cloud.ingest_manual( + content="Brainworm malware lives in CLAUDE.md files and hijacks agent reasoning", + title="Brainworm: CUA Malware via Context Injection", + source_url="https://www.originhq.com/blog/brainworm", + tags=["ai-security", "prompt-injection", "agent-safety"], + ) + ) + assert node is not None + + # Search + results = cloud.search("brainworm") + assert len(results) > 0 + + # Stats + stats = cloud.stats + assert stats["total_nodes"] == 1 + + def test_three_tweets_integration(self, tmp_vault): + """Integration test with the 3 tweets from today's conversation.""" + cloud = IdeaCloud(vault_path=tmp_vault) + cloud.load() + + # 1. Brainworm + n1 = asyncio.run( + cloud.ingest_manual( + content=( + "Natural language malware that lives in CLAUDE.md files — " + "agent tool calls as the execution primitive. No binaries, no signatures.\n\n" + "Key insight: everything in a context window gets reasoned over " + "with equal authority. No internal mechanism marks tool results " + "as less trustworthy than system instructions." + ), + title="Brainworm: CUA Malware via Context Injection", + source_url="https://www.originhq.com/blog/brainworm", + source_author="Origin HQ", + tags=["ai-security", "prompt-injection", "agent-safety", "supply-chain"], + ) + ) + + # 2. OBLITERATUS + n2 = asyncio.run( + cloud.ingest_manual( + content=( + "Toolkit for removing refusal behaviors from open-weight LLMs " + "using SVD-based weight projection. 13 ablation methods. " + "Surgically identifies directions in weight space that encode " + "refusal and projects them out." + ), + title="OBLITERATUS: Open-Weight LLM Refusal Removal Toolkit", + source_url="https://x.com/elder_plinius/status/2029317072765784156", + source_author="@elder_plinius", + tags=["ai-security", "model-modification", "open-weights", "alignment"], + ) + ) + + # 3. Capraro RLHF + n3 = asyncio.run( + cloud.ingest_manual( + content=( + "GPT says torture is acceptable to prevent nuclear apocalypse " + "but harassment is absolutely not. Reversal appears only when " + "target is a woman. RLHF creates mechanical overgeneralization " + "of certain harm categories." + ), + title="RLHF Moral Inconsistency: Torture vs Harassment Gender Bias", + source_url="https://x.com/ValerioCapraro/status/2029593915674771457", + source_author="@ValerioCapraro", + tags=["ai-security", "rlhf", "alignment", "bias"], + ) + ) + + assert n1 is not None + assert n2 is not None + assert n3 is not None + + # All 3 should be in the graph + assert cloud.stats["total_nodes"] == 3 + + # Search should find them + assert len(cloud.search("brainworm")) > 0 + assert len(cloud.search("OBLITERATUS")) > 0 + assert len(cloud.search("RLHF")) > 0 + + # Auto-link should find connections (shared ai-security tag) + new_edges = cloud.auto_link(min_similarity=0.2) + assert len(new_edges) > 0 + + # Auto-cluster should group related ideas + clusters = cloud.auto_cluster() + assert len(clusters) >= 1 + + # Verify markdown files exist on disk + files = list(tmp_vault.glob("ic_*.md")) + assert len(files) == 3 + + # Verify round-trip: reload from disk + cloud2 = IdeaCloud(vault_path=tmp_vault) + loaded = cloud2.load() + assert loaded == 3 + + # Search still works after reload + results = cloud2.search("prompt injection") + assert len(results) > 0 + + # Cluster summary should work + if clusters: + first_cluster_id = list(clusters.keys())[0] + summary = cloud2.cluster_summary(first_cluster_id) + assert len(summary) > 0 + + def test_dedup_prevents_double_add(self, tmp_vault): + cloud = IdeaCloud(vault_path=tmp_vault) + cloud.load() + + node = asyncio.run( + cloud.ingest_manual( + content="Test idea about something", + title="Test Idea", + ) + ) + assert node is not None + + # Adding the same content again should be rejected + dupe = asyncio.run( + cloud.ingest_manual( + content="Test idea about something", + title="Test Idea", + ) + ) + assert dupe is None + assert cloud.stats["total_nodes"] == 1 diff --git a/tests/test_ideacloud/test_enhanced.py b/tests/test_ideacloud/test_enhanced.py new file mode 100644 index 0000000000..dc5397ccdf --- /dev/null +++ b/tests/test_ideacloud/test_enhanced.py @@ -0,0 +1,477 @@ +"""Tests for Idea Cloud Phase 1+ and Phase 2 enhancements. + +Covers: pipeline bridge, RSS ingestor, wiki-link injection, +proposition extraction, promote/status management, KM adapter sync. +""" + +from __future__ import annotations + +import asyncio +import tempfile +from pathlib import Path + +import pytest + +from aragora.ideacloud.graph.node import IdeaNode +from aragora.ideacloud.graph.edge import IdeaEdge +from aragora.ideacloud.graph.cluster import IdeaCluster +from aragora.ideacloud.graph.graph import IdeaGraph +from aragora.ideacloud.graph import operations as ops +from aragora.ideacloud.core import IdeaCloud +from aragora.ideacloud.adapters.pipeline_bridge import ( + cluster_to_ideas, + cluster_to_brain_dump, + cluster_to_universal_nodes, + export_cluster_for_debate, +) + + +@pytest.fixture +def tmp_vault(tmp_path): + vault = tmp_path / ".aragora_ideas" + vault.mkdir() + return vault + + +@pytest.fixture +def populated_graph(tmp_vault): + """Graph with 3 linked nodes in a cluster for testing exports.""" + graph = IdeaGraph(tmp_vault) + + n1 = IdeaNode( + id="ic_exp1", + title="Brainworm: CUA Malware via Context Injection", + body="Natural language malware that lives in CLAUDE.md files.", + tags=["ai-security", "prompt-injection", "agent-safety"], + source_url="https://originhq.com/blog/brainworm", + source_author="Origin HQ", + node_type="idea_insight", + ) + n2 = IdeaNode( + id="ic_exp2", + title="OBLITERATUS: LLM Refusal Removal Toolkit", + body="Toolkit for removing refusal behaviors from open-weight LLMs.", + tags=["ai-security", "model-modification", "alignment"], + source_url="https://x.com/elder_plinius/status/123", + source_author="@elder_plinius", + node_type="idea_evidence", + ) + n3 = IdeaNode( + id="ic_exp3", + title="RLHF Moral Inconsistency", + body="RLHF creates mechanical overgeneralization of harm categories.", + tags=["ai-security", "rlhf", "alignment", "bias"], + source_author="@ValerioCapraro", + node_type="idea_hypothesis", + ) + + for n in [n1, n2, n3]: + graph.add_node(n, persist=False) + + graph.add_edge( + IdeaEdge( + source_id="ic_exp1", + target_id="ic_exp2", + edge_type="relates_to", + weight=0.7, + ) + ) + graph.add_edge( + IdeaEdge( + source_id="ic_exp2", + target_id="ic_exp3", + edge_type="extends", + weight=0.5, + ) + ) + + cluster = IdeaCluster( + id="cl_test1", + name="AI Security Threats", + node_ids=["ic_exp1", "ic_exp2", "ic_exp3"], + tags=["ai-security", "alignment"], + confidence=0.8, + ) + graph.clusters["cl_test1"] = cluster + for nid in cluster.node_ids: + graph.nodes[nid].cluster_id = "cl_test1" + + return graph + + +# ---- Pipeline Bridge Tests ---- + + +class TestClusterToIdeas: + def test_basic_export(self, populated_graph): + ideas = cluster_to_ideas(populated_graph, "cl_test1") + assert len(ideas) == 3 + assert any("Brainworm" in i for i in ideas) + assert any("OBLITERATUS" in i for i in ideas) + + def test_nonexistent_cluster_raises(self, populated_graph): + with pytest.raises(KeyError, match="not found"): + cluster_to_ideas(populated_graph, "cl_nonexistent") + + +class TestClusterToBrainDump: + def test_brain_dump_format(self, populated_graph): + dump = cluster_to_brain_dump(populated_graph, "cl_test1") + assert "# AI Security Threats" in dump + assert "Themes:" in dump + assert "Brainworm" in dump + assert "## Implications" in dump + + def test_includes_connections(self, populated_graph): + dump = cluster_to_brain_dump(populated_graph, "cl_test1") + # Should show edge relationships + assert "relates_to" in dump or "extends" in dump + + +class TestClusterToUniversalNodes: + def test_universal_node_format(self, populated_graph): + nodes = cluster_to_universal_nodes(populated_graph, "cl_test1") + assert len(nodes) == 3 + + # Check structure + n = nodes[0] + assert "id" in n + assert n["stage"] == "ideas" + assert n["node_subtype"] in ( + "concept", + "insight", + "evidence", + "hypothesis", + "question", + "cluster", + ) + assert "data" in n + assert n["data"]["source_type"] is not None + assert n["metadata"]["origin"] == "ideacloud" + + def test_type_mapping(self, populated_graph): + nodes = cluster_to_universal_nodes(populated_graph, "cl_test1") + subtypes = {n["label"]: n["node_subtype"] for n in nodes} + assert subtypes["Brainworm: CUA Malware via Context Injection"] == "insight" + assert subtypes["OBLITERATUS: LLM Refusal Removal Toolkit"] == "evidence" + assert subtypes["RLHF Moral Inconsistency"] == "hypothesis" + + +class TestExportForDebate: + def test_debate_export_structure(self, populated_graph): + result = export_cluster_for_debate(populated_graph, "cl_test1") + assert "task" in result + assert "context" in result + assert "metadata" in result + assert result["metadata"]["cluster_id"] == "cl_test1" + assert result["metadata"]["node_count"] == 3 + + def test_debate_task_is_actionable(self, populated_graph): + result = export_cluster_for_debate(populated_graph, "cl_test1") + assert "Analyze" in result["task"] + assert "AI Security Threats" in result["task"] + + +# ---- Wiki-Link Injection Tests ---- + + +class TestWikiLinkInjection: + def test_auto_link_injects_wiki_links(self, tmp_vault): + graph = IdeaGraph(tmp_vault) + n1 = IdeaNode( + id="ic_wl1", + title="AI Safety Fundamentals", + body="Core concepts in AI safety research.", + tags=["ai-safety", "research"], + ) + n2 = IdeaNode( + id="ic_wl2", + title="AI Safety Testing Methods", + body="Methods for testing AI safety properties.", + tags=["ai-safety", "research", "testing"], + ) + graph.add_node(n1, persist=False) + graph.add_node(n2, persist=False) + + edges = ops.auto_link(graph, min_similarity=0.1, inject_wiki_links=True) + assert len(edges) > 0 + + # Check that wiki-links were injected + source = graph.nodes[edges[0].source_id] + target = graph.nodes[edges[0].target_id] + assert f"[[{target.title}]]" in source.body + + def test_wiki_link_no_duplicates(self, tmp_vault): + graph = IdeaGraph(tmp_vault) + n1 = IdeaNode( + id="ic_wl3", + title="Topic A", + body="Some content.\n\n## Connections\n- [[Topic B]] (relates_to)", + tags=["test"], + ) + n2 = IdeaNode( + id="ic_wl4", + title="Topic B", + body="Other content.", + tags=["test"], + ) + graph.add_node(n1, persist=False) + graph.add_node(n2, persist=False) + + # Manually inject a link that already exists + edge = IdeaEdge(source_id="ic_wl3", target_id="ic_wl4", edge_type="relates_to") + graph.add_edge(edge) + ops._inject_wiki_link(graph, edge) + + # Should not add a duplicate + assert n1.body.count("[[Topic B]]") == 1 + + def test_auto_link_without_wiki_links(self, tmp_vault): + graph = IdeaGraph(tmp_vault) + n1 = IdeaNode(id="ic_wl5", title="X", body="Body A.", tags=["test", "same"]) + n2 = IdeaNode(id="ic_wl6", title="Y", body="Body B.", tags=["test", "same"]) + graph.add_node(n1, persist=False) + graph.add_node(n2, persist=False) + + edges = ops.auto_link(graph, min_similarity=0.1, inject_wiki_links=False) + assert len(edges) > 0 + + # Wiki-links should NOT be injected + assert "[[" not in graph.nodes["ic_wl5"].body + assert "[[" not in graph.nodes["ic_wl6"].body + + +# ---- Proposition Extraction Tests ---- + + +class TestPropositionExtraction: + def test_extract_from_cluster(self, populated_graph): + propositions = ops.extract_propositions(populated_graph, "cl_test1") + assert len(propositions) > 0 + # Should include node titles + assert any("Brainworm" in p for p in propositions) + + def test_extends_edge_creates_building_proposition(self, populated_graph): + propositions = ops.extract_propositions(populated_graph, "cl_test1") + # n2 extends n3, so should see a "Building on" proposition + assert any("Building on" in p for p in propositions) + + def test_synthesis_proposition(self, populated_graph): + propositions = ops.extract_propositions(populated_graph, "cl_test1") + assert any("Synthesis" in p for p in propositions) + + def test_nonexistent_cluster(self, populated_graph): + propositions = ops.extract_propositions(populated_graph, "cl_none") + assert propositions == [] + + +# ---- RSS Ingestor Tests ---- + + +class TestRSSFeedIngestor: + def test_create_ingestor(self): + from aragora.ideacloud.ingestion.rss_feeds import RSSFeedIngestor + + ingestor = RSSFeedIngestor( + relevance_keywords=["ai", "security"], + min_relevance=0.5, + ) + assert len(ingestor.feeds) == 0 + assert ingestor.min_relevance == 0.5 + + def test_add_and_remove_feed(self): + from aragora.ideacloud.ingestion.rss_feeds import RSSFeedIngestor + + ingestor = RSSFeedIngestor() + ingestor.add_feed("https://example.com/feed.xml", name="Example") + assert len(ingestor.feeds) == 1 + assert ingestor.feeds[0].name == "Example" + + assert ingestor.remove_feed("https://example.com/feed.xml") + assert len(ingestor.feeds) == 0 + + # Removing non-existent returns False + assert not ingestor.remove_feed("https://no.such/feed") + + def test_relevance_filter(self): + from aragora.ideacloud.ingestion.rss_feeds import RSSFeedIngestor + + ingestor = RSSFeedIngestor( + relevance_keywords=["ai", "security", "safety"], + min_relevance=0.5, + ) + + # Node with 2/3 keywords = 0.67 relevance → passes + relevant = IdeaNode( + title="AI Security Research", + body="This is about ai and security topics.", + tags=["tech"], + ) + assert ingestor._passes_relevance(relevant) + + # Node with 0/3 keywords = 0 relevance → rejected + irrelevant = IdeaNode( + title="Cooking Recipes", + body="How to make pasta.", + tags=["cooking"], + ) + assert not ingestor._passes_relevance(irrelevant) + + def test_ingest_no_feeds_returns_empty(self): + from aragora.ideacloud.ingestion.rss_feeds import RSSFeedIngestor + + ingestor = RSSFeedIngestor() + result = asyncio.run(ingestor.ingest()) + assert result == [] + + +# ---- Core Orchestrator Enhancement Tests ---- + + +class TestIdeaCloudPipelineBridge: + def test_export_for_pipeline(self, tmp_vault): + cloud = IdeaCloud(vault_path=tmp_vault) + cloud.load() + + # Add nodes and cluster them + asyncio.run( + cloud.ingest_manual( + content="AI safety is critical for trustworthy systems.", + title="AI Safety", + tags=["ai-safety", "trustworthy"], + ) + ) + asyncio.run( + cloud.ingest_manual( + content="Adversarial testing reveals AI vulnerabilities.", + title="Adversarial Testing", + tags=["ai-safety", "testing", "trustworthy"], + ) + ) + + clusters = cloud.auto_cluster() + if clusters: + cid = list(clusters.keys())[0] + ideas = cloud.export_for_pipeline(cid) + assert len(ideas) > 0 + assert isinstance(ideas[0], str) + + def test_export_for_debate(self, tmp_vault): + cloud = IdeaCloud(vault_path=tmp_vault) + cloud.load() + + asyncio.run( + cloud.ingest_manual( + content="Test idea A", + title="Idea A", + tags=["test", "alpha"], + ) + ) + asyncio.run( + cloud.ingest_manual( + content="Test idea B", + title="Idea B", + tags=["test", "alpha"], + ) + ) + + clusters = cloud.auto_cluster() + if clusters: + cid = list(clusters.keys())[0] + result = cloud.export_for_debate(cid) + assert "task" in result + assert "context" in result + + +class TestPromoteStatus: + def test_promote_node(self, tmp_vault): + cloud = IdeaCloud(vault_path=tmp_vault) + cloud.load() + + node = asyncio.run( + cloud.ingest_manual( + content="Important idea", + title="Test Promote", + ) + ) + assert node is not None + assert node.pipeline_status == "inbox" + + assert cloud.promote_node(node.id, "candidate") + reloaded = cloud.get_node(node.id) + assert reloaded.pipeline_status == "candidate" + + def test_promote_invalid_status(self, tmp_vault): + cloud = IdeaCloud(vault_path=tmp_vault) + cloud.load() + + node = asyncio.run( + cloud.ingest_manual( + content="A substantial idea about testing promotion status transitions", + title="Testing Status Promotion", + tags=["test"], + ) + ) + assert node is not None + assert not cloud.promote_node(node.id, "invalid_status") + + def test_promote_nonexistent_node(self, tmp_vault): + cloud = IdeaCloud(vault_path=tmp_vault) + cloud.load() + assert not cloud.promote_node("ic_nonexistent", "candidate") + + def test_promote_cluster(self, tmp_vault): + cloud = IdeaCloud(vault_path=tmp_vault) + cloud.load() + + asyncio.run( + cloud.ingest_manual( + content="A", + title="A", + tags=["same", "group"], + ) + ) + asyncio.run( + cloud.ingest_manual( + content="B", + title="B", + tags=["same", "group"], + ) + ) + + clusters = cloud.auto_cluster() + if clusters: + cid = list(clusters.keys())[0] + count = cloud.promote_cluster(cid, "prioritized") + assert count >= 2 + + # Verify nodes were promoted + for node in cloud.cluster_nodes(cid): + assert node.pipeline_status == "prioritized" + + +class TestExtractPropositionsFromCloud: + def test_extract_via_core(self, tmp_vault): + cloud = IdeaCloud(vault_path=tmp_vault) + cloud.load() + + asyncio.run( + cloud.ingest_manual( + content="Idea 1 content", + title="Idea One", + tags=["topic", "sub"], + ) + ) + asyncio.run( + cloud.ingest_manual( + content="Idea 2 content", + title="Idea Two", + tags=["topic", "sub"], + ) + ) + + clusters = cloud.auto_cluster() + if clusters: + cid = list(clusters.keys())[0] + props = cloud.extract_propositions(cid) + assert len(props) > 0 diff --git a/tests/test_ideacloud/test_km_adapter_registration.py b/tests/test_ideacloud/test_km_adapter_registration.py new file mode 100644 index 0000000000..ac3aafb816 --- /dev/null +++ b/tests/test_ideacloud/test_km_adapter_registration.py @@ -0,0 +1,13 @@ +"""Regression coverage for Idea Cloud Knowledge Mound adapter registration.""" + +from aragora.knowledge.mound.adapters._base import ADAPTER_CIRCUIT_CONFIGS +from aragora.knowledge.mound.adapters.factory import _ADAPTER_DEFS + + +def test_ideacloud_adapter_registered_in_factory() -> None: + names = [spec_kwargs.get("name", "") for _, _, spec_kwargs in _ADAPTER_DEFS] + assert "ideacloud" in names + + +def test_ideacloud_adapter_has_circuit_breaker_config() -> None: + assert "ideacloud" in ADAPTER_CIRCUIT_CONFIGS