From 07c641b60198b8f3ae82b39008418c0e1306a992 Mon Sep 17 00:00:00 2001 From: laxmanclo Date: Wed, 17 Jun 2026 13:51:47 +0530 Subject: [PATCH 1/8] feat(backend): add telemetry event plumbing --- .../adapters/telemetry/localsqlite.go | 121 ++++++++++++++++++ backend/internal/adapters/telemetry/noop.go | 14 ++ backend/internal/config/config.go | 81 ++++++++++++ backend/internal/config/config_test.go | 19 ++- backend/internal/daemon/daemon.go | 15 +++ backend/internal/daemon/telemetry_wiring.go | 17 +++ .../internal/daemon/telemetry_wiring_test.go | 32 +++++ backend/internal/ports/telemetry.go | 40 ++++++ backend/internal/storage/sqlite/gen/models.go | 12 ++ .../storage/sqlite/gen/telemetry.sql.go | 115 +++++++++++++++++ .../migrations/0014_telemetry_events.sql | 35 +++++ .../storage/sqlite/queries/telemetry.sql | 21 +++ .../storage/sqlite/store/telemetry_store.go | 73 +++++++++++ .../sqlite/store/telemetry_store_test.go | 70 ++++++++++ 14 files changed, 664 insertions(+), 1 deletion(-) create mode 100644 backend/internal/adapters/telemetry/localsqlite.go create mode 100644 backend/internal/adapters/telemetry/noop.go create mode 100644 backend/internal/daemon/telemetry_wiring.go create mode 100644 backend/internal/daemon/telemetry_wiring_test.go create mode 100644 backend/internal/ports/telemetry.go create mode 100644 backend/internal/storage/sqlite/gen/telemetry.sql.go create mode 100644 backend/internal/storage/sqlite/migrations/0014_telemetry_events.sql create mode 100644 backend/internal/storage/sqlite/queries/telemetry.sql create mode 100644 backend/internal/storage/sqlite/store/telemetry_store.go create mode 100644 backend/internal/storage/sqlite/store/telemetry_store_test.go diff --git a/backend/internal/adapters/telemetry/localsqlite.go b/backend/internal/adapters/telemetry/localsqlite.go new file mode 100644 index 00000000..5876baf0 --- /dev/null +++ b/backend/internal/adapters/telemetry/localsqlite.go @@ -0,0 +1,121 @@ +package telemetry + +import ( + "context" + "encoding/json" + "log/slog" + "sync" + "time" + + "github.com/google/uuid" + + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + sqlitestore "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/store" +) + +const ( + localBufferSize = 128 + localRetention = 30 * 24 * time.Hour + localPruneEvery = time.Hour + localPruneBatchLimit = int64(1000) +) + +type localStore interface { + CreateTelemetryEvent(ctx context.Context, rec sqlitestore.TelemetryEventRecord) error + PruneTelemetryEventsBefore(ctx context.Context, before time.Time, limit int64) (int64, error) +} + +// LocalSQLiteSink persists telemetry events into the daemon's SQLite database +// behind a small buffered worker so event emission stays best-effort. +type LocalSQLiteSink struct { + store localStore + log *slog.Logger + ch chan ports.TelemetryEvent + wg sync.WaitGroup + closeOnce sync.Once + now func() time.Time + newID func() string + + pruneMu sync.Mutex + lastPrune time.Time +} + +func NewLocalSQLiteSink(store localStore, log *slog.Logger) *LocalSQLiteSink { + s := &LocalSQLiteSink{ + store: store, + log: log, + ch: make(chan ports.TelemetryEvent, localBufferSize), + now: time.Now, + newID: func() string { return "tev_" + uuid.NewString() }, + } + s.wg.Add(1) + go s.loop() + return s +} + +func (s *LocalSQLiteSink) Emit(_ context.Context, ev ports.TelemetryEvent) { + select { + case s.ch <- ev: + default: + s.log.Warn("telemetry local sink buffer full; dropping event", "name", ev.Name, "source", ev.Source) + } +} + +func (s *LocalSQLiteSink) Close(ctx context.Context) error { + s.closeOnce.Do(func() { close(s.ch) }) + done := make(chan struct{}) + go func() { + defer close(done) + s.wg.Wait() + }() + select { + case <-ctx.Done(): + return ctx.Err() + case <-done: + return nil + } +} + +func (s *LocalSQLiteSink) loop() { + defer s.wg.Done() + for ev := range s.ch { + s.persist(ev) + } +} + +func (s *LocalSQLiteSink) persist(ev ports.TelemetryEvent) { + payloadJSON, err := json.Marshal(ev.Payload) + if err != nil { + s.log.Warn("telemetry payload marshal failed", "name", ev.Name, "error", err) + return + } + rec := sqlitestore.TelemetryEventRecord{ + ID: s.newID(), + OccurredAt: ev.OccurredAt.UTC(), + Name: ev.Name, + Source: ev.Source, + Level: string(ev.Level), + ProjectID: ev.ProjectID, + SessionID: ev.SessionID, + RequestID: ev.RequestID, + PayloadJSON: string(payloadJSON), + } + if err := s.store.CreateTelemetryEvent(context.Background(), rec); err != nil { + s.log.Warn("telemetry local sink write failed", "name", ev.Name, "error", err) + return + } + s.maybePrune() +} + +func (s *LocalSQLiteSink) maybePrune() { + s.pruneMu.Lock() + defer s.pruneMu.Unlock() + now := s.now().UTC() + if !s.lastPrune.IsZero() && now.Sub(s.lastPrune) < localPruneEvery { + return + } + s.lastPrune = now + if _, err := s.store.PruneTelemetryEventsBefore(context.Background(), now.Add(-localRetention), localPruneBatchLimit); err != nil { + s.log.Warn("telemetry local sink prune failed", "error", err) + } +} diff --git a/backend/internal/adapters/telemetry/noop.go b/backend/internal/adapters/telemetry/noop.go new file mode 100644 index 00000000..afd54c76 --- /dev/null +++ b/backend/internal/adapters/telemetry/noop.go @@ -0,0 +1,14 @@ +package telemetry + +import ( + "context" + + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +// NoopSink discards every event. +type NoopSink struct{} + +func (NoopSink) Emit(context.Context, ports.TelemetryEvent) {} + +func (NoopSink) Close(context.Context) error { return nil } diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index e2a9386c..56a4200e 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -32,8 +32,28 @@ const ( // DefaultAgent is the agent adapter id the daemon wires when AO_AGENT is // unset. It matches the claude-code adapter's manifest id. DefaultAgent = "claude-code" + // DefaultTelemetryPostHogHost is the default PostHog ingestion host when + // remote telemetry is enabled and AO_TELEMETRY_POSTHOG_HOST is unset. + DefaultTelemetryPostHogHost = "https://us.i.posthog.com" ) +// TelemetryRemote selects the remote telemetry exporter. +type TelemetryRemote string + +const ( + TelemetryRemoteOff TelemetryRemote = "off" + TelemetryRemotePostHog TelemetryRemote = "posthog" +) + +// TelemetryConfig controls local and remote telemetry behavior. +type TelemetryConfig struct { + Events bool + Metrics bool + Remote TelemetryRemote + PostHogKey string + PostHogHost string +} + // DefaultAllowedOrigins are the browser origins the daemon's CORS boundary // trusts, beyond loopback-served content (which the middleware always trusts — // local pages can reach the no-auth daemon directly anyway). The daemon has no @@ -70,6 +90,8 @@ type Config struct { // AllowedOrigins are the browser origins granted CORS read access (see // DefaultAllowedOrigins). Overridden by AO_ALLOWED_ORIGINS. AllowedOrigins []string + // Telemetry controls local/remote telemetry sinks. + Telemetry TelemetryConfig } // Addr returns the host:port the HTTP server binds. It uses net.JoinHostPort so @@ -91,6 +113,11 @@ func (c Config) Addr() string { // AO_DATA_DIR durable state dir (default ~/.ao/data) // AO_AGENT agent adapter id (default claude-code) // AO_ALLOWED_ORIGINS CORS origins, comma-separated (default DefaultAllowedOrigins) +// AO_TELEMETRY_EVENTS local event capture off|on (default off) +// AO_TELEMETRY_METRICS local metric capture off|on (default off) +// AO_TELEMETRY_REMOTE remote exporter off|posthog (default off) +// AO_TELEMETRY_POSTHOG_KEY PostHog project key +// AO_TELEMETRY_POSTHOG_HOST PostHog host (default DefaultTelemetryPostHogHost) // // The bind host is not configurable: the daemon is loopback-only by design. func Load() (Config, error) { @@ -101,6 +128,10 @@ func Load() (Config, error) { ShutdownTimeout: DefaultShutdownTimeout, Agent: DefaultAgent, AllowedOrigins: DefaultAllowedOrigins, + Telemetry: TelemetryConfig{ + Remote: TelemetryRemoteOff, + PostHogHost: DefaultTelemetryPostHogHost, + }, } if raw := os.Getenv("AO_PORT"); raw != "" { @@ -153,6 +184,34 @@ func Load() (Config, error) { cfg.AllowedOrigins = origins } + if raw := os.Getenv("AO_TELEMETRY_EVENTS"); raw != "" { + v, err := parseToggleEnv("AO_TELEMETRY_EVENTS", raw) + if err != nil { + return Config{}, err + } + cfg.Telemetry.Events = v + } + if raw := os.Getenv("AO_TELEMETRY_METRICS"); raw != "" { + v, err := parseToggleEnv("AO_TELEMETRY_METRICS", raw) + if err != nil { + return Config{}, err + } + cfg.Telemetry.Metrics = v + } + if raw := os.Getenv("AO_TELEMETRY_REMOTE"); raw != "" { + remote, err := parseTelemetryRemote(raw) + if err != nil { + return Config{}, fmt.Errorf("invalid AO_TELEMETRY_REMOTE %q: %w", raw, err) + } + cfg.Telemetry.Remote = remote + } + if raw := os.Getenv("AO_TELEMETRY_POSTHOG_KEY"); raw != "" { + cfg.Telemetry.PostHogKey = raw + } + if raw := os.Getenv("AO_TELEMETRY_POSTHOG_HOST"); raw != "" { + cfg.Telemetry.PostHogHost = raw + } + runFile, err := resolveRunFilePath() if err != nil { return Config{}, err @@ -168,6 +227,28 @@ func Load() (Config, error) { return cfg, nil } +func parseToggleEnv(name, raw string) (bool, error) { + switch strings.ToLower(strings.TrimSpace(raw)) { + case "on", "true", "1", "yes": + return true, nil + case "off", "false", "0", "no": + return false, nil + default: + return false, fmt.Errorf("%s must be off|on", name) + } +} + +func parseTelemetryRemote(raw string) (TelemetryRemote, error) { + switch TelemetryRemote(strings.ToLower(strings.TrimSpace(raw))) { + case TelemetryRemoteOff: + return TelemetryRemoteOff, nil + case TelemetryRemotePostHog: + return TelemetryRemotePostHog, nil + default: + return "", fmt.Errorf("must be off|posthog") + } +} + // parsePositiveDuration rejects zero and negative durations: a zero // RequestTimeout would expire every request instantly, and a non-positive // ShutdownTimeout would defeat graceful shutdown. diff --git a/backend/internal/config/config_test.go b/backend/internal/config/config_test.go index 4ce22512..d2ef8cb5 100644 --- a/backend/internal/config/config_test.go +++ b/backend/internal/config/config_test.go @@ -10,7 +10,7 @@ import ( func TestLoadDefaults(t *testing.T) { // Clear every recognised var so we observe pure defaults regardless of the // surrounding environment. - for _, k := range []string{"AO_PORT", "AO_REQUEST_TIMEOUT", "AO_SHUTDOWN_TIMEOUT", "AO_RUN_FILE", "AO_DATA_DIR", "AO_AGENT", "AO_ALLOWED_ORIGINS"} { + for _, k := range []string{"AO_PORT", "AO_REQUEST_TIMEOUT", "AO_SHUTDOWN_TIMEOUT", "AO_RUN_FILE", "AO_DATA_DIR", "AO_AGENT", "AO_ALLOWED_ORIGINS", "AO_TELEMETRY_EVENTS", "AO_TELEMETRY_METRICS", "AO_TELEMETRY_REMOTE", "AO_TELEMETRY_POSTHOG_KEY", "AO_TELEMETRY_POSTHOG_HOST"} { t.Setenv(k, "") } @@ -48,6 +48,9 @@ func TestLoadDefaults(t *testing.T) { if cfg.DataDir != wantDataDir { t.Errorf("DataDir = %q, want %q", cfg.DataDir, wantDataDir) } + if cfg.Telemetry.Remote != TelemetryRemoteOff || cfg.Telemetry.PostHogHost != DefaultTelemetryPostHogHost { + t.Fatalf("Telemetry defaults = %+v", cfg.Telemetry) + } } func TestLoadOverrides(t *testing.T) { @@ -56,6 +59,11 @@ func TestLoadOverrides(t *testing.T) { t.Setenv("AO_SHUTDOWN_TIMEOUT", "3s") t.Setenv("AO_RUN_FILE", "/tmp/ao-test-running.json") t.Setenv("AO_DATA_DIR", "/tmp/ao-test-data") + t.Setenv("AO_TELEMETRY_EVENTS", "on") + t.Setenv("AO_TELEMETRY_METRICS", "off") + t.Setenv("AO_TELEMETRY_REMOTE", "posthog") + t.Setenv("AO_TELEMETRY_POSTHOG_KEY", "phc_test") + t.Setenv("AO_TELEMETRY_POSTHOG_HOST", "https://eu.i.posthog.com") cfg, err := Load() if err != nil { @@ -76,6 +84,12 @@ func TestLoadOverrides(t *testing.T) { if cfg.DataDir != "/tmp/ao-test-data" { t.Errorf("DataDir = %q, want /tmp/ao-test-data", cfg.DataDir) } + if !cfg.Telemetry.Events || cfg.Telemetry.Metrics { + t.Fatalf("Telemetry toggles = %+v", cfg.Telemetry) + } + if cfg.Telemetry.Remote != TelemetryRemotePostHog || cfg.Telemetry.PostHogKey != "phc_test" || cfg.Telemetry.PostHogHost != "https://eu.i.posthog.com" { + t.Fatalf("Telemetry remote = %+v", cfg.Telemetry) + } } func TestLoadInvalid(t *testing.T) { @@ -93,6 +107,9 @@ func TestLoadInvalid(t *testing.T) { {"negative shutdown timeout", map[string]string{"AO_SHUTDOWN_TIMEOUT": "-5s"}}, {"null origin", map[string]string{"AO_ALLOWED_ORIGINS": "app://renderer,null"}}, {"wildcard origin", map[string]string{"AO_ALLOWED_ORIGINS": "*"}}, + {"bad telemetry events", map[string]string{"AO_TELEMETRY_EVENTS": "maybe"}}, + {"bad telemetry metrics", map[string]string{"AO_TELEMETRY_METRICS": "maybe"}}, + {"bad telemetry remote", map[string]string{"AO_TELEMETRY_REMOTE": "otlp"}}, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index 747f5251..a4d620cb 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -10,11 +10,13 @@ import ( "os" "os/signal" "syscall" + "time" "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/zellij" "github.com/aoagents/agent-orchestrator/backend/internal/config" "github.com/aoagents/agent-orchestrator/backend/internal/httpd" "github.com/aoagents/agent-orchestrator/backend/internal/notify" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" "github.com/aoagents/agent-orchestrator/backend/internal/runfile" notificationsvc "github.com/aoagents/agent-orchestrator/backend/internal/service/notification" projectsvc "github.com/aoagents/agent-orchestrator/backend/internal/service/project" @@ -50,6 +52,19 @@ func Run() error { } defer func() { _ = store.Close() }() + telemetrySink := newTelemetrySink(cfg, store, log) + defer func() { _ = telemetrySink.Close(context.Background()) }() + telemetrySink.Emit(context.Background(), ports.TelemetryEvent{ + Name: "ao.daemon.started", + Source: "daemon", + OccurredAt: time.Now().UTC(), + Level: ports.TelemetryLevelInfo, + Payload: map[string]any{ + "port": cfg.Port, + "agent": cfg.Agent, + }, + }) + // signal.NotifyContext cancels ctx on SIGINT/SIGTERM, which drives the // graceful shutdown inside Server.Run and stops the background goroutines. ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) diff --git a/backend/internal/daemon/telemetry_wiring.go b/backend/internal/daemon/telemetry_wiring.go new file mode 100644 index 00000000..4b936450 --- /dev/null +++ b/backend/internal/daemon/telemetry_wiring.go @@ -0,0 +1,17 @@ +package daemon + +import ( + "log/slog" + + telemetryadapter "github.com/aoagents/agent-orchestrator/backend/internal/adapters/telemetry" + "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +func newTelemetrySink(cfg config.Config, store *sqlite.Store, log *slog.Logger) ports.EventSink { + if !cfg.Telemetry.Events && !cfg.Telemetry.Metrics { + return telemetryadapter.NoopSink{} + } + return telemetryadapter.NewLocalSQLiteSink(store, log) +} diff --git a/backend/internal/daemon/telemetry_wiring_test.go b/backend/internal/daemon/telemetry_wiring_test.go new file mode 100644 index 00000000..839dae90 --- /dev/null +++ b/backend/internal/daemon/telemetry_wiring_test.go @@ -0,0 +1,32 @@ +package daemon + +import ( + "log/slog" + "testing" + + telemetryadapter "github.com/aoagents/agent-orchestrator/backend/internal/adapters/telemetry" + "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +func TestNewTelemetrySink_DefaultsToNoopWhenDisabled(t *testing.T) { + sink := newTelemetrySink(config.Config{}, nil, slog.Default()) + if _, ok := sink.(telemetryadapter.NoopSink); !ok { + t.Fatalf("sink type = %T, want telemetry.NoopSink", sink) + } +} + +func TestNewTelemetrySink_UsesLocalSQLiteWhenEnabled(t *testing.T) { + store, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = store.Close() }) + + sink := newTelemetrySink(config.Config{Telemetry: config.TelemetryConfig{Events: true}}, store, slog.Default()) + local, ok := sink.(*telemetryadapter.LocalSQLiteSink) + if !ok { + t.Fatalf("sink type = %T, want *telemetry.LocalSQLiteSink", sink) + } + t.Cleanup(func() { _ = local.Close(t.Context()) }) +} diff --git a/backend/internal/ports/telemetry.go b/backend/internal/ports/telemetry.go new file mode 100644 index 00000000..622376b8 --- /dev/null +++ b/backend/internal/ports/telemetry.go @@ -0,0 +1,40 @@ +package ports + +import ( + "context" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +// TelemetryLevel is the severity of a telemetry event. +type TelemetryLevel string + +const ( + TelemetryLevelDebug TelemetryLevel = "debug" + TelemetryLevelInfo TelemetryLevel = "info" + TelemetryLevelWarn TelemetryLevel = "warn" + TelemetryLevelError TelemetryLevel = "error" +) + +// TelemetryEvent is a structured operational/product event emitted by the +// daemon. Payload must be allowlisted at the call site; sinks may serialize it +// but must not mutate it. +type TelemetryEvent struct { + Name string + Source string + OccurredAt time.Time + Level TelemetryLevel + ProjectID *domain.ProjectID + SessionID *domain.SessionID + RequestID string + Payload map[string]any +} + +// EventSink consumes structured telemetry events. Implementations should be +// best-effort: a slow or failing sink must not break the user action that +// emitted the event. +type EventSink interface { + Emit(ctx context.Context, ev TelemetryEvent) + Close(ctx context.Context) error +} diff --git a/backend/internal/storage/sqlite/gen/models.go b/backend/internal/storage/sqlite/gen/models.go index 589bfed0..f9899869 100644 --- a/backend/internal/storage/sqlite/gen/models.go +++ b/backend/internal/storage/sqlite/gen/models.go @@ -178,6 +178,18 @@ type SessionWorktree struct { State string } +type TelemetryEvent struct { + ID string + OccurredAt time.Time + Name string + Source string + Level string + ProjectID sql.NullString + SessionID sql.NullString + RequestID string + PayloadJson string +} + type WorkspaceRepo struct { ProjectID domain.ProjectID Name string diff --git a/backend/internal/storage/sqlite/gen/telemetry.sql.go b/backend/internal/storage/sqlite/gen/telemetry.sql.go new file mode 100644 index 00000000..ba0c994b --- /dev/null +++ b/backend/internal/storage/sqlite/gen/telemetry.sql.go @@ -0,0 +1,115 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: telemetry.sql + +package gen + +import ( + "context" + "database/sql" + "time" +) + +const createTelemetryEvent = `-- name: CreateTelemetryEvent :exec +INSERT INTO telemetry_event ( + id, occurred_at, name, source, level, project_id, session_id, request_id, payload_json +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +` + +type CreateTelemetryEventParams struct { + ID string + OccurredAt time.Time + Name string + Source string + Level string + ProjectID sql.NullString + SessionID sql.NullString + RequestID string + PayloadJson string +} + +func (q *Queries) CreateTelemetryEvent(ctx context.Context, arg CreateTelemetryEventParams) error { + _, err := q.db.ExecContext(ctx, createTelemetryEvent, + arg.ID, + arg.OccurredAt, + arg.Name, + arg.Source, + arg.Level, + arg.ProjectID, + arg.SessionID, + arg.RequestID, + arg.PayloadJson, + ) + return err +} + +const listTelemetryEventsSince = `-- name: ListTelemetryEventsSince :many +SELECT id, occurred_at, name, source, level, project_id, session_id, request_id, payload_json +FROM telemetry_event +WHERE occurred_at >= ? +ORDER BY occurred_at ASC +LIMIT ? +` + +type ListTelemetryEventsSinceParams struct { + OccurredAt time.Time + Limit int64 +} + +func (q *Queries) ListTelemetryEventsSince(ctx context.Context, arg ListTelemetryEventsSinceParams) ([]TelemetryEvent, error) { + rows, err := q.db.QueryContext(ctx, listTelemetryEventsSince, arg.OccurredAt, arg.Limit) + if err != nil { + return nil, err + } + defer rows.Close() + items := []TelemetryEvent{} + for rows.Next() { + var i TelemetryEvent + if err := rows.Scan( + &i.ID, + &i.OccurredAt, + &i.Name, + &i.Source, + &i.Level, + &i.ProjectID, + &i.SessionID, + &i.RequestID, + &i.PayloadJson, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const pruneTelemetryEventsBefore = `-- name: PruneTelemetryEventsBefore :execrows +DELETE FROM telemetry_event +WHERE id IN ( + SELECT te.id + FROM telemetry_event te + WHERE te.occurred_at < ? + ORDER BY te.occurred_at ASC + LIMIT ? +) +` + +type PruneTelemetryEventsBeforeParams struct { + OccurredAt time.Time + Limit int64 +} + +func (q *Queries) PruneTelemetryEventsBefore(ctx context.Context, arg PruneTelemetryEventsBeforeParams) (int64, error) { + result, err := q.db.ExecContext(ctx, pruneTelemetryEventsBefore, arg.OccurredAt, arg.Limit) + if err != nil { + return 0, err + } + return result.RowsAffected() +} diff --git a/backend/internal/storage/sqlite/migrations/0014_telemetry_events.sql b/backend/internal/storage/sqlite/migrations/0014_telemetry_events.sql new file mode 100644 index 00000000..2f240e20 --- /dev/null +++ b/backend/internal/storage/sqlite/migrations/0014_telemetry_events.sql @@ -0,0 +1,35 @@ +-- +goose Up +-- +goose StatementBegin +CREATE TABLE telemetry_event ( + id TEXT PRIMARY KEY, + occurred_at TIMESTAMP NOT NULL, + name TEXT NOT NULL, + source TEXT NOT NULL, + level TEXT NOT NULL CHECK (level IN ('debug', 'info', 'warn', 'error')), + project_id TEXT, + session_id TEXT, + request_id TEXT NOT NULL DEFAULT '', + payload_json TEXT NOT NULL +); + +CREATE INDEX idx_telemetry_event_occurred_at + ON telemetry_event(occurred_at DESC); + +CREATE INDEX idx_telemetry_event_name + ON telemetry_event(name, occurred_at DESC); + +CREATE INDEX idx_telemetry_event_project + ON telemetry_event(project_id, occurred_at DESC); + +CREATE INDEX idx_telemetry_event_session + ON telemetry_event(session_id, occurred_at DESC); +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP INDEX IF EXISTS idx_telemetry_event_session; +DROP INDEX IF EXISTS idx_telemetry_event_project; +DROP INDEX IF EXISTS idx_telemetry_event_name; +DROP INDEX IF EXISTS idx_telemetry_event_occurred_at; +DROP TABLE IF EXISTS telemetry_event; +-- +goose StatementEnd diff --git a/backend/internal/storage/sqlite/queries/telemetry.sql b/backend/internal/storage/sqlite/queries/telemetry.sql new file mode 100644 index 00000000..cb437017 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/telemetry.sql @@ -0,0 +1,21 @@ +-- name: CreateTelemetryEvent :exec +INSERT INTO telemetry_event ( + id, occurred_at, name, source, level, project_id, session_id, request_id, payload_json +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?); + +-- name: ListTelemetryEventsSince :many +SELECT id, occurred_at, name, source, level, project_id, session_id, request_id, payload_json +FROM telemetry_event +WHERE occurred_at >= ? +ORDER BY occurred_at ASC +LIMIT ?; + +-- name: PruneTelemetryEventsBefore :execrows +DELETE FROM telemetry_event +WHERE id IN ( + SELECT te.id + FROM telemetry_event te + WHERE te.occurred_at < ? + ORDER BY te.occurred_at ASC + LIMIT ? +); diff --git a/backend/internal/storage/sqlite/store/telemetry_store.go b/backend/internal/storage/sqlite/store/telemetry_store.go new file mode 100644 index 00000000..711a2fc0 --- /dev/null +++ b/backend/internal/storage/sqlite/store/telemetry_store.go @@ -0,0 +1,73 @@ +package store + +import ( + "context" + "database/sql" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// TelemetryEventRecord is the store-facing representation of a telemetry row. +type TelemetryEventRecord struct { + ID string + OccurredAt time.Time + Name string + Source string + Level string + ProjectID *domain.ProjectID + SessionID *domain.SessionID + RequestID string + PayloadJSON string +} + +// CreateTelemetryEvent persists one telemetry event row. +func (s *Store) CreateTelemetryEvent(ctx context.Context, rec TelemetryEventRecord) error { + arg := gen.CreateTelemetryEventParams{ + ID: rec.ID, + OccurredAt: rec.OccurredAt.UTC(), + Name: rec.Name, + Source: rec.Source, + Level: rec.Level, + RequestID: rec.RequestID, + PayloadJson: rec.PayloadJSON, + } + if rec.ProjectID != nil { + arg.ProjectID = sql.NullString{String: string(*rec.ProjectID), Valid: true} + } + if rec.SessionID != nil { + arg.SessionID = sql.NullString{String: string(*rec.SessionID), Valid: true} + } + if err := s.qw.CreateTelemetryEvent(ctx, arg); err != nil { + return fmt.Errorf("create telemetry event %s: %w", rec.ID, err) + } + return nil +} + +// ListTelemetryEventsSince returns telemetry rows oldest-first from a time +// boundary, capped by limit. +func (s *Store) ListTelemetryEventsSince(ctx context.Context, since time.Time, limit int64) ([]gen.TelemetryEvent, error) { + rows, err := s.qr.ListTelemetryEventsSince(ctx, gen.ListTelemetryEventsSinceParams{ + OccurredAt: since.UTC(), + Limit: limit, + }) + if err != nil { + return nil, fmt.Errorf("list telemetry events since %s: %w", since.UTC().Format(time.RFC3339), err) + } + return rows, nil +} + +// PruneTelemetryEventsBefore deletes at most limit rows older than before and +// returns how many rows were removed. +func (s *Store) PruneTelemetryEventsBefore(ctx context.Context, before time.Time, limit int64) (int64, error) { + n, err := s.qw.PruneTelemetryEventsBefore(ctx, gen.PruneTelemetryEventsBeforeParams{ + OccurredAt: before.UTC(), + Limit: limit, + }) + if err != nil { + return 0, fmt.Errorf("prune telemetry events before %s: %w", before.UTC().Format(time.RFC3339), err) + } + return n, nil +} diff --git a/backend/internal/storage/sqlite/store/telemetry_store_test.go b/backend/internal/storage/sqlite/store/telemetry_store_test.go new file mode 100644 index 00000000..15491629 --- /dev/null +++ b/backend/internal/storage/sqlite/store/telemetry_store_test.go @@ -0,0 +1,70 @@ +package store_test + +import ( + "context" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + sqlitestore "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/store" +) + +func TestTelemetryStore_CreateListAndPrune(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + + projectID := domain.ProjectID("mer") + sessionID := domain.SessionID("mer-1") + seedProject(t, s, string(projectID)) + + oldAt := time.Now().UTC().Add(-31 * 24 * time.Hour).Truncate(time.Second) + newAt := time.Now().UTC().Truncate(time.Second) + + if err := s.CreateTelemetryEvent(ctx, telemetryRecord("tev_old", oldAt, &projectID, &sessionID)); err != nil { + t.Fatalf("CreateTelemetryEvent old: %v", err) + } + if err := s.CreateTelemetryEvent(ctx, telemetryRecord("tev_new", newAt, &projectID, &sessionID)); err != nil { + t.Fatalf("CreateTelemetryEvent new: %v", err) + } + + rows, err := s.ListTelemetryEventsSince(ctx, oldAt.Add(-time.Second), 10) + if err != nil { + t.Fatalf("ListTelemetryEventsSince: %v", err) + } + if len(rows) != 2 { + t.Fatalf("rows = %d, want 2", len(rows)) + } + if rows[0].ID != "tev_old" || rows[1].ID != "tev_new" { + t.Fatalf("ids = %q, %q", rows[0].ID, rows[1].ID) + } + + n, err := s.PruneTelemetryEventsBefore(ctx, newAt.Add(-24*time.Hour), 100) + if err != nil { + t.Fatalf("PruneTelemetryEventsBefore: %v", err) + } + if n != 1 { + t.Fatalf("pruned = %d, want 1", n) + } + + rows, err = s.ListTelemetryEventsSince(ctx, oldAt.Add(-time.Second), 10) + if err != nil { + t.Fatalf("ListTelemetryEventsSince after prune: %v", err) + } + if len(rows) != 1 || rows[0].ID != "tev_new" { + t.Fatalf("remaining rows = %+v", rows) + } +} + +func telemetryRecord(id string, at time.Time, projectID *domain.ProjectID, sessionID *domain.SessionID) sqlitestore.TelemetryEventRecord { + return sqlitestore.TelemetryEventRecord{ + ID: id, + OccurredAt: at, + Name: "ao.daemon.started", + Source: "daemon", + Level: "info", + ProjectID: projectID, + SessionID: sessionID, + RequestID: "req_123", + PayloadJSON: `{"port":3001}`, + } +} From 1107b2907571b1cd9d2aef79e17a10ec6426f240 Mon Sep 17 00:00:00 2001 From: laxmanclo Date: Wed, 17 Jun 2026 13:54:04 +0530 Subject: [PATCH 2/8] feat(backend): emit session telemetry events --- backend/internal/daemon/daemon.go | 2 +- backend/internal/daemon/lifecycle_wiring.go | 3 +- backend/internal/daemon/wiring_test.go | 3 +- backend/internal/service/session/service.go | 45 ++++++++++++- .../internal/service/session/service_test.go | 67 ++++++++++++++++++- 5 files changed, 115 insertions(+), 5 deletions(-) diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index a4d620cb..753c12f0 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -112,7 +112,7 @@ func Run() error { // zellij runtime, a gitworktree workspace, the per-session agent resolver // (AO_AGENT default, validated here), and the agent messenger, then mount it // on the API. - sessionSvc, reviewSvc, err := startSession(cfg, runtimeAdapter, store, lcStack.LCM, messenger, log) + sessionSvc, reviewSvc, err := startSession(cfg, runtimeAdapter, store, lcStack.LCM, messenger, telemetrySink, log) if err != nil { stop() lcStack.Stop() diff --git a/backend/internal/daemon/lifecycle_wiring.go b/backend/internal/daemon/lifecycle_wiring.go index a66d3dea..76195515 100644 --- a/backend/internal/daemon/lifecycle_wiring.go +++ b/backend/internal/daemon/lifecycle_wiring.go @@ -62,7 +62,7 @@ func (l *lifecycleStack) Stop() { // over the real zellij runtime, a per-session gitworktree workspace, the shared // store + LCM, the per-session agent resolver (AO_AGENT default), and the // agent messenger. The returned service is mounted at httpd APIDeps.Sessions. -func startSession(cfg config.Config, runtime *zellij.Runtime, store *sqlite.Store, lcm *lifecycle.Manager, messenger ports.AgentMessenger, log *slog.Logger) (*sessionsvc.Service, reviewsvc.Manager, error) { +func startSession(cfg config.Config, runtime *zellij.Runtime, store *sqlite.Store, lcm *lifecycle.Manager, messenger ports.AgentMessenger, telemetry ports.EventSink, log *slog.Logger) (*sessionsvc.Service, reviewsvc.Manager, error) { // Resolve the default agent once and share it with both the resolver (which // launches it for an unspecified harness) and the session manager (which // persists it onto the seed row), so the stored harness matches what runs. @@ -106,6 +106,7 @@ func startSession(cfg config.Config, runtime *zellij.Runtime, store *sqlite.Stor Store: store, PRClaimer: store, SCM: scmProvider, + Telemetry: telemetry, // no_signal only makes sense for harnesses whose adapters install // activity hooks; the deriver registry is the source of truth for that. SignalCapable: activitydispatch.SupportsHarness, diff --git a/backend/internal/daemon/wiring_test.go b/backend/internal/daemon/wiring_test.go index 36e67344..0722b819 100644 --- a/backend/internal/daemon/wiring_test.go +++ b/backend/internal/daemon/wiring_test.go @@ -11,6 +11,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/adapters" "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/zellij" + telemetryadapter "github.com/aoagents/agent-orchestrator/backend/internal/adapters/telemetry" "github.com/aoagents/agent-orchestrator/backend/internal/cdc" "github.com/aoagents/agent-orchestrator/backend/internal/config" "github.com/aoagents/agent-orchestrator/backend/internal/domain" @@ -149,7 +150,7 @@ func TestWiring_StartSessionBuildsSessionService(t *testing.T) { runtime := zellij.New(zellij.Options{}) messenger := newSessionMessenger(store, runtime, log) - svc, reviewSvc, err := startSession(cfg, runtime, store, lcm, messenger, log) + svc, reviewSvc, err := startSession(cfg, runtime, store, lcm, messenger, telemetryadapter.NoopSink{}, log) if err != nil { t.Fatalf("startSession: %v", err) } diff --git a/backend/internal/service/session/service.go b/backend/internal/service/session/service.go index 00bd6d90..d10875a2 100644 --- a/backend/internal/service/session/service.go +++ b/backend/internal/service/session/service.go @@ -80,6 +80,7 @@ type Service struct { prClaimer ports.PRClaimer scm scmProvider clock func() time.Time + telemetry ports.EventSink // signalCapable reports whether a harness has a hook pipeline that can // deliver activity signals at all. Only capable harnesses are eligible for // the no_signal downgrade — a hook-less harness staying silent forever is @@ -101,6 +102,7 @@ type Deps struct { PRClaimer ports.PRClaimer SCM scmProvider Clock func() time.Time + Telemetry ports.EventSink // SignalCapable gates the no_signal status downgrade per harness; daemon // wiring passes activitydispatch.SupportsHarness. Left nil, no session is // ever downgraded to no_signal. @@ -109,7 +111,7 @@ type Deps struct { // NewWithDeps wires a session service with optional PR-claim dependencies. func NewWithDeps(d Deps) *Service { - s := &Service{manager: d.Manager, store: d.Store, prClaimer: d.PRClaimer, scm: d.SCM, clock: d.Clock, signalCapable: d.SignalCapable} + s := &Service{manager: d.Manager, store: d.Store, prClaimer: d.PRClaimer, scm: d.SCM, clock: d.Clock, signalCapable: d.SignalCapable, telemetry: d.Telemetry} if s.prClaimer == nil { if w, ok := d.Store.(ports.PRClaimer); ok { s.prClaimer = w @@ -128,8 +130,10 @@ func (s *Service) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Sess } rec, err := s.manager.Spawn(ctx, cfg) if err != nil { + s.emitSpawnFailed(cfg, err) return domain.Session{}, toAPIError(err) } + s.emitSpawned(rec) return s.toSession(ctx, rec) } @@ -153,6 +157,45 @@ func (s *Service) requireProject(ctx context.Context, id domain.ProjectID) error return nil } +func (s *Service) emitSpawned(rec domain.SessionRecord) { + if s.telemetry == nil { + return + } + projectID := rec.ProjectID + sessionID := rec.ID + s.telemetry.Emit(context.Background(), ports.TelemetryEvent{ + Name: "ao.session.spawned", + Source: "session_service", + OccurredAt: s.clock().UTC(), + Level: ports.TelemetryLevelInfo, + ProjectID: &projectID, + SessionID: &sessionID, + Payload: map[string]any{ + "kind": string(rec.Kind), + "harness": string(rec.Harness), + }, + }) +} + +func (s *Service) emitSpawnFailed(cfg ports.SpawnConfig, err error) { + if s.telemetry == nil { + return + } + projectID := cfg.ProjectID + s.telemetry.Emit(context.Background(), ports.TelemetryEvent{ + Name: "ao.session.spawn_failed", + Source: "session_service", + OccurredAt: s.clock().UTC(), + Level: ports.TelemetryLevelError, + ProjectID: &projectID, + Payload: map[string]any{ + "kind": string(cfg.Kind), + "harness": string(cfg.Harness), + "error": err.Error(), + }, + }) +} + // SpawnOrchestrator spawns an orchestrator session for a project. When clean is // true it first tears down any active orchestrator(s) for that project so the new // one is the only live coordinator — a business rule that belongs here, not in the diff --git a/backend/internal/service/session/service_test.go b/backend/internal/service/session/service_test.go index 51fef9ce..180afd92 100644 --- a/backend/internal/service/session/service_test.go +++ b/backend/internal/service/session/service_test.go @@ -13,6 +13,13 @@ import ( sessionmanager "github.com/aoagents/agent-orchestrator/backend/internal/session_manager" ) +type fakeTelemetrySink struct{ events []ports.TelemetryEvent } + +func (f *fakeTelemetrySink) Emit(_ context.Context, ev ports.TelemetryEvent) { + f.events = append(f.events, ev) +} +func (f *fakeTelemetrySink) Close(context.Context) error { return nil } + type fakeStore struct { sessions map[domain.SessionID]domain.SessionRecord pr map[domain.SessionID]domain.PRFacts @@ -131,14 +138,18 @@ type fakeCommander struct { cleanupProjects []domain.ProjectID killErr error cleanupErr error + spawnErr error spawned bool killsAtSpawn int } func (f *fakeCommander) Spawn(_ context.Context, cfg ports.SpawnConfig) (domain.SessionRecord, error) { + if f.spawnErr != nil { + return domain.SessionRecord{}, f.spawnErr + } f.spawned = true f.killsAtSpawn = len(f.killed) - return domain.SessionRecord{ID: "mer-9", ProjectID: cfg.ProjectID, Kind: cfg.Kind}, nil + return domain.SessionRecord{ID: "mer-9", ProjectID: cfg.ProjectID, Kind: cfg.Kind, Harness: cfg.Harness}, nil } func (f *fakeCommander) Restore(context.Context, domain.SessionID) (domain.SessionRecord, error) { return domain.SessionRecord{}, nil @@ -260,6 +271,60 @@ func TestSpawnUnknownProjectReturns404(t *testing.T) { } } +func TestSpawnEmitsTelemetryOnSuccess(t *testing.T) { + st := newFakeStore() + st.projects["mer"] = domain.ProjectRecord{ID: "mer"} + fc := &fakeCommander{} + ts := &fakeTelemetrySink{} + svc := NewWithDeps(Deps{Manager: fc, Store: st, Telemetry: ts, Clock: func() time.Time { return time.Unix(1700000000, 0).UTC() }}) + + _, err := svc.Spawn(context.Background(), ports.SpawnConfig{ + ProjectID: "mer", + Kind: domain.KindWorker, + Harness: domain.HarnessCodex, + }) + if err != nil { + t.Fatalf("Spawn: %v", err) + } + if len(ts.events) != 1 { + t.Fatalf("telemetry events = %d, want 1", len(ts.events)) + } + ev := ts.events[0] + if ev.Name != "ao.session.spawned" || ev.Source != "session_service" { + t.Fatalf("event = %+v", ev) + } + if ev.ProjectID == nil || *ev.ProjectID != "mer" || ev.SessionID == nil || *ev.SessionID != "mer-9" { + t.Fatalf("event ids = %+v", ev) + } +} + +func TestSpawnEmitsTelemetryOnFailure(t *testing.T) { + st := newFakeStore() + st.projects["mer"] = domain.ProjectRecord{ID: "mer"} + fc := &fakeCommander{spawnErr: errors.New("boom")} + ts := &fakeTelemetrySink{} + svc := NewWithDeps(Deps{Manager: fc, Store: st, Telemetry: ts, Clock: func() time.Time { return time.Unix(1700000000, 0).UTC() }}) + + _, err := svc.Spawn(context.Background(), ports.SpawnConfig{ + ProjectID: "mer", + Kind: domain.KindWorker, + Harness: domain.HarnessCodex, + }) + if err == nil { + t.Fatal("Spawn error = nil, want failure") + } + if len(ts.events) != 1 { + t.Fatalf("telemetry events = %d, want 1", len(ts.events)) + } + ev := ts.events[0] + if ev.Name != "ao.session.spawn_failed" || ev.Source != "session_service" || ev.Level != ports.TelemetryLevelError { + t.Fatalf("event = %+v", ev) + } + if ev.ProjectID == nil || *ev.ProjectID != "mer" || ev.SessionID != nil { + t.Fatalf("event ids = %+v", ev) + } +} + // TestSpawnOrchestratorUnknownProjectReturns404 is the orchestrator-side guard // for Bug 1: same pre-validation, same typed envelope. func TestSpawnOrchestratorUnknownProjectReturns404(t *testing.T) { From 887c190d70021c9c1ffbc58ca04d1c57390f987b Mon Sep 17 00:00:00 2001 From: laxmanclo Date: Wed, 17 Jun 2026 16:45:06 +0530 Subject: [PATCH 3/8] feat(backend): add http and cli telemetry export --- backend/internal/adapters/telemetry/fanout.go | 39 ++++ .../internal/adapters/telemetry/posthog.go | 177 ++++++++++++++++++ .../adapters/telemetry/posthog_test.go | 75 ++++++++ backend/internal/cli/client.go | 10 +- backend/internal/cli/root.go | 25 +++ backend/internal/cli/root_test.go | 93 +++++++++ backend/internal/cli/start.go | 1 + backend/internal/daemon/daemon.go | 1 + backend/internal/daemon/telemetry_wiring.go | 11 +- .../internal/daemon/telemetry_wiring_test.go | 29 ++- backend/internal/httpd/api.go | 2 + backend/internal/httpd/log.go | 22 ++- backend/internal/httpd/log_test.go | 21 ++- backend/internal/httpd/recover.go | 62 ++++++ backend/internal/httpd/router.go | 57 +++++- backend/internal/httpd/telemetry_test.go | 83 ++++++++ 16 files changed, 698 insertions(+), 10 deletions(-) create mode 100644 backend/internal/adapters/telemetry/fanout.go create mode 100644 backend/internal/adapters/telemetry/posthog.go create mode 100644 backend/internal/adapters/telemetry/posthog_test.go create mode 100644 backend/internal/httpd/recover.go create mode 100644 backend/internal/httpd/telemetry_test.go diff --git a/backend/internal/adapters/telemetry/fanout.go b/backend/internal/adapters/telemetry/fanout.go new file mode 100644 index 00000000..52e4b8d7 --- /dev/null +++ b/backend/internal/adapters/telemetry/fanout.go @@ -0,0 +1,39 @@ +package telemetry + +import ( + "context" + "errors" + + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +// FanoutSink emits each event to multiple sinks. +type FanoutSink struct { + sinks []ports.EventSink +} + +func NewFanoutSink(sinks ...ports.EventSink) *FanoutSink { + filtered := make([]ports.EventSink, 0, len(sinks)) + for _, sink := range sinks { + if sink != nil { + filtered = append(filtered, sink) + } + } + return &FanoutSink{sinks: filtered} +} + +func (s *FanoutSink) Emit(ctx context.Context, ev ports.TelemetryEvent) { + for _, sink := range s.sinks { + sink.Emit(ctx, ev) + } +} + +func (s *FanoutSink) Close(ctx context.Context) error { + var errs []error + for _, sink := range s.sinks { + if err := sink.Close(ctx); err != nil { + errs = append(errs, err) + } + } + return errors.Join(errs...) +} diff --git a/backend/internal/adapters/telemetry/posthog.go b/backend/internal/adapters/telemetry/posthog.go new file mode 100644 index 00000000..2b98d51b --- /dev/null +++ b/backend/internal/adapters/telemetry/posthog.go @@ -0,0 +1,177 @@ +package telemetry + +import ( + "bytes" + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/google/uuid" + + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +const postHogBufferSize = 128 + +type postHogClient interface { + Do(req *http.Request) (*http.Response, error) +} + +// PostHogSink exports allowlisted telemetry events to PostHog. +type PostHogSink struct { + apiKey string + host string + distinctID string + client postHogClient + log *slog.Logger + ch chan ports.TelemetryEvent + wg sync.WaitGroup + closeOnce sync.Once +} + +func NewPostHogSink(dataDir, apiKey, host string, client postHogClient, log *slog.Logger) (*PostHogSink, error) { + if strings.TrimSpace(apiKey) == "" { + return nil, fmt.Errorf("posthog api key is required") + } + if strings.TrimSpace(host) == "" { + return nil, fmt.Errorf("posthog host is required") + } + if client == nil { + client = &http.Client{Timeout: 5 * time.Second} + } + distinctID, err := loadOrCreateInstallID(dataDir) + if err != nil { + return nil, err + } + s := &PostHogSink{ + apiKey: apiKey, + host: strings.TrimRight(host, "/"), + distinctID: distinctID, + client: client, + log: telemetryLogger(log), + ch: make(chan ports.TelemetryEvent, postHogBufferSize), + } + s.wg.Add(1) + go s.loop() + return s, nil +} + +func (s *PostHogSink) Emit(_ context.Context, ev ports.TelemetryEvent) { + select { + case s.ch <- ev: + default: + s.log.Warn("telemetry posthog sink buffer full; dropping event", "name", ev.Name, "source", ev.Source) + } +} + +func (s *PostHogSink) Close(ctx context.Context) error { + s.closeOnce.Do(func() { close(s.ch) }) + done := make(chan struct{}) + go func() { + defer close(done) + s.wg.Wait() + }() + select { + case <-ctx.Done(): + return ctx.Err() + case <-done: + return nil + } +} + +func (s *PostHogSink) loop() { + defer s.wg.Done() + for ev := range s.ch { + s.send(ev) + } +} + +func (s *PostHogSink) send(ev ports.TelemetryEvent) { + body := map[string]any{ + "api_key": s.apiKey, + "event": ev.Name, + "distinct_id": s.distinctID, + "properties": s.properties(ev), + "timestamp": ev.OccurredAt.UTC().Format(time.RFC3339Nano), + } + payload, err := json.Marshal(body) + if err != nil { + s.log.Warn("telemetry posthog payload marshal failed", "name", ev.Name, "error", err) + return + } + req, err := http.NewRequestWithContext(context.Background(), http.MethodPost, s.host+"/capture/", bytes.NewReader(payload)) + if err != nil { + s.log.Warn("telemetry posthog request build failed", "name", ev.Name, "error", err) + return + } + req.Header.Set("Content-Type", "application/json") + + resp, err := s.client.Do(req) + if err != nil { + s.log.Warn("telemetry posthog export failed", "name", ev.Name, "error", err) + return + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + b, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) + s.log.Warn("telemetry posthog rejected event", "name", ev.Name, "status", resp.StatusCode, "body", strings.TrimSpace(string(b))) + } +} + +func (s *PostHogSink) properties(ev ports.TelemetryEvent) map[string]any { + props := map[string]any{ + "source": ev.Source, + "level": string(ev.Level), + } + if ev.RequestID != "" { + props["request_id"] = ev.RequestID + } + if ev.ProjectID != nil { + props["project_id_hash"] = sha256String(string(*ev.ProjectID)) + } + if ev.SessionID != nil { + props["session_id_hash"] = sha256String(string(*ev.SessionID)) + } + for k, v := range ev.Payload { + props[k] = v + } + return props +} + +func loadOrCreateInstallID(dataDir string) (string, error) { + path := filepath.Join(dataDir, "telemetry_install_id") + if b, err := os.ReadFile(path); err == nil { + if id := strings.TrimSpace(string(b)); id != "" { + return id, nil + } + } else if !os.IsNotExist(err) { + return "", fmt.Errorf("read telemetry install id: %w", err) + } + id := "ins_" + uuid.NewString() + if err := os.WriteFile(path, []byte(id+"\n"), 0o600); err != nil { + return "", fmt.Errorf("write telemetry install id: %w", err) + } + return id, nil +} + +func sha256String(raw string) string { + sum := sha256.Sum256([]byte(raw)) + return hex.EncodeToString(sum[:]) +} + +func telemetryLogger(log *slog.Logger) *slog.Logger { + if log != nil { + return log + } + return slog.Default() +} diff --git a/backend/internal/adapters/telemetry/posthog_test.go b/backend/internal/adapters/telemetry/posthog_test.go new file mode 100644 index 00000000..67e238ab --- /dev/null +++ b/backend/internal/adapters/telemetry/posthog_test.go @@ -0,0 +1,75 @@ +package telemetry + +import ( + "context" + "encoding/json" + "net/http" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +func TestPostHogSinkCapturesEvent(t *testing.T) { + requests := make(chan map[string]any, 1) + sink, err := NewPostHogSink(t.TempDir(), "phc_test", "https://us.i.posthog.com", roundTripClient(func(req *http.Request) (*http.Response, error) { + defer req.Body.Close() + var body map[string]any + if err := json.NewDecoder(req.Body).Decode(&body); err != nil { + return nil, err + } + requests <- body + return &http.Response{ + StatusCode: http.StatusOK, + Header: make(http.Header), + Body: http.NoBody, + }, nil + }), nil) + if err != nil { + t.Fatalf("NewPostHogSink: %v", err) + } + + projectID := domain.ProjectID("proj-1") + sessionID := domain.SessionID("sess-1") + sink.Emit(context.Background(), ports.TelemetryEvent{ + Name: "ao.session.spawned", + Source: "session_service", + OccurredAt: time.Unix(1700000000, 0).UTC(), + Level: ports.TelemetryLevelInfo, + ProjectID: &projectID, + SessionID: &sessionID, + RequestID: "req-1", + Payload: map[string]any{ + "kind": "worker", + }, + }) + if err := sink.Close(context.Background()); err != nil { + t.Fatalf("Close: %v", err) + } + + select { + case req := <-requests: + if got := req["event"]; got != "ao.session.spawned" { + t.Fatalf("event = %#v, want ao.session.spawned", got) + } + props, ok := req["properties"].(map[string]any) + if !ok { + t.Fatalf("properties type = %T, want map[string]any", req["properties"]) + } + if props["kind"] != "worker" { + t.Fatalf("properties.kind = %#v, want worker", props["kind"]) + } + if props["project_id_hash"] == "" || props["session_id_hash"] == "" { + t.Fatalf("hashed ids missing from properties: %#v", props) + } + case <-time.After(2 * time.Second): + t.Fatal("PostHog sink did not send request") + } +} + +type roundTripClient func(*http.Request) (*http.Response, error) + +func (f roundTripClient) Do(req *http.Request) (*http.Response, error) { return f(req) } + +var _ postHogClient = roundTripClient(nil) diff --git a/backend/internal/cli/client.go b/backend/internal/cli/client.go index 4c8ddff2..2f5ca9df 100644 --- a/backend/internal/cli/client.go +++ b/backend/internal/cli/client.go @@ -75,6 +75,14 @@ func (c *commandContext) deleteJSON(ctx context.Context, path string, out any) e } func (c *commandContext) doJSON(ctx context.Context, method, path string, body, out any) error { + return c.doJSONPath(ctx, method, "/api/v1/"+path, body, out) +} + +func (c *commandContext) postLoopbackJSON(ctx context.Context, path string, body any) error { + return c.doJSONPath(ctx, http.MethodPost, path, body, nil) +} + +func (c *commandContext) doJSONPath(ctx context.Context, method, path string, body, out any) error { cfg, err := config.Load() if err != nil { return err @@ -98,7 +106,7 @@ func (c *commandContext) doJSON(ctx context.Context, method, path string, body, } reader = bytes.NewReader(payload) } - url := fmt.Sprintf("http://%s:%d/api/v1/%s", config.LoopbackHost, info.Port, path) + url := fmt.Sprintf("http://%s:%d%s", config.LoopbackHost, info.Port, path) req, err := http.NewRequestWithContext(ctx, method, url, reader) // #nosec G704 -- daemon host is fixed loopback; path is an internal API route. if err != nil { return err diff --git a/backend/internal/cli/root.go b/backend/internal/cli/root.go index f536459c..942f8f2b 100644 --- a/backend/internal/cli/root.go +++ b/backend/internal/cli/root.go @@ -9,6 +9,7 @@ import ( "net/http" "os" "os/exec" + "strings" "time" "github.com/spf13/cobra" @@ -149,6 +150,12 @@ func NewRootCommand(deps Deps) *cobra.Command { Version: VersionString(), SilenceUsage: true, SilenceErrors: true, + PersistentPreRunE: func(cmd *cobra.Command, _ []string) error { + if shouldEmitCLIInvocation(cmd) { + ctx.emitCLIInvoked(cmd.Context(), cmd) + } + return nil + }, } root.SetIn(deps.In) root.SetOut(deps.Out) @@ -182,6 +189,24 @@ type commandContext struct { deps Deps } +func shouldEmitCLIInvocation(cmd *cobra.Command) bool { + switch strings.TrimSpace(cmd.CommandPath()) { + case "ao daemon", "ao start", "ao completion", "ao help": + return false + default: + return true + } +} + +func (c *commandContext) emitCLIInvoked(ctx context.Context, cmd *cobra.Command) { + reqCtx, cancel := context.WithTimeout(ctx, probeTimeout) + defer cancel() + _ = c.postLoopbackJSON(reqCtx, "/internal/telemetry/cli-invoked", map[string]string{ + "command": cmd.Name(), + "commandPath": cmd.CommandPath(), + }) +} + func noArgs(cmd *cobra.Command, args []string) error { if err := cobra.ExactArgs(0)(cmd, args); err != nil { return usageError{err} diff --git a/backend/internal/cli/root_test.go b/backend/internal/cli/root_test.go index f9576cb9..27416797 100644 --- a/backend/internal/cli/root_test.go +++ b/backend/internal/cli/root_test.go @@ -3,6 +3,7 @@ package cli import ( "bytes" "fmt" + "io" "net" "net/http" "net/http/httptest" @@ -55,6 +56,35 @@ func TestCommandsRejectUnexpectedArgs(t *testing.T) { } } +func TestVersionEmitsCLIInvocationBestEffort(t *testing.T) { + cfg := setConfigEnv(t) + called := make(chan string, 1) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: os.Getpid(), Port: 3001, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + if _, _, err := executeCLI(t, Deps{ + HTTPClient: &http.Client{Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) { + if req.URL.Path == "/internal/telemetry/cli-invoked" { + called <- req.URL.Path + return jsonResponse(http.StatusAccepted, ""), nil + } + return jsonResponse(http.StatusNotFound, ""), nil + })}, + ProcessAlive: func(pid int) bool { return pid == os.Getpid() }, + }, "version"); err != nil { + t.Fatal(err) + } + select { + case path := <-called: + if path != "/internal/telemetry/cli-invoked" { + t.Fatalf("telemetry path = %q, want /internal/telemetry/cli-invoked", path) + } + default: + t.Fatal("version did not emit CLI invocation") + } +} + func TestStatusStoppedJSON(t *testing.T) { setConfigEnv(t) @@ -160,6 +190,54 @@ func TestStartClearsStaleRunFileBeforeSpawning(t *testing.T) { } } +func TestStartEmitsCLIInvocationAfterReady(t *testing.T) { + cfg := setConfigEnv(t) + var spawned atomic.Bool + called := make(chan string, 1) + port := 3001 + out, _, err := executeCLI(t, Deps{ + HTTPClient: &http.Client{Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) { + switch req.URL.Path { + case "/healthz": + if !spawned.Load() { + return jsonResponse(http.StatusOK, `{"status":"ok","service":"not-ao","pid":4242}`), nil + } + return jsonResponse(http.StatusOK, fmt.Sprintf(`{"status":"ok","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid())), nil + case "/readyz": + if !spawned.Load() { + return jsonResponse(http.StatusOK, `{"status":"not_ready","service":"not-ao","pid":4242}`), nil + } + return jsonResponse(http.StatusOK, fmt.Sprintf(`{"status":"ready","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid())), nil + case "/internal/telemetry/cli-invoked": + called <- req.URL.Path + return jsonResponse(http.StatusAccepted, ""), nil + default: + return jsonResponse(http.StatusNotFound, ""), nil + } + })}, + ProcessAlive: func(pid int) bool { return pid == os.Getpid() }, + StartProcess: func(processStartConfig) error { + spawned.Store(true) + return runfile.Write(cfg.runFile, runfile.Info{PID: os.Getpid(), Port: port, StartedAt: time.Unix(110, 0).UTC()}) + }, + Now: func() time.Time { return time.Unix(120, 0).UTC() }, + }, "start", "--json") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(out, `"state": "ready"`) { + t.Fatalf("start did not report ready:\n%s", out) + } + select { + case path := <-called: + if path != "/internal/telemetry/cli-invoked" { + t.Fatalf("telemetry path = %q, want /internal/telemetry/cli-invoked", path) + } + default: + t.Fatal("start did not emit CLI invocation after readiness") + } +} + func TestStopRemovesStaleRunFile(t *testing.T) { cfg := setConfigEnv(t) if err := runfile.Write(cfg.runFile, runfile.Info{PID: 999999, Port: 3001, StartedAt: time.Unix(100, 0).UTC()}); err != nil { @@ -404,3 +482,18 @@ func closedPort(t *testing.T) int { } return port } + +type roundTripFunc func(*http.Request) (*http.Response, error) + +func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) { return f(req) } + +func jsonResponse(status int, body string) *http.Response { + if body == "" { + body = "{}" + } + return &http.Response{ + StatusCode: status, + Header: make(http.Header), + Body: io.NopCloser(strings.NewReader(body)), + } +} diff --git a/backend/internal/cli/start.go b/backend/internal/cli/start.go index a67e4007..6d6f3716 100644 --- a/backend/internal/cli/start.go +++ b/backend/internal/cli/start.go @@ -32,6 +32,7 @@ func newStartCommand(ctx *commandContext) *cobra.Command { if err != nil { return err } + ctx.emitCLIInvoked(cmd.Context(), cmd) if opts.json { return writeJSON(cmd.OutOrStdout(), st) } diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index 753c12f0..3ed5f3e8 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -131,6 +131,7 @@ func Run() error { CDC: store, Events: cdcPipe.Broadcaster, Activity: lcStack.LCM, + Telemetry: telemetrySink, }) if err != nil { stop() diff --git a/backend/internal/daemon/telemetry_wiring.go b/backend/internal/daemon/telemetry_wiring.go index 4b936450..f045086c 100644 --- a/backend/internal/daemon/telemetry_wiring.go +++ b/backend/internal/daemon/telemetry_wiring.go @@ -13,5 +13,14 @@ func newTelemetrySink(cfg config.Config, store *sqlite.Store, log *slog.Logger) if !cfg.Telemetry.Events && !cfg.Telemetry.Metrics { return telemetryadapter.NoopSink{} } - return telemetryadapter.NewLocalSQLiteSink(store, log) + local := telemetryadapter.NewLocalSQLiteSink(store, log) + if cfg.Telemetry.Remote != config.TelemetryRemotePostHog { + return local + } + remote, err := telemetryadapter.NewPostHogSink(cfg.DataDir, cfg.Telemetry.PostHogKey, cfg.Telemetry.PostHogHost, nil, log) + if err != nil { + log.Warn("telemetry remote sink disabled", "remote", cfg.Telemetry.Remote, "error", err) + return local + } + return telemetryadapter.NewFanoutSink(local, remote) } diff --git a/backend/internal/daemon/telemetry_wiring_test.go b/backend/internal/daemon/telemetry_wiring_test.go index 839dae90..eca0b902 100644 --- a/backend/internal/daemon/telemetry_wiring_test.go +++ b/backend/internal/daemon/telemetry_wiring_test.go @@ -17,16 +17,41 @@ func TestNewTelemetrySink_DefaultsToNoopWhenDisabled(t *testing.T) { } func TestNewTelemetrySink_UsesLocalSQLiteWhenEnabled(t *testing.T) { - store, err := sqlite.Open(t.TempDir()) + dataDir := t.TempDir() + store, err := sqlite.Open(dataDir) if err != nil { t.Fatalf("open store: %v", err) } t.Cleanup(func() { _ = store.Close() }) - sink := newTelemetrySink(config.Config{Telemetry: config.TelemetryConfig{Events: true}}, store, slog.Default()) + sink := newTelemetrySink(config.Config{Telemetry: config.TelemetryConfig{Events: true}, DataDir: dataDir}, store, slog.Default()) local, ok := sink.(*telemetryadapter.LocalSQLiteSink) if !ok { t.Fatalf("sink type = %T, want *telemetry.LocalSQLiteSink", sink) } t.Cleanup(func() { _ = local.Close(t.Context()) }) } + +func TestNewTelemetrySink_FanoutIncludesPostHogWhenConfigured(t *testing.T) { + dataDir := t.TempDir() + store, err := sqlite.Open(dataDir) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = store.Close() }) + + sink := newTelemetrySink(config.Config{ + DataDir: dataDir, + Telemetry: config.TelemetryConfig{ + Events: true, + Remote: config.TelemetryRemotePostHog, + PostHogKey: "phc_test", + PostHogHost: "https://us.i.posthog.com", + }, + }, store, slog.Default()) + fanout, ok := sink.(*telemetryadapter.FanoutSink) + if !ok { + t.Fatalf("sink type = %T, want *telemetry.FanoutSink", sink) + } + t.Cleanup(func() { _ = fanout.Close(t.Context()) }) +} diff --git a/backend/internal/httpd/api.go b/backend/internal/httpd/api.go index 40b65d8a..9026376d 100644 --- a/backend/internal/httpd/api.go +++ b/backend/internal/httpd/api.go @@ -11,6 +11,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/httpd/apispec" "github.com/aoagents/agent-orchestrator/backend/internal/httpd/controllers" "github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" prsvc "github.com/aoagents/agent-orchestrator/backend/internal/service/pr" projectsvc "github.com/aoagents/agent-orchestrator/backend/internal/service/project" reviewsvc "github.com/aoagents/agent-orchestrator/backend/internal/service/review" @@ -27,6 +28,7 @@ type APIDeps struct { NotificationStream controllers.NotificationStream CDC cdc.Source Events cdcSubscriber + Telemetry ports.EventSink } // API owns one controller per resource and is the single Register call the diff --git a/backend/internal/httpd/log.go b/backend/internal/httpd/log.go index 43eb0d78..03fc4a9f 100644 --- a/backend/internal/httpd/log.go +++ b/backend/internal/httpd/log.go @@ -8,6 +8,7 @@ import ( "github.com/go-chi/chi/v5/middleware" "github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) // requestLogger emits one structured access-log line per request via the @@ -24,7 +25,7 @@ import ( // A 5xx line additionally carries the raw service error recorded by // envelope.WriteError: the wire envelope hides internals ("Internal server // error"), so without this the cause of a 500 was lost entirely. -func requestLogger(log *slog.Logger) func(http.Handler) http.Handler { +func requestLogger(log *slog.Logger, sink ports.EventSink) func(http.Handler) http.Handler { return func(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { ww := middleware.NewWrapResponseWriter(w, r.ProtoMajor) @@ -44,6 +45,25 @@ func requestLogger(log *slog.Logger) func(http.Handler) http.Handler { attrs = append(attrs, "error", err) } log.Info("http request", attrs...) + if sink != nil && ww.Status() >= http.StatusInternalServerError { + payload := map[string]any{ + "method": r.Method, + "path": r.URL.Path, + "status": ww.Status(), + "duration": time.Since(start).Milliseconds(), + } + if err := capturedErr(); err != nil { + payload["error"] = err.Error() + } + sink.Emit(r.Context(), ports.TelemetryEvent{ + Name: "ao.http.5xx", + Source: "http", + OccurredAt: time.Now().UTC(), + Level: ports.TelemetryLevelError, + RequestID: middleware.GetReqID(r.Context()), + Payload: payload, + }) + } }() next.ServeHTTP(ww, r) }) diff --git a/backend/internal/httpd/log_test.go b/backend/internal/httpd/log_test.go index c1d1f23a..cb044c6a 100644 --- a/backend/internal/httpd/log_test.go +++ b/backend/internal/httpd/log_test.go @@ -2,6 +2,7 @@ package httpd import ( "bytes" + "context" "errors" "log/slog" "net/http" @@ -11,6 +12,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/httpd/apierr" "github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) // TestRequestLoggerRecords5xxCause: the wire envelope collapses unrecognized @@ -31,7 +33,8 @@ func TestRequestLoggerRecords5xxCause(t *testing.T) { t.Run(tc.name, func(t *testing.T) { var buf bytes.Buffer log := slog.New(slog.NewTextHandler(&buf, nil)) - handler := requestLogger(log)(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sink := &captureSink{} + handler := requestLogger(log, sink)(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { envelope.WriteError(w, r, tc.err) })) @@ -43,11 +46,27 @@ func TestRequestLoggerRecords5xxCause(t *testing.T) { if strings.Contains(got, tc.wantInLog) { t.Fatalf("log line unexpectedly contains %q:\n%s", tc.wantInLog, got) } + if len(sink.events) != 0 { + t.Fatalf("5xx telemetry events = %d, want 0 for typed 4xx", len(sink.events)) + } return } if !strings.Contains(got, tc.wantInLog) { t.Fatalf("log line missing %q:\n%s", tc.wantInLog, got) } + if len(sink.events) != 1 || sink.events[0].Name != "ao.http.5xx" { + t.Fatalf("telemetry events = %#v, want one ao.http.5xx event", sink.events) + } }) } } + +type captureSink struct { + events []ports.TelemetryEvent +} + +func (s *captureSink) Emit(_ context.Context, ev ports.TelemetryEvent) { + s.events = append(s.events, ev) +} + +func (s *captureSink) Close(context.Context) error { return nil } diff --git a/backend/internal/httpd/recover.go b/backend/internal/httpd/recover.go new file mode 100644 index 00000000..00daa5eb --- /dev/null +++ b/backend/internal/httpd/recover.go @@ -0,0 +1,62 @@ +package httpd + +import ( + "fmt" + "log/slog" + "net/http" + "runtime/debug" + "strings" + "time" + + "github.com/go-chi/chi/v5/middleware" + + "github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +func recoverTelemetry(log *slog.Logger, sink ports.EventSink) func(http.Handler) http.Handler { + log = loggerOrDefault(log) + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + defer func() { + if rec := recover(); rec != nil { + stack := string(debug.Stack()) + log.Error("http handler panic", + "id", middleware.GetReqID(r.Context()), + "method", r.Method, + "path", r.URL.Path, + "panic", fmt.Sprint(rec), + "stack", stack, + ) + if sink != nil { + sink.Emit(r.Context(), ports.TelemetryEvent{ + Name: "ao.daemon.panic", + Source: "http", + OccurredAt: time.Now().UTC(), + Level: ports.TelemetryLevelError, + RequestID: middleware.GetReqID(r.Context()), + Payload: map[string]any{ + "method": r.Method, + "path": r.URL.Path, + "panic": fmt.Sprint(rec), + "stack": stack, + }, + }) + } + writeRecoveredError(w, r) + } + }() + next.ServeHTTP(w, r) + }) + } +} + +func writeRecoveredError(w http.ResponseWriter, r *http.Request) { + if strings.HasPrefix(r.URL.Path, "/api/") { + envelope.WriteAPIError(w, r, http.StatusInternalServerError, "internal_error", "INTERNAL_ERROR", "Internal server error", nil) + return + } + envelope.WriteJSON(w, http.StatusInternalServerError, map[string]any{ + "status": "error", + }) +} diff --git a/backend/internal/httpd/router.go b/backend/internal/httpd/router.go index 3a0dd668..894de985 100644 --- a/backend/internal/httpd/router.go +++ b/backend/internal/httpd/router.go @@ -3,10 +3,12 @@ package httpd import ( + "encoding/json" "log/slog" "net" "net/http" "os" + "time" "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" @@ -14,6 +16,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/config" "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" "github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" "github.com/aoagents/agent-orchestrator/backend/internal/terminal" ) @@ -30,10 +33,10 @@ type ControlDeps struct { // // Middleware order (outermost first): // -// Recoverer → turn a handler panic into 500 instead of crashing the daemon // RequestID → attach a request id for correlation -// requestLogger → slog-backed access log, stderr, carries the request id // RealIP → normalise client IP (loopback proxy from the dev server) +// requestLogger → slog-backed access log + 5xx telemetry, carries the request id +// recoverer → turn a handler panic into 500 instead of crashing the daemon // cors → CORS allowlist for the Electron renderer / dev origins // // The per-request timeout is deliberately not global: it wraps only bounded @@ -42,10 +45,10 @@ func NewRouterWithControl(cfg config.Config, log *slog.Logger, termMgr *terminal log = loggerOrDefault(log) r := chi.NewRouter() - r.Use(middleware.Recoverer) r.Use(middleware.RequestID) - r.Use(requestLogger(log)) r.Use(middleware.RealIP) + r.Use(requestLogger(log, deps.Telemetry)) + r.Use(recoverTelemetry(log, deps.Telemetry)) r.Use(corsMiddleware(cfg.AllowedOrigins)) // JSON envelopes for unmatched routes / methods — chi's defaults are @@ -57,6 +60,7 @@ func NewRouterWithControl(cfg config.Config, log *slog.Logger, termMgr *terminal mountHealth(r) mountTerminalMux(r, termMgr, log) mountControl(r, control) + mountTelemetry(r, deps.Telemetry) NewAPI(cfg, deps).Register(r) return r @@ -94,6 +98,51 @@ func mountControl(r chi.Router, deps ControlDeps) { }) } +type cliInvokedRequest struct { + Command string `json:"command"` + CommandPath string `json:"commandPath"` +} + +func mountTelemetry(r chi.Router, sink ports.EventSink) { + if sink == nil { + return + } + r.Post("/internal/telemetry/cli-invoked", func(w http.ResponseWriter, req *http.Request) { + if !localControlRequest(req) { + envelope.WriteJSON(w, http.StatusForbidden, map[string]any{ + "status": "forbidden", + "service": daemonmeta.ServiceName, + }) + return + } + + var body cliInvokedRequest + dec := json.NewDecoder(req.Body) + dec.DisallowUnknownFields() + if err := dec.Decode(&body); err != nil { + envelope.WriteAPIError(w, req, http.StatusBadRequest, "bad_request", "INVALID_JSON", "request body must be valid JSON", nil) + return + } + if body.CommandPath == "" { + envelope.WriteAPIError(w, req, http.StatusBadRequest, "bad_request", "COMMAND_PATH_REQUIRED", "commandPath is required", nil) + return + } + + sink.Emit(req.Context(), ports.TelemetryEvent{ + Name: "ao.cli.invoked", + Source: "cli", + OccurredAt: time.Now().UTC(), + Level: ports.TelemetryLevelInfo, + RequestID: middleware.GetReqID(req.Context()), + Payload: map[string]any{ + "command": body.Command, + "command_path": body.CommandPath, + }, + }) + w.WriteHeader(http.StatusAccepted) + }) +} + // localControlRequest reports whether a control request is a trusted local // caller. The Go CLI client addresses the daemon by its loopback host and // never sets an Origin header; a cross-site browser fetch always carries an diff --git a/backend/internal/httpd/telemetry_test.go b/backend/internal/httpd/telemetry_test.go new file mode 100644 index 00000000..708af258 --- /dev/null +++ b/backend/internal/httpd/telemetry_test.go @@ -0,0 +1,83 @@ +package httpd + +import ( + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" +) + +func TestCLIInvokedRouteEmitsTelemetry(t *testing.T) { + sink := &captureSink{} + r := NewRouterWithControl(config.Config{}, discardLogger(), nil, APIDeps{Telemetry: sink}, ControlDeps{}) + + req := httptest.NewRequest(http.MethodPost, "http://127.0.0.1/internal/telemetry/cli-invoked", strings.NewReader(`{"command":"status","commandPath":"ao status"}`)) + req.Host = "127.0.0.1:3001" + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + + if rec.Code != http.StatusAccepted { + t.Fatalf("status = %d, want 202", rec.Code) + } + if len(sink.events) != 1 { + t.Fatalf("events = %d, want 1", len(sink.events)) + } + if sink.events[0].Name != "ao.cli.invoked" { + t.Fatalf("event name = %q, want ao.cli.invoked", sink.events[0].Name) + } + if got := sink.events[0].Payload["command_path"]; got != "ao status" { + t.Fatalf("command_path = %#v, want ao status", got) + } +} + +func TestCLIInvokedRouteRequiresLoopback(t *testing.T) { + sink := &captureSink{} + r := NewRouterWithControl(config.Config{}, discardLogger(), nil, APIDeps{Telemetry: sink}, ControlDeps{}) + + req := httptest.NewRequest(http.MethodPost, "http://evil.example/internal/telemetry/cli-invoked", strings.NewReader(`{"command":"status","commandPath":"ao status"}`)) + req.Host = "evil.example" + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + + if rec.Code != http.StatusForbidden { + t.Fatalf("status = %d, want 403", rec.Code) + } + if len(sink.events) != 0 { + t.Fatalf("events = %d, want 0", len(sink.events)) + } +} + +func TestRecoverTelemetryEmitsPanicEvent(t *testing.T) { + sink := &captureSink{} + r := NewRouterWithControl(config.Config{}, discardLogger(), nil, APIDeps{Telemetry: sink}, ControlDeps{}) + r.Get("/panic", func(http.ResponseWriter, *http.Request) { + panic("boom") + }) + + req := httptest.NewRequest(http.MethodGet, "http://127.0.0.1/panic", nil) + req.Host = "127.0.0.1:3001" + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + + if rec.Code != http.StatusInternalServerError { + t.Fatalf("status = %d, want 500", rec.Code) + } + var sawPanic, saw5xx bool + for _, ev := range sink.events { + switch ev.Name { + case "ao.daemon.panic": + sawPanic = true + case "ao.http.5xx": + saw5xx = true + } + } + if !sawPanic { + t.Fatalf("events = %#v, want ao.daemon.panic", sink.events) + } + if !saw5xx { + t.Fatalf("events = %#v, want ao.http.5xx after recovery", sink.events) + } +} From 896794cb675038e8ff3870770f7edd4da76d98ad Mon Sep 17 00:00:00 2001 From: laxmanclo Date: Wed, 17 Jun 2026 18:37:04 +0530 Subject: [PATCH 4/8] feat(backend): add onboarding and dwell telemetry --- backend/internal/cli/root.go | 40 +++++++- backend/internal/cli/root_test.go | 31 +++++++ backend/internal/daemon/daemon.go | 4 +- backend/internal/daemon/lifecycle_wiring.go | 4 +- backend/internal/daemon/wiring_test.go | 2 +- backend/internal/httpd/router.go | 41 +++++++++ backend/internal/httpd/telemetry_test.go | 19 ++++ backend/internal/lifecycle/manager.go | 66 +++++++++++++- backend/internal/lifecycle/manager_test.go | 41 +++++++++ backend/internal/service/project/service.go | 65 ++++++++++++- .../internal/service/project/service_test.go | 59 ++++++++++++ backend/internal/service/session/service.go | 91 ++++++++++++++----- .../internal/service/session/service_test.go | 58 ++++++++++++ 13 files changed, 482 insertions(+), 39 deletions(-) diff --git a/backend/internal/cli/root.go b/backend/internal/cli/root.go index 942f8f2b..56479faa 100644 --- a/backend/internal/cli/root.go +++ b/backend/internal/cli/root.go @@ -20,7 +20,18 @@ import ( // Execute runs the ao CLI with process stdio. func Execute() error { - return NewRootCommand(DefaultDeps()).Execute() + return executeWithDeps(DefaultDeps(), os.Args[1:]) +} + +func executeWithDeps(deps Deps, args []string) error { + deps = deps.withDefaults() + cmd := NewRootCommand(deps) + cmd.SetArgs(args) + err := cmd.Execute() + if err != nil && ExitCode(err) == 2 { + (&commandContext{deps: deps}).emitCLIUsageError(context.Background(), args, err) + } + return err } // usageError marks a command-line misuse (bad flag, wrong arg count). It lets @@ -207,6 +218,33 @@ func (c *commandContext) emitCLIInvoked(ctx context.Context, cmd *cobra.Command) }) } +func (c *commandContext) emitCLIUsageError(ctx context.Context, args []string, err error) { + command, commandPath := usageErrorCommand(args) + reqCtx, cancel := context.WithTimeout(ctx, probeTimeout) + defer cancel() + _ = c.postLoopbackJSON(reqCtx, "/internal/telemetry/cli-usage-error", map[string]string{ + "command": command, + "commandPath": commandPath, + "error": err.Error(), + }) +} + +func usageErrorCommand(args []string) (string, string) { + tokens := []string{"ao"} + for _, arg := range args { + if strings.HasPrefix(arg, "-") { + break + } + tokens = append(tokens, arg) + } + commandPath := strings.Join(tokens, " ") + command := "ao" + if len(tokens) > 1 { + command = tokens[len(tokens)-1] + } + return command, commandPath +} + func noArgs(cmd *cobra.Command, args []string) error { if err := cobra.ExactArgs(0)(cmd, args); err != nil { return usageError{err} diff --git a/backend/internal/cli/root_test.go b/backend/internal/cli/root_test.go index 27416797..f2807816 100644 --- a/backend/internal/cli/root_test.go +++ b/backend/internal/cli/root_test.go @@ -85,6 +85,37 @@ func TestVersionEmitsCLIInvocationBestEffort(t *testing.T) { } } +func TestUsageErrorEmitsCLIUsageTelemetryBestEffort(t *testing.T) { + cfg := setConfigEnv(t) + called := make(chan string, 1) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: os.Getpid(), Port: 3001, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + deps := Deps{ + HTTPClient: &http.Client{Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) { + if req.URL.Path == "/internal/telemetry/cli-usage-error" { + called <- req.URL.Path + return jsonResponse(http.StatusAccepted, ""), nil + } + return jsonResponse(http.StatusNotFound, ""), nil + })}, + ProcessAlive: func(pid int) bool { return pid == os.Getpid() }, + } + err := executeWithDeps(deps, []string{"status", "extra"}) + if err == nil { + t.Fatal("expected usage error") + } + select { + case path := <-called: + if path != "/internal/telemetry/cli-usage-error" { + t.Fatalf("telemetry path = %q, want /internal/telemetry/cli-usage-error", path) + } + default: + t.Fatal("usage error did not emit CLI usage telemetry") + } +} + func TestStatusStoppedJSON(t *testing.T) { setConfigEnv(t) diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index 3ed5f3e8..241fa410 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -105,7 +105,7 @@ func Run() error { // Bring up the Lifecycle Manager and the reaper first: it makes the session // lifecycle write path live (reducer write -> store -> DB trigger -> // change_log -> poller -> broadcaster) and gives startSession the shared LCM. - lcStack := startLifecycle(ctx, store, runtimeAdapter, messenger, notificationWriter, log) + lcStack := startLifecycle(ctx, store, runtimeAdapter, messenger, notificationWriter, telemetrySink, log) lcStack.scmDone = startSCMObserver(ctx, store, lcStack.LCM, log) // Wire the controller-facing session service over the same store + LCM, the @@ -123,7 +123,7 @@ func Run() error { } srv, err := httpd.NewWithDeps(cfg, log, termMgr, httpd.APIDeps{ - Projects: projectsvc.NewWithDeps(projectsvc.Deps{Store: store, Sessions: sessionSvc}), + Projects: projectsvc.NewWithDeps(projectsvc.Deps{Store: store, Sessions: sessionSvc, Telemetry: telemetrySink}), Sessions: sessionSvc, Reviews: reviewSvc, Notifications: notifier, diff --git a/backend/internal/daemon/lifecycle_wiring.go b/backend/internal/daemon/lifecycle_wiring.go index 76195515..5d79ec31 100644 --- a/backend/internal/daemon/lifecycle_wiring.go +++ b/backend/internal/daemon/lifecycle_wiring.go @@ -43,8 +43,8 @@ type lifecycleStack struct { // reaper. The goroutine stops when ctx is cancelled; Stop waits for it to drain. // The messenger is the per-daemon agent messenger the LCM uses to nudge agents // in response to SCM observations (CI failure, review feedback, merge conflict). -func startLifecycle(ctx context.Context, store *sqlite.Store, runtime ports.Runtime, messenger ports.AgentMessenger, notifier notificationSink, logger *slog.Logger) *lifecycleStack { - lcm := lifecycle.New(store, messenger, lifecycle.WithNotificationSink(notifier)) +func startLifecycle(ctx context.Context, store *sqlite.Store, runtime ports.Runtime, messenger ports.AgentMessenger, notifier notificationSink, telemetry ports.EventSink, logger *slog.Logger) *lifecycleStack { + lcm := lifecycle.New(store, messenger, lifecycle.WithNotificationSink(notifier), lifecycle.WithTelemetry(telemetry)) rp := reaper.New(lcm, store, runtime, reaper.Config{Logger: logger}) return &lifecycleStack{LCM: lcm, reaperDone: rp.Start(ctx)} } diff --git a/backend/internal/daemon/wiring_test.go b/backend/internal/daemon/wiring_test.go index 0722b819..95dc523b 100644 --- a/backend/internal/daemon/wiring_test.go +++ b/backend/internal/daemon/wiring_test.go @@ -319,7 +319,7 @@ func TestWiring_StartLifecycleThreadsMessengerIntoLCM(t *testing.T) { log := slog.New(slog.NewTextHandler(io.Discard, nil)) messenger := &captureMessenger{} - stack := startLifecycle(ctx, store, zellij.New(zellij.Options{}), messenger, nil, log) + stack := startLifecycle(ctx, store, zellij.New(zellij.Options{}), messenger, nil, nil, log) t.Cleanup(stack.Stop) t.Cleanup(cancel) diff --git a/backend/internal/httpd/router.go b/backend/internal/httpd/router.go index 894de985..e57c29c6 100644 --- a/backend/internal/httpd/router.go +++ b/backend/internal/httpd/router.go @@ -103,6 +103,12 @@ type cliInvokedRequest struct { CommandPath string `json:"commandPath"` } +type cliUsageErrorRequest struct { + Command string `json:"command"` + CommandPath string `json:"commandPath"` + Error string `json:"error"` +} + func mountTelemetry(r chi.Router, sink ports.EventSink) { if sink == nil { return @@ -141,6 +147,41 @@ func mountTelemetry(r chi.Router, sink ports.EventSink) { }) w.WriteHeader(http.StatusAccepted) }) + r.Post("/internal/telemetry/cli-usage-error", func(w http.ResponseWriter, req *http.Request) { + if !localControlRequest(req) { + envelope.WriteJSON(w, http.StatusForbidden, map[string]any{ + "status": "forbidden", + "service": daemonmeta.ServiceName, + }) + return + } + + var body cliUsageErrorRequest + dec := json.NewDecoder(req.Body) + dec.DisallowUnknownFields() + if err := dec.Decode(&body); err != nil { + envelope.WriteAPIError(w, req, http.StatusBadRequest, "bad_request", "INVALID_JSON", "request body must be valid JSON", nil) + return + } + if body.CommandPath == "" { + envelope.WriteAPIError(w, req, http.StatusBadRequest, "bad_request", "COMMAND_PATH_REQUIRED", "commandPath is required", nil) + return + } + + sink.Emit(req.Context(), ports.TelemetryEvent{ + Name: "ao.cli.usage_errors", + Source: "cli", + OccurredAt: time.Now().UTC(), + Level: ports.TelemetryLevelWarn, + RequestID: middleware.GetReqID(req.Context()), + Payload: map[string]any{ + "command": body.Command, + "command_path": body.CommandPath, + "error": body.Error, + }, + }) + w.WriteHeader(http.StatusAccepted) + }) } // localControlRequest reports whether a control request is a trusted local diff --git a/backend/internal/httpd/telemetry_test.go b/backend/internal/httpd/telemetry_test.go index 708af258..b53326ff 100644 --- a/backend/internal/httpd/telemetry_test.go +++ b/backend/internal/httpd/telemetry_test.go @@ -6,6 +6,8 @@ import ( "strings" "testing" + "github.com/go-chi/chi/v5" + "github.com/aoagents/agent-orchestrator/backend/internal/config" ) @@ -50,6 +52,23 @@ func TestCLIInvokedRouteRequiresLoopback(t *testing.T) { } } +func TestCLIUsageErrorRouteEmitsTelemetry(t *testing.T) { + sink := &captureSink{} + r := chi.NewRouter() + mountTelemetry(r, sink) + + req := httptest.NewRequest(http.MethodPost, "http://127.0.0.1/internal/telemetry/cli-usage-error", strings.NewReader(`{"command":"status","commandPath":"ao status","error":"too many args"}`)) + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + + if rec.Code != http.StatusAccepted { + t.Fatalf("status = %d, want 202", rec.Code) + } + if len(sink.events) != 1 || sink.events[0].Name != "ao.cli.usage_errors" { + t.Fatalf("events = %#v, want one ao.cli.usage_errors event", sink.events) + } +} + func TestRecoverTelemetryEmitsPanicEvent(t *testing.T) { sink := &captureSink{} r := NewRouterWithControl(config.Config{}, discardLogger(), nil, APIDeps{Telemetry: sink}, ControlDeps{}) diff --git a/backend/internal/lifecycle/manager.go b/backend/internal/lifecycle/manager.go index 95ad708a..30128b8c 100644 --- a/backend/internal/lifecycle/manager.go +++ b/backend/internal/lifecycle/manager.go @@ -37,6 +37,11 @@ func WithNotificationSink(sink notificationSink) Option { return func(m *Manager) { m.notifications = sink } } +// WithTelemetry wires lifecycle activity transitions to the shared telemetry sink. +func WithTelemetry(sink ports.EventSink) Option { + return func(m *Manager) { m.telemetry = sink } +} + // Manager reduces runtime, activity, spawn, and termination observations into durable session facts. // It also owns agent nudges caused by PR observations, including merge-conflict, CI-failure, and review-feedback prompts. type Manager struct { @@ -44,10 +49,11 @@ type Manager struct { messenger ports.AgentMessenger notifications notificationSink - mu sync.Mutex - window time.Duration - clock func() time.Time - react reactionState + mu sync.Mutex + window time.Duration + clock func() time.Time + react reactionState + telemetry ports.EventSink } // New builds a Lifecycle Manager over the session store it writes and the messenger it uses for agent nudges. @@ -119,6 +125,8 @@ func (m *Manager) ApplyActivitySignal(ctx context.Context, id domain.SessionID, m.mu.Unlock() return nil } + prevState := rec.Activity.State + prevAt := rec.Activity.LastActivityAt next := rec act := domain.Activity{State: s.State, LastActivityAt: timeOr(s.Timestamp, now)} // A same-state repeat is still a write when it is the FIRST signal for @@ -151,11 +159,61 @@ func (m *Manager) ApplyActivitySignal(ctx context.Context, id domain.SessionID, SessionDisplayName: next.DisplayName, } } + waitingEvents := m.waitingInputEvents(rec, next, prevState, prevAt, now) m.mu.Unlock() + for _, ev := range waitingEvents { + m.emitTelemetry(ctx, ev) + } m.emitNotification(ctx, intent) return nil } +func (m *Manager) waitingInputEvents(prev, next domain.SessionRecord, prevState domain.ActivityState, prevAt, now time.Time) []ports.TelemetryEvent { + if m.telemetry == nil { + return nil + } + projectID := next.ProjectID + sessionID := next.ID + var events []ports.TelemetryEvent + if prevState != domain.ActivityWaitingInput && next.Activity.State == domain.ActivityWaitingInput && !next.IsTerminated { + events = append(events, ports.TelemetryEvent{ + Name: "ao.session.waiting_input_entered", + Source: "lifecycle", + OccurredAt: now.UTC(), + Level: ports.TelemetryLevelInfo, + ProjectID: &projectID, + SessionID: &sessionID, + Payload: map[string]any{ + "state": string(next.Activity.State), + }, + }) + } + if prevState == domain.ActivityWaitingInput && next.Activity.State != domain.ActivityWaitingInput { + payload := map[string]any{ + "state": string(next.Activity.State), + "dwell_ms": now.Sub(prevAt).Milliseconds(), + "exited_to": string(next.Activity.State), + } + events = append(events, ports.TelemetryEvent{ + Name: "ao.session.waiting_input_exited", + Source: "lifecycle", + OccurredAt: now.UTC(), + Level: ports.TelemetryLevelInfo, + ProjectID: &projectID, + SessionID: &sessionID, + Payload: payload, + }) + } + return events +} + +func (m *Manager) emitTelemetry(ctx context.Context, ev ports.TelemetryEvent) { + if m.telemetry == nil { + return + } + m.telemetry.Emit(ctx, ev) +} + func (m *Manager) emitNotification(ctx context.Context, intent *ports.NotificationIntent) { if intent == nil || m.notifications == nil { return diff --git a/backend/internal/lifecycle/manager_test.go b/backend/internal/lifecycle/manager_test.go index c3f84478..28a7400f 100644 --- a/backend/internal/lifecycle/manager_test.go +++ b/backend/internal/lifecycle/manager_test.go @@ -56,6 +56,16 @@ type fakeMessenger struct { err error } +type telemetrySink struct { + events []ports.TelemetryEvent +} + +func (s *telemetrySink) Emit(_ context.Context, ev ports.TelemetryEvent) { + s.events = append(s.events, ev) +} + +func (*telemetrySink) Close(context.Context) error { return nil } + func (f *fakeMessenger) Send(_ context.Context, _ domain.SessionID, msg string) error { if f.err != nil { return f.err @@ -161,6 +171,37 @@ func TestMarkSpawned_StampsUTCActivity(t *testing.T) { } } +func TestActivity_WaitingInputEntryAndExitEmitTelemetry(t *testing.T) { + st := newFakeStore() + sink := &telemetrySink{} + m := New(st, nil, WithTelemetry(sink)) + now := time.Unix(100, 0).UTC() + m.clock = func() time.Time { return now } + st.sessions["mer-1"] = domain.SessionRecord{ + ID: "mer-1", + ProjectID: "mer", + Activity: domain.Activity{State: domain.ActivityIdle, LastActivityAt: now.Add(-time.Minute)}, + } + + if err := m.ApplyActivitySignal(ctx, "mer-1", ports.ActivitySignal{Valid: true, State: domain.ActivityWaitingInput, Timestamp: now}); err != nil { + t.Fatal(err) + } + now = now.Add(3 * time.Second) + if err := m.ApplyActivitySignal(ctx, "mer-1", ports.ActivitySignal{Valid: true, State: domain.ActivityActive, Timestamp: now}); err != nil { + t.Fatal(err) + } + + if len(sink.events) != 2 { + t.Fatalf("events = %#v, want waiting_input entered/exited", sink.events) + } + if sink.events[0].Name != "ao.session.waiting_input_entered" || sink.events[1].Name != "ao.session.waiting_input_exited" { + t.Fatalf("event names = %#v", []string{sink.events[0].Name, sink.events[1].Name}) + } + if got := sink.events[1].Payload["dwell_ms"]; got != int64(3000) { + t.Fatalf("dwell_ms = %#v, want 3000", got) + } +} + func TestPRObservation_CIFailingNudgesAgentWithLogs(t *testing.T) { m, st, msg := newManager() st.sessions["mer-1"] = working("mer-1") diff --git a/backend/internal/service/project/service.go b/backend/internal/service/project/service.go index 3e32f73b..cbccb9e7 100644 --- a/backend/internal/service/project/service.go +++ b/backend/internal/service/project/service.go @@ -13,6 +13,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/httpd/apierr" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) // Manager is the controller-facing contract for the /api/v1/projects surface. @@ -44,8 +45,10 @@ type SessionTeardowner interface { // Service implements project registration and lookup use-cases for controllers. type Service struct { - store Store - sessions SessionTeardowner + store Store + sessions SessionTeardowner + clock func() time.Time + telemetry ports.EventSink // addMu serialises the whole body of Add. Workspace registration performs // filesystem mutations (git init, .gitignore writes, commits) that are not // covered by the store's own writeMu, so path/id conflict checks plus the @@ -57,8 +60,10 @@ var _ Manager = (*Service)(nil) // Deps captures optional collaborators for project use-cases. type Deps struct { - Store Store - Sessions SessionTeardowner + Store Store + Sessions SessionTeardowner + Clock func() time.Time + Telemetry ports.EventSink } // New returns a project service backed by the given durable store. @@ -68,7 +73,11 @@ func New(store Store) *Service { // NewWithDeps returns a project service with optional teardown dependencies. func NewWithDeps(d Deps) *Service { - return &Service{store: d.Store, sessions: d.Sessions} + s := &Service{store: d.Store, sessions: d.Sessions, clock: d.Clock, telemetry: d.Telemetry} + if s.clock == nil { + s.clock = time.Now + } + return s } // List returns every active registered project. @@ -135,6 +144,11 @@ func (m *Service) Add(ctx context.Context, in AddInput) (Project, error) { m.addMu.Lock() defer m.addMu.Unlock() + projectCountBefore, err := m.activeProjectCount(ctx) + if err != nil { + return Project{}, apierr.Internal("PROJECT_LOAD_FAILED", "Failed to load project") + } + name := string(id) if in.Name != nil { name = strings.TrimSpace(*in.Name) @@ -187,6 +201,7 @@ func (m *Service) Add(ctx context.Context, in AddInput) (Project, error) { if err := m.store.UpsertWorkspaceProject(ctx, row, repos); err != nil { return Project{}, apierr.Internal("PROJECT_ADD_FAILED", "Failed to register workspace project") } + m.emitProjectAdded(row, projectCountBefore == 0) p := projectFromRow(row) p.WorkspaceRepos = workspaceReposFromRecords(repos) return p, nil @@ -208,9 +223,49 @@ func (m *Service) Add(ctx context.Context, in AddInput) (Project, error) { if err := m.store.UpsertProject(ctx, row); err != nil { return Project{}, apierr.Internal("PROJECT_ADD_FAILED", "Failed to register project") } + m.emitProjectAdded(row, projectCountBefore == 0) return projectFromRow(row), nil } +func (m *Service) activeProjectCount(ctx context.Context) (int, error) { + projects, err := m.store.ListProjects(ctx) + if err != nil { + return 0, err + } + return len(projects), nil +} + +func (m *Service) emitProjectAdded(row domain.ProjectRecord, firstProject bool) { + if m.telemetry == nil { + return + } + projectID := domain.ProjectID(row.ID) + at := m.clock().UTC() + payload := map[string]any{ + "kind": string(row.Kind.WithDefault()), + "has_git_remote": row.RepoOriginURL != "", + } + m.telemetry.Emit(context.Background(), ports.TelemetryEvent{ + Name: "ao.projects.created", + Source: "project_service", + OccurredAt: at, + Level: ports.TelemetryLevelInfo, + ProjectID: &projectID, + Payload: payload, + }) + if !firstProject { + return + } + m.telemetry.Emit(context.Background(), ports.TelemetryEvent{ + Name: "ao.onboarding.first_project_added", + Source: "project_service", + OccurredAt: at, + Level: ports.TelemetryLevelInfo, + ProjectID: &projectID, + Payload: payload, + }) +} + // SetConfig replaces the project's stored config. The typed config is validated // here so a bad value is rejected when set rather than surfacing at spawn. func (m *Service) SetConfig(ctx context.Context, id domain.ProjectID, in SetConfigInput) (Project, error) { diff --git a/backend/internal/service/project/service_test.go b/backend/internal/service/project/service_test.go index 057afc84..44228648 100644 --- a/backend/internal/service/project/service_test.go +++ b/backend/internal/service/project/service_test.go @@ -12,6 +12,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/httpd/apierr" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" "github.com/aoagents/agent-orchestrator/backend/internal/service/project" "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) @@ -70,6 +71,16 @@ type fakeProjectTeardowner struct { err error } +type captureSink struct { + events []ports.TelemetryEvent +} + +func (s *captureSink) Emit(_ context.Context, ev ports.TelemetryEvent) { + s.events = append(s.events, ev) +} + +func (*captureSink) Close(context.Context) error { return nil } + func (f *fakeProjectTeardowner) TeardownProject(_ context.Context, project domain.ProjectID) error { f.projects = append(f.projects, project) return f.err @@ -122,6 +133,54 @@ func TestManager_AddListGetRemove(t *testing.T) { wantCode(t, err, "PROJECT_NOT_FOUND") } +func TestManager_AddEmitsProjectAndFirstProjectTelemetry(t *testing.T) { + ctx := context.Background() + store, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = store.Close() }) + sink := &captureSink{} + m := project.NewWithDeps(project.Deps{Store: store, Telemetry: sink}) + + if _, err := m.Add(ctx, project.AddInput{Path: gitRepo(t), ProjectID: ptr("ao")}); err != nil { + t.Fatalf("Add: %v", err) + } + if len(sink.events) != 2 { + t.Fatalf("events = %#v, want projects.created + first_project_added", sink.events) + } + if sink.events[0].Name != "ao.projects.created" || sink.events[1].Name != "ao.onboarding.first_project_added" { + t.Fatalf("event names = %#v", []string{sink.events[0].Name, sink.events[1].Name}) + } +} + +func TestManager_AddDoesNotRepeatFirstProjectTelemetry(t *testing.T) { + ctx := context.Background() + store, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = store.Close() }) + sink := &captureSink{} + m := project.NewWithDeps(project.Deps{Store: store, Telemetry: sink}) + + if _, err := m.Add(ctx, project.AddInput{Path: gitRepo(t), ProjectID: ptr("ao")}); err != nil { + t.Fatalf("Add first: %v", err) + } + if _, err := m.Add(ctx, project.AddInput{Path: gitRepo(t), ProjectID: ptr("ao2")}); err != nil { + t.Fatalf("Add second: %v", err) + } + var firstProjectCount int + for _, ev := range sink.events { + if ev.Name == "ao.onboarding.first_project_added" { + firstProjectCount++ + } + } + if firstProjectCount != 1 { + t.Fatalf("first project telemetry count = %d, want 1", firstProjectCount) + } +} + func TestManager_RemoveTeardownsBeforeArchive(t *testing.T) { ctx := context.Background() store, err := sqlite.Open(t.TempDir()) diff --git a/backend/internal/service/session/service.go b/backend/internal/service/session/service.go index d10875a2..906ad903 100644 --- a/backend/internal/service/session/service.go +++ b/backend/internal/service/session/service.go @@ -125,39 +125,59 @@ func NewWithDeps(d Deps) *Service { // Spawn creates a session and returns the API-facing read model. func (s *Service) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Session, error) { - if err := s.requireProject(ctx, cfg.ProjectID); err != nil { + project, err := s.requireProject(ctx, cfg.ProjectID) + if err != nil { return domain.Session{}, err } + start := s.now() + firstSession, err := s.isFirstSession(ctx) + if err != nil { + return domain.Session{}, fmt.Errorf("count sessions: %w", err) + } rec, err := s.manager.Spawn(ctx, cfg) if err != nil { - s.emitSpawnFailed(cfg, err) + s.emitSpawnFailed(cfg, err, s.now().Sub(start).Milliseconds()) return domain.Session{}, toAPIError(err) } - s.emitSpawned(rec) + s.emitSpawned(rec, s.now().Sub(start).Milliseconds()) + if firstSession { + s.emitFirstSessionSpawned(rec, project) + } return s.toSession(ctx, rec) } // requireProject verifies the project is registered before any spawn write // touches the session store, so an unknown projectId surfaces as a typed 404 // rather than an opaque 500 with an orphan terminated row left behind. -func (s *Service) requireProject(ctx context.Context, id domain.ProjectID) error { +func (s *Service) requireProject(ctx context.Context, id domain.ProjectID) (domain.ProjectRecord, error) { if id == "" { - return apierr.Invalid("PROJECT_ID_REQUIRED", "projectId is required", nil) + return domain.ProjectRecord{}, apierr.Invalid("PROJECT_ID_REQUIRED", "projectId is required", nil) } if s.store == nil { - return nil + return domain.ProjectRecord{ID: string(id)}, nil } - _, ok, err := s.store.GetProject(ctx, string(id)) + rec, ok, err := s.store.GetProject(ctx, string(id)) if err != nil { - return fmt.Errorf("get project %s: %w", id, err) + return domain.ProjectRecord{}, fmt.Errorf("get project %s: %w", id, err) } if !ok { - return apierr.NotFound("PROJECT_NOT_FOUND", "Unknown project — register it with `ao project add`") + return domain.ProjectRecord{}, apierr.NotFound("PROJECT_NOT_FOUND", "Unknown project — register it with `ao project add`") } - return nil + return rec, nil } -func (s *Service) emitSpawned(rec domain.SessionRecord) { +func (s *Service) isFirstSession(ctx context.Context) (bool, error) { + if s.store == nil { + return false, nil + } + rows, err := s.store.ListAllSessions(ctx) + if err != nil { + return false, err + } + return len(rows) == 0, nil +} + +func (s *Service) emitSpawned(rec domain.SessionRecord, durationMs int64) { if s.telemetry == nil { return } @@ -166,18 +186,43 @@ func (s *Service) emitSpawned(rec domain.SessionRecord) { s.telemetry.Emit(context.Background(), ports.TelemetryEvent{ Name: "ao.session.spawned", Source: "session_service", - OccurredAt: s.clock().UTC(), + OccurredAt: s.now(), Level: ports.TelemetryLevelInfo, ProjectID: &projectID, SessionID: &sessionID, Payload: map[string]any{ - "kind": string(rec.Kind), - "harness": string(rec.Harness), + "kind": string(rec.Kind), + "harness": string(rec.Harness), + "duration_ms": durationMs, }, }) } -func (s *Service) emitSpawnFailed(cfg ports.SpawnConfig, err error) { +func (s *Service) emitFirstSessionSpawned(rec domain.SessionRecord, project domain.ProjectRecord) { + if s.telemetry == nil { + return + } + projectID := rec.ProjectID + sessionID := rec.ID + payload := map[string]any{ + "kind": string(rec.Kind), + "harness": string(rec.Harness), + } + if !project.RegisteredAt.IsZero() { + payload["since_first_project_ms"] = s.now().Sub(project.RegisteredAt).Milliseconds() + } + s.telemetry.Emit(context.Background(), ports.TelemetryEvent{ + Name: "ao.onboarding.first_session_spawned", + Source: "session_service", + OccurredAt: s.now(), + Level: ports.TelemetryLevelInfo, + ProjectID: &projectID, + SessionID: &sessionID, + Payload: payload, + }) +} + +func (s *Service) emitSpawnFailed(cfg ports.SpawnConfig, err error, durationMs int64) { if s.telemetry == nil { return } @@ -185,13 +230,14 @@ func (s *Service) emitSpawnFailed(cfg ports.SpawnConfig, err error) { s.telemetry.Emit(context.Background(), ports.TelemetryEvent{ Name: "ao.session.spawn_failed", Source: "session_service", - OccurredAt: s.clock().UTC(), + OccurredAt: s.now(), Level: ports.TelemetryLevelError, ProjectID: &projectID, Payload: map[string]any{ - "kind": string(cfg.Kind), - "harness": string(cfg.Harness), - "error": err.Error(), + "kind": string(cfg.Kind), + "harness": string(cfg.Harness), + "error": err.Error(), + "duration_ms": durationMs, }, }) } @@ -201,9 +247,6 @@ func (s *Service) emitSpawnFailed(cfg ports.SpawnConfig, err error) { // one is the only live coordinator — a business rule that belongs here, not in the // HTTP controller. func (s *Service) SpawnOrchestrator(ctx context.Context, projectID domain.ProjectID, clean bool) (domain.Session, error) { - if err := s.requireProject(ctx, projectID); err != nil { - return domain.Session{}, err - } if clean { active := true existing, err := s.List(ctx, ListFilter{ProjectID: projectID, Active: &active, OrchestratorOnly: true}) @@ -411,9 +454,9 @@ func (s *Service) toSession(ctx context.Context, rec domain.SessionRecord) (doma // without going through New, which is where clock gets its default). func (s *Service) now() time.Time { if s.clock == nil { - return time.Now() + return time.Now().UTC() } - return s.clock() + return s.clock().UTC() } // harnessSignals tolerates a zero-value Service the same way now does. Without diff --git a/backend/internal/service/session/service_test.go b/backend/internal/service/session/service_test.go index 180afd92..f1017b1c 100644 --- a/backend/internal/service/session/service_test.go +++ b/backend/internal/service/session/service_test.go @@ -271,9 +271,67 @@ func TestSpawnUnknownProjectReturns404(t *testing.T) { } } +func TestSpawnEmitsFirstSessionOnboardingAndDuration(t *testing.T) { + st := newFakeStore() + st.projects["mer"] = domain.ProjectRecord{ID: "mer", RegisteredAt: time.Unix(100, 0).UTC()} + sink := &fakeTelemetrySink{} + fc := &fakeCommander{} + svc := NewWithDeps(Deps{ + Manager: fc, + Store: st, + Telemetry: sink, + Clock: func() time.Time { return time.Unix(102, 0).UTC() }, + }) + + if _, err := svc.Spawn(context.Background(), ports.SpawnConfig{ProjectID: "mer"}); err != nil { + t.Fatalf("Spawn: %v", err) + } + if len(sink.events) != 2 { + t.Fatalf("events = %#v, want spawned + first_session", sink.events) + } + if sink.events[0].Name != "ao.session.spawned" || sink.events[1].Name != "ao.onboarding.first_session_spawned" { + t.Fatalf("event names = %#v", []string{sink.events[0].Name, sink.events[1].Name}) + } + if got := sink.events[0].Payload["duration_ms"]; got != int64(0) { + t.Fatalf("spawn duration_ms = %#v, want 0 with fixed clock", got) + } + if got := sink.events[1].Payload["since_first_project_ms"]; got != int64(2000) { + t.Fatalf("since_first_project_ms = %#v, want 2000", got) + } +} + +func TestSpawnFailedEmitsDuration(t *testing.T) { + st := newFakeStore() + st.projects["mer"] = domain.ProjectRecord{ID: "mer"} + sink := &fakeTelemetrySink{} + fc := &fakeCommander{spawnErr: errors.New("boom")} + now := time.Unix(200, 0).UTC() + svc := NewWithDeps(Deps{ + Manager: fc, + Store: st, + Telemetry: sink, + Clock: func() time.Time { + v := now + now = now.Add(1500 * time.Millisecond) + return v + }, + }) + + if _, err := svc.Spawn(context.Background(), ports.SpawnConfig{ProjectID: "mer"}); err == nil { + t.Fatal("Spawn should fail") + } + if len(sink.events) != 1 || sink.events[0].Name != "ao.session.spawn_failed" { + t.Fatalf("events = %#v, want one spawn_failed", sink.events) + } + if got := sink.events[0].Payload["duration_ms"]; got != int64(1500) { + t.Fatalf("spawn_failed duration_ms = %#v, want 1500", got) + } +} + func TestSpawnEmitsTelemetryOnSuccess(t *testing.T) { st := newFakeStore() st.projects["mer"] = domain.ProjectRecord{ID: "mer"} + st.sessions["old-1"] = domain.SessionRecord{ID: "old-1", ProjectID: "other"} fc := &fakeCommander{} ts := &fakeTelemetrySink{} svc := NewWithDeps(Deps{Manager: fc, Store: st, Telemetry: ts, Clock: func() time.Time { return time.Unix(1700000000, 0).UTC() }}) From 99d4efd2d81a441c7a96a6a20e03cd711efa6f4f Mon Sep 17 00:00:00 2001 From: laxmanclo Date: Thu, 18 Jun 2026 13:14:18 +0530 Subject: [PATCH 5/8] feat(frontend): add renderer posthog telemetry --- frontend/package-lock.json | 102 +++++++++++++++--- frontend/package.json | 1 + frontend/src/main.ts | 2 + frontend/src/preload.ts | 4 + .../src/renderer/components/ShellTopbar.tsx | 3 + .../renderer/components/TelemetryBoundary.tsx | 39 +++++++ frontend/src/renderer/global.d.ts | 5 + frontend/src/renderer/lib/bridge.ts | 3 + frontend/src/renderer/lib/telemetry.ts | 82 ++++++++++++++ frontend/src/renderer/main.tsx | 11 +- frontend/src/renderer/routes/__root.tsx | 14 ++- frontend/src/renderer/routes/_shell.tsx | 16 ++- frontend/src/renderer/test/setup.ts | 3 + frontend/src/shared/telemetry.test.ts | 32 ++++++ frontend/src/shared/telemetry.ts | 50 +++++++++ frontend/vite.renderer.config.ts | 12 ++- 16 files changed, 357 insertions(+), 22 deletions(-) create mode 100644 frontend/src/renderer/components/TelemetryBoundary.tsx create mode 100644 frontend/src/renderer/lib/telemetry.ts create mode 100644 frontend/src/shared/telemetry.test.ts create mode 100644 frontend/src/shared/telemetry.ts diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 63d6ce8c..9866c17a 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -25,6 +25,7 @@ "clsx": "^2.1.1", "lucide-react": "^1.17.0", "openapi-fetch": "^0.17.0", + "posthog-js": "^1.390.2", "radix-ui": "^1.5.0", "react": "^19.2.7", "react-dom": "^19.2.7", @@ -3239,21 +3240,6 @@ "@emnapi/runtime": "^1.7.1" } }, - "node_modules/@noble/hashes": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@noble/hashes/-/hashes-2.2.0.tgz", - "integrity": "sha512-IYqDGiTXab6FniAgnSdZwgWbomxpy9FtYvLKs7wCUs2a8RkITG+DFGO1DM9cr+E3/RgADRpFjrKVaJ1z6sjtEg==", - "dev": true, - "license": "MIT", - "optional": true, - "peer": true, - "engines": { - "node": ">= 20.19.0" - }, - "funding": { - "url": "https://paulmillr.com/funding/" - } - }, "node_modules/@nodelib/fs.scandir": { "version": "2.1.5", "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", @@ -3711,6 +3697,21 @@ "node": ">=18" } }, + "node_modules/@posthog/core": { + "version": "1.35.1", + "resolved": "https://registry.npmjs.org/@posthog/core/-/core-1.35.1.tgz", + "integrity": "sha512-2a9JgJgR+Ow8lrUQHVZYXH9EgXskYDlpbgNW6UvtsVxp8pEEFD8PxjZMnzq73Dx3NhAwNUrnVpb8KeZzHAtoSA==", + "license": "MIT", + "dependencies": { + "@posthog/types": "^1.389.0" + } + }, + "node_modules/@posthog/types": { + "version": "1.390.0", + "resolved": "https://registry.npmjs.org/@posthog/types/-/types-1.390.0.tgz", + "integrity": "sha512-zMjK6nrUWhAlL8ECrM4WldvgawqdoAE5B0ys7eA0lCoWuyzfFoSyh6zYtEuBSDRyN7fQLSfNCK+mQH4ngOl7Zw==", + "license": "MIT" + }, "node_modules/@radix-ui/number": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/@radix-ui/number/-/number-1.1.2.tgz", @@ -6325,6 +6326,13 @@ "@types/node": "*" } }, + "node_modules/@types/trusted-types": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz", + "integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==", + "license": "MIT", + "optional": true + }, "node_modules/@types/wrap-ansi": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/@types/wrap-ansi/-/wrap-ansi-3.0.0.tgz", @@ -7772,6 +7780,17 @@ "integrity": "sha512-UaXxwISYJPTr9hwQxMFYZ7kNhSXboMXP+Z3TRX6f1/NyaGPfuNUZOWP1pUEb75B2HjfklIYLVRfWiFZJyC6Npg==", "license": "MIT" }, + "node_modules/core-js": { + "version": "3.49.0", + "resolved": "https://registry.npmjs.org/core-js/-/core-js-3.49.0.tgz", + "integrity": "sha512-es1U2+YTtzpwkxVLwAFdSpaIMyQaq0PBgm3YD1W3Qpsn1NAmO3KSgZfu+oGSWVu6NvLHoHCV/aYcsE5wiB7ALg==", + "hasInstallScript": true, + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/core-js" + } + }, "node_modules/cross-dirname": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/cross-dirname/-/cross-dirname-0.1.0.tgz", @@ -8099,6 +8118,15 @@ "license": "MIT", "peer": true }, + "node_modules/dompurify": { + "version": "3.4.11", + "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.11.tgz", + "integrity": "sha512-zhlUV12GsaRzMsf9q5M254YhA4+VuF0fG+QFqu6aYpoGlKtz+w8//jBcGVYBgQkR5GHjUomejY84AV+/uPbWdw==", + "license": "(MPL-2.0 OR Apache-2.0)", + "optionalDependencies": { + "@types/trusted-types": "^2.0.7" + } + }, "node_modules/eastasianwidth": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", @@ -9053,6 +9081,12 @@ } } }, + "node_modules/fflate": { + "version": "0.4.8", + "resolved": "https://registry.npmjs.org/fflate/-/fflate-0.4.8.tgz", + "integrity": "sha512-FJqqoDBR00Mdj9ppamLa/Y7vxm+PRmNWA67N846RvsoYVMKB4q3y/de5PA7gUmRMYK/8CMz2GDZQmCRN1wBcWA==", + "license": "MIT" + }, "node_modules/filename-reserved-regex": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/filename-reserved-regex/-/filename-reserved-regex-2.0.0.tgz", @@ -11869,6 +11903,22 @@ "node": "^10 || ^12 || >=14" } }, + "node_modules/posthog-js": { + "version": "1.390.2", + "resolved": "https://registry.npmjs.org/posthog-js/-/posthog-js-1.390.2.tgz", + "integrity": "sha512-z1zh0mMokecCILXxabmo5Xag6uCVYEDhP2JnMYLNxwmKN7d7u1S94XYmhUTF3iyAo2JOg6c7n86mGaTXliz4MA==", + "license": "SEE LICENSE IN LICENSE", + "dependencies": { + "@posthog/core": "^1.35.1", + "@posthog/types": "^1.390.0", + "core-js": "^3.38.1", + "dompurify": "^3.3.2", + "fflate": "^0.4.8", + "preact": "^10.28.2", + "query-selector-shadow-dom": "^1.0.1", + "web-vitals": "^5.1.0" + } + }, "node_modules/postject": { "version": "1.0.0-alpha.6", "resolved": "https://registry.npmjs.org/postject/-/postject-1.0.0-alpha.6.tgz", @@ -11895,6 +11945,16 @@ "node": "^12.20.0 || >=14" } }, + "node_modules/preact": { + "version": "10.29.2", + "resolved": "https://registry.npmjs.org/preact/-/preact-10.29.2.tgz", + "integrity": "sha512-7tNmwg/7mzzAoB/8kSg6Hl37JraAZw3Z3A0JSY7VXlZwo82Xn0G7wKbNNs2qoF4ZEEsQGTwDAroNdqKs1ofJxQ==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/preact" + } + }, "node_modules/prettier": { "version": "3.8.4", "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.8.4.tgz", @@ -11993,6 +12053,12 @@ "node": ">=6" } }, + "node_modules/query-selector-shadow-dom": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/query-selector-shadow-dom/-/query-selector-shadow-dom-1.0.1.tgz", + "integrity": "sha512-lT5yCqEBgfoMYpf3F2xQRK7zEr1rhIIZuceDK6+xRkJQ4NMbHTwXqk4NkwDwQMNqXgG9r9fyHnzwNVs6zV5KRw==", + "license": "MIT" + }, "node_modules/queue-microtask": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", @@ -13804,6 +13870,12 @@ "defaults": "^1.0.3" } }, + "node_modules/web-vitals": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/web-vitals/-/web-vitals-5.3.0.tgz", + "integrity": "sha512-q6LWsLatGYZp5VGBIOvbTj6JBV2nOmC8KvWztXBmwJcfFAzhwKwbOxhUH306XY3CcaZDUlSmSuNPBsCn0bFu+g==", + "license": "Apache-2.0" + }, "node_modules/webidl-conversions": { "version": "8.0.1", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.1.tgz", diff --git a/frontend/package.json b/frontend/package.json index f5ae6033..cb0822de 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -67,6 +67,7 @@ "clsx": "^2.1.1", "lucide-react": "^1.17.0", "openapi-fetch": "^0.17.0", + "posthog-js": "^1.390.2", "radix-ui": "^1.5.0", "react": "^19.2.7", "react-dom": "^19.2.7", diff --git a/frontend/src/main.ts b/frontend/src/main.ts index ca05ed6f..dd8183c6 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -9,6 +9,7 @@ import { pathToFileURL } from "node:url"; import { resolveDaemonLaunch } from "./shared/daemon-launch"; import { createListenPortScanner, defaultRunFilePath, parseRunFile } from "./shared/daemon-discovery"; import type { DaemonStatus } from "./shared/daemon-status"; +import { buildTelemetryBootstrap } from "./shared/telemetry"; // Globals injected at compile time by @electron-forge/plugin-vite. declare const MAIN_WINDOW_VITE_DEV_SERVER_URL: string | undefined; @@ -308,6 +309,7 @@ ipcMain.handle("daemon:getStatus", () => daemonStatus); ipcMain.handle("daemon:start", () => startDaemon()); ipcMain.handle("daemon:stop", () => stopDaemon()); ipcMain.handle("app:getVersion", () => app.getVersion()); +ipcMain.handle("telemetry:getBootstrap", () => buildTelemetryBootstrap(process.env, app.getVersion(), process.platform)); ipcMain.handle("app:chooseDirectory", async () => { const options: OpenDialogOptions = { properties: ["openDirectory"], diff --git a/frontend/src/preload.ts b/frontend/src/preload.ts index 2e762f2c..15d69fa9 100644 --- a/frontend/src/preload.ts +++ b/frontend/src/preload.ts @@ -1,5 +1,6 @@ import { contextBridge, ipcRenderer } from "electron"; import type { DaemonStatus } from "./shared/daemon-status"; +import type { TelemetryBootstrap } from "./shared/telemetry"; const api = { app: { @@ -18,6 +19,9 @@ const api = { }; }, }, + telemetry: { + getBootstrap: () => ipcRenderer.invoke("telemetry:getBootstrap") as Promise, + }, }; contextBridge.exposeInMainWorld("ao", api); diff --git a/frontend/src/renderer/components/ShellTopbar.tsx b/frontend/src/renderer/components/ShellTopbar.tsx index 63539287..13bf0c59 100644 --- a/frontend/src/renderer/components/ShellTopbar.tsx +++ b/frontend/src/renderer/components/ShellTopbar.tsx @@ -11,6 +11,7 @@ import { } from "../types/workspace"; import { useWorkspaceQuery, workspaceQueryKey } from "../hooks/useWorkspaceQuery"; import { spawnOrchestrator } from "../lib/spawn-orchestrator"; +import { captureRendererEvent, captureRendererException } from "../lib/telemetry"; import { useUiStore } from "../stores/ui-store"; import { cn } from "../lib/utils"; @@ -70,6 +71,7 @@ export function ShellTopbar() { const openOrchestrator = async () => { if (!projectId) return; + void captureRendererEvent("ao.renderer.orchestrator_open_requested", { project_id: projectId }); if (orchestrator) { void navigate({ to: "/projects/$projectId/sessions/$sessionId", @@ -86,6 +88,7 @@ export function ShellTopbar() { params: { projectId, sessionId }, }); } catch (error) { + void captureRendererException(error, { source: "orchestrator-open", project_id: projectId }); console.error("Failed to spawn orchestrator:", error); } finally { setIsSpawning(false); diff --git a/frontend/src/renderer/components/TelemetryBoundary.tsx b/frontend/src/renderer/components/TelemetryBoundary.tsx new file mode 100644 index 00000000..bf4a4c58 --- /dev/null +++ b/frontend/src/renderer/components/TelemetryBoundary.tsx @@ -0,0 +1,39 @@ +import React from "react"; +import { captureRendererException } from "../lib/telemetry"; + +type Props = { + children: React.ReactNode; +}; + +type State = { + hasError: boolean; +}; + +export class TelemetryBoundary extends React.Component { + state: State = { hasError: false }; + + static getDerivedStateFromError() { + return { hasError: true }; + } + + componentDidCatch(error: Error, info: React.ErrorInfo) { + void captureRendererException(error, { + source: "react-error-boundary", + component_stack: info.componentStack, + }); + } + + render() { + if (this.state.hasError) { + return ( +
+
+

The app hit an unexpected error.

+

Restart the app or check the daemon logs if this keeps happening.

+
+
+ ); + } + return this.props.children; + } +} diff --git a/frontend/src/renderer/global.d.ts b/frontend/src/renderer/global.d.ts index ca9ca402..c12f68b9 100644 --- a/frontend/src/renderer/global.d.ts +++ b/frontend/src/renderer/global.d.ts @@ -4,6 +4,11 @@ declare global { interface Window { ao?: AoBridge; } + + interface ImportMetaEnv { + readonly VITE_AO_POSTHOG_KEY?: string; + readonly VITE_AO_POSTHOG_HOST?: string; + } } export {}; diff --git a/frontend/src/renderer/lib/bridge.ts b/frontend/src/renderer/lib/bridge.ts index 6e907e13..d480fc5c 100644 --- a/frontend/src/renderer/lib/bridge.ts +++ b/frontend/src/renderer/lib/bridge.ts @@ -16,4 +16,7 @@ export const aoBridge: AoBridge = stop: async () => ({ state: "stopped" }), onStatus: () => () => undefined, }, + telemetry: { + getBootstrap: async () => null, + }, } satisfies AoBridge); diff --git a/frontend/src/renderer/lib/telemetry.ts b/frontend/src/renderer/lib/telemetry.ts new file mode 100644 index 00000000..71622bfd --- /dev/null +++ b/frontend/src/renderer/lib/telemetry.ts @@ -0,0 +1,82 @@ +import posthog from "posthog-js/dist/module.full.no-external"; +import { aoBridge } from "./bridge"; + +const POSTHOG_KEY = import.meta.env.VITE_AO_POSTHOG_KEY?.trim() ?? ""; +const POSTHOG_HOST = import.meta.env.VITE_AO_POSTHOG_HOST?.trim() || "https://us.i.posthog.com"; +const RELEASE_TAG = "2026-01-30"; + +let initPromise: Promise | null = null; +let errorHandlersBound = false; + +function normalizeException(reason: unknown): Error { + if (reason instanceof Error) return reason; + if (typeof reason === "string") return new Error(reason); + try { + return new Error(JSON.stringify(reason)); + } catch { + return new Error("Unknown renderer exception"); + } +} + +function bindErrorHandlers() { + if (errorHandlersBound) return; + errorHandlersBound = true; + window.addEventListener("error", (event) => { + posthog.captureException(event.error ?? new Error(event.message), { + source: "renderer", + filename: event.filename, + lineno: event.lineno, + colno: event.colno, + }); + }); + window.addEventListener("unhandledrejection", (event) => { + posthog.captureException(normalizeException(event.reason), { + source: "renderer", + unhandled: true, + }); + }); +} + +export async function initTelemetry(): Promise { + if (initPromise) return initPromise; + initPromise = (async () => { + if (!POSTHOG_KEY) return false; + const bootstrap = await aoBridge.telemetry.getBootstrap(); + if (!bootstrap) return false; + posthog.init(POSTHOG_KEY, { + api_host: POSTHOG_HOST, + defaults: RELEASE_TAG, + autocapture: false, + capture_pageview: false, + persistence: "localStorage", + }); + posthog.identify(bootstrap.distinctId, { + app_version: bootstrap.appVersion, + platform: bootstrap.platform, + surface: "renderer", + }); + posthog.register({ + app_version: bootstrap.appVersion, + platform: bootstrap.platform, + surface: "renderer", + build_mode: import.meta.env.DEV ? "dev" : "packaged", + }); + bindErrorHandlers(); + posthog.capture("ao.renderer.loaded"); + return true; + })().catch(() => false); + return initPromise; +} + +export async function captureRendererEvent(event: string, properties?: Record): Promise { + if (!(await initTelemetry())) return; + posthog.capture(event, properties); +} + +export async function captureRendererException( + error: unknown, + properties?: Record, +): Promise { + if (!(await initTelemetry())) return; + posthog.captureException(normalizeException(error), properties); +} diff --git a/frontend/src/renderer/main.tsx b/frontend/src/renderer/main.tsx index e612d6bc..48ce485a 100644 --- a/frontend/src/renderer/main.tsx +++ b/frontend/src/renderer/main.tsx @@ -6,8 +6,11 @@ import "@xterm/xterm/css/xterm.css"; import "./styles.css"; import { queryClient } from "./lib/query-client"; import { createAppRouter } from "./router"; +import { TelemetryBoundary } from "./components/TelemetryBoundary"; +import { initTelemetry } from "./lib/telemetry"; const router = createAppRouter(queryClient); +void initTelemetry(); declare module "@tanstack/react-router" { interface Register { @@ -17,8 +20,10 @@ declare module "@tanstack/react-router" { createRoot(document.getElementById("root") as HTMLElement).render( - - - + + + + + , ); diff --git a/frontend/src/renderer/routes/__root.tsx b/frontend/src/renderer/routes/__root.tsx index 66a53f18..8a836967 100644 --- a/frontend/src/renderer/routes/__root.tsx +++ b/frontend/src/renderer/routes/__root.tsx @@ -1,6 +1,8 @@ -import { createRootRouteWithContext, Outlet } from "@tanstack/react-router"; +import { createRootRouteWithContext, Outlet, useRouterState } from "@tanstack/react-router"; +import { useEffect } from "react"; import { TooltipProvider } from "../components/ui/tooltip"; import type { QueryClient } from "@tanstack/react-query"; +import { captureRendererEvent } from "../lib/telemetry"; export const Route = createRootRouteWithContext<{ queryClient: QueryClient; @@ -9,6 +11,16 @@ export const Route = createRootRouteWithContext<{ }); function RootComponent() { + const location = useRouterState({ select: (state) => state.location }); + + useEffect(() => { + void captureRendererEvent("ao.renderer.route_viewed", { + pathname: location.pathname, + search: location.searchStr, + hash: window.location.hash, + }); + }, [location.pathname, location.searchStr]); + return ( diff --git a/frontend/src/renderer/routes/_shell.tsx b/frontend/src/renderer/routes/_shell.tsx index fc276ee4..7eb54235 100644 --- a/frontend/src/renderer/routes/_shell.tsx +++ b/frontend/src/renderer/routes/_shell.tsx @@ -8,6 +8,7 @@ import { TitlebarNav } from "../components/TitlebarNav"; import { useDaemonStatus } from "../hooks/useDaemonStatus"; import { useWorkspaceQuery, workspaceQueryKey, workspaceQueryOptions } from "../hooks/useWorkspaceQuery"; import { apiClient, apiErrorMessage } from "../lib/api-client"; +import { captureRendererEvent, captureRendererException } from "../lib/telemetry"; import { ShellProvider } from "../lib/shell-context"; import { readStoredTheme, type Theme, useUiStore } from "../stores/ui-store"; import type { WorkspaceSummary } from "../types/workspace"; @@ -49,8 +50,13 @@ function ShellLayout() { const createProject = useCallback( async (input: { path: string }) => { + void captureRendererEvent("ao.renderer.project_add_requested"); const { data, error } = await apiClient.POST("/api/v1/projects", { body: { path: input.path } }); - if (error) throw new Error(apiErrorMessage(error)); + if (error) { + const failure = new Error(apiErrorMessage(error)); + void captureRendererException(failure, { source: "project-add" }); + throw failure; + } if (!data?.project) throw new Error("Project creation returned no project"); const workspace: WorkspaceSummary = { @@ -60,6 +66,7 @@ function ShellLayout() { type: "main", sessions: [], }; + void captureRendererEvent("ao.renderer.project_add_succeeded", { project_id: workspace.id }); updateWorkspaces((current) => [workspace, ...current.filter((item) => item.id !== workspace.id)]); void navigate({ to: "/projects/$projectId", params: { projectId: workspace.id } }); }, @@ -71,7 +78,12 @@ function ShellLayout() { const { error } = await apiClient.DELETE("/api/v1/projects/{id}", { params: { path: { id: projectId } }, }); - if (error) throw new Error(apiErrorMessage(error)); + if (error) { + const failure = new Error(apiErrorMessage(error)); + void captureRendererException(failure, { source: "project-remove", project_id: projectId }); + throw failure; + } + void captureRendererEvent("ao.renderer.project_removed", { project_id: projectId }); updateWorkspaces((current) => current.filter((item) => item.id !== projectId)); }, [updateWorkspaces], diff --git a/frontend/src/renderer/test/setup.ts b/frontend/src/renderer/test/setup.ts index 6ad62a0c..c4501999 100644 --- a/frontend/src/renderer/test/setup.ts +++ b/frontend/src/renderer/test/setup.ts @@ -61,4 +61,7 @@ window.ao = { stop: async () => ({ state: "stopped" }), onStatus: () => () => undefined, }, + telemetry: { + getBootstrap: async () => null, + }, }; diff --git a/frontend/src/shared/telemetry.test.ts b/frontend/src/shared/telemetry.test.ts new file mode 100644 index 00000000..8cb4323e --- /dev/null +++ b/frontend/src/shared/telemetry.test.ts @@ -0,0 +1,32 @@ +import { mkdtemp, readFile } from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, expect, test } from "vitest"; +import { buildTelemetryBootstrap, defaultDataDir, loadOrCreateTelemetryInstallId } from "./telemetry"; + +const tempDirs: string[] = []; + +afterEach(async () => { + await Promise.all(tempDirs.splice(0).map((dir) => import("node:fs/promises").then(({ rm }) => rm(dir, { recursive: true, force: true })))); +}); + +test("defaultDataDir prefers AO_DATA_DIR", () => { + expect(defaultDataDir("linux", { AO_DATA_DIR: "/tmp/custom" }, "/home/test")).toBe("/tmp/custom"); +}); + +test("loadOrCreateTelemetryInstallId persists a stable install id", async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), "ao-telemetry-")); + tempDirs.push(dir); + + const first = await loadOrCreateTelemetryInstallId(dir); + const second = await loadOrCreateTelemetryInstallId(dir); + const stored = (await readFile(path.join(dir, "telemetry_install_id"), "utf8")).trim(); + + expect(first).toMatch(/^ins_/); + expect(second).toBe(first); + expect(stored).toBe(first); +}); + +test("buildTelemetryBootstrap returns null when no home dir is available", async () => { + await expect(buildTelemetryBootstrap({}, "1.2.3", "linux", "")).resolves.toBeNull(); +}); diff --git a/frontend/src/shared/telemetry.ts b/frontend/src/shared/telemetry.ts new file mode 100644 index 00000000..01357848 --- /dev/null +++ b/frontend/src/shared/telemetry.ts @@ -0,0 +1,50 @@ +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { randomUUID } from "node:crypto"; + +export type TelemetryBootstrap = { + distinctId: string; + appVersion: string; + platform: NodeJS.Platform; +}; + +export function defaultDataDir( + platform: NodeJS.Platform, + env: Record, + homeDir: string, +): string | null { + void platform; + if (env.AO_DATA_DIR) return env.AO_DATA_DIR; + if (!homeDir) return null; + return path.join(homeDir, ".ao", "data"); +} + +export async function loadOrCreateTelemetryInstallId(dataDir: string): Promise { + const file = path.join(dataDir, "telemetry_install_id"); + try { + const existing = (await readFile(file, "utf8")).trim(); + if (existing) return existing; + } catch { + // Create the id on first use. + } + await mkdir(dataDir, { recursive: true }); + const distinctId = `ins_${randomUUID()}`; + await writeFile(file, `${distinctId}\n`, { mode: 0o600 }); + return distinctId; +} + +export async function buildTelemetryBootstrap( + env: Record, + appVersion: string, + platform: NodeJS.Platform, + homeDir = os.homedir(), +): Promise { + const dataDir = defaultDataDir(platform, env, homeDir); + if (!dataDir) return null; + return { + distinctId: await loadOrCreateTelemetryInstallId(dataDir), + appVersion, + platform, + }; +} diff --git a/frontend/vite.renderer.config.ts b/frontend/vite.renderer.config.ts index 6aa56328..6c8118a8 100644 --- a/frontend/vite.renderer.config.ts +++ b/frontend/vite.renderer.config.ts @@ -8,6 +8,16 @@ import { TanStackRouterVite } from "@tanstack/router-plugin/vite"; import react from "@vitejs/plugin-react"; import tailwindcss from "@tailwindcss/vite"; +const POSTHOG_ORIGIN = (() => { + const configured = process.env.VITE_AO_POSTHOG_HOST?.trim(); + if (!configured) return ""; + try { + return new URL(configured).origin; + } catch { + return ""; + } +})(); + // CSP for the built renderer. The daemon is loopback-only, so network access is // pinned to 127.0.0.1 (REST + SSE over http, terminal mux over ws). Injected at // build time rather than written into index.html because the dev server needs @@ -18,7 +28,7 @@ const CONTENT_SECURITY_POLICY = [ "style-src 'self' 'unsafe-inline'", "img-src 'self' data:", "font-src 'self' data:", - "connect-src 'self' http://127.0.0.1:* ws://127.0.0.1:*", + ["connect-src", "'self'", "http://127.0.0.1:*", "ws://127.0.0.1:*", POSTHOG_ORIGIN].filter(Boolean).join(" "), "object-src 'none'", "base-uri 'self'", "frame-src 'none'", From 81050d08aee4c8e6d3d4d75b99ad6586096f57ca Mon Sep 17 00:00:00 2001 From: laxmanclo Date: Thu, 18 Jun 2026 13:17:44 +0530 Subject: [PATCH 6/8] feat(frontend): bundle posthog project defaults --- frontend/src/main.ts | 13 ++++++++++++- frontend/src/renderer/lib/telemetry.ts | 5 +++-- frontend/src/shared/posthog-config.ts | 2 ++ frontend/vite.renderer.config.ts | 3 ++- 4 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 frontend/src/shared/posthog-config.ts diff --git a/frontend/src/main.ts b/frontend/src/main.ts index dd8183c6..112cb75a 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -9,6 +9,7 @@ import { pathToFileURL } from "node:url"; import { resolveDaemonLaunch } from "./shared/daemon-launch"; import { createListenPortScanner, defaultRunFilePath, parseRunFile } from "./shared/daemon-discovery"; import type { DaemonStatus } from "./shared/daemon-status"; +import { DEFAULT_POSTHOG_HOST, DEFAULT_POSTHOG_PROJECT_KEY } from "./shared/posthog-config"; import { buildTelemetryBootstrap } from "./shared/telemetry"; // Globals injected at compile time by @electron-forge/plugin-vite. @@ -147,6 +148,16 @@ function runFilePath(): string | null { return defaultRunFilePath(process.platform, process.env, os.homedir()); } +function daemonEnv(): NodeJS.ProcessEnv { + return { + ...process.env, + AO_TELEMETRY_EVENTS: process.env.AO_TELEMETRY_EVENTS ?? "on", + AO_TELEMETRY_REMOTE: process.env.AO_TELEMETRY_REMOTE ?? "posthog", + AO_TELEMETRY_POSTHOG_KEY: process.env.AO_TELEMETRY_POSTHOG_KEY ?? DEFAULT_POSTHOG_PROJECT_KEY, + AO_TELEMETRY_POSTHOG_HOST: process.env.AO_TELEMETRY_POSTHOG_HOST ?? DEFAULT_POSTHOG_HOST, + }; +} + function startDaemon(): DaemonStatus { if (daemonProcess) { return daemonStatus; @@ -187,7 +198,7 @@ function startDaemon(): DaemonStatus { // the whole group via killDaemon() reaches the daemon and any PTY children. const child = spawn(launch.command, launch.args, { cwd: launch.cwd, - env: process.env, + env: daemonEnv(), shell: launch.shell, detached: true, }); diff --git a/frontend/src/renderer/lib/telemetry.ts b/frontend/src/renderer/lib/telemetry.ts index 71622bfd..bbe123aa 100644 --- a/frontend/src/renderer/lib/telemetry.ts +++ b/frontend/src/renderer/lib/telemetry.ts @@ -1,8 +1,9 @@ import posthog from "posthog-js/dist/module.full.no-external"; import { aoBridge } from "./bridge"; +import { DEFAULT_POSTHOG_HOST, DEFAULT_POSTHOG_PROJECT_KEY } from "../../shared/posthog-config"; -const POSTHOG_KEY = import.meta.env.VITE_AO_POSTHOG_KEY?.trim() ?? ""; -const POSTHOG_HOST = import.meta.env.VITE_AO_POSTHOG_HOST?.trim() || "https://us.i.posthog.com"; +const POSTHOG_KEY = import.meta.env.VITE_AO_POSTHOG_KEY?.trim() || DEFAULT_POSTHOG_PROJECT_KEY; +const POSTHOG_HOST = import.meta.env.VITE_AO_POSTHOG_HOST?.trim() || DEFAULT_POSTHOG_HOST; const RELEASE_TAG = "2026-01-30"; let initPromise: Promise | null = null; diff --git a/frontend/src/shared/posthog-config.ts b/frontend/src/shared/posthog-config.ts new file mode 100644 index 00000000..07d9aef3 --- /dev/null +++ b/frontend/src/shared/posthog-config.ts @@ -0,0 +1,2 @@ +export const DEFAULT_POSTHOG_PROJECT_KEY = "phc_uXAqS8nokL2QLSGBZSEMHTUNVXsFeXu3SrcWG7fjEyVH"; +export const DEFAULT_POSTHOG_HOST = "https://us.i.posthog.com"; diff --git a/frontend/vite.renderer.config.ts b/frontend/vite.renderer.config.ts index 6c8118a8..21ef729b 100644 --- a/frontend/vite.renderer.config.ts +++ b/frontend/vite.renderer.config.ts @@ -7,9 +7,10 @@ import { fileURLToPath, URL } from "node:url"; import { TanStackRouterVite } from "@tanstack/router-plugin/vite"; import react from "@vitejs/plugin-react"; import tailwindcss from "@tailwindcss/vite"; +import { DEFAULT_POSTHOG_HOST } from "./src/shared/posthog-config"; const POSTHOG_ORIGIN = (() => { - const configured = process.env.VITE_AO_POSTHOG_HOST?.trim(); + const configured = process.env.VITE_AO_POSTHOG_HOST?.trim() || DEFAULT_POSTHOG_HOST; if (!configured) return ""; try { return new URL(configured).origin; From 4578d557d7b05e51fda8f6fa7f36f46de7b9ed0f Mon Sep 17 00:00:00 2001 From: laxmanclo Date: Thu, 18 Jun 2026 13:46:21 +0530 Subject: [PATCH 7/8] feat(telemetry): add canonical active-user event --- backend/internal/httpd/router.go | 12 ++++++++++++ backend/internal/httpd/telemetry_test.go | 10 ++++++++-- frontend/src/renderer/lib/telemetry.ts | 1 + 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/backend/internal/httpd/router.go b/backend/internal/httpd/router.go index e57c29c6..f23f9283 100644 --- a/backend/internal/httpd/router.go +++ b/backend/internal/httpd/router.go @@ -145,6 +145,18 @@ func mountTelemetry(r chi.Router, sink ports.EventSink) { "command_path": body.CommandPath, }, }) + sink.Emit(req.Context(), ports.TelemetryEvent{ + Name: "ao.app.active", + Source: "cli", + OccurredAt: time.Now().UTC(), + Level: ports.TelemetryLevelInfo, + RequestID: middleware.GetReqID(req.Context()), + Payload: map[string]any{ + "channel": "cli", + "command": body.Command, + "command_path": body.CommandPath, + }, + }) w.WriteHeader(http.StatusAccepted) }) r.Post("/internal/telemetry/cli-usage-error", func(w http.ResponseWriter, req *http.Request) { diff --git a/backend/internal/httpd/telemetry_test.go b/backend/internal/httpd/telemetry_test.go index b53326ff..00bfb012 100644 --- a/backend/internal/httpd/telemetry_test.go +++ b/backend/internal/httpd/telemetry_test.go @@ -24,8 +24,8 @@ func TestCLIInvokedRouteEmitsTelemetry(t *testing.T) { if rec.Code != http.StatusAccepted { t.Fatalf("status = %d, want 202", rec.Code) } - if len(sink.events) != 1 { - t.Fatalf("events = %d, want 1", len(sink.events)) + if len(sink.events) != 2 { + t.Fatalf("events = %d, want 2", len(sink.events)) } if sink.events[0].Name != "ao.cli.invoked" { t.Fatalf("event name = %q, want ao.cli.invoked", sink.events[0].Name) @@ -33,6 +33,12 @@ func TestCLIInvokedRouteEmitsTelemetry(t *testing.T) { if got := sink.events[0].Payload["command_path"]; got != "ao status" { t.Fatalf("command_path = %#v, want ao status", got) } + if sink.events[1].Name != "ao.app.active" { + t.Fatalf("second event name = %q, want ao.app.active", sink.events[1].Name) + } + if got := sink.events[1].Payload["channel"]; got != "cli" { + t.Fatalf("channel = %#v, want cli", got) + } } func TestCLIInvokedRouteRequiresLoopback(t *testing.T) { diff --git a/frontend/src/renderer/lib/telemetry.ts b/frontend/src/renderer/lib/telemetry.ts index bbe123aa..1e6044c5 100644 --- a/frontend/src/renderer/lib/telemetry.ts +++ b/frontend/src/renderer/lib/telemetry.ts @@ -63,6 +63,7 @@ export async function initTelemetry(): Promise { build_mode: import.meta.env.DEV ? "dev" : "packaged", }); bindErrorHandlers(); + posthog.capture("ao.app.active", { channel: "renderer" }); posthog.capture("ao.renderer.loaded"); return true; })().catch(() => false); From c0fb42e424e6e4b965e300484ae1dedae233fd24 Mon Sep 17 00:00:00 2001 From: laxmanclo Date: Thu, 18 Jun 2026 15:49:51 +0530 Subject: [PATCH 8/8] fix(telemetry): repair cli test expectations --- backend/internal/adapters/telemetry/fanout.go | 3 +++ .../adapters/telemetry/localsqlite.go | 3 +++ backend/internal/adapters/telemetry/noop.go | 2 ++ .../internal/adapters/telemetry/posthog.go | 3 +++ backend/internal/cli/session_test.go | 22 +++++++++++++++---- backend/internal/cli/spawn_test.go | 4 ++-- backend/internal/config/config.go | 4 +++- backend/internal/lifecycle/manager.go | 4 ++-- backend/internal/ports/telemetry.go | 8 +++++-- 9 files changed, 42 insertions(+), 11 deletions(-) diff --git a/backend/internal/adapters/telemetry/fanout.go b/backend/internal/adapters/telemetry/fanout.go index 52e4b8d7..7bc10366 100644 --- a/backend/internal/adapters/telemetry/fanout.go +++ b/backend/internal/adapters/telemetry/fanout.go @@ -12,6 +12,7 @@ type FanoutSink struct { sinks []ports.EventSink } +// NewFanoutSink builds a sink that forwards each event to every non-nil sink. func NewFanoutSink(sinks ...ports.EventSink) *FanoutSink { filtered := make([]ports.EventSink, 0, len(sinks)) for _, sink := range sinks { @@ -22,12 +23,14 @@ func NewFanoutSink(sinks ...ports.EventSink) *FanoutSink { return &FanoutSink{sinks: filtered} } +// Emit forwards the event to each configured sink. func (s *FanoutSink) Emit(ctx context.Context, ev ports.TelemetryEvent) { for _, sink := range s.sinks { sink.Emit(ctx, ev) } } +// Close closes every configured sink and joins any returned errors. func (s *FanoutSink) Close(ctx context.Context) error { var errs []error for _, sink := range s.sinks { diff --git a/backend/internal/adapters/telemetry/localsqlite.go b/backend/internal/adapters/telemetry/localsqlite.go index 5876baf0..53b7f7f8 100644 --- a/backend/internal/adapters/telemetry/localsqlite.go +++ b/backend/internal/adapters/telemetry/localsqlite.go @@ -40,6 +40,7 @@ type LocalSQLiteSink struct { lastPrune time.Time } +// NewLocalSQLiteSink starts a buffered SQLite-backed telemetry sink. func NewLocalSQLiteSink(store localStore, log *slog.Logger) *LocalSQLiteSink { s := &LocalSQLiteSink{ store: store, @@ -53,6 +54,7 @@ func NewLocalSQLiteSink(store localStore, log *slog.Logger) *LocalSQLiteSink { return s } +// Emit enqueues an event for best-effort persistence. func (s *LocalSQLiteSink) Emit(_ context.Context, ev ports.TelemetryEvent) { select { case s.ch <- ev: @@ -61,6 +63,7 @@ func (s *LocalSQLiteSink) Emit(_ context.Context, ev ports.TelemetryEvent) { } } +// Close drains the worker until completion or context cancellation. func (s *LocalSQLiteSink) Close(ctx context.Context) error { s.closeOnce.Do(func() { close(s.ch) }) done := make(chan struct{}) diff --git a/backend/internal/adapters/telemetry/noop.go b/backend/internal/adapters/telemetry/noop.go index afd54c76..66bba392 100644 --- a/backend/internal/adapters/telemetry/noop.go +++ b/backend/internal/adapters/telemetry/noop.go @@ -9,6 +9,8 @@ import ( // NoopSink discards every event. type NoopSink struct{} +// Emit discards the event. func (NoopSink) Emit(context.Context, ports.TelemetryEvent) {} +// Close is a no-op. func (NoopSink) Close(context.Context) error { return nil } diff --git a/backend/internal/adapters/telemetry/posthog.go b/backend/internal/adapters/telemetry/posthog.go index 2b98d51b..8096de21 100644 --- a/backend/internal/adapters/telemetry/posthog.go +++ b/backend/internal/adapters/telemetry/posthog.go @@ -39,6 +39,7 @@ type PostHogSink struct { closeOnce sync.Once } +// NewPostHogSink starts a buffered PostHog exporter with a stable install ID. func NewPostHogSink(dataDir, apiKey, host string, client postHogClient, log *slog.Logger) (*PostHogSink, error) { if strings.TrimSpace(apiKey) == "" { return nil, fmt.Errorf("posthog api key is required") @@ -66,6 +67,7 @@ func NewPostHogSink(dataDir, apiKey, host string, client postHogClient, log *slo return s, nil } +// Emit enqueues an event for best-effort export. func (s *PostHogSink) Emit(_ context.Context, ev ports.TelemetryEvent) { select { case s.ch <- ev: @@ -74,6 +76,7 @@ func (s *PostHogSink) Emit(_ context.Context, ev ports.TelemetryEvent) { } } +// Close drains the exporter until completion or context cancellation. func (s *PostHogSink) Close(ctx context.Context) error { s.closeOnce.Do(func() { close(s.ch) }) done := make(chan struct{}) diff --git a/backend/internal/cli/session_test.go b/backend/internal/cli/session_test.go index d7aa2f51..5c8696b6 100644 --- a/backend/internal/cli/session_test.go +++ b/backend/internal/cli/session_test.go @@ -17,14 +17,28 @@ type sessionRequestLog struct { requests []string } -func (l *sessionRequestLog) append(r *http.Request) { - l.mu.Lock() - defer l.mu.Unlock() +const cliInvokedRequest = "POST /internal/telemetry/cli-invoked" + +func requestLogEntry(r *http.Request) string { entry := r.Method + " " + r.URL.Path if r.URL.RawQuery != "" { entry += "?" + r.URL.RawQuery } - l.requests = append(l.requests, entry) + return entry +} + +func appendPrimaryRequest(dst *[]string, r *http.Request) { + entry := requestLogEntry(r) + if entry == cliInvokedRequest { + return + } + *dst = append(*dst, entry) +} + +func (l *sessionRequestLog) append(r *http.Request) { + l.mu.Lock() + defer l.mu.Unlock() + appendPrimaryRequest(&l.requests, r) } func (l *sessionRequestLog) all() []string { diff --git a/backend/internal/cli/spawn_test.go b/backend/internal/cli/spawn_test.go index 850b298f..2abfcfaa 100644 --- a/backend/internal/cli/spawn_test.go +++ b/backend/internal/cli/spawn_test.go @@ -45,7 +45,7 @@ func TestSpawnClaimPRWiring(t *testing.T) { cfg := setConfigEnv(t) var requests []string srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - requests = append(requests, r.Method+" "+r.URL.Path) + appendPrimaryRequest(&requests, r) w.Header().Set("Content-Type", "application/json") switch { case r.Method == http.MethodGet && r.URL.Path == "/api/v1/projects/demo": @@ -84,7 +84,7 @@ func TestSpawnClaimPRFailureRollsBackSession(t *testing.T) { var requests []string sessions := map[string]bool{} srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - requests = append(requests, r.Method+" "+r.URL.Path) + appendPrimaryRequest(&requests, r) w.Header().Set("Content-Type", "application/json") switch { case r.Method == http.MethodGet && r.URL.Path == "/api/v1/projects/demo": diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index 56a4200e..dd94193d 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -41,7 +41,9 @@ const ( type TelemetryRemote string const ( - TelemetryRemoteOff TelemetryRemote = "off" + // TelemetryRemoteOff disables remote telemetry export. + TelemetryRemoteOff TelemetryRemote = "off" + // TelemetryRemotePostHog exports allowlisted events to PostHog. TelemetryRemotePostHog TelemetryRemote = "posthog" ) diff --git a/backend/internal/lifecycle/manager.go b/backend/internal/lifecycle/manager.go index 30128b8c..c5342647 100644 --- a/backend/internal/lifecycle/manager.go +++ b/backend/internal/lifecycle/manager.go @@ -159,7 +159,7 @@ func (m *Manager) ApplyActivitySignal(ctx context.Context, id domain.SessionID, SessionDisplayName: next.DisplayName, } } - waitingEvents := m.waitingInputEvents(rec, next, prevState, prevAt, now) + waitingEvents := m.waitingInputEvents(next, prevState, prevAt, now) m.mu.Unlock() for _, ev := range waitingEvents { m.emitTelemetry(ctx, ev) @@ -168,7 +168,7 @@ func (m *Manager) ApplyActivitySignal(ctx context.Context, id domain.SessionID, return nil } -func (m *Manager) waitingInputEvents(prev, next domain.SessionRecord, prevState domain.ActivityState, prevAt, now time.Time) []ports.TelemetryEvent { +func (m *Manager) waitingInputEvents(next domain.SessionRecord, prevState domain.ActivityState, prevAt, now time.Time) []ports.TelemetryEvent { if m.telemetry == nil { return nil } diff --git a/backend/internal/ports/telemetry.go b/backend/internal/ports/telemetry.go index 622376b8..7c58e9db 100644 --- a/backend/internal/ports/telemetry.go +++ b/backend/internal/ports/telemetry.go @@ -11,9 +11,13 @@ import ( type TelemetryLevel string const ( + // TelemetryLevelDebug marks verbose diagnostic events. TelemetryLevelDebug TelemetryLevel = "debug" - TelemetryLevelInfo TelemetryLevel = "info" - TelemetryLevelWarn TelemetryLevel = "warn" + // TelemetryLevelInfo marks normal operational events. + TelemetryLevelInfo TelemetryLevel = "info" + // TelemetryLevelWarn marks degraded but non-fatal events. + TelemetryLevelWarn TelemetryLevel = "warn" + // TelemetryLevelError marks failed operations. TelemetryLevelError TelemetryLevel = "error" )