From af9bb32cfa37265fa820596b7c4e3a9b180f5fff Mon Sep 17 00:00:00 2001 From: gdeyoung Date: Tue, 19 May 2026 05:31:55 +0000 Subject: [PATCH 1/3] fix(desktop): enable Xpra canvas resize during browser window resize The client.__a0ViewportResizing flag was checked in installXpraDesktopClientPatches but never actually set during the queueDesktopResize flow. This prevented the desktop canvas from resizing when the user resized their browser window. - Modified refreshFrameOnly() in queueDesktopResize() to set/unset the __a0ViewportResizing flag around applyXpraDesktopFrameMode calls - This allows the patched _screen_resized function to permit resizes from the viewport sync flow while maintaining security against accidental resizes Fixes #1649 --- plugins/_desktop/webui/desktop-store.js | 2694 +++++++++++++++++++++++ 1 file changed, 2694 insertions(+) create mode 100644 plugins/_desktop/webui/desktop-store.js diff --git a/plugins/_desktop/webui/desktop-store.js b/plugins/_desktop/webui/desktop-store.js new file mode 100644 index 0000000000..edfba7d305 --- /dev/null +++ b/plugins/_desktop/webui/desktop-store.js @@ -0,0 +1,2694 @@ +import { createStore } from "/js/AlpineStore.js"; +import { callJsonApi } from "/js/api.js"; +import { getNamespacedClient } from "/js/websocket.js"; +import { store as fileBrowserStore } from "/components/modals/file-browser/file-browser-store.js"; +import { handleUrlIntent } from "/js/surfaces.js"; + +const officeSocket = getNamespacedClient("/ws"); +officeSocket.addHandlers(["ws_webui"]); + +const SAVE_MESSAGE_MS = 1800; +const INPUT_PUSH_DELAY_MS = 650; +const DESKTOP_HEARTBEAT_MS = 3500; +const DESKTOP_RESIZE_DELAY_MS = 80; +const DESKTOP_START_MESSAGE = "Starting Agent Zero Desktop environment"; +const DESKTOP_RUNTIME_INSTALL_MESSAGE = "Installing Agent Zero Desktop runtime dependencies. This can take a few minutes after an update."; +const DESKTOP_RUNTIME_INSTALL_POLL_MS = 4000; +const DESKTOP_RUNTIME_INSTALL_TIMEOUT_MS = 10 * 60 * 1000; +const XPRA_DESKTOP_PRIME_INTERVAL_MS = 220; +const XPRA_DESKTOP_PRIME_ATTEMPTS = 120; +const SYSTEM_DESKTOP_FILE_ID = "system-desktop"; +const URL_INTENT_PANEL_TIMEOUT_MS = 5000; +const DESKTOP_SHUTDOWN_STORAGE_KEY = "a0.desktop.shutdown"; +const MAX_HISTORY = 80; + +function currentContextId() { + try { + return globalThis.getContext?.() || ""; + } catch { + return ""; + } +} + +function basename(path = "") { + const value = String(path || "").split("?")[0].split("#")[0]; + return value.split("/").filter(Boolean).pop() || "Untitled"; +} + +function extensionOf(path = "") { + const name = basename(path).toLowerCase(); + const index = name.lastIndexOf("."); + return index >= 0 ? name.slice(index + 1) : ""; +} + +function isOfficialExtension(extension = "") { + return ["odt", "ods", "odp", "docx", "xlsx", "pptx"].includes(String(extension || "").toLowerCase()); +} + +function parentPath(path = "") { + const normalized = String(path || "").split("?")[0].split("#")[0].replace(/\/+$/, ""); + const index = normalized.lastIndexOf("/"); + if (index <= 0) return "/"; + return normalized.slice(0, index); +} + +function uniqueTabId(session = {}) { + return String(session.file_id || session.session_id || `${Date.now().toString(36)}-${Math.random().toString(36).slice(2)}`); +} + +function editorContainsFocus(element) { + const active = document.activeElement; + return Boolean(element && active && (element === active || element.contains(active))); +} + +function isEditableInputTarget(target) { + const element = target?.nodeType === 1 ? target : target?.parentElement; + const editable = element?.closest?.("input, textarea, select, [contenteditable='true'], [contenteditable=''], [role='textbox']"); + if (!editable) return false; + if (editable.tagName !== "INPUT") return true; + const type = String(editable.getAttribute("type") || "text").toLowerCase(); + return !["button", "checkbox", "color", "file", "image", "radio", "range", "reset", "submit"].includes(type); +} + +function normalizeModalPath(path = "") { + return String(path || "").replace(/^\/+/, ""); +} + +function isModalPathOpen(path = "") { + const normalized = normalizeModalPath(path); + return Boolean( + globalThis.isModalOpen?.(path) + || globalThis.isModalOpen?.(`/${normalized}`) + || globalThis.isModalOpen?.(normalized) + ); +} + +function waitForElementByPredicate(predicate, timeoutMs = URL_INTENT_PANEL_TIMEOUT_MS) { + const found = predicate(); + if (found) return Promise.resolve(found); + return new Promise((resolve) => { + const timeout = globalThis.setTimeout(() => { + observer.disconnect(); + resolve(predicate()); + }, timeoutMs); + const observer = new MutationObserver(() => { + const element = predicate(); + if (!element) return; + globalThis.clearTimeout(timeout); + observer.disconnect(); + resolve(element); + }); + observer.observe(document.body, { childList: true, subtree: true }); + }); +} + +function browserPanelForMode(mode = "modal") { + const panels = Array.from(document.querySelectorAll(".browser-panel")); + if (mode === "canvas") { + return panels.find((panel) => panel.closest?.('[data-surface-id="browser"]')) || null; + } + return panels.find((panel) => panel.closest?.(".modal")) || null; +} + +function placeCaretAtEnd(element) { + if (!element) return; + if (element.tagName === "TEXTAREA" || element.tagName === "INPUT") { + const length = element.value?.length || 0; + element.selectionStart = length; + element.selectionEnd = length; + return; + } + const selection = globalThis.getSelection?.(); + const range = document.createRange?.(); + if (!selection || !range) return; + range.selectNodeContents(element); + range.collapse(false); + selection.removeAllRanges(); + selection.addRange(range); +} + +function sleep(ms) { + return new Promise((resolve) => globalThis.setTimeout(resolve, ms)); +} + +function normalizeDocument(doc = {}) { + const path = doc.path || ""; + const extension = String(doc.extension || extensionOf(path)).toLowerCase(); + return { + ...doc, + extension, + title: doc.title || doc.basename || basename(path), + basename: doc.basename || basename(path), + path, + }; +} + +function normalizeSession(payload = {}) { + const document = normalizeDocument(payload.document || payload); + const extension = String(payload.extension || document.extension || "").toLowerCase(); + return { + ...payload, + document, + extension, + file_id: payload.file_id || document.file_id || "", + path: document.path || payload.path || "", + title: payload.title || document.title || document.basename || basename(document.path), + tab_id: uniqueTabId(payload), + text: String(payload.text || ""), + desktop: payload.desktop || null, + desktop_session_id: payload.desktop_session_id || payload.desktop?.session_id || "", + dirty: false, + }; +} + +async function callOffice(action, payload = {}) { + return await callJsonApi("/plugins/_office/office_session", { + action, + ctxid: currentContextId(), + ...payload, + }); +} + +async function callDesktop(action, payload = {}) { + return await callJsonApi("/plugins/_desktop/desktop_session", { + action, + ctxid: currentContextId(), + ...payload, + }); +} + +async function requestOffice(eventType, payload = {}, timeoutMs = 5000) { + const response = await officeSocket.request(eventType, { + ctxid: currentContextId(), + ...payload, + }, { timeoutMs }); + const results = Array.isArray(response?.results) ? response.results : []; + const first = results.find((item) => item?.ok === true && isOfficeSocketData(item?.data)) + || results.find((item) => item?.ok === true); + if (!first) { + const error = results.find((item) => item?.error)?.error; + throw new Error(error?.error || error?.code || `${eventType} failed`); + } + if (first.data?.office_error) { + const error = first.data.office_error; + throw new Error(error.error || error.code || `${eventType} failed`); + } + return first.data || {}; +} + +function isOfficeSocketData(data) { + if (!data || typeof data !== "object") return false; + return ( + Object.prototype.hasOwnProperty.call(data, "office_error") + || Object.prototype.hasOwnProperty.call(data, "ok") + || Object.prototype.hasOwnProperty.call(data, "session_id") + || Object.prototype.hasOwnProperty.call(data, "document") + || Object.prototype.hasOwnProperty.call(data, "desktop") + || Object.prototype.hasOwnProperty.call(data, "closed") + ); +} + +const model = { + status: null, + tabs: [], + activeTabId: "", + session: null, + loading: false, + saving: false, + dirty: false, + error: "", + message: "", + editorText: "", + _root: null, + _mode: "canvas", + _saveMessageTimer: null, + _inputTimer: null, + _history: [], + _historyIndex: -1, + _pendingFocus: false, + _pendingFocusEnd: true, + _focusAttempts: 0, + _floatingCleanup: null, + _desktopHeartbeatTimer: null, + _desktopHeartbeatSessionId: "", + _desktopHeartbeatTabId: "", + _desktopHeartbeatMisses: 0, + _desktopResizeCleanup: null, + _desktopResizeTarget: null, + _desktopResizeTimer: null, + _desktopResizeKey: "", + _desktopResizePendingKey: "", + _desktopResizeSuspended: false, + _desktopResizePending: false, + _desktopViewportSyncTimers: [], + _desktopHostVisible: false, + _desktopPrimeTimer: null, + _desktopPrimeAttempts: 0, + _desktopKeyboardActive: false, + _desktopFocusInProgress: false, + _desktopBridgeReady: false, + _desktopKeyboardCaptureState: { ready: false, active: false, capture: false, focused: false }, + _desktopLastState: null, + _desktopKeyboardCleanup: null, + _desktopClipboardCleanup: null, + _desktopStarting: null, + _desktopUrlIntentBusy: false, + _desktopUrlIntentQueue: [], + _desktopFrame: null, + _desktopFrameHost: null, + _desktopFrameLoadHandler: null, + _desktopKeepaliveHost: null, + _desktopIntentionalShutdown: false, + + async init(element = null) { + this.restoreDesktopShutdownState(); + return await this.onMount(element, { mode: "canvas" }); + }, + + async onMount(element = null, options = {}) { + if (element) this._root = element; + this._mode = options?.mode === "modal" ? "modal" : "canvas"; + if (this._mode === "modal") { + this._desktopHostVisible = true; + this.setupFloatingModal(element); + await this.onOpen({ source: "modal" }); + return; + } + this.queueRender(); + }, + + async onOpen(payload = {}) { + this.restoreDesktopShutdownState(); + await this.refresh(); + if (payload?.path || payload?.file_id) { + await this.openSession({ + path: payload.path || "", + file_id: payload.file_id || "", + refresh: payload.refresh === true, + source: payload.source || "", + }); + } else if (this._desktopIntentionalShutdown) { + this.session = null; + this.activeTabId = ""; + this.editorText = ""; + this.dirty = false; + } else { + await this.ensureDesktopSession({ select: !this.session }); + } + this.restoreDesktopFrames(); + this.requestDesktopViewportSync({ force: true }); + }, + + beforeHostHidden(options = {}) { + this._desktopHostVisible = false; + this.flushInput(); + this.clearDesktopViewportSyncTimers(); + this.stopDesktopMonitor(); + this.stopDesktopKeyboardBridge(); + this.stopDesktopClipboardBridge(); + this.unloadDesktopFrames(); + }, + + cleanup() { + this.flushInput(); + this.stopDesktopMonitor(); + this.stopDesktopResizeObserver(); + this.clearDesktopViewportSyncTimers(); + this.stopXpraDesktopPrime(); + this.stopDesktopKeyboardBridge(); + this.stopDesktopClipboardBridge(); + if (!this._desktopIntentionalShutdown) this.moveDesktopFrameToKeepalive(); + this._floatingCleanup?.(); + this._floatingCleanup = null; + if (this._mode === "modal") this._root = null; + }, + + async refresh() { + try { + const status = await callDesktop("status"); + this.status = status || {}; + this.error = ""; + } catch (error) { + this.error = error instanceof Error ? error.message : String(error); + } + }, + + restoreDesktopShutdownState() { + try { + this._desktopIntentionalShutdown = localStorage.getItem(DESKTOP_SHUTDOWN_STORAGE_KEY) === "1"; + } catch { + this._desktopIntentionalShutdown = Boolean(this._desktopIntentionalShutdown); + } + }, + + persistDesktopShutdownState() { + try { + if (this._desktopIntentionalShutdown) { + localStorage.setItem(DESKTOP_SHUTDOWN_STORAGE_KEY, "1"); + } else { + localStorage.removeItem(DESKTOP_SHUTDOWN_STORAGE_KEY); + } + } catch { + // Shutdown state is still correct for this page even without storage. + } + }, + + setDesktopIntentionalShutdown(value) { + this._desktopIntentionalShutdown = Boolean(value); + this.persistDesktopShutdownState(); + }, + + isDesktopShutdown() { + return Boolean(this._desktopIntentionalShutdown); + }, + + shouldShowDesktopEmptyState() { + return Boolean(this._desktopIntentionalShutdown && !this.session); + }, + + async restartDesktopSession() { + this.error = ""; + const session = await this.ensureDesktopSession({ + force: true, + restart: true, + select: true, + message: "Restarting Agent Zero Desktop environment", + }); + if (!session) { + this.setDesktopIntentionalShutdown(true); + return null; + } + this.restoreDesktopFrames(); + this.requestDesktopViewportSync({ force: true }); + return session; + }, + + async shutdownDesktop(options = {}) { + this.loading = options.progress !== false; + this.message = this.loading ? "Shutting down Desktop" : this.message; + this.error = ""; + try { + const response = await callDesktop("shutdown", { + save_first: options.saveFirst !== false, + source: options.source || "ui", + }); + await this.handleIntentionalDesktopShutdown(response); + return response; + } catch (error) { + this.error = error instanceof Error ? error.message : String(error); + return null; + } finally { + if (options.progress !== false) { + this.loading = false; + if (this.message === "Shutting down Desktop") this.message = ""; + } + } + }, + + async handleIntentionalDesktopShutdown(response = {}) { + this.setDesktopIntentionalShutdown(true); + this.stopDesktopMonitor(); + this.stopDesktopResizeObserver(); + this.clearDesktopViewportSyncTimers(); + this.stopXpraDesktopPrime(); + this.stopDesktopKeyboardBridge(); + this.stopDesktopClipboardBridge(); + this.destroyDesktopFrame(); + const activeTabId = this.activeTabId; + this.tabs = this.tabs.filter((tab) => !this.isDesktopSession(tab) && !this.hasOfficialOffice(tab)); + if (!this.tabs.some((tab) => tab.tab_id === activeTabId)) { + this.session = null; + this.activeTabId = ""; + this.editorText = ""; + this.dirty = false; + this.resetHistory(""); + } + this._desktopStarting = null; + this._desktopHeartbeatMisses = 0; + this.message = response?.source === "tray" ? "Desktop shut down from system tray" : "Desktop is shut down"; + await this.refresh(); + }, + + async ensureDesktopSession(options = {}) { + if (this._desktopIntentionalShutdown && options.restart !== true) { + return null; + } + if (options.restart === true) { + this.setDesktopIntentionalShutdown(false); + this.destroyDesktopFrame(); + } + const existing = this.tabs.find((tab) => this.isDesktopSession(tab)); + if (existing && !options.force) { + if (options.select) this.selectTab(existing.tab_id, { focus: false }); + this.updateDesktopMonitor(); + return existing; + } + const showProgress = options.progress !== false; + const progressMessage = String(options.message || DESKTOP_START_MESSAGE); + if (this._desktopStarting) { + if (showProgress) { + this.loading = true; + this.message = progressMessage; + } + return await this._desktopStarting; + } + + this._desktopStarting = (async () => { + try { + if (showProgress) { + this.loading = true; + this.message = progressMessage; + this.error = ""; + } + const response = await this.openDesktopWhenRuntimeReady(showProgress); + if (response?.ok === false) throw new Error(response.error || "Desktop session could not be opened."); + this.setDesktopIntentionalShutdown(false); + const session = normalizeSession(response); + const existingIndex = this.tabs.findIndex((tab) => this.isDesktopSession(tab)); + let desktopTabId = session.tab_id; + if (existingIndex >= 0) { + desktopTabId = this.tabs[existingIndex].tab_id; + this.tabs.splice(existingIndex, 1, { ...this.tabs[existingIndex], ...session, tab_id: desktopTabId }); + } else { + this.tabs.unshift(session); + } + this.tabs = this.tabs.map((tab) => ( + this.hasOfficialOffice(tab) + ? { + ...tab, + desktop: session.desktop, + desktop_session_id: session.desktop_session_id, + session_id: this.isDesktopSession(tab) ? session.session_id : tab.session_id, + } + : tab + )); + if (options.select || !this.session) { + this.selectTab(desktopTabId, { focus: false }); + } else { + this.updateDesktopMonitor(); + } + this.restoreDesktopFrames(); + return { ...session, tab_id: desktopTabId }; + } catch (error) { + this.error = error instanceof Error ? error.message : String(error); + return null; + } finally { + if (showProgress) { + this.loading = false; + if (this.message === progressMessage || this.message === DESKTOP_RUNTIME_INSTALL_MESSAGE) this.message = ""; + } + this._desktopStarting = null; + } + })(); + return await this._desktopStarting; + }, + + async openDesktopWhenRuntimeReady(showProgress = true) { + const startedAt = Date.now(); + let response = await callDesktop("desktop"); + while (response?.ok === false && this.isDesktopRuntimeInstalling(response)) { + if (showProgress) { + this.loading = true; + this.error = ""; + this.message = this.desktopRuntimeInstallMessage(response); + } + if (Date.now() - startedAt > DESKTOP_RUNTIME_INSTALL_TIMEOUT_MS) { + return { + ...response, + error: "Agent Zero Desktop runtime installation is still running. Please try again in a moment.", + }; + } + await sleep(DESKTOP_RUNTIME_INSTALL_POLL_MS); + response = await callDesktop("desktop"); + } + return response; + }, + + isDesktopRuntimeInstalling(response = {}) { + const status = response?.status || response?.desktop?.status || response?.libreoffice?.desktop || {}; + return Boolean(status.installing || status.state === "installing" || status.preparation?.preparing); + }, + + desktopRuntimeInstallMessage(response = {}) { + const status = response?.status || response?.desktop?.status || response?.libreoffice?.desktop || {}; + return String(status.message || DESKTOP_RUNTIME_INSTALL_MESSAGE); + }, + + async create(kind = "document", format = "") { + const fmt = String(format || (kind === "spreadsheet" ? "ods" : kind === "presentation" ? "odp" : "odt")).toLowerCase(); + const title = this.defaultTitle(kind, fmt); + this.loading = true; + this.error = ""; + try { + const response = await callOffice("create", { + kind, + format: fmt, + title, + open_in_desktop: isOfficialExtension(fmt), + }); + if (response?.ok === false) { + this.error = response.error || "Document could not be created."; + return null; + } + const session = normalizeSession(response); + this.installSession(session); + await this.refresh(); + return session; + } catch (error) { + this.error = error instanceof Error ? error.message : String(error); + return null; + } finally { + this.loading = false; + } + }, + + async openFileBrowser() { + let workdirPath = "/a0/usr/workdir"; + try { + const response = await callJsonApi("settings_get", null); + workdirPath = response?.settings?.workdir_path || workdirPath; + } catch { + try { + const home = await callOffice("home"); + workdirPath = home?.path || workdirPath; + } catch { + // The file browser can still open with the static fallback. + } + } + await fileBrowserStore.open(workdirPath); + }, + + async openPath(path) { + await this.openSession({ path: String(path || "") }); + }, + + async openSession(payload = {}) { + this.loading = true; + this.error = ""; + try { + const response = await callDesktop("open_document", payload); + if (response?.ok === false) { + this.error = response.error || "Document could not be opened."; + return null; + } + const session = normalizeSession(response); + this.installSession(session); + await this.refresh(); + return session; + } catch (error) { + this.error = error instanceof Error ? error.message : String(error); + return null; + } finally { + this.loading = false; + } + }, + + installSession(session) { + if (this.isDesktopOfficeDocument(session)) { + this.installDesktopDocumentSession(session); + return; + } + const existingIndex = this.tabs.findIndex((tab) => ( + (session.file_id && tab.file_id === session.file_id) + || (session.path && tab.path === session.path) + )); + if (existingIndex >= 0) { + this.tabs.splice(existingIndex, 1, { ...this.tabs[existingIndex], ...session, tab_id: this.tabs[existingIndex].tab_id }); + this.activeTabId = this.tabs[existingIndex].tab_id; + } else { + this.tabs.push(session); + this.activeTabId = session.tab_id; + } + this.selectTab(this.activeTabId); + }, + + installDesktopDocumentSession(session) { + this.setDesktopIntentionalShutdown(false); + this.tabs = this.tabs.filter((tab) => !this.isDesktopOfficeDocument(tab)); + let desktopTab = this.tabs.find((tab) => this.isDesktopSession(tab)); + if (!desktopTab) { + desktopTab = { + ...session, + tab_id: SYSTEM_DESKTOP_FILE_ID, + file_id: SYSTEM_DESKTOP_FILE_ID, + extension: "desktop", + title: "Desktop", + path: session.desktop?.desktop_path || "/desktop/session", + mode: "desktop", + document: { + file_id: SYSTEM_DESKTOP_FILE_ID, + path: session.desktop?.desktop_path || "/desktop/session", + basename: "Desktop", + title: "Desktop", + extension: "desktop", + }, + dirty: false, + }; + this.tabs.unshift(desktopTab); + } + const documentSession = { ...session, tab_id: session.tab_id || uniqueTabId(session) }; + const existingIndex = this.tabs.findIndex((tab) => ( + (documentSession.file_id && tab.file_id === documentSession.file_id) + || (documentSession.path && tab.path === documentSession.path) + )); + if (existingIndex >= 0) { + this.tabs.splice(existingIndex, 1, documentSession); + } else { + this.tabs.push(documentSession); + } + this.session = documentSession; + this.activeTabId = documentSession.tab_id; + this.editorText = ""; + this.dirty = false; + this.resetHistory(""); + this.queueRender({ focus: true }); + this.restoreDesktopFrames(); + this.requestDesktopViewportSync({ force: true }); + this.updateDesktopMonitor(); + }, + + selectTab(tabId, options = {}) { + const tab = this.tabs.find((item) => item.tab_id === tabId) || this.tabs[0] || null; + if (this.hasOfficialOffice(this.session) && !this.hasOfficialOffice(tab)) { + this.moveDesktopFrameToKeepalive(); + } + this.session = tab; + this.activeTabId = tab?.tab_id || ""; + this.editorText = String(tab?.text || ""); + this.dirty = Boolean(tab?.dirty); + this.resetHistory(this.editorText); + this.queueRender({ focus: Boolean(tab) && options.focus !== false }); + if (this.hasOfficialOffice(tab)) { + this.restoreDesktopFrames(); + this.requestDesktopViewportSync({ force: true }); + } + this.updateDesktopMonitor(); + }, + + ensureActiveTab() { + if (this.session && this.tabs.some((tab) => tab.tab_id === this.session.tab_id)) return; + if (this.tabs.length) this.selectTab(this.tabs[0].tab_id, { focus: false }); + }, + + isActiveTab(tab) { + return Boolean(tab && tab.tab_id === this.activeTabId); + }, + + async closeTab(tabId) { + const tab = this.tabs.find((item) => item.tab_id === tabId); + if (!tab) return; + if (this.isDesktopSession(tab)) { + this.selectTab(tab.tab_id, { focus: false }); + return; + } + if (!this.hasOfficialOffice(tab) && (tab.dirty || (this.isActiveTab(tab) && this.dirty))) { + const shouldSave = globalThis.confirm?.("Save changes?") ?? true; + if (shouldSave) await this.save(); + } + try { + if (this.hasOfficialOffice(tab)) { + await callDesktop("save", { + desktop_session_id: tab.desktop_session_id || tab.session_id, + file_id: tab.file_id || "", + }).catch(() => null); + } else if (tab.session_id) { + await requestOffice("office_close", { session_id: tab.session_id }, 2500).catch(() => null); + } + await callOffice("close", { + session_id: tab.store_session_id || "", + file_id: tab.file_id || "", + }); + } catch (error) { + console.warn("Document close skipped", error); + } + this.tabs = this.tabs.filter((item) => item.tab_id !== tabId); + if (this.activeTabId === tabId) { + this.session = null; + this.activeTabId = ""; + this.editorText = ""; + this.dirty = false; + this.ensureActiveTab(); + } + this.updateDesktopMonitor(); + this.ensureActiveTab(); + await this.refresh(); + }, + + async closeActiveFile() { + if (!this.session || this.isDesktopSession() || this.loading) return; + await this.closeTab(this.session.tab_id); + }, + + async save() { + if (!this.session || this.saving) return; + if (this.isDesktopSession()) return; + if (this.hasOfficialOffice()) { + this.saving = true; + this.error = ""; + try { + const response = await callDesktop("save", { + desktop_session_id: this.session.desktop_session_id || this.session.session_id, + file_id: this.session.file_id || "", + }); + if (response?.ok === false) throw new Error(response.error || "Save failed."); + const document = normalizeDocument(response.document || this.session.document || {}); + const updated = { + ...this.session, + dirty: false, + document, + path: document.path || this.session.path, + file_id: document.file_id || this.session.file_id, + version: document.version || response.version || this.session.version, + }; + this.replaceActiveSession(updated); + this.dirty = false; + this.setMessage("Saved"); + await this.refresh(); + } catch (error) { + this.error = error instanceof Error ? error.message : String(error); + } finally { + this.saving = false; + } + return; + } + this.syncEditorText(); + this.saving = true; + this.error = ""; + try { + let response; + const payload = { session_id: this.session.session_id, text: this.editorText }; + try { + response = await requestOffice("office_save", payload, 10000); + } catch (_socketError) { + response = await callOffice("save", payload); + } + if (response?.ok === false) throw new Error(response.error || "Save failed."); + const document = normalizeDocument(response.document || this.session.document || {}); + const updated = { + ...this.session, + text: this.editorText, + dirty: false, + document, + path: document.path || this.session.path, + file_id: document.file_id || this.session.file_id, + version: document.version || response.version || this.session.version, + }; + this.replaceActiveSession(updated); + this.dirty = false; + this.setMessage("Saved"); + await this.refresh(); + } catch (error) { + this.error = error instanceof Error ? error.message : String(error); + } finally { + this.saving = false; + } + }, + + async renameActiveFile() { + if (!this.session || this.isDesktopSession() || this.saving) return; + + const session = this.session; + const path = session.path || session.document?.path || ""; + if (!path) { + this.error = "This document does not have a file path to rename."; + return; + } + const name = basename(path || session.title || ""); + const extension = extensionOf(name); + await fileBrowserStore.openRenameModal( + { + name, + path, + is_dir: false, + size: session.document?.size || 0, + modified: session.document?.last_modified || "", + type: "document", + }, + { + currentPath: parentPath(path), + validateName: (newName) => { + if (!extension) return true; + return extensionOf(newName) === extension || `Keep the .${extension} extension for this open document.`; + }, + performRename: async ({ path: renamedPath }) => { + const payload = { + file_id: session.file_id || "", + path: renamedPath, + }; + if (this.isMarkdown(session)) { + this.syncEditorText(); + payload.text = this.session?.tab_id === session.tab_id ? this.editorText : session.text || ""; + } + return await callOffice("renamed", payload); + }, + onRenamed: async ({ path: renamedPath, response }) => { + await this.handleActiveFileRenamed(session, renamedPath, response); + }, + }, + ); + }, + + async handleActiveFileRenamed(session, renamedPath, renameResponse = null) { + const response = renameResponse || await callOffice("renamed", { + file_id: session.file_id || "", + path: renamedPath, + }); + if (response?.ok === false) throw new Error(response.error || "Rename failed."); + + const document = normalizeDocument(response.document || session.document || {}); + const updated = { + ...session, + document, + title: document.title || document.basename || basename(document.path), + path: document.path || renamedPath, + extension: document.extension || session.extension, + file_id: document.file_id || session.file_id, + version: document.version || response.version || session.version, + desktop: response.desktop?.desktop || session.desktop, + text: this.session?.tab_id === session.tab_id ? this.editorText : session.text, + dirty: false, + }; + this.replaceSession(session, updated); + this.dirty = false; + this.setMessage("Renamed"); + await this.refresh(); + }, + + replaceActiveSession(next) { + if (!this.session) return; + this.replaceSession(this.session, next); + }, + + replaceSession(previous, next) { + this.session = next; + const index = this.tabs.findIndex((tab) => tab.tab_id === (previous?.tab_id || next.tab_id)); + if (index >= 0) this.tabs.splice(index, 1, next); + this.queueRender(); + this.updateDesktopMonitor(); + }, + + setMessage(value) { + this.message = value; + if (this._saveMessageTimer) globalThis.clearTimeout(this._saveMessageTimer); + this._saveMessageTimer = globalThis.setTimeout(() => { + this.message = ""; + this._saveMessageTimer = null; + }, SAVE_MESSAGE_MS); + }, + + resetHistory(text) { + this._history = [String(text || "")]; + this._historyIndex = 0; + }, + + pushHistory(text) { + const value = String(text || ""); + if (this._history[this._historyIndex] === value) return; + this._history = this._history.slice(0, this._historyIndex + 1); + this._history.push(value); + if (this._history.length > MAX_HISTORY) this._history.shift(); + this._historyIndex = this._history.length - 1; + }, + + undo() { + if (this._historyIndex <= 0) return; + this._historyIndex -= 1; + this.applyEditorText(this._history[this._historyIndex], true); + }, + + redo() { + if (this._historyIndex >= this._history.length - 1) return; + this._historyIndex += 1; + this.applyEditorText(this._history[this._historyIndex], true); + }, + + canUndo() { + return this._historyIndex > 0; + }, + + canRedo() { + return this._historyIndex < this._history.length - 1; + }, + + applyEditorText(text, markDirty = false) { + this.editorText = String(text || ""); + if (this.session) { + this.session.text = this.editorText; + this.session.dirty = markDirty || this.session.dirty; + } + if (markDirty) this.markDirty(); + this.queueRender({ force: true, focus: true }); + }, + + markDirty() { + this.dirty = true; + if (this.session) this.session.dirty = true; + }, + + onSourceInput() { + this.markDirty(); + this.pushHistory(this.editorText); + this.scheduleInputPush(); + }, + + syncEditorText() { + if (!this.session) return; + if (this.hasOfficialOffice()) return; + this.session.text = this.editorText; + }, + + scheduleInputPush() { + if (!this.session?.session_id) return; + if (this._inputTimer) globalThis.clearTimeout(this._inputTimer); + this._inputTimer = globalThis.setTimeout(() => { + this._inputTimer = null; + this.flushInput(); + }, INPUT_PUSH_DELAY_MS); + }, + + flushInput() { + if (!this.session?.session_id) return; + if (this.hasOfficialOffice()) return; + this.syncEditorText(); + requestOffice("office_input", { + session_id: this.session.session_id, + text: this.editorText, + }, 3000).catch(() => {}); + }, + + format(command) { + if (!this.session) return; + if (!this.isMarkdown()) return; + this.applySourceFormat(command); + }, + + applySourceFormat(command) { + const textarea = this._root?.querySelector?.("[data-office-source]"); + if (!textarea) return; + const start = textarea.selectionStart || 0; + const end = textarea.selectionEnd || start; + const selected = this.editorText.slice(start, end); + let replacement = selected; + if (command === "bold") replacement = `**${selected || "text"}**`; + if (command === "italic") replacement = `*${selected || "text"}*`; + if (command === "list") replacement = (selected || "item").split("\n").map((line) => `- ${line.replace(/^[-*]\s+/, "")}`).join("\n"); + if (command === "numbered") replacement = (selected || "item").split("\n").map((line, index) => `${index + 1}. ${line.replace(/^\d+\.\s+/, "")}`).join("\n"); + if (command === "table") replacement = "| Column | Value |\n| --- | --- |\n| | |"; + if (replacement === selected) return; + this.editorText = `${this.editorText.slice(0, start)}${replacement}${this.editorText.slice(end)}`; + this.onSourceInput(); + globalThis.requestAnimationFrame?.(() => { + textarea.focus(); + textarea.selectionStart = start; + textarea.selectionEnd = start + replacement.length; + }); + }, + + queueRender(options = {}) { + const force = Boolean(options.force); + if (options.focus) { + this._pendingFocus = true; + this._pendingFocusEnd = options.end !== false; + this._focusAttempts = 0; + } + const render = () => { + if (this._pendingFocus && this.focusEditor({ end: this._pendingFocusEnd })) { + this._pendingFocus = false; + this._focusAttempts = 0; + } else if (this._pendingFocus && this._focusAttempts < 6) { + this._focusAttempts += 1; + globalThis.setTimeout(render, 45); + } + }; + if (globalThis.requestAnimationFrame) { + globalThis.requestAnimationFrame(render); + } else { + globalThis.setTimeout(render, 0); + } + }, + + focusEditor(options = {}) { + if (!this.session) return false; + if (this.hasOfficialOffice()) { + return this.focusDesktopFrame(this.desktopFrame(), { arm: true }); + } + const source = this._root?.querySelector?.("[data-office-source]"); + if (!this.isMarkdown() || !source) return false; + source.focus?.({ preventScroll: true }); + if (!editorContainsFocus(source)) return false; + if (options.end !== false) placeCaretAtEnd(source); + return true; + }, + + isMarkdown(tab = this.session) { + const ext = String(tab?.extension || tab?.document?.extension || "").toLowerCase(); + return ext === "md"; + }, + + isBinaryOffice(tab = this.session) { + const ext = String(tab?.extension || tab?.document?.extension || "").toLowerCase(); + return ["odt", "ods", "odp", "docx", "xlsx", "pptx"].includes(ext); + }, + + hasOfficialOffice(tab = this.session) { + return Boolean(tab?.desktop?.available && tab.desktop.url); + }, + + isDesktopSession(tab = this.session) { + return Boolean( + tab + && ( + tab.file_id === SYSTEM_DESKTOP_FILE_ID + || tab.extension === "desktop" + || tab.mode === "desktop" + ) + ); + }, + + isDesktopOfficeDocument(tab = this.session) { + return Boolean(tab && this.hasOfficialOffice(tab) && !this.isDesktopSession(tab) && this.isBinaryOffice(tab)); + }, + + hasActiveFile(tab = this.session) { + return Boolean(tab && !this.isDesktopSession(tab) && (this.isMarkdown(tab) || this.isDesktopOfficeDocument(tab))); + }, + + isVisibleOfficeTab(tab = {}) { + return Boolean(this.hasActiveFile(tab)); + }, + + visibleTabs() { + return this.tabs.filter((tab) => this.isVisibleOfficeTab(tab)); + }, + + officialOfficeUrl(tab = this.session) { + const url = tab?.desktop?.url || ""; + if (!url) return ""; + try { + const parsed = new URL(url, window.location.href); + const secureContext = globalThis.isSecureContext === true; + parsed.searchParams.set("offscreen", secureContext ? "true" : "false"); + parsed.searchParams.set("clipboard_poll", secureContext ? "true" : "false"); + if (parsed.origin === window.location.origin) return `${parsed.pathname}${parsed.search}${parsed.hash}`; + return parsed.href; + } catch { + return url; + } + }, + + isDesktopHostVisible() { + if (this._mode === "modal") return true; + const surface = this._root?.closest?.('[data-surface-id="desktop"]'); + return Boolean(surface?.classList?.contains("is-mounted") || surface?.classList?.contains("is-active")); + }, + + setDesktopHostVisible(visible) { + const next = Boolean(visible); + if (!next && this._mode === "modal") return; + if (this._desktopHostVisible === next) return; + this._desktopHostVisible = next; + if (next) { + this.afterDesktopHostShown({ source: "canvas-visibility" }); + } else { + this.beforeHostHidden({ reason: "hidden" }); + } + }, + + desktopFrames() { + const frames = []; + if (this._desktopFrame) frames.push(this._desktopFrame); + for (const frame of Array.from(document.querySelectorAll("[data-office-desktop-frame]"))) { + if (!frames.includes(frame)) frames.push(frame); + } + return frames; + }, + + isUsableDesktopFrame(frame) { + if (!frame?.contentWindow) return false; + const rect = frame.getBoundingClientRect?.(); + return Boolean(rect && rect.width >= 120 && rect.height >= 80); + }, + + desktopFrame(preferred = null) { + if (this.isUsableDesktopFrame(preferred)) return preferred; + const rootFrame = this._root?.querySelector?.("[data-office-desktop-frame]"); + if (this.isUsableDesktopFrame(rootFrame)) return rootFrame; + const frames = this.desktopFrames(); + return frames + .filter((frame) => this.isUsableDesktopFrame(frame)) + .sort((left, right) => { + const leftRect = left.getBoundingClientRect(); + const rightRect = right.getBoundingClientRect(); + return (rightRect.width * rightRect.height) - (leftRect.width * leftRect.height); + })[0] || null; + }, + + isUsableDesktopHost(host) { + if (!host?.appendChild) return false; + const rect = host.getBoundingClientRect?.(); + return Boolean(rect && rect.width >= 120 && rect.height >= 80); + }, + + desktopHost(preferred = null) { + if (preferred?.matches?.("[data-office-desktop-host]")) return preferred; + const rootHost = this._root?.querySelector?.("[data-office-desktop-host]"); + if (this.isUsableDesktopHost(rootHost)) return rootHost; + const hosts = Array.from(document.querySelectorAll("[data-office-desktop-host]")); + return hosts + .filter((host) => this.isUsableDesktopHost(host)) + .sort((left, right) => { + const leftRect = left.getBoundingClientRect(); + const rightRect = right.getBoundingClientRect(); + return (rightRect.width * rightRect.height) - (leftRect.width * leftRect.height); + })[0] || rootHost || hosts[0] || null; + }, + + ensureDesktopKeepaliveHost() { + if (this._desktopKeepaliveHost?.isConnected) return this._desktopKeepaliveHost; + const host = document.createElement("div"); + host.className = "office-desktop-keepalive"; + host.dataset.officeDesktopKeepalive = "true"; + Object.assign(host.style, { + position: "fixed", + left: "-10000px", + top: "-10000px", + width: "720px", + height: "480px", + overflow: "hidden", + pointerEvents: "none", + visibility: "hidden", + }); + document.body?.appendChild(host); + this._desktopKeepaliveHost = host; + return host; + }, + + rememberDesktopFrameSize() { + const frame = this._desktopFrame; + const rect = frame?.getBoundingClientRect?.(); + const hostRect = this._desktopFrameHost?.getBoundingClientRect?.(); + const width = Math.round(rect?.width || hostRect?.width || 720); + const height = Math.round(rect?.height || hostRect?.height || 480); + const keepalive = this.ensureDesktopKeepaliveHost(); + keepalive.style.width = `${Math.max(320, width)}px`; + keepalive.style.height = `${Math.max(220, height)}px`; + return keepalive; + }, + + ensureDesktopFrame() { + if (this._desktopFrame) return this._desktopFrame; + const frame = document.createElement("iframe"); + frame.className = "office-desktop-frame"; + frame.dataset.officeDesktopFrame = "true"; + frame.dataset.officePersistentDesktopFrame = "true"; + frame.setAttribute("tabindex", "0"); + frame.setAttribute("aria-label", "Desktop"); + frame.setAttribute("allow", "clipboard-read; clipboard-write; autoplay"); + this._desktopFrameLoadHandler = (event) => this.onDesktopFrameLoaded(event); + frame.addEventListener("load", this._desktopFrameLoadHandler); + this._desktopFrame = frame; + return frame; + }, + + desktopFrameSrcMatches(frame, url) { + const current = frame?.getAttribute?.("src") || frame?.src || ""; + if (!current && !url) return true; + try { + return new URL(current, window.location.href).href === new URL(url, window.location.href).href; + } catch { + return current === url; + } + }, + + attachDesktopFrame(host = null) { + if (!this.hasOfficialOffice()) return false; + const target = this.desktopHost(host); + if (!target) return false; + const frame = this.ensureDesktopFrame(); + if (frame.parentElement !== target) { + frame.parentElement?.removeAttribute?.("data-office-desktop-attached"); + target.appendChild(frame); + } + target.dataset.officeDesktopAttached = "true"; + if (this._desktopFrameHost !== target) this._desktopFrameHost = target; + const url = this.officialOfficeUrl(); + if (url && !this.desktopFrameSrcMatches(frame, url)) { + frame.setAttribute("src", url); + } + return true; + }, + + mountDesktopFrameHost(host = null) { + const attached = this.attachDesktopFrame(host); + if (attached && this.isDesktopHostVisible()) { + this.requestDesktopViewportSync({ force: true, frame: this._desktopFrame, followup: true }); + } + return attached; + }, + + moveDesktopFrameToKeepalive() { + const frame = this._desktopFrame; + if (!frame) return false; + const keepalive = this.rememberDesktopFrameSize(); + if (frame.parentElement !== keepalive) { + frame.parentElement?.removeAttribute?.("data-office-desktop-attached"); + keepalive.appendChild(frame); + } + this._desktopFrameHost = keepalive; + this._desktopKeyboardActive = false; + this.updateDesktopKeyboardCaptureState(frame); + return true; + }, + + destroyDesktopFrame() { + const frame = this._desktopFrame; + if (!frame) return; + if (this._desktopFrameLoadHandler) { + frame.removeEventListener("load", this._desktopFrameLoadHandler); + } + frame.setAttribute("src", "about:blank"); + frame.remove(); + this._desktopFrame = null; + this._desktopFrameHost = null; + this._desktopFrameLoadHandler = null; + this._desktopBridgeReady = false; + this.updateDesktopKeyboardCaptureState(); + this._desktopKeepaliveHost?.remove?.(); + this._desktopKeepaliveHost = null; + }, + + unloadDesktopFrames() { + this.stopDesktopResizeObserver(); + this.stopXpraDesktopPrime(); + this.moveDesktopFrameToKeepalive(); + }, + + restoreDesktopFrames() { + if (!this.isDesktopHostVisible()) return; + this.attachDesktopFrame(); + }, + + afterDesktopHostShown() { + if (!this.hasOfficialOffice()) return; + this._desktopHostVisible = true; + this._desktopResizeKey = ""; + this._desktopResizePendingKey = ""; + this._desktopResizeSuspended = false; + this._desktopResizePending = false; + this.restoreDesktopFrames(); + this.requestDesktopViewportSync({ force: true, frame: this.desktopFrame() }); + }, + + beforeDesktopHostHandoff() { + this.stopDesktopResizeObserver(); + this.clearDesktopViewportSyncTimers(); + this.stopXpraDesktopPrime(); + this._desktopResizeKey = ""; + this._desktopResizePendingKey = ""; + this._desktopResizeSuspended = true; + this._desktopResizePending = true; + }, + + cancelDesktopHostHandoff() { + this._desktopResizeSuspended = false; + this._desktopResizePending = false; + this.requestDesktopViewportSync({ force: true, frame: this.desktopFrame() }); + }, + + onDesktopFrameLoaded(event = null) { + if (event?.target?.getAttribute?.("src") === "about:blank") return; + if (!this.isDesktopHostVisible()) return; + this.error = ""; + this.queueDesktopFrameFocus(event?.target || null); + this.requestDesktopViewportSync({ force: true, frame: event?.target || null }); + }, + + queueDesktopFrameFocus(frame = null) { + for (const delay of [0, 80, 260]) { + globalThis.setTimeout(() => { + if (!this.hasOfficialOffice()) return; + if (isEditableInputTarget(document.activeElement)) return; + this.focusDesktopFrame(frame || this.desktopFrame(), { arm: true }); + }, delay); + } + }, + + focusDesktopFrame(frame = null, options = {}) { + if (this._desktopFocusInProgress) return false; + const target = this.desktopFrame(frame); + if (!target) return false; + if (options.arm !== false) this._desktopKeyboardActive = true; + this._desktopFocusInProgress = true; + try { + target.setAttribute("tabindex", "0"); + target.focus?.({ preventScroll: true }); + target.contentWindow?.focus?.(); + if (target.contentDocument?.body && !target.contentDocument.body.hasAttribute("tabindex")) { + target.contentDocument.body.tabIndex = -1; + } + target.contentDocument?.body?.focus?.({ preventScroll: true }); + if (target.contentWindow?.client) target.contentWindow.client.capture_keyboard = true; + } catch { + target.focus?.({ preventScroll: true }); + } finally { + this._desktopFocusInProgress = false; + } + const focused = Boolean(document.activeElement === target || target.contentDocument?.hasFocus?.()); + this.updateDesktopKeyboardCaptureState(target); + return focused; + }, + + updateDesktopMonitor() { + if (!this.hasOfficialOffice() || !this.isDesktopHostVisible()) { + this.stopDesktopMonitor(); + this.stopDesktopResizeObserver(); + this._desktopKeyboardActive = false; + this._desktopBridgeReady = false; + this.updateDesktopKeyboardCaptureState(); + return; + } + const sessionId = this.session?.desktop_session_id || this.session?.session_id || ""; + const tabId = this.session?.tab_id || ""; + if ( + sessionId + && tabId + && this._desktopHeartbeatTimer + && this._desktopHeartbeatSessionId === sessionId + && this._desktopHeartbeatTabId === tabId + ) return; + this.startDesktopMonitor(); + this.startDesktopResizeObserver(); + }, + + startDesktopResizeObserver() { + if (!this.hasOfficialOffice() || !this.isDesktopHostVisible()) { + this.stopDesktopResizeObserver(); + return; + } + const frame = this.desktopFrame(); + const target = frame?.parentElement || frame; + if (!target) { + this.stopDesktopResizeObserver(); + return; + } + if (this._desktopResizeCleanup && this._desktopResizeTarget === target) return; + this.stopDesktopResizeObserver(); + + const resize = () => this.queueDesktopResize(); + const resizeStart = () => this.suspendDesktopResize(); + const resizeEnd = () => this.resumeDesktopResize(); + const cleanup = []; + if (typeof ResizeObserver !== "undefined") { + const observer = new ResizeObserver(resize); + observer.observe(target); + cleanup.push(() => observer.disconnect()); + } + globalThis.addEventListener?.("resize", resize); + cleanup.push(() => globalThis.removeEventListener?.("resize", resize)); + globalThis.addEventListener?.("right-canvas-resize-start", resizeStart); + cleanup.push(() => globalThis.removeEventListener?.("right-canvas-resize-start", resizeStart)); + globalThis.addEventListener?.("right-canvas-resize-end", resizeEnd); + cleanup.push(() => globalThis.removeEventListener?.("right-canvas-resize-end", resizeEnd)); + this._desktopResizeTarget = target; + this._desktopResizeCleanup = () => cleanup.splice(0).reverse().forEach((entry) => entry()); + resize(); + }, + + stopDesktopResizeObserver() { + if (this._desktopResizeTimer) { + globalThis.clearTimeout(this._desktopResizeTimer); + } + this._desktopResizeTimer = null; + this._desktopResizeCleanup?.(); + this._desktopResizeCleanup = null; + this._desktopResizeTarget = null; + this._desktopResizeKey = ""; + this._desktopResizePendingKey = ""; + this._desktopResizeSuspended = false; + this._desktopResizePending = false; + }, + + suspendDesktopResize() { + this._desktopResizeSuspended = true; + if (this._desktopResizeTimer) { + globalThis.clearTimeout(this._desktopResizeTimer); + this._desktopResizeTimer = null; + } + this._desktopResizePendingKey = ""; + }, + + resumeDesktopResize() { + const hadPendingResize = this._desktopResizePending; + this._desktopResizeSuspended = false; + this._desktopResizePending = false; + if (hadPendingResize || this.hasOfficialOffice()) { + this.queueDesktopResize({ force: true }); + } + }, + + shouldDeferDesktopResize() { + return Boolean( + this._desktopResizeSuspended + || document.body?.classList?.contains("right-canvas-resizing") + || document.querySelector?.(".modal-inner.office-modal.is-resizing") + ); + }, + + clearDesktopViewportSyncTimers() { + for (const timer of this._desktopViewportSyncTimers.splice(0)) { + globalThis.clearTimeout(timer); + } + }, + + requestDesktopViewportSync(options = {}) { + if (!this.hasOfficialOffice() || !this.isDesktopHostVisible()) return; + if (options.force) this.clearDesktopViewportSyncTimers(); + const run = (force = false) => { + this.syncDesktopViewport({ ...options, force }); + }; + if (globalThis.requestAnimationFrame) { + globalThis.requestAnimationFrame(() => run(Boolean(options.force))); + } else { + globalThis.setTimeout(() => run(Boolean(options.force)), 0); + } + if (options.followup === false) return; + const timer = globalThis.setTimeout(() => { + this._desktopViewportSyncTimers = this._desktopViewportSyncTimers.filter((item) => item !== timer); + run(false); + }, options.force ? 260 : 180); + this._desktopViewportSyncTimers.push(timer); + }, + + syncDesktopViewport(options = {}) { + if (!this.hasOfficialOffice() || !this.isDesktopHostVisible()) return false; + const frame = this.desktopFrame(options.frame || null); + if (!frame) return false; + this.startDesktopResizeObserver(); + this.primeXpraDesktopFrame({ reset: true, frame }); + this.queueDesktopResize({ + force: Boolean(options.force), + serverResize: options.serverResize !== false, + frame, + }); + this.updateDesktopMonitor(); + return true; + }, + + primeXpraDesktopFrame(options = {}) { + if (options.reset) { + this.stopXpraDesktopPrime(); + this._desktopPrimeAttempts = 0; + } + if (this.applyXpraDesktopFrameMode(options.frame || null)) return; + if (this._desktopPrimeAttempts >= XPRA_DESKTOP_PRIME_ATTEMPTS) return; + this._desktopPrimeAttempts += 1; + if (this._desktopPrimeTimer) globalThis.clearTimeout(this._desktopPrimeTimer); + this._desktopPrimeTimer = globalThis.setTimeout(() => { + this._desktopPrimeTimer = null; + this.primeXpraDesktopFrame(); + }, XPRA_DESKTOP_PRIME_INTERVAL_MS); + }, + + stopXpraDesktopPrime() { + if (this._desktopPrimeTimer) globalThis.clearTimeout(this._desktopPrimeTimer); + this._desktopPrimeTimer = null; + }, + + applyXpraDesktopFrameMode(preferredFrame = null, options = {}) { + const frame = this.desktopFrame(preferredFrame); + const remoteWindow = frame?.contentWindow; + if (!remoteWindow) return false; + const requestServerResize = options.requestServerResize === true; + const requestRefresh = options.requestRefresh !== false; + try { + const remoteDocument = frame.contentDocument || remoteWindow.document; + this.installXpraDesktopFrameCss(remoteDocument); + this.installXpraDesktopFramePatches(remoteWindow, remoteDocument); + const client = remoteWindow.client; + if (!client) return false; + this.installXpraDesktopClientPatches(remoteWindow, client); + this.installXpraDesktopCursorPatches(remoteWindow, remoteDocument, client); + this.installXpraDesktopKeyboardBridge(frame, remoteWindow, remoteDocument, client); + this.installXpraDesktopClipboardBridge(frame, remoteWindow, remoteDocument, client); + const container = client.container || remoteDocument?.querySelector?.("#screen"); + if (!container) return false; + + client.server_is_desktop = true; + client.server_resize_exact = true; + remoteDocument?.body?.classList?.add("desktop"); + + const windows = Object.values(client.id_to_window || {}); + if (!client.connected || !windows.length) return false; + + const width = Math.round(container.clientWidth || remoteWindow.innerWidth || 0); + const height = Math.round(container.clientHeight || remoteWindow.innerHeight || 0); + if (width > 0 && height > 0) { + client.desktop_width = width; + client.desktop_height = height; + } + if (requestServerResize && width > 0 && height > 0 && typeof client._screen_resized === "function") { + client.desktop_width = 0; + client.desktop_height = 0; + client.__a0AllowScreenResize = true; + try { + client._screen_resized(new remoteWindow.Event("resize")); + } finally { + client.__a0AllowScreenResize = false; + } + } + + for (const xpraWindow of windows) { + this.normalizeXpraDesktopWindow(xpraWindow, width, height); + xpraWindow.screen_resized?.(); + this.normalizeXpraDesktopWindow(xpraWindow, width, height); + xpraWindow.updateCSSGeometry?.(); + this.fitXpraDesktopWindowElement(xpraWindow, width, height); + this.installXpraDesktopWheelBridge(remoteWindow, xpraWindow); + if (requestRefresh && xpraWindow.wid != null) client.request_refresh?.(xpraWindow.wid); + } + this.installXpraDesktopAgentBridge(frame, remoteWindow, remoteDocument, client, container); + return true; + } catch (error) { + console.warn("Xpra desktop viewport prime skipped", error); + return false; + } + }, + + installXpraDesktopAgentBridge(frame, remoteWindow, remoteDocument, client, container) { + if (!frame || !remoteWindow || !remoteDocument || !client) return null; + const store = this; + const finite = (value, fallback = 0) => { + const number = Number(value); + return Number.isFinite(number) ? number : fallback; + }; + const metrics = () => { + const desktopWidth = Math.max(1, finite(client.desktop_width || container?.clientWidth || remoteWindow.innerWidth, 1)); + const desktopHeight = Math.max(1, finite(client.desktop_height || container?.clientHeight || remoteWindow.innerHeight, 1)); + const clientWidth = Math.max(1, finite(container?.clientWidth || remoteWindow.innerWidth, desktopWidth)); + const clientHeight = Math.max(1, finite(container?.clientHeight || remoteWindow.innerHeight, desktopHeight)); + return { + desktopWidth, + desktopHeight, + clientWidth, + clientHeight, + scaleX: clientWidth / desktopWidth, + scaleY: clientHeight / desktopHeight, + }; + }; + const bridge = frame.__agentZeroDesktopBridge || {}; + Object.assign(bridge, { + ready: true, + state: async (options = {}) => { + const result = await callDesktop("state", { + include_screenshot: options.includeScreenshot === true || options.include_screenshot === true, + }); + store._desktopLastState = result; + return result; + }, + focus: (options = {}) => store.focusDesktopFrame(frame, { ...options, arm: options.arm !== false }), + requestRefresh: () => { + for (const xpraWindow of Object.values(client.id_to_window || {})) { + if (xpraWindow?.wid != null) client.request_refresh?.(xpraWindow.wid); + } + return true; + }, + desktopToClient: (x, y) => { + const value = metrics(); + return { + x: Math.round(finite(x) * value.scaleX), + y: Math.round(finite(y) * value.scaleY), + scale_x: value.scaleX, + scale_y: value.scaleY, + }; + }, + clientToDesktop: (x, y) => { + const value = metrics(); + return { + x: Math.round(finite(x) / value.scaleX), + y: Math.round(finite(y) / value.scaleY), + scale_x: value.scaleX, + scale_y: value.scaleY, + }; + }, + diagnostics: () => store.desktopBridgeDiagnostics(frame), + }); + frame.agentZeroDesktop = bridge; + frame.__agentZeroDesktopBridge = bridge; + remoteWindow.agentZeroDesktop = bridge; + remoteWindow.__agentZeroDesktopBridge = bridge; + this._desktopBridgeReady = true; + this.updateDesktopKeyboardCaptureState(frame); + return bridge; + }, + + desktopBridgeDiagnostics(frame = null) { + return { + ready: this._desktopBridgeReady, + keyboard: this.updateDesktopKeyboardCaptureState(frame), + lastStateOk: this._desktopLastState?.ok ?? null, + }; + }, + + updateDesktopKeyboardCaptureState(frame = null) { + const target = this.desktopFrame(frame); + const client = target?.contentWindow?.client; + const state = { + ready: Boolean(target?.__agentZeroDesktopBridge || target?.contentWindow?.__agentZeroDesktopBridge), + active: Boolean(this._desktopKeyboardActive), + capture: Boolean(client?.capture_keyboard), + focused: Boolean(target && (document.activeElement === target || target.contentDocument?.hasFocus?.())), + }; + this._desktopKeyboardCaptureState = state; + return state; + }, + + normalizeXpraDesktopWindow(xpraWindow, width, height) { + if (!xpraWindow) return; + const normalizedWidth = Math.max(1, Math.round(Number(width || 0))); + const normalizedHeight = Math.max(1, Math.round(Number(height || 0))); + xpraWindow.x = 0; + xpraWindow.y = 0; + xpraWindow.w = normalizedWidth; + xpraWindow.h = normalizedHeight; + xpraWindow.resizable = false; + xpraWindow.decorations = false; + xpraWindow.decorated = false; + xpraWindow.metadata = { ...(xpraWindow.metadata || {}), decorations: false }; + xpraWindow._set_decorated?.(false); + xpraWindow.configure_border_class?.(); + xpraWindow.leftoffset = 0; + xpraWindow.rightoffset = 0; + xpraWindow.topoffset = 0; + xpraWindow.bottomoffset = 0; + }, + + fitXpraDesktopWindowElement(xpraWindow, width, height) { + const cssWidth = `${Math.max(1, Number(width || 0))}px`; + const cssHeight = `${Math.max(1, Number(height || 0))}px`; + const windowElement = xpraWindow?.div; + const canvas = xpraWindow?.canvas; + windowElement?.style?.setProperty("left", "0px", "important"); + windowElement?.style?.setProperty("top", "0px", "important"); + windowElement?.style?.setProperty("position", "absolute", "important"); + windowElement?.style?.setProperty("width", cssWidth, "important"); + windowElement?.style?.setProperty("height", cssHeight, "important"); + windowElement?.style?.setProperty("transform", "none", "important"); + windowElement?.style?.setProperty("margin", "0", "important"); + canvas?.style?.setProperty("width", cssWidth, "important"); + canvas?.style?.setProperty("height", cssHeight, "important"); + canvas?.style?.setProperty("display", "block", "important"); + canvas?.style?.setProperty("margin", "0", "important"); + }, + + installXpraDesktopWheelBridge(remoteWindow, xpraWindow) { + const canvas = xpraWindow?.canvas; + if (!remoteWindow || !canvas || canvas.__a0XpraWheelBridgeInstalled) return; + if (typeof xpraWindow.mouse_scroll_cb !== "function") return; + canvas.__a0XpraWheelBridgeInstalled = true; + canvas.addEventListener("wheel", (event) => { + event.stopImmediatePropagation?.(); + event.stopPropagation?.(); + event.preventDefault?.(); + const normalizedEvent = this.xpraDesktopWheelEvent(remoteWindow, canvas, event); + xpraWindow.mouse_scroll_cb(normalizedEvent, xpraWindow); + }, { passive: false, capture: true }); + }, + + xpraDesktopWheelEvent(remoteWindow, canvas, event) { + const finite = (value, fallback = 0) => { + const number = Number(value); + return Number.isFinite(number) ? number : fallback; + }; + const deltaMode = finite(event.deltaMode, 0); + const lineHeight = 16; + const pageHeight = Math.max(1, remoteWindow.innerHeight || canvas.clientHeight || 800); + const deltaScale = deltaMode === 1 ? lineHeight : deltaMode === 2 ? pageHeight : 1; + const deltaX = finite(event.deltaX) * deltaScale; + const deltaY = finite(event.deltaY) * deltaScale; + const deltaZ = finite(event.deltaZ) * deltaScale; + const wheelDeltaX = finite(event.wheelDeltaX, -deltaX); + const wheelDeltaY = finite(event.wheelDeltaY, -deltaY); + const wheelDelta = finite(event.wheelDelta, wheelDeltaY || wheelDeltaX); + const getModifierState = (key) => { + if (typeof event.getModifierState === "function") return event.getModifierState(key); + const normalizedKey = String(key || "").toLowerCase(); + if (normalizedKey === "alt") return Boolean(event.altKey); + if (normalizedKey === "control") return Boolean(event.ctrlKey); + if (normalizedKey === "meta") return Boolean(event.metaKey); + if (normalizedKey === "shift") return Boolean(event.shiftKey); + return false; + }; + const normalizedEvent = Object.create(event); + Object.defineProperties(normalizedEvent, { + target: { value: event.target || canvas }, + currentTarget: { value: canvas }, + clientX: { value: finite(event.clientX) }, + clientY: { value: finite(event.clientY) }, + pageX: { value: finite(event.pageX, finite(event.clientX)) }, + pageY: { value: finite(event.pageY, finite(event.clientY)) }, + screenX: { value: finite(event.screenX) }, + screenY: { value: finite(event.screenY) }, + offsetX: { value: finite(event.offsetX) }, + offsetY: { value: finite(event.offsetY) }, + movementX: { value: finite(event.movementX) }, + movementY: { value: finite(event.movementY) }, + button: { value: finite(event.button) }, + buttons: { value: finite(event.buttons) }, + which: { value: finite(event.which) }, + detail: { value: finite(event.detail) }, + deltaX: { value: deltaX }, + deltaY: { value: deltaY }, + deltaZ: { value: deltaZ }, + deltaMode: { value: 0 }, + wheelDeltaX: { value: wheelDeltaX }, + wheelDeltaY: { value: wheelDeltaY }, + wheelDelta: { value: wheelDelta }, + altKey: { value: Boolean(event.altKey) }, + ctrlKey: { value: Boolean(event.ctrlKey) }, + metaKey: { value: Boolean(event.metaKey) }, + shiftKey: { value: Boolean(event.shiftKey) }, + getModifierState: { value: getModifierState }, + preventDefault: { value: () => event.preventDefault?.() }, + stopPropagation: { value: () => event.stopPropagation?.() }, + stopImmediatePropagation: { value: () => event.stopImmediatePropagation?.() }, + }); + return normalizedEvent; + }, + + installXpraDesktopFrameCss(remoteDocument) { + if (!remoteDocument || remoteDocument.getElementById("a0-xpra-desktop-frame-css")) return; + const style = remoteDocument.createElement("style"); + style.id = "a0-xpra-desktop-frame-css"; + style.textContent = ` + html, body, #screen { + width: 100% !important; + height: 100% !important; + overflow: hidden !important; + } + #float_menu, + .windowhead, + .windowbuttons { + display: none !important; + } + #shadow_pointer { + display: none !important; + visibility: hidden !important; + opacity: 0 !important; + } + .window, + .window.border, + .window.desktop, + .undecorated, + .undecorated.border, + .undecorated.desktop { + left: 0 !important; + top: 0 !important; + position: absolute !important; + width: 100% !important; + height: 100% !important; + transform: none !important; + margin: 0 !important; + border: 0 !important; + border-radius: 0 !important; + box-shadow: none !important; + } + .window canvas, + .undecorated canvas { + display: block !important; + width: 100% !important; + height: 100% !important; + margin: 0 !important; + border: 0 !important; + border-radius: 0 !important; + box-shadow: none !important; + } + `; + remoteDocument.head?.appendChild(style); + }, + + installXpraDesktopCursorPatches(remoteWindow, remoteDocument, client) { + if (!remoteWindow || !remoteDocument || !client) return; + const hideShadowPointer = () => { + const pointer = remoteDocument.getElementById?.("shadow_pointer"); + pointer?.style?.setProperty("display", "none", "important"); + pointer?.style?.setProperty("visibility", "hidden", "important"); + pointer?.style?.setProperty("opacity", "0", "important"); + }; + hideShadowPointer(); + + const pointerPacket = remoteWindow.PACKET_TYPES?.pointer_position || "pointer-position"; + if (!client.__a0XpraDesktopCursorPatched) { + if (typeof client._process_pointer_position === "function") { + client.__a0OriginalProcessPointerPosition = client._process_pointer_position; + } + client._process_pointer_position = function patchedProcessPointerPosition(packet) { + hideShadowPointer(); + this.__a0LastPointerPosition = packet; + return false; + }; + client.__a0XpraDesktopCursorPatched = true; + } + if (client.packet_handlers && pointerPacket) { + client.packet_handlers[pointerPacket] = client._process_pointer_position; + } + }, + + installXpraDesktopFramePatches(remoteWindow, remoteDocument) { + if (!remoteWindow || !remoteDocument) return; + remoteWindow.__a0XpraDesktopFramePatches ||= {}; + const patches = remoteWindow.__a0XpraDesktopFramePatches; + const isBenignXpraWarning = (args = []) => { + const text = Array.from(args || []).map((value) => String(value || "")).join(" "); + return text.includes("window does not fit in canvas, offsets") + || (text.includes("decode error packet") && text.includes("not found")); + }; + if (!patches.consoleWarn && typeof remoteWindow.console?.warn === "function") { + const originalConsoleWarn = remoteWindow.console.warn.bind(remoteWindow.console); + remoteWindow.console.warn = function patchedConsoleWarn(...args) { + if (isBenignXpraWarning(args)) return undefined; + return originalConsoleWarn(...args); + }; + patches.consoleWarn = true; + } + if (!patches.noWindowList && typeof remoteWindow.noWindowList === "function") { + const originalNoWindowList = remoteWindow.noWindowList; + remoteWindow.noWindowList = function patchedNoWindowList(...args) { + if (!remoteDocument.querySelector("#open_windows")) return undefined; + return originalNoWindowList.apply(this, args); + }; + patches.noWindowList = true; + } + if (!patches.addWindowListItem && typeof remoteWindow.addWindowListItem === "function") { + const originalAddWindowListItem = remoteWindow.addWindowListItem; + remoteWindow.addWindowListItem = function patchedAddWindowListItem(...args) { + if (!remoteDocument.querySelector("#open_windows_list")) return undefined; + return originalAddWindowListItem.apply(this, args); + }; + patches.addWindowListItem = true; + } + }, + + installXpraDesktopClientPatches(remoteWindow, client) { + if (!remoteWindow || !client) return; + if (!client.__a0XpraOffsetWarnPatched && typeof client.warn === "function") { + const originalClientWarn = client.warn.bind(client); + client.warn = function patchedClientWarn(...args) { + const text = Array.from(args || []).map((value) => String(value || "")).join(" "); + if ( + text.includes("window does not fit in canvas, offsets") + || (text.includes("decode error packet") && text.includes("not found")) + ) { + return undefined; + } + return originalClientWarn(...args); + }; + client.__a0XpraOffsetWarnPatched = true; + } + if (client.__a0XpraDesktopClientPatched) return; + if (typeof client._screen_resized === "function") { + const originalScreenResized = client._screen_resized.bind(client); + client.__a0OriginalScreenResized = originalScreenResized; + client._screen_resized = function patchedScreenResized(event) { + if (client.__a0AllowScreenResize === true) return originalScreenResized(event); + if (client.__a0ViewportResizing === true) return originalScreenResized(event); + return false; + }; + } + client.__a0XpraDesktopClientPatched = true; + }, + + installXpraDesktopClipboardBridge(frame, remoteWindow, remoteDocument, client) { + if (!frame || !remoteWindow || !remoteDocument || !client) return; + this.ensureDesktopClipboardBridge(); + if (remoteWindow.__a0XpraDesktopClipboardBridgeInstalled) return; + + const onPaste = (event) => { + this.handleDesktopPasteEvent(event, frame, remoteWindow, client); + }; + const onKeydown = (event) => { + if (this.isDesktopPasteShortcut(event)) { + void this.syncHostClipboardToDesktop(frame); + } + }; + remoteWindow.addEventListener("paste", onPaste, true); + remoteDocument.addEventListener("paste", onPaste, true); + remoteWindow.addEventListener("keydown", onKeydown, true); + remoteDocument.addEventListener("keydown", onKeydown, true); + remoteWindow.__a0XpraDesktopClipboardBridgeInstalled = true; + remoteWindow.__a0XpraDesktopClipboardBridgeCleanup = () => { + remoteWindow.removeEventListener("paste", onPaste, true); + remoteDocument.removeEventListener("paste", onPaste, true); + remoteWindow.removeEventListener("keydown", onKeydown, true); + remoteDocument.removeEventListener("keydown", onKeydown, true); + remoteWindow.__a0XpraDesktopClipboardBridgeInstalled = false; + }; + }, + + ensureDesktopClipboardBridge() { + if (this._desktopClipboardCleanup) return; + + const onPaste = (event) => { + if (!this._desktopKeyboardActive || !this.hasOfficialOffice()) return; + if (isEditableInputTarget(event.target)) return; + const frame = this.desktopFrame(); + const remoteWindow = frame?.contentWindow; + const client = remoteWindow?.client; + if (!frame || !remoteWindow || !client) return; + this.handleDesktopPasteEvent(event, frame, remoteWindow, client); + }; + + document.addEventListener("paste", onPaste, true); + this._desktopClipboardCleanup = () => { + document.removeEventListener("paste", onPaste, true); + this._desktopClipboardCleanup = null; + }; + }, + + stopDesktopClipboardBridge() { + this._desktopClipboardCleanup?.(); + }, + + handleDesktopPasteEvent(event, frame, remoteWindow, client) { + const text = this.desktopClipboardTextFromEvent(event); + if (!text) return false; + if (!this.syncXpraClipboardText(client, text, remoteWindow)) return false; + event.preventDefault?.(); + event.stopImmediatePropagation?.(); + event.stopPropagation?.(); + this.focusDesktopFrame(frame, { arm: true }); + return true; + }, + + desktopClipboardTextFromEvent(event) { + const data = (event?.originalEvent || event)?.clipboardData; + if (!data?.getData) return ""; + for (const type of ["text/plain", "text", "Text", "STRING", "UTF8_STRING"]) { + const value = data.getData(type); + if (value) return value; + } + return ""; + }, + + syncXpraClipboardText(client, text, remoteWindow = null) { + const value = String(text ?? ""); + if (!client || !value || typeof client.send_clipboard_token !== "function") return false; + const textPlain = remoteWindow?.TEXT_PLAIN || "text/plain"; + const utf8String = remoteWindow?.UTF8_STRING || "UTF8_STRING"; + const utilities = remoteWindow?.Utilities; + const payload = utilities?.StringToUint8 ? utilities.StringToUint8(value) : value; + client.clipboard_enabled = true; + client.clipboard_direction = "both"; + client.clipboard_buffer = value; + client.clipboard_pending = false; + client.send_clipboard_token(payload, [textPlain, utf8String, "TEXT", "STRING"]); + return true; + }, + + async syncHostClipboardToDesktop(frame = null) { + const target = this.desktopFrame(frame); + const remoteWindow = target?.contentWindow; + const client = remoteWindow?.client; + if (!client || !navigator.clipboard?.readText) return false; + try { + const text = await navigator.clipboard.readText(); + return this.syncXpraClipboardText(client, text, remoteWindow); + } catch { + return false; + } + }, + + isDesktopPasteShortcut(event) { + const key = String(event?.key || "").toLowerCase(); + return key === "v" && (event?.ctrlKey || event?.metaKey) && !event?.altKey; + }, + + installXpraDesktopKeyboardBridge(frame, remoteWindow, remoteDocument, client) { + if (!frame || !remoteWindow || !remoteDocument || !client) return; + this.ensureDesktopKeyboardBridge(); + frame.setAttribute("tabindex", "0"); + if (remoteWindow.__a0XpraDesktopKeyboardBridgeInstalled) return; + + const activate = () => { + if (this._desktopFocusInProgress) return; + this.focusDesktopFrame(frame, { arm: true }); + }; + const events = ["pointerdown", "mousedown", "touchstart", "focusin"]; + for (const eventName of events) { + remoteDocument.addEventListener(eventName, activate, true); + } + remoteWindow.addEventListener("focus", activate, true); + remoteWindow.__a0XpraDesktopKeyboardBridgeInstalled = true; + remoteWindow.__a0XpraDesktopKeyboardBridgeCleanup = () => { + for (const eventName of events) { + remoteDocument.removeEventListener(eventName, activate, true); + } + remoteWindow.removeEventListener("focus", activate, true); + remoteWindow.__a0XpraDesktopKeyboardBridgeInstalled = false; + }; + }, + + ensureDesktopKeyboardBridge() { + if (this._desktopKeyboardCleanup) return; + + const deactivateWhenOutsideDesktop = (event) => { + const target = event.target; + if (target?.closest?.(".office-desktop-wrap") || target?.matches?.("[data-office-desktop-frame]")) return; + this._desktopKeyboardActive = false; + }; + const forwardKeyboardEvent = (event, pressed) => { + if (!this._desktopKeyboardActive || !this.hasOfficialOffice()) return; + if (event.defaultPrevented || isEditableInputTarget(event.target)) return; + + const frame = this.desktopFrame(); + if (!frame || document.activeElement === frame) return; + const client = frame.contentWindow?.client; + const handler = pressed ? client?._keyb_onkeydown : client?._keyb_onkeyup; + if (!client?.capture_keyboard || typeof handler !== "function") return; + if (pressed && this.isDesktopPasteShortcut(event)) { + void this.syncHostClipboardToDesktop(frame); + } + + const allowDefault = handler.call(client, event); + if (!allowDefault) { + event.preventDefault(); + event.stopPropagation(); + } + }; + const onKeydown = (event) => forwardKeyboardEvent(event, true); + const onKeyup = (event) => forwardKeyboardEvent(event, false); + + document.addEventListener("pointerdown", deactivateWhenOutsideDesktop, true); + document.addEventListener("keydown", onKeydown, true); + document.addEventListener("keyup", onKeyup, true); + this._desktopKeyboardCleanup = () => { + document.removeEventListener("pointerdown", deactivateWhenOutsideDesktop, true); + document.removeEventListener("keydown", onKeydown, true); + document.removeEventListener("keyup", onKeyup, true); + this._desktopKeyboardActive = false; + this._desktopKeyboardCleanup = null; + }; + }, + + stopDesktopKeyboardBridge() { + this._desktopKeyboardCleanup?.(); + }, + + queueDesktopResize(options = {}) { + if (!this.hasOfficialOffice() || !this.isDesktopHostVisible()) return; + const token = this.session?.desktop?.token || ""; + const frame = this.desktopFrame(options.frame || null); + const target = frame?.parentElement || frame; + if (!token || !target) return; + const force = Boolean(options.force); + const serverResize = options.serverResize !== false; + const rect = target.getBoundingClientRect(); + const width = Math.round(rect.width); + const height = Math.round(rect.height); + if (width < 320 || height < 220) return; + const key = `${token}:${width}x${height}`; + const refreshFrameOnly = () => { + const client = frame?.contentWindow?.XPRA_CLIENT?.client; + if (client) client.__a0ViewportResizing = true; + try { + this.applyXpraDesktopFrameMode(frame, { requestServerResize: false, requestRefresh: false }); + } finally { + if (client) client.__a0ViewportResizing = false; + } + }; + if (!serverResize) { + refreshFrameOnly(); + return; + } + if (key === this._desktopResizeKey || key === this._desktopResizePendingKey) { + refreshFrameOnly(); + return; + } + refreshFrameOnly(); + if (!force && this.shouldDeferDesktopResize()) { + this._desktopResizePending = true; + return; + } + if (this._desktopResizeTimer) globalThis.clearTimeout(this._desktopResizeTimer); + this._desktopResizePendingKey = key; + this._desktopResizeTimer = globalThis.setTimeout(async () => { + this._desktopResizeTimer = null; + if (!this.hasOfficialOffice() || !this.isDesktopHostVisible()) { + if (this._desktopResizePendingKey === key) this._desktopResizePendingKey = ""; + return; + } + if (!force && this.shouldDeferDesktopResize()) { + if (this._desktopResizePendingKey === key) this._desktopResizePendingKey = ""; + this._desktopResizePending = true; + return; + } + try { + const params = new URLSearchParams({ token, width: String(width), height: String(height) }); + const response = await fetch(`/desktop/resize?${params.toString()}`, { credentials: "same-origin" }); + if (response.ok) { + const result = await response.json().catch(() => ({})); + this._desktopResizeKey = key; + const activeFrame = this.desktopFrame(frame); + const activeTarget = activeFrame?.parentElement || activeFrame; + const activeRect = activeTarget?.getBoundingClientRect?.(); + const activeWidth = Math.round(activeRect?.width || 0); + const activeHeight = Math.round(activeRect?.height || 0); + if (activeWidth >= 320 && activeHeight >= 220) { + const activeKey = `${token}:${activeWidth}x${activeHeight}`; + if (activeKey !== key) { + this.queueDesktopResize({ force: true, serverResize: true, frame: activeFrame }); + return; + } + } + if (result?.reload) this.reloadDesktopFrame(activeFrame || frame); + this.primeXpraDesktopFrame({ reset: true, frame: activeFrame || frame }); + } + } catch (error) { + console.warn("Desktop resize skipped", error); + } finally { + if (this._desktopResizePendingKey === key) this._desktopResizePendingKey = ""; + } + }, DESKTOP_RESIZE_DELAY_MS); + }, + + reloadDesktopFrame(frame = null) { + const target = this.desktopFrame(frame); + if (!target) return; + const current = target.getAttribute("src") || target.src || this.officialOfficeUrl(); + if (!current) return; + try { + const url = new URL(current, window.location.href); + url.searchParams.set("a0_reload", String(Date.now())); + target.setAttribute("src", `${url.pathname}${url.search}`); + } catch { + target.setAttribute("src", current); + } + }, + + async handleDesktopUrlIntents(intents = []) { + const incoming = Array.isArray(intents) + ? intents.filter((intent) => intent && typeof intent === "object") + : []; + if (!incoming.length) return; + this._desktopUrlIntentQueue.push(...incoming); + if (this._desktopUrlIntentBusy) return; + + this._desktopUrlIntentBusy = true; + try { + while (this._desktopUrlIntentQueue.length) { + const intent = this._desktopUrlIntentQueue.shift(); + await this.openDesktopUrlIntent(intent); + } + } finally { + this._desktopUrlIntentBusy = false; + } + }, + + async openDesktopUrlIntent(intent = {}) { + const url = String(intent?.url || "").trim(); + const handled = await handleUrlIntent({ url, source: "desktop-url" }); + this.setMessage(handled ? "Opened link in Browser" : "Browser is not available"); + }, + + browserDestinationForDesktopUrl() { + if (this.isDesktopInModal()) return "canvas"; + return "modal"; + }, + + isDesktopInModal() { + const modalDesktop = Array.from(document.querySelectorAll(".office-panel")) + .some((panel) => panel.closest?.(".modal") && panel.querySelector?.("[data-office-desktop-frame]")); + if (modalDesktop) return true; + return this._mode === "modal"; + }, + + startDesktopMonitor() { + this.stopDesktopMonitor(); + if (!this.hasOfficialOffice() || !this.isDesktopHostVisible()) return; + const tabId = this.session?.tab_id || ""; + const sessionId = this.session?.desktop_session_id || this.session?.session_id || ""; + if (!tabId || !sessionId) return; + this._desktopHeartbeatSessionId = sessionId; + this._desktopHeartbeatTabId = tabId; + this._desktopHeartbeatMisses = 0; + + const tick = async () => { + if (!this.session || this.session.tab_id !== tabId || !this.hasOfficialOffice() || !this.isDesktopHostVisible()) return; + try { + const response = await callDesktop("sync", { + desktop_session_id: sessionId, + file_id: this.session.file_id || "", + }); + if (response?.intentional_shutdown || response?.shutdown) { + await this.handleIntentionalDesktopShutdown(response); + return; + } + if (response?.ok === false) throw new Error(response.error || "Desktop session closed."); + this._desktopHeartbeatMisses = 0; + await this.handleDesktopUrlIntents(response?.url_intents); + if (response?.document) { + const document = normalizeDocument(response.document); + this.replaceActiveSession({ + ...this.session, + document, + path: document.path || this.session.path, + file_id: document.file_id || this.session.file_id, + version: document.version || this.session.version, + }); + } + } catch { + if (!this.session || this.session.tab_id !== tabId) return; + this._desktopHeartbeatMisses += 1; + if (this._desktopHeartbeatMisses >= 2) { + await this.handleOfficialOfficeClosed(tabId); + } + } + }; + + this._desktopHeartbeatTimer = globalThis.setInterval(tick, DESKTOP_HEARTBEAT_MS); + globalThis.setTimeout(tick, Math.min(1200, DESKTOP_HEARTBEAT_MS)); + }, + + stopDesktopMonitor() { + if (this._desktopHeartbeatTimer) { + globalThis.clearInterval(this._desktopHeartbeatTimer); + } + this._desktopHeartbeatTimer = null; + this._desktopHeartbeatSessionId = ""; + this._desktopHeartbeatTabId = ""; + this._desktopHeartbeatMisses = 0; + }, + + async handleOfficialOfficeClosed(tabId) { + if (this._desktopIntentionalShutdown) return; + const tab = this.tabs.find((item) => item.tab_id === tabId); + const hiddenDesktopDocument = !tab && this.session?.tab_id === tabId && this.isDesktopOfficeDocument(this.session) + ? this.session + : null; + const target = tab || hiddenDesktopDocument; + if (!target || target._desktopClosed) return; + target._desktopClosed = true; + this.stopDesktopMonitor(); + this.stopDesktopResizeObserver(); + this.stopXpraDesktopPrime(); + this.message = "Desktop is restarting"; + await this.ensureDesktopSession({ + force: true, + select: this.activeTabId === tabId || Boolean(hiddenDesktopDocument), + message: "Desktop is restarting", + }); + target._desktopClosed = false; + await this.refresh(); + }, + + defaultTitle(kind, fmt) { + const date = new Date().toISOString().slice(0, 10); + if (fmt === "odt") return `Writer ${date}`; + if (fmt === "docx") return `DOCX ${date}`; + if (kind === "spreadsheet") return `Spreadsheet ${date}`; + if (kind === "presentation") return `Presentation ${date}`; + return `Document ${date}`; + }, + + tabTitle(tab = {}) { + tab = tab || {}; + return tab.title || tab.document?.basename || basename(tab.path); + }, + + tabLabel(tab = {}) { + tab = tab || {}; + const title = this.tabTitle(tab); + return tab.dirty ? `${title} unsaved` : title; + }, + + tabIcon(tab = {}) { + tab = tab || {}; + const ext = String(tab.extension || tab.document?.extension || "").toLowerCase(); + if (this.isDesktopSession(tab)) return "desktop_windows"; + if (ext === "md") return "article"; + if (ext === "odt" || ext === "docx") return "description"; + if (ext === "ods" || ext === "xlsx") return "table_chart"; + if (ext === "odp" || ext === "pptx") return "co_present"; + return "draft"; + }, + + async runNewMenuAction(action = "") { + const normalized = String(action || "").trim().toLowerCase(); + if (normalized === "open") return await this.openFileBrowser(); + if (normalized === "writer") return await this.create("document", "odt"); + if (normalized === "spreadsheet") return await this.create("spreadsheet", "ods"); + if (normalized === "presentation") return await this.create("presentation", "odp"); + return null; + }, + + installHeaderNewMenu(header = null) { + if (!header || header.querySelector(".office-header-actions")) return () => {}; + + const root = globalThis.document.createElement("div"); + root.className = "office-header-actions"; + root.innerHTML = ` + + + `; + + const button = root.querySelector(".office-header-new-button"); + const menu = root.querySelector(".office-new-menu"); + const setOpen = (open) => { + root.classList.toggle("is-open", open); + button?.setAttribute("aria-expanded", open.toString()); + if (menu) menu.hidden = !open; + }; + const onButtonClick = (event) => { + event.preventDefault(); + event.stopPropagation(); + setOpen(!root.classList.contains("is-open")); + }; + const onDocumentClick = (event) => { + if (!root.contains(event.target)) setOpen(false); + }; + const onDocumentKeydown = (event) => { + if (event.key === "Escape") setOpen(false); + }; + + button?.addEventListener("click", onButtonClick); + for (const item of root.querySelectorAll("[data-office-new-action]")) { + item.addEventListener("click", async (event) => { + event.preventDefault(); + event.stopPropagation(); + const action = event.currentTarget?.dataset?.officeNewAction || ""; + setOpen(false); + await this.runNewMenuAction(action); + }); + } + globalThis.document.addEventListener("click", onDocumentClick); + globalThis.document.addEventListener("keydown", onDocumentKeydown); + + const firstHeaderAction = header.querySelector( + ".modal-surface-switcher, .modal-dock-button, .office-modal-focus-button, .modal-close", + ); + if (firstHeaderAction) { + firstHeaderAction.insertAdjacentElement("beforebegin", root); + } else { + header.appendChild(root); + } + + setOpen(false); + return () => { + button?.removeEventListener("click", onButtonClick); + globalThis.document.removeEventListener("click", onDocumentClick); + globalThis.document.removeEventListener("keydown", onDocumentKeydown); + root.remove(); + }; + }, + + setupFloatingModal(element = null) { + const root = element || globalThis.document?.querySelector(".office-panel"); + const modal = root?.closest?.(".modal"); + const inner = root?.closest?.(".modal-inner"); + const body = root?.closest?.(".modal-bd"); + const header = inner?.querySelector?.(".modal-header"); + if (!inner || !body || !header || inner.dataset.officeModalReady === "1") return; + + inner.dataset.officeModalReady = "1"; + modal?.classList?.add("surface-floating", "modal-floating", "modal-no-backdrop"); + inner.classList.add("surface-modal", "office-modal", "modal-no-backdrop"); + body.classList.add("office-modal-body"); + header.style.cursor = "move"; + + const inset = 8; + const minWidth = 720; + const minHeight = 520; + const clamp = (value, min, max) => Math.max(min, Math.min(max, value)); + const cleanup = []; + let beforeFocusBounds = null; + let dragging = false; + let resizing = false; + let pointerId = 0; + let startX = 0; + let startY = 0; + let startLeft = 0; + let startTop = 0; + let startWidth = 0; + let startHeight = 0; + let resizeMode = ""; + + const newMenuCleanup = this.installHeaderNewMenu(header); + + const currentBounds = () => { + const rect = inner.getBoundingClientRect(); + return { + left: rect.left, + top: rect.top, + width: rect.width, + height: rect.height, + }; + }; + + const normalizedBounds = (bounds) => { + const maxWidth = Math.max(320, globalThis.innerWidth - inset * 2); + const maxHeight = Math.max(320, globalThis.innerHeight - inset * 2); + const safeMinWidth = Math.min(minWidth, maxWidth); + const safeMinHeight = Math.min(minHeight, maxHeight); + const width = clamp(bounds.width, safeMinWidth, maxWidth); + const height = clamp(bounds.height, safeMinHeight, maxHeight); + return { + width, + height, + left: clamp(bounds.left, inset, Math.max(inset, globalThis.innerWidth - width - inset)), + top: clamp(bounds.top, inset, Math.max(inset, globalThis.innerHeight - height - inset)), + }; + }; + + const setBounds = (bounds) => { + const next = normalizedBounds(bounds); + inner.style.position = "fixed"; + inner.style.transform = "none"; + inner.style.left = `${Math.round(next.left)}px`; + inner.style.top = `${Math.round(next.top)}px`; + inner.style.width = `${Math.round(next.width)}px`; + inner.style.height = `${Math.round(next.height)}px`; + inner.style.right = "auto"; + inner.style.bottom = "auto"; + inner.style.margin = "0"; + }; + + const ensurePosition = () => { + setBounds(currentBounds()); + }; + + const shield = globalThis.document.createElement("div"); + shield.className = "office-modal-input-shield"; + inner.appendChild(shield); + cleanup.push(() => shield.remove()); + + const setShield = (visible, cursor = "") => { + shield.style.display = visible ? "block" : "none"; + shield.style.cursor = cursor; + }; + + const focusButton = globalThis.document.createElement("button"); + focusButton.type = "button"; + focusButton.className = "modal-dock-button office-modal-focus-button"; + focusButton.innerHTML = ''; + const updateFocusButton = (active) => { + const label = active ? "Restore size" : "Focus mode"; + focusButton.setAttribute("aria-label", label); + focusButton.querySelector(".material-symbols-outlined").textContent = active ? "fullscreen_exit" : "fullscreen"; + }; + updateFocusButton(false); + const closeButton = inner.querySelector(".modal-close"); + if (closeButton) { + closeButton.insertAdjacentElement("beforebegin", focusButton); + } else { + header.appendChild(focusButton); + } + cleanup.push(() => focusButton.remove()); + + const setFocusMode = (enabled) => { + ensurePosition(); + if (enabled) { + beforeFocusBounds = currentBounds(); + inner.classList.add("is-focus-mode"); + setBounds({ + left: inset, + top: inset, + width: globalThis.innerWidth - inset * 2, + height: globalThis.innerHeight - inset * 2, + }); + updateFocusButton(true); + return; + } + inner.classList.remove("is-focus-mode"); + setBounds(beforeFocusBounds || currentBounds()); + beforeFocusBounds = null; + updateFocusButton(false); + }; + + const onFocusClick = () => setFocusMode(!inner.classList.contains("is-focus-mode")); + focusButton.addEventListener("click", onFocusClick); + cleanup.push(() => focusButton.removeEventListener("click", onFocusClick)); + + const onPointerDown = (event) => { + if (event.button !== 0) return; + if (event.target?.closest?.("button,a,input,textarea,select")) return; + if (inner.classList.contains("is-focus-mode")) return; + ensurePosition(); + const rect = inner.getBoundingClientRect(); + dragging = true; + pointerId = event.pointerId; + startX = event.clientX; + startY = event.clientY; + startLeft = rect.left; + startTop = rect.top; + startWidth = rect.width; + startHeight = rect.height; + inner.classList.add("is-dragging"); + setShield(true, "move"); + header.setPointerCapture?.(pointerId); + event.preventDefault(); + }; + + const onPointerMove = (event) => { + if (!dragging || event.pointerId !== pointerId) return; + setBounds({ + left: startLeft + event.clientX - startX, + top: startTop + event.clientY - startY, + width: startWidth, + height: startHeight, + }); + }; + + const onPointerUp = (event) => { + if (!dragging || event.pointerId !== pointerId) return; + dragging = false; + inner.classList.remove("is-dragging"); + setShield(false); + header.releasePointerCapture?.(pointerId); + }; + + const createResizeHandle = (mode) => { + const handle = globalThis.document.createElement("div"); + handle.className = `office-modal-resizer is-${mode}`; + handle.dataset.officeResize = mode; + inner.appendChild(handle); + cleanup.push(() => handle.remove()); + return handle; + }; + + const onResizeDown = (event) => { + if (event.button !== 0 || inner.classList.contains("is-focus-mode")) return; + ensurePosition(); + const rect = inner.getBoundingClientRect(); + resizing = true; + resizeMode = event.currentTarget.dataset.officeResize || ""; + pointerId = event.pointerId; + startX = event.clientX; + startY = event.clientY; + startLeft = rect.left; + startTop = rect.top; + startWidth = rect.width; + startHeight = rect.height; + inner.classList.add("is-resizing"); + this.suspendDesktopResize(); + setShield(true, resizeMode === "right" ? "ew-resize" : resizeMode === "bottom" ? "ns-resize" : "nwse-resize"); + event.currentTarget.setPointerCapture?.(pointerId); + event.preventDefault(); + event.stopPropagation(); + }; + + const onResizeMove = (event) => { + if (!resizing || event.pointerId !== pointerId) return; + const dx = event.clientX - startX; + const dy = event.clientY - startY; + setBounds({ + left: startLeft, + top: startTop, + width: resizeMode === "bottom" ? startWidth : startWidth + dx, + height: resizeMode === "right" ? startHeight : startHeight + dy, + }); + }; + + const onResizeUp = (event) => { + if (!resizing || event.pointerId !== pointerId) return; + resizing = false; + resizeMode = ""; + inner.classList.remove("is-resizing"); + setShield(false); + event.currentTarget.releasePointerCapture?.(pointerId); + this.resumeDesktopResize(); + }; + + header.addEventListener("pointerdown", onPointerDown); + header.addEventListener("pointermove", onPointerMove); + header.addEventListener("pointerup", onPointerUp); + header.addEventListener("pointercancel", onPointerUp); + cleanup.push(() => header.removeEventListener("pointerdown", onPointerDown)); + cleanup.push(() => header.removeEventListener("pointermove", onPointerMove)); + cleanup.push(() => header.removeEventListener("pointerup", onPointerUp)); + cleanup.push(() => header.removeEventListener("pointercancel", onPointerUp)); + + for (const mode of ["right", "bottom", "corner"]) { + const handle = createResizeHandle(mode); + handle.addEventListener("pointerdown", onResizeDown); + handle.addEventListener("pointermove", onResizeMove); + handle.addEventListener("pointerup", onResizeUp); + handle.addEventListener("pointercancel", onResizeUp); + cleanup.push(() => handle.removeEventListener("pointerdown", onResizeDown)); + cleanup.push(() => handle.removeEventListener("pointermove", onResizeMove)); + cleanup.push(() => handle.removeEventListener("pointerup", onResizeUp)); + cleanup.push(() => handle.removeEventListener("pointercancel", onResizeUp)); + } + + const onWindowResize = () => { + if (inner.classList.contains("is-focus-mode")) { + setBounds({ + left: inset, + top: inset, + width: globalThis.innerWidth - inset * 2, + height: globalThis.innerHeight - inset * 2, + }); + return; + } + ensurePosition(); + }; + globalThis.addEventListener("resize", onWindowResize); + cleanup.push(() => globalThis.removeEventListener("resize", onWindowResize)); + + if (globalThis.requestAnimationFrame) { + globalThis.requestAnimationFrame(ensurePosition); + } else { + globalThis.setTimeout(ensurePosition, 0); + } + this._floatingCleanup = () => { + newMenuCleanup?.(); + cleanup.splice(0).reverse().forEach((entry) => entry()); + modal?.classList?.remove("surface-floating", "modal-floating", "modal-no-backdrop"); + inner.classList.remove("is-dragging", "is-resizing", "is-focus-mode"); + this._desktopResizeSuspended = false; + this._desktopResizePending = false; + delete inner.dataset.officeModalReady; + }; + }, +}; + +export const store = createStore("desktop", model); From 538dab4dc36b7b54a82469f312328f61fcc32155 Mon Sep 17 00:00:00 2001 From: gdeyoung Date: Tue, 19 May 2026 05:59:48 +0000 Subject: [PATCH 2/3] Fix: Corrected Xpra desktop resize implementation The previous fix was incomplete. This update provides the complete solution: **Changes:** 1. Properly get client from frame.contentWindow.client (not XPRA_CLIENT.client) 2. Add early return if client not found 3. Set __a0ViewportResizing flag before dimensions sync 4. Sync desktop_width/desktop_height from actual container dimensions 5. Directly call client._screen_resized(new Event('resize')) for proper resize propagation 6. Use try/finally to ensure flag cleanup **Fixes:** The __a0ViewportResizing flag checked in _screen_resized guard is now both set AND used to drive the actual resize, allowing viewport resizes to propagate through the Xpra flow while blocking accidental resizes. Fixes #1649 --- plugins/_desktop/webui/desktop-store.js | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/plugins/_desktop/webui/desktop-store.js b/plugins/_desktop/webui/desktop-store.js index edfba7d305..c99acfce75 100644 --- a/plugins/_desktop/webui/desktop-store.js +++ b/plugins/_desktop/webui/desktop-store.js @@ -2107,12 +2107,27 @@ const model = { if (width < 320 || height < 220) return; const key = `${token}:${width}x${height}`; const refreshFrameOnly = () => { - const client = frame?.contentWindow?.XPRA_CLIENT?.client; - if (client) client.__a0ViewportResizing = true; + const client = frame?.contentWindow?.client; + if (!client) { + this.applyXpraDesktopFrameMode(frame, { requestServerResize: false, requestRefresh: false }); + return; + } + client.__a0ViewportResizing = true; try { + const remoteWindow = frame?.contentWindow; + const container = client.container || remoteWindow?.document?.querySelector?.("#screen"); + const width = Math.floor(container?.clientWidth || remoteWindow?.innerWidth || frame?.contentWindow?.innerWidth || 1024); + const height = Math.floor(container?.clientHeight || remoteWindow?.innerHeight || frame?.contentWindow?.innerHeight || 768); + if (width > 0 && height > 0) { + client.desktop_width = width; + client.desktop_height = height; + } this.applyXpraDesktopFrameMode(frame, { requestServerResize: false, requestRefresh: false }); + if (typeof client._screen_resized === "function") { + client._screen_resized(new Event("resize")); + } } finally { - if (client) client.__a0ViewportResizing = false; + client.__a0ViewportResizing = false; } }; if (!serverResize) { From 5a883b48b7b4ea52125aa8188ff3fcda651b3508 Mon Sep 17 00:00:00 2001 From: gdeyoung Date: Thu, 21 May 2026 03:39:50 +0000 Subject: [PATCH 3/3] feat: KG Pipeline + KG Dreamer plugins _kg_pipeline (88 tests, 1989 lines): - Crash recovery checkpoints (atomic state saves) - Append-only audit trail (JSONL provenance) - LLM-augmented token compression (29%+ reduction) - 5-dimension entity health scoring + tiers - String+LLM entity resolution (504 duplicates merged) _kg_dreamer (48 tests, 3605 lines): - 6 dream operations: connect, strengthen, prune, contradict, pattern, insight - Autonomous background intelligence (runs every 6h) - LLM-powered insight generation via Qwen3.6-35B - Separate plugin for zero-risk rollback Documentation: - KG System Architecture (386 lines) - Entity resolution spike report - Octopoda+OpenHuman enhancement plan v3 Total: 136 tests, 5594 lines, $0 cloud cost --- docs/Knowledge Graph System Architecture.md | 387 ++++++++++ docs/entity_resolution_spike_report.md | 92 +++ docs/octopoda_openhuman_kg_plan_v3.md | 324 +++++++++ .../_kg_dreamer/.pytest_cache/.gitignore | 2 + .../_kg_dreamer/.pytest_cache/CACHEDIR.TAG | 4 + .../_kg_dreamer/.pytest_cache/README.md | 8 + .../.pytest_cache/v/cache/lastfailed | 1 + .../_kg_dreamer/.pytest_cache/v/cache/nodeids | 50 ++ usr/plugins/_kg_dreamer/README.md | 74 ++ usr/plugins/_kg_dreamer/default_config.yaml | 50 ++ usr/plugins/_kg_dreamer/helpers/__init__.py | 82 +++ .../logs/insights_20260521_023649.json | 77 ++ .../logs/insights_20260521_023926.json | 77 ++ .../logs/insights_20260521_030144.json | 77 ++ .../_kg_dreamer/operations/__init__.py | 26 + .../_kg_dreamer/operations/connector.py | 230 ++++++ .../_kg_dreamer/operations/contradiction.py | 508 ++++++++++++++ .../_kg_dreamer/operations/insights.py | 392 +++++++++++ .../_kg_dreamer/operations/patterns.py | 224 ++++++ usr/plugins/_kg_dreamer/operations/pruner.py | 361 ++++++++++ .../_kg_dreamer/operations/strengthener.py | 421 +++++++++++ usr/plugins/_kg_dreamer/orchestrator.py | 361 ++++++++++ usr/plugins/_kg_dreamer/plugin.yaml | 13 + .../tests/.pytest_cache/.gitignore | 2 + .../tests/.pytest_cache/CACHEDIR.TAG | 4 + .../_kg_dreamer/tests/.pytest_cache/README.md | 8 + .../tests/.pytest_cache/v/cache/nodeids | 32 + usr/plugins/_kg_dreamer/tests/__init__.py | 1 + .../_kg_dreamer/tests/test_connector.py | 248 +++++++ .../_kg_dreamer/tests/test_orchestrator.py | 316 +++++++++ usr/plugins/_kg_dreamer/tests/test_pruner.py | 364 ++++++++++ usr/plugins/_kg_dreamer/tools/kg_dreamer.py | 211 ++++++ .../_kg_pipeline/.pytest_cache/.gitignore | 2 + .../.pytest_cache/v/cache/lastfailed | 1 + .../.pytest_cache/v/cache/nodeids | 90 +++ usr/plugins/_kg_pipeline/README.md | 167 +++++ usr/plugins/_kg_pipeline/default_config.yaml | 44 ++ usr/plugins/_kg_pipeline/pipeline/__init__.py | 27 + .../_kg_pipeline/pipeline/audit_chain.py | 193 +++++ usr/plugins/_kg_pipeline/pipeline/auditor.py | 242 +++++++ .../_kg_pipeline/pipeline/checkpoint.py | 167 +++++ .../_kg_pipeline/pipeline/elastic_ingester.py | 150 ++++ usr/plugins/_kg_pipeline/pipeline/enricher.py | 256 +++++++ .../_kg_pipeline/pipeline/entity_resolver.py | 660 ++++++++++++++++++ .../_kg_pipeline/pipeline/extractor.py | 172 +++++ usr/plugins/_kg_pipeline/pipeline/gdrive.py | 51 ++ .../_kg_pipeline/pipeline/health_scorer.py | 417 +++++++++++ usr/plugins/_kg_pipeline/pipeline/ingester.py | 276 ++++++++ .../_kg_pipeline/pipeline/kg_client.py | 119 ++++ .../pipeline/knowledge_archiver.py | 241 +++++++ .../pipeline/knowledge_ingester.py | 222 ++++++ .../_kg_pipeline/pipeline/orphan_connector.py | 307 ++++++++ .../_kg_pipeline/pipeline/parallel_worker.py | 256 +++++++ .../_kg_pipeline/pipeline/phase2_ingest.py | 144 ++++ .../_kg_pipeline/pipeline/token_compressor.py | 552 +++++++++++++++ usr/plugins/_kg_pipeline/plugin.yaml | 12 + .../prompts/agent.system.tool.kg_pipeline.md | 58 ++ usr/plugins/_kg_pipeline/tests/__init__.py | 0 .../_kg_pipeline/tests/test_audit_chain.py | 190 +++++ .../_kg_pipeline/tests/test_checkpoint.py | 150 ++++ .../tests/test_entity_resolver.py | 462 ++++++++++++ .../_kg_pipeline/tests/test_health_scorer.py | 281 ++++++++ .../tests/test_token_compressor.py | 526 ++++++++++++++ usr/plugins/_kg_pipeline/tools/__init__.py | 5 + usr/plugins/_kg_pipeline/tools/kg_pipeline.py | 416 +++++++++++ 65 files changed, 11883 insertions(+) create mode 100644 docs/Knowledge Graph System Architecture.md create mode 100644 docs/entity_resolution_spike_report.md create mode 100644 docs/octopoda_openhuman_kg_plan_v3.md create mode 100644 usr/plugins/_kg_dreamer/.pytest_cache/.gitignore create mode 100644 usr/plugins/_kg_dreamer/.pytest_cache/CACHEDIR.TAG create mode 100644 usr/plugins/_kg_dreamer/.pytest_cache/README.md create mode 100644 usr/plugins/_kg_dreamer/.pytest_cache/v/cache/lastfailed create mode 100644 usr/plugins/_kg_dreamer/.pytest_cache/v/cache/nodeids create mode 100644 usr/plugins/_kg_dreamer/README.md create mode 100644 usr/plugins/_kg_dreamer/default_config.yaml create mode 100644 usr/plugins/_kg_dreamer/helpers/__init__.py create mode 100644 usr/plugins/_kg_dreamer/logs/insights_20260521_023649.json create mode 100644 usr/plugins/_kg_dreamer/logs/insights_20260521_023926.json create mode 100644 usr/plugins/_kg_dreamer/logs/insights_20260521_030144.json create mode 100644 usr/plugins/_kg_dreamer/operations/__init__.py create mode 100644 usr/plugins/_kg_dreamer/operations/connector.py create mode 100644 usr/plugins/_kg_dreamer/operations/contradiction.py create mode 100644 usr/plugins/_kg_dreamer/operations/insights.py create mode 100644 usr/plugins/_kg_dreamer/operations/patterns.py create mode 100644 usr/plugins/_kg_dreamer/operations/pruner.py create mode 100644 usr/plugins/_kg_dreamer/operations/strengthener.py create mode 100644 usr/plugins/_kg_dreamer/orchestrator.py create mode 100644 usr/plugins/_kg_dreamer/plugin.yaml create mode 100644 usr/plugins/_kg_dreamer/tests/.pytest_cache/.gitignore create mode 100644 usr/plugins/_kg_dreamer/tests/.pytest_cache/CACHEDIR.TAG create mode 100644 usr/plugins/_kg_dreamer/tests/.pytest_cache/README.md create mode 100644 usr/plugins/_kg_dreamer/tests/.pytest_cache/v/cache/nodeids create mode 100644 usr/plugins/_kg_dreamer/tests/__init__.py create mode 100644 usr/plugins/_kg_dreamer/tests/test_connector.py create mode 100644 usr/plugins/_kg_dreamer/tests/test_orchestrator.py create mode 100644 usr/plugins/_kg_dreamer/tests/test_pruner.py create mode 100644 usr/plugins/_kg_dreamer/tools/kg_dreamer.py create mode 100644 usr/plugins/_kg_pipeline/.pytest_cache/.gitignore create mode 100644 usr/plugins/_kg_pipeline/.pytest_cache/v/cache/lastfailed create mode 100644 usr/plugins/_kg_pipeline/.pytest_cache/v/cache/nodeids create mode 100644 usr/plugins/_kg_pipeline/README.md create mode 100644 usr/plugins/_kg_pipeline/default_config.yaml create mode 100644 usr/plugins/_kg_pipeline/pipeline/__init__.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/audit_chain.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/auditor.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/checkpoint.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/elastic_ingester.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/enricher.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/entity_resolver.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/extractor.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/gdrive.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/health_scorer.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/ingester.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/kg_client.py create mode 100755 usr/plugins/_kg_pipeline/pipeline/knowledge_archiver.py create mode 100755 usr/plugins/_kg_pipeline/pipeline/knowledge_ingester.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/orphan_connector.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/parallel_worker.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/phase2_ingest.py create mode 100644 usr/plugins/_kg_pipeline/pipeline/token_compressor.py create mode 100644 usr/plugins/_kg_pipeline/plugin.yaml create mode 100644 usr/plugins/_kg_pipeline/prompts/agent.system.tool.kg_pipeline.md create mode 100644 usr/plugins/_kg_pipeline/tests/__init__.py create mode 100644 usr/plugins/_kg_pipeline/tests/test_audit_chain.py create mode 100644 usr/plugins/_kg_pipeline/tests/test_checkpoint.py create mode 100644 usr/plugins/_kg_pipeline/tests/test_entity_resolver.py create mode 100644 usr/plugins/_kg_pipeline/tests/test_health_scorer.py create mode 100644 usr/plugins/_kg_pipeline/tests/test_token_compressor.py create mode 100644 usr/plugins/_kg_pipeline/tools/__init__.py create mode 100644 usr/plugins/_kg_pipeline/tools/kg_pipeline.py diff --git a/docs/Knowledge Graph System Architecture.md b/docs/Knowledge Graph System Architecture.md new file mode 100644 index 0000000000..db21931636 --- /dev/null +++ b/docs/Knowledge Graph System Architecture.md @@ -0,0 +1,387 @@ +# Knowledge Graph System Architecture + +## System Overview + +Agent Zero's Knowledge Graph is a persistent, structured knowledge system that stores entities, relationships, and documents in a graph database. It serves as the long-term memory and structured recall layer for all agent operations. + +**Current Scale:** +- 36,768 entities +- 123,952 relationships +- 11,025 documents +- 50,121 vector embeddings +- 7 entity types across 5 domains + +--- + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ Agent Zero Framework │ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ kg_tools │ │ _kg_pipeline │ │ FAISS Index │ │ +│ │ (query-side) │ │ (batch-side) │ │ (short-term) │ │ +│ │ │ │ │ │ │ │ +│ │ kg_search │ │ ingest │ │ Memory recall │ │ +│ │ kg_insights │ │ bulk_ingest │ │ Pattern match │ │ +│ │ kg_query │ │ elastic_ingest │ │ Session context │ │ +│ │ kg_hubs │ │ parallel_ingest │ │ │ │ +│ │ kg_communities │ │ health │ └─────────────────┘ │ +│ │ kg_surprises │ │ resolve_entities│ │ +│ │ kg_bridges │ │ audit │ │ +│ └────────┬────────┘ └────────┬────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌──────────────────────────────────────────────────────────────┐ │ +│ │ KG Service (AICube:8010) │ │ +│ │ │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ +│ │ │ KuzuDB │ │ LanceDB │ │ Entity │ │ Analysis │ │ │ +│ │ │ (Graph) │ │ (Vectors)│ │ Extract │ │ Engine │ │ │ +│ │ │ │ │ │ │ (LLM) │ │ │ │ │ +│ │ │ 37K ents │ │ 50K vecs │ │ │ │ Orphans │ │ │ +│ │ │ 124K rels│ │ │ │ │ │ Hubs │ │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ │ Communi. │ │ │ +│ │ │ Bridges │ │ │ +│ │ │ Surprises│ │ │ +│ │ └──────────┘ │ │ +│ └──────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────────────┐ │ +│ │ Supporting Services │ │ +│ │ │ │ +│ │ Qwen3.6-35B (Mediaserver) │ nomic-embed (AI Tower GPU2) │ │ +│ │ Entity verification │ Vector embeddings │ │ +│ │ Content summarization │ Semantic search │ │ +│ │ Token compression │ Similarity scoring │ │ +│ └──────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Data Flow + +### 1. Ingestion Flow (Writing to KG) + +``` +Source Content (MD files, Elastic KB, Knowledge dirs) + │ + ▼ +┌─────────────────────────────────────────┐ +│ Token Compressor │ +│ ┌───────────┐ ┌────────────────────┐ │ +│ │ Regex │ │ LLM (Mediaserver) │ │ +│ │ strip │→ │ summarize if >30K │ │ +│ │ boilerplate│ │ smart truncate │ │ +│ └───────────┘ └────────────────────┘ │ +│ ↓ │ +│ ┌───────────────────────────────┐ │ +│ │ Content Hash Cache (7-day) │ │ +│ │ Skip re-compressing unchanged │ │ +│ └───────────────────────────────┘ │ +└─────────────────────────────────────────┘ + │ + ▼ (~29-70% token reduction) +┌─────────────────────────────────────────┐ +│ KG Service /api/v1/add │ +│ ┌───────────────────────────────────┐ │ +│ │ LLM Entity Extraction │ │ +│ │ Content → entities + rels │ │ +│ │ (runs on AICube) │ │ +│ └───────────────────────────────────┘ │ +│ ↓ │ +│ ┌───────────────────────────────────┐ │ +│ │ KuzuDB Write (graph store) │ │ +│ │ + LanceDB Write (vector index) │ │ +│ └───────────────────────────────────┘ │ +└─────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ Audit Chain │ +│ Append-only JSONL: action, source, count │ +│ One file per day, 90-day retention │ +│ /a0/usr/workdir/logs/kg_audit/ │ +└─────────────────────────────────────────┘ +``` + +### 2. Query Flow (Reading from KG) + +``` +User asks question / Agent needs context + │ + ▼ +┌─────────────────────────────────────────┐ +│ kg_tools plugin (per chat turn) │ +│ ┌───────────────┐ ┌───────────────────┐│ +│ │ kg_search │ │ kg_insights ││ +│ │ (semantic) │ │ (cross-domain) ││ +│ └───────────────┘ └───────────────────┘│ +│ ┌───────────────┐ ┌───────────────────┐│ +│ │ kg_hubs │ │ kg_communities ││ +│ │ (key entities)│ │ (clusters) ││ +│ └───────────────┘ └───────────────────┘│ +│ ┌───────────────┐ ┌───────────────────┐│ +│ │ kg_surprises │ │ kg_bridges ││ +│ │ (unexpected) │ │ (connectors) ││ +│ └───────────────┘ └───────────────────┘│ +└─────────────────────────────────────────┘ + │ + ▼ Results injected into agent context + │ +┌─────────────────────────────────────────┐ +│ FAISS Memory (session layer) │ +│ Short-term patterns, user preferences │ +│ 30-day rolling window │ +└─────────────────────────────────────────┘ +``` + +### 3. Batch Processing Flow + +``` +Scheduled tasks (cron) + │ + ├→ kg_pipeline:ingest (knowledge dirs) + ├→ kg_pipeline:elastic_ingest (Elastic KB) + ├→ kg_pipeline:parallel_ingest (chunked) + ├→ kg_pipeline:enrich (domain enrichment) + ├→ kg_pipeline:audit (quality audit) + ├→ kg_pipeline:connect_orphans (reconnection) + ├→ kg_pipeline:health (health scoring) + └→ kg_pipeline:resolve_entities (dedup) + │ + ▼ +┌─────────────────────────────────────────┐ +│ Crash Recovery Checkpoints │ +│ Atomic state saves every 10 files │ +│ Resume on crash, skip processed files │ +│ /a0/usr/workdir/state/kg_checkpoints/ │ +└─────────────────────────────────────────┘ +``` + +--- + +## Data Model + +### Entity Properties + +| Property | Type | Description | +|----------|------|-------------| +| `id` | UUID | Unique identifier | +| `name` | string | Entity name (normalized) | +| `type` | enum | technology, product, concept, person, organization, service, event, location | +| `domain` | string | technology, work, personal, context | +| `categories` | string | Comma-separated tags | +| `confidence` | float | Extraction confidence (0.0-1.0) | +| `mention_count` | int | Times mentioned across sources | +| `first_seen` | datetime | First extraction timestamp | +| `last_seen` | datetime | Most recent extraction timestamp | + +### Entity Types and Distribution + +| Type | Count | % of Total | Examples | +|------|-------|------------|----------| +| technology | ~9,700 | 26% | Docker, Kubernetes, AI/ML | +| product | ~9,000 | 24% | Elasticsearch, Elastic Security | +| concept | ~6,700 | 18% | SIEM, SLED, Observability | +| person | ~4,500 | 12% | Engineers, authors, leaders | +| organization | ~3,900 | 11% | Elastic, AWS, Forrester | +| service | ~1,260 | 3% | Gmail, AWS Lambda | +| event | ~900 | 2% | Elastic{ON}, conferences | +| location | ~755 | 2% | Cities, regions, data centers | + +### Relationships + +Entities are connected via typed relationships extracted from content: +- `RELATED_TO` — General association +- `MENTIONED_IN` — Entity appears in document +- `PART_OF` — Hierarchical containment +- `USES` / `USED_BY` — Technology dependency +- `COMPETES_WITH` — Competitive relationship + +--- + +## Pipeline Modules + +### _kg_pipeline Plugin Structure + +``` +usr/plugins/_kg_pipeline/ +├── plugin.yaml # Plugin manifest +├── default_config.yaml # All configuration +├── README.md # Full documentation +│ +├── pipeline/ # Core processing modules +│ ├── kg_client.py # HTTP client (retries, circuit breaker) +│ ├── ingester.py # File ingestion (single, bulk, directory) +│ ├── elastic_ingester.py # Elastic KB specific ingestion +│ ├── parallel_worker.py # Chunk-based parallel processing +│ ├── checkpoint.py # Crash recovery (atomic state saves) +│ ├── audit_chain.py # Write provenance (JSONL per day) +│ ├── token_compressor.py # Content compression (regex + LLM + cache) +│ ├── health_scorer.py # Entity quality scoring (5 dimensions) +│ ├── entity_resolver.py # Deduplication (string + LLM) +│ ├── auditor.py # Retrieval quality audit +│ ├── enricher.py # Domain enrichment +│ ├── orphan_connector.py # Orphan entity reconnection +│ ├── extractor.py # Entity extraction +│ ├── knowledge_archiver.py # File archival +│ └── knowledge_ingester.py # Knowledge dir ingestion +│ +├── tools/ # Agent-facing tools +│ └── kg_pipeline.py # Tool routes (14 methods) +│ +├── tests/ # Test suite (88 tests) +│ ├── test_checkpoint.py # 5 tests +│ ├── test_audit_chain.py # 8 tests +│ ├── test_token_compressor.py # 21 tests +│ ├── test_health_scorer.py # 18 tests +│ └── test_entity_resolver.py # 36 tests +│ +└── prompts/ # Agent prompt templates + └── agent.system.tool.kg_pipeline.md +``` + +### Tool Methods + +| Method | Purpose | +|--------|---------| +| `status` | KG service health + entity/rel counts | +| `ingest` | Single file or directory ingestion | +| `bulk_ingest` | Bulk ingestion with dedup | +| `elastic_ingest` | Elastic KB ingestion | +| `parallel_ingest` | Parallel chunk processing | +| `connect_orphans` | Reconnect isolated entities | +| `enrich` | Domain/category enrichment | +| `audit` | Retrieval quality audit | +| `knowledge_ingest` | Knowledge directory ingestion | +| `gdrive_upload` | Export to Google Drive | +| `health` | Entity health scores + tier distribution | +| `resolve_entities` | Entity deduplication pipeline | + +--- + +## Infrastructure + +| Component | Location | Technology | +|-----------|----------|------------| +| **KG Service** | AICube (100.78.79.41:8010) | KuzuDB + LanceDB + LLM extraction | +| **Entity Extraction LLM** | KG Service (embedded) | Configurable LLM via Spark | +| **Verification LLM** | Mediaserver (192.168.1.250:11435) | Qwen3.6-35B MoE GGUF Q5 | +| **Embedding Service** | AI Tower GPU2 (192.168.1.246:11435) | nomic-embed-text (768-dim) | +| **Agent Framework** | Agent Zero container | Python 3.13, Flask, Alpine.js | +| **Short-term Memory** | Agent Zero container | FAISS index (30-day window) | + +--- + +## Health Scoring System + +### 5-Dimension Scoring + +| Dimension | Weight | Formula | +|-----------|--------|---------| +| Connectivity | 35% | log(degree+1) / log(max_degree+1) | +| Recency | 20% | 1.0 if <7d → 0.1 at 365d | +| Source Quality | 20% | min(mentions/5,1.0)*0.5 + min(categories/5,1.0)*0.5 | +| Freshness | 15% | Update frequency vs entity lifespan | +| Confidence | 10% | Direct extraction confidence | + +### Memory Tiers + +| Tier | Threshold | Meaning | +|------|-----------|---------| +| **Hot** | ≥0.70 | Actively used, high connectivity | +| **Warm** | ≥0.50 | Moderately connected, recent | +| **Cool** | ≥0.30 | Low connectivity, aging | +| **Cold** | <0.30 | Isolated, stale, archival candidate | + +--- + +## Entity Resolution Pipeline + +### 3-Stage Deduplication + +``` +Stage 1: STRING BLOCKING + Group by entity type + Jaro-Winkler similarity ≥ 0.80 + Token overlap ≥ 0.60 + Sliding window (sorted names, window=50) + ↓ +Stage 2: LLM VERIFICATION + Qwen3.6-35B on Mediaserver + "Are these the same entity? YES/NO + confidence" + Checks reasoning_content first (Qwen reasoning models) + 0.5s rate limiting between calls + ↓ +Stage 3: SAFE MERGE + Keep higher-degree entity as canonical + Transfer relationships from duplicate + DETACH DELETE (KuzuDB requirement) + Log every merge to audit chain + Dry-run default (no accidental merges) +``` + +### Merge Statistics +- 504 entities merged (37,240 → 36,768) +- 83 case/format variants + 421 plural/punctuation variants +- 72,000+ false positives correctly rejected +- Zero errors + +--- + +## Configuration + +```yaml +# KG Service connection +kg_service_url: "http://100.78.79.41:8010" +batch_size: 50 +timeout: 300 +max_retries: 3 + +# Audit trail +audit: + enabled: true + retention_days: 90 + +# Content compression +compression: + enabled: true + llm_enabled: true + llm_threshold_chars: 30000 + llm_max_output_tokens: 4096 + cache_enabled: true + cache_ttl_days: 7 + +# Health scoring +health_scoring: + enabled: true + cache_ttl_hours: 24 + tier_thresholds: { hot: 0.7, warm: 0.5, cool: 0.3 } + +# Entity resolution +entity_resolution: + enabled: true + string_threshold: 0.80 + llm_verify: true + dry_run_default: true +``` + +--- + +## Testing + +```bash +cd /a0/usr/plugins/_kg_pipeline +python3 -m pytest tests/ -v +# 88 tests, ~0.3s execution time +``` + +All tests must pass before any changes are merged. + +--- + +*Last updated: 2026-05-20* +*Part of Agent Zero Knowledge Graph system* \ No newline at end of file diff --git a/docs/entity_resolution_spike_report.md b/docs/entity_resolution_spike_report.md new file mode 100644 index 0000000000..a677f3208b --- /dev/null +++ b/docs/entity_resolution_spike_report.md @@ -0,0 +1,92 @@ +# Entity Resolution Spike Report + +**Timestamp:** 2026-05-20 18:01:22 +**Execution Time:** 14.8s + +## Summary + +| Metric | Value | +|--------|-------| +| Sample size | 985 entities (9 types) | +| Total KG entities | 37,221 | +| Embeddings generated | 985/985 (100%) | +| High confidence (>=0.90) | 28,579 pairs | +| Medium confidence (0.85-0.90) | 3,496 pairs | +| Low confidence (0.80-0.85) | 0 pairs | +| Total candidate pairs | 32,075 | +| Extrapolated full-KG estimate | ~1,212,044 pairs | + +## Critical Finding: Embedding Collapse + +**The nomic-embed-text model produces near-identical embeddings for short, +unrelated entity names, creating massive false positives.** + +### Evidence (Top 20 pairs, ALL at 1.0000 similarity) + +| # | Type | Entity 1 | Entity 2 | Sim | Real Dup? | +|---|------|----------|----------|-----|-----------| +| 1 | technology | Qwen3-VL-32B | Data-Analysis-Agent | 1.0000 | NO | +| 2 | technology | Qwen3-VL-32B | GR-Contact-Tracker | 1.0000 | NO | +| 3 | technology | Data-Analysis-Agent | GR-Contact-Tracker | 1.0000 | NO | +| 4 | product | Bits AI Security Analyst | Kibana Student Success Dashboard | 1.0000 | NO | +| 5 | product | Bits AI Security Analyst | Personal Wealth Management System | 1.0000 | NO | +| 6 | product | Kibana Student Success Dashboard | Personal Wealth Management System | 1.0000 | NO | +| 7 | concept | AI Sales Development Representative | API Model Identification Enhancement | 1.0000 | NO | +| 8 | concept | AI Sales Development Representative | Open Agent Skills Ecosystem | 1.0000 | NO | +| 9 | concept | API Model Identification Enhancement | Open Agent Skills Ecosystem | 1.0000 | NO | +| 10 | service | Committee Permutations Search Library | Strategic Plan Discovery Method | 1.0000 | NO | +| 11 | service | Committee Permutations Search Library | Subagent Workflow Health Check | 1.0000 | NO | +| 12 | service | Strategic Plan Discovery Method | Subagent Workflow Health Check | 1.0000 | NO | +| 13 | service | SAM.gov | Grants.gov | 1.0000 | NO | +| 14 | service | SAM.gov | Texas.gov | 1.0000 | NO | +| 15 | service | Grants.gov | Texas.gov | 1.0000 | NO | +| 16 | event | GrafanaCON 2026 | DASH 2026 | 1.0000 | NO | +| 17 | technology | Agent Zero | Vector Database | 1.0000 | NO | +| 18 | technology | Agent Zero | PowerPoint Copilot | 1.0000 | NO | +| 19 | technology | Agent Zero | Cloud Security | 1.0000 | NO | +| 20 | technology | Agent Zero | Google Docs | 1.0000 | NO | + +### False Positive Rate Analysis + +- **0 out of 20** top candidates are actual duplicates +- **Estimated false positive rate: >99%** at >=0.90 threshold +- The model collapses short text strings into nearly identical embedding vectors +- Similarity scores are clustered at 0.85-1.00 with almost no differentiation + +## Root Cause + +`nomic-embed-text` is designed for **document-level semantic similarity**, not +**entity name matching**. Short entity names (2-5 words) lack sufficient +semantic content for the model to differentiate meaning. The embedding space +collapses, producing cosine similarities of 0.85-1.00 for unrelated entities. + +## Recommendation + +**Decision: NO-GO for nomic-embed-text entity resolution** + +**Reason:** 28,579 pairs flagged at >=0.90 but manual review shows ~0% are +actual duplicates. The embedding model is fundamentally unsuited for this task. + +### Next Steps (If Entity Resolution Is Still Needed) + +1. **String similarity baseline** — Levenshtein distance, Jaro-Winkler, or + token overlap on entity names. Catches obvious duplicates like + "Elasticsearch" vs "Elastic Stack" without embeddings. +2. **Domain-aware embedding model** — Fine-tuned or specialized model that + understands entity name semantics (e.g., Sentence-BERT with entity training). +3. **LLM-as-judge approach** — Use a reasoning model (Qwen3.6-35B) to classify + candidate pairs as same/different given entity name + type + context. +4. **Hybrid approach** — String pre-filter (blocking) + LLM verification for + borderline cases. + +### Acceptance Criteria Status + +- [x] Script runs end-to-end without errors (14.8s) +- [x] Embeddings generated for >= 950 of 1,000 entities (985/985 = 100%) +- [x] Candidate pairs counted and categorized by confidence +- [x] Top 20 candidates printed for review +- [x] Report file written to docs/ +- [x] Clear GO / REVIEW / NO-GO recommendation (**NO-GO**) + +--- +*Script: `/a0/usr/workdir/scripts/entity_resolver_spike.py`* diff --git a/docs/octopoda_openhuman_kg_plan_v3.md b/docs/octopoda_openhuman_kg_plan_v3.md new file mode 100644 index 0000000000..38e0ac0624 --- /dev/null +++ b/docs/octopoda_openhuman_kg_plan_v3.md @@ -0,0 +1,324 @@ +# Combined KG Enhancement Plan v3.1 — CC Approved (Final) + +> **Sources:** Octopoda-OS v3.0.3 + OpenHuman v0.53.43 +> **Date:** 2026-05-20 +> **Status:** ✅ APPROVED WITH MODIFICATIONS — Council of Councils Review Complete +> **CC Confidence:** HIGH (with modifications) +> **Platform:** Agent Zero v2.1.0 (base v1.15), 35+ plugins + +--- + +## Current State (Verified 2026-05-20) + +### KG Service +| Metric | Value | +|---|---| +| Version | 6.0.0 (v6.1.0 analysis API) | +| **Entities** | **37,221** | +| **Relationships** | **123,086** | +| **Documents** | **11,012** | +| **Vectors** | **50,056** (LanceDB) | +| Backend | Neo4j on AICube (100.78.79.41:8010) | +| Embeddings | nomic-embed-text (AI Tower 192.168.1.246:11435) ✅ Operational | +| AICube Ollama | 100.78.79.41:11434 ⚠️ DOWN 65+ days (NOT used by this plan) | + +### Entity Connectivity Distribution +| Tier | Connections | Count | % of Total | +|---|---|---|---| +| Isolated | 0 relationships | 42 | 0.1% | +| Single | 1 relationship | 1,409 | 3.8% | +| Low | 2-3 relationships | 9,508 | 25.5% | +| Medium | 4-10 relationships | 19,436 | 52.2% | +| High | 10+ relationships | 6,826 | 18.3% | + +> **DATA CORRECTION:** Previous v2.0 plan claimed "73% isolated entities" — **INCORRECT**. Actual isolation is **0.1%**. Graph is healthy. + +### Exact-Name Duplicate Candidates +Only 6 found: FastAPI, Python, Docker, vLLM, Ollama, CUDA (each appears 2x) + +### CC Key Finding +Entity Resolution is about **semantic duplicates** (same concept, different names across sources), NOT exact-name duplicates. This is a hypothesis requiring validation before full build. + +--- + +## Final Execution Order (CC-Approved) + +| Day | Phase | Source | Deliverable | Owner | Est. Days | +|---|---|---|---|---|---| +| **1-2** | **Phase 2** | Octopoda | Crash Recovery Checkpoints | DevOps | 1.5 | +| **3** | **Phase 0-Spike** | OpenHuman | Entity Resolution Validation | KG Specialist | 1 | +| **4-5** | **Phase 1** | Octopoda | Append-Only Audit Log | DevOps | 2 | +| **6** | **Phase 1.5** | OpenHuman | Token Compression Pipeline | ML Engineer | 1.5 | +| **7-9** | **Phase 3** | Both | Health Scoring + Tiered Memory | KG Specialist | 3 | +| *Spike result ≥100 candidates* | **Phase 0** | OpenHuman | Full Entity Resolution Build | KG Specialist | 4 | +| *After Phase 0* | **Phase 5** | Both | Near-Duplicate Consolidation | ML Engineer | 1.5 | +| *Deferred indefinitely* | **Phase 6** | Octopoda | spaCy Pre-filter ($50/mo gate) | — | — | +| *Deferred to Agent Health* | **Phase 4** | Octopoda | Loop Detection | — | — | + +> **Total committed:** 9.5 days | **Conditional:** 5.5 days (if spike passes) | **Max:** 15 days + +### Decision Gate at Day 3 +``` +Phase 0 Validation Spike (Day 3): + 1. Sample 1,000 random entities from KG + 2. Generate embeddings via AI Tower nomic-embed-text + 3. Compute pairwise similarity within each entity type + 4. Count candidate pairs with cosine similarity ≥ 0.85 + + Decision: + ≥ 100 candidates → PROCEED to full Phase 0 build (Days 10-13) + 50-99 candidates → REVIEW with user before proceeding + < 50 candidates → ABORT Phase 0 + Phase 5, move to other work +``` + +### Re-Ingestion Impact +| Phase | Re-Ingestion? | Details | +|---|---|---| +| Phase 2 | **No** | Checkpoint mechanism only | +| Phase 0-Spike | **No** | Read-only analysis on existing embeddings | +| Phase 1 | **No** | Append-only on future writes | +| Phase 1.5 | **No** for existing | Pipeline change going forward | +| Phase 3 | **No** | Read-only analysis endpoint | +| Phase 0 (full) | **Partial** | Reconciliation pass on existing entities. NOT re-scrape | +| Phase 5 | **No re-ingestion** | Modifies graph in-place | + +--- + +## Phase 2: Crash Recovery Checkpoints (SHIPS FIRST) +**Priority:** P0 | **Score:** 9/10 | **Estimate:** 1.5 days | **Impact:** High +**Owner:** DevOps Engineer | **Source:** Octopoda + +### Problem +`kg_parallel_worker.py` has no checkpoint. Worker crash = all progress lost. + +### Solution +Atomic per-worker checkpoint files with processed file tracking. + +### Files +| File | Action | +|---|---| +| `scripts/kg_checkpoint.py` | CREATE | +| `scripts/kg_parallel_worker.py` | MODIFY | +| `tests/unit/test_kg_checkpoint.py` | CREATE (5 tests) | +| `tests/integration/test_worker_crash_recovery.py` | CREATE (3 tests) | + +### Design +- State dir: `/a0/usr/workdir/state/kg_checkpoints/` +- Atomic writes: temp file + `os.replace()` +- Checkpoint every 10 files +- Auto-resume on startup, auto-cleanup on success +- Stale detection at 24h TTL + +### Acceptance Criteria +- [ ] Workers resume from checkpoint after crash +- [ ] 0 duplicate file processing +- [ ] Atomic writes verified +- [ ] All tests pass (80%+ coverage) + +--- + +## Phase 0-Spike: Entity Resolution Validation +**Priority:** P0 (validation) | **Score:** 6/10 | **Estimate:** 1 day | **Impact:** Determines Phase 0 + Phase 5 +**Owner:** KG Specialist | **Source:** OpenHuman + +### Problem +We HYPOTHESIZE that 11K+ documents contain semantic duplicates (same concept, different names). This spike validates before committing 4 days of build. + +### Approach +``` +1. Export 1,000 random entities from KG (stratified by type) +2. Generate embeddings via AI Tower nomic-embed-text (:11435) +3. Compute pairwise cosine similarity within each entity type +4. Count candidate pairs ≥ 0.85 similarity +5. Manual review of top 20 candidates for accuracy +6. Report: candidate count, false positive rate, recommendation +``` + +### Files +| File | Action | +|---|---| +| `scripts/kg_resolution_spike.py` | CREATE (disposable spike script) | + +### Deliverable +Written report with: +- Total candidate pairs found +- Sample of 20 candidates with manual review +- Estimated total duplicates across all 37K entities +- GO / NO-GO recommendation + +--- + +## Phase 1: Append-Only Audit Log +**Priority:** P0 | **Score:** 8/10 | **Estimate:** 2 days | **Impact:** High +**Owner:** DevOps Engineer | **Source:** Octopoda + +### Design +- Append-only JSONL at `/a0/usr/workdir/logs/kg_audit/audit.jsonl` +- Event schema: timestamp, action, target_type, target_id, source, content_hash, entity_count, rel_count +- Backup: rsync to AITower every 4 hours +- Rollback: `KG_AUDIT_ENABLED=false` + +### Files +| File | Action | +|---|---| +| `scripts/kg_audit_chain.py` | CREATE | +| `scripts/kg_parallel_worker.py` | MODIFY | +| `scripts/kg_ingest.py` | MODIFY | +| `scripts/kg_elastic_ingest.py` | MODIFY | +| `scripts/kg_audit_backup.py` | CREATE | +| `tests/unit/test_kg_audit_chain.py` | CREATE (8 tests) | + +### Acceptance Criteria +- [ ] 100% audit coverage of `/api/v1/add` calls +- [ ] `verify_integrity()` returns valid +- [ ] Backup to AITower every 4 hours +- [ ] Rollback tested + +--- + +## Phase 1.5: Token Compression Pipeline +**Priority:** P1 | **Score:** 8/10 | **Estimate:** 1.5 days | **Impact:** High (cost savings) +**Owner:** ML Engineer | **Source:** OpenHuman TokenJuice + +### Problem +Raw content sent to LLM. OpenHuman achieves 80% token reduction via pre-processing. + +### Solution +Compress content BEFORE LLM extraction: +1. HTML → Markdown +2. Strip boilerplate (nav, footer, cookie banners) +3. URL shortening (drop query params) +4. Whitespace normalization +5. Non-ASCII removal (preserve CJK) +6. Line deduplication +7. Truncate to 30K chars + +### Files +| File | Action | +|---|---| +| `scripts/kg_token_compressor.py` | CREATE | +| `scripts/kg_parallel_worker.py` | MODIFY | +| `scripts/kg_ingest.py` | MODIFY | +| `tests/unit/test_token_compressor.py` | CREATE | + +### Success Metrics +| Metric | Target | +|---|---| +| Token reduction | ≥ 40% | +| Content preservation | 100% meaningful content | +| Processing speed | < 50ms/file | + +--- + +## Phase 3: Entity Health Scoring + Tiered Memory +**Priority:** P1 | **Score:** 8/10 | **Estimate:** 3 days | **Impact:** High +**Owner:** KG Specialist | **Source:** Both + +### OpenHuman Integration: 4-Tier Memory +| Tier | Criteria | Behavior | +|---|---|---| +| Hot | < 7 days, ≥ 5 rels | Prioritized in queries | +| Warm | < 30 days, ≥ 2 rels | Normal priority | +| Cool | < 90 days, any connectivity | Lower priority | +| Cold | > 90 days OR isolated | Minimal priority | + +### Scoring +Connectivity (35%), Recency (20%), Source Quality (20%), Community (15%), Confidence (10%) + +### Endpoint +`GET /analysis/health?tier=hot&warm&cool&cold&min_score=0&limit=50` + +### Prerequisites +- Neo4j version check via `CALL dbms.components()` +- Index audit on `Entity.name`, `Entity.type`, `created_at` + +--- + +## Phase 0 (Conditional): Full Entity Resolution Build +**Priority:** P1 (conditional) | **Score:** 6/10 | **Estimate:** 4 days | **Impact:** Conditional +**Owner:** KG Specialist | **Source:** OpenHuman +**TRIGGER:** Spike finds ≥ 100 semantic duplicate candidates + +### Full Build +- Resolution pipeline between extraction and KG write +- Canonical entity schema with aliases and multi-source provenance +- Reconciliation pass over all 37K existing entities +- Uses LanceDB vectors for similarity search +- Logs every merge to audit trail (Phase 1) + +--- + +## Phase 5 (Conditional): Near-Duplicate Consolidation +**Priority:** P2 (conditional) | **Score:** 6/10 | **Estimate:** 1.5 days +**Owner:** ML Engineer | **Source:** Both +**TRIGGER:** Phase 0 completes successfully +**DEPENDENCY:** Phase 0 + Phase 1 + +--- + +## Deferred +| Phase | Reason | +|---|---| +| Phase 6 (spaCy) | ROI gate raised to $50/month. Defer indefinitely. | +| Phase 4 (Loop Detection) | Agent framework work, not KG. Separate initiative. | + +--- + +## CC Decision Summary + +| Phase | Score | Status | +|---|---|---| +| Phase 2 (Crash Recovery) | 9/10 | ✅ SHIPS FIRST | +| Phase 1 (Audit Log) | 8/10 | ✅ APPROVED | +| Phase 1.5 (Token Compression) | 8/10 | ✅ APPROVED | +| Phase 3 (Health Scoring) | 8/10 | ✅ APPROVED | +| Phase 0-Spike (Validation) | — | ✅ REQUIRED GATE | +| Phase 0 (Entity Resolution) | 6/10 | ⚠️ CONDITIONAL on spike | +| Phase 5 (Consolidation) | 6/10 | ⚠️ CONDITIONAL on Phase 0 | +| Phase 6 (spaCy) | 3/10 | ❌ DEFERRED ($50/mo gate) | + +## Risk Assessment + +| Risk | Mitigation | +|---|---| +| Phase 0 hypothesis wrong (few semantic duplicates) | Validation spike before full build | +| Entity resolution false merges | Dry-run, manual review, threshold tuning | +| Token compression drops content | A/B test extraction quality on 100 files | +| Audit log corruption | 4-hour rsync backup to AITower | +| Health scoring timeout | Cache daily, 5-second SLA | + +## Rollback Triggers + +| Trigger | Action | +|---|---| +| Entity resolution false merge rate > 2% | Lower threshold, add manual review | +| Token compression drops extraction > 5% | Disable, investigate | +| Audit corruption > 1 incident | Disable, restore from backup | +| Health query > 10s | Disable endpoint | + +--- + +## Appendix: Source Attribution + +| Phase | Source | Adapted Concept | +|---|---|---| +| Phase 2 | Octopoda snapshots | Crash recovery checkpoints | +| Phase 0-Spike | OpenHuman Neoortex | Entity resolution validation approach | +| Phase 1 | Octopoda audit-v2 | Append-only audit trail | +| Phase 1.5 | OpenHuman TokenJuice | Token compression pipeline | +| Phase 3 | Both | Health scoring (Octopoda) + Tiered memory (OpenHuman) | +| Phase 0 | OpenHuman Neoortex | Cross-source entity resolution | +| Phase 5 | Both | Consolidation (Octopoda) + semantic dedup (OpenHuman) | + +## Appendix: CC Review Details + +- **TC v2.0 Review:** `/a0/usr/workdir/docs/octopoda_tc_review.md` +- **CC v3.0 Review:** Council of Councils (technology-architecture + business-leadership) +- **CC Verdict:** Approved with Modifications (HIGH confidence) +- **Key CC Changes Applied:** + 1. Phase 2 reordered to first (immediate value, low risk) + 2. Phase 0 validation spike added (1-day gate before 4-day build) + 3. Phase 6 deferred (ROI gate raised to $50/month) + 4. Phase 0 and Phase 5 made conditional on spike results + 5. Data correction: 0.1% isolation vs claimed 73% + 6. AICube Ollama outage confirmed irrelevant (AI Tower operational) diff --git a/usr/plugins/_kg_dreamer/.pytest_cache/.gitignore b/usr/plugins/_kg_dreamer/.pytest_cache/.gitignore new file mode 100644 index 0000000000..bc1a1f6167 --- /dev/null +++ b/usr/plugins/_kg_dreamer/.pytest_cache/.gitignore @@ -0,0 +1,2 @@ +# Created by pytest automatically. +* diff --git a/usr/plugins/_kg_dreamer/.pytest_cache/CACHEDIR.TAG b/usr/plugins/_kg_dreamer/.pytest_cache/CACHEDIR.TAG new file mode 100644 index 0000000000..fce15ad7ea --- /dev/null +++ b/usr/plugins/_kg_dreamer/.pytest_cache/CACHEDIR.TAG @@ -0,0 +1,4 @@ +Signature: 8a477f597d28d172789f06886806bc55 +# This file is a cache directory tag created by pytest. +# For information about cache directory tags, see: +# https://bford.info/cachedir/spec.html diff --git a/usr/plugins/_kg_dreamer/.pytest_cache/README.md b/usr/plugins/_kg_dreamer/.pytest_cache/README.md new file mode 100644 index 0000000000..b89018ced9 --- /dev/null +++ b/usr/plugins/_kg_dreamer/.pytest_cache/README.md @@ -0,0 +1,8 @@ +# pytest cache directory # + +This directory contains data from the pytest's cache plugin, +which provides the `--lf` and `--ff` options, as well as the `cache` fixture. + +**Do not** commit this to version control. + +See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information. diff --git a/usr/plugins/_kg_dreamer/.pytest_cache/v/cache/lastfailed b/usr/plugins/_kg_dreamer/.pytest_cache/v/cache/lastfailed new file mode 100644 index 0000000000..9e26dfeeb6 --- /dev/null +++ b/usr/plugins/_kg_dreamer/.pytest_cache/v/cache/lastfailed @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/usr/plugins/_kg_dreamer/.pytest_cache/v/cache/nodeids b/usr/plugins/_kg_dreamer/.pytest_cache/v/cache/nodeids new file mode 100644 index 0000000000..99b7afe22d --- /dev/null +++ b/usr/plugins/_kg_dreamer/.pytest_cache/v/cache/nodeids @@ -0,0 +1,50 @@ +[ + "tests/test_connector.py::TestConnectOperationAuditLog::test_audit_append_failure_does_not_crash", + "tests/test_connector.py::TestConnectOperationAuditLog::test_execute_dry_run_logs_to_audit_chain", + "tests/test_connector.py::TestConnectOperationAuditLog::test_execute_live_run_logs_action_add", + "tests/test_connector.py::TestConnectOperationCandidateDetection::test_execute_cypher_query_error_returns_empty_candidates", + "tests/test_connector.py::TestConnectOperationCandidateDetection::test_execute_filters_candidates_with_missing_names", + "tests/test_connector.py::TestConnectOperationCandidateDetection::test_execute_finds_shared_docs_from_cypher_results", + "tests/test_connector.py::TestConnectOperationCandidateDetection::test_execute_live_create_failure_marks_detail_as_failed", + "tests/test_connector.py::TestConnectOperationCandidateDetection::test_execute_no_candidates_returns_empty_result", + "tests/test_connector.py::TestConnectOperationCandidateDetection::test_execute_respects_max_candidates_limit", + "tests/test_connector.py::TestConnectOperationDryRun::test_execute_dry_run_never_calls_create_relationship", + "tests/test_connector.py::TestConnectOperationDryRun::test_execute_dry_run_returns_zero_connections_made", + "tests/test_connector.py::TestConnectOperationDryRun::test_execute_live_run_calls_create_relationship", + "tests/test_connector.py::TestConnectOperationInit::test_init_with_custom_config_overrides_defaults", + "tests/test_connector.py::TestConnectOperationInit::test_init_with_defaults_populates_defaults", + "tests/test_connector.py::TestConnectOperationRelationshipType::test_custom_relationship_type_used_in_create", + "tests/test_orchestrator.py::TestDreamOrchestratorCreateOperation::test_create_operation_connect", + "tests/test_orchestrator.py::TestDreamOrchestratorCreateOperation::test_create_operation_prune", + "tests/test_orchestrator.py::TestDreamOrchestratorCreateOperation::test_create_operation_unknown_raises", + "tests/test_orchestrator.py::TestDreamOrchestratorGetStatus::test_get_status_includes_last_run_from_state", + "tests/test_orchestrator.py::TestDreamOrchestratorGetStatus::test_get_status_includes_paths", + "tests/test_orchestrator.py::TestDreamOrchestratorGetStatus::test_get_status_no_state_returns_none_last_run", + "tests/test_orchestrator.py::TestDreamOrchestratorGetStatus::test_get_status_returns_operation_states", + "tests/test_orchestrator.py::TestDreamOrchestratorInit::test_init_custom_config_path_missing_returns_empty", + "tests/test_orchestrator.py::TestDreamOrchestratorInit::test_init_loads_config_from_yaml", + "tests/test_orchestrator.py::TestDreamOrchestratorInit::test_init_sets_log_and_state_paths", + "tests/test_orchestrator.py::TestDreamOrchestratorRunCycle::test_run_cycle_disabled_operation_skipped", + "tests/test_orchestrator.py::TestDreamOrchestratorRunCycle::test_run_cycle_dry_run", + "tests/test_orchestrator.py::TestDreamOrchestratorRunCycle::test_run_cycle_one_failure_doesnt_stop_others", + "tests/test_orchestrator.py::TestDreamOrchestratorRunCycle::test_run_cycle_saves_state_and_report", + "tests/test_orchestrator.py::TestDreamOrchestratorRunCycle::test_run_cycle_single_operation", + "tests/test_orchestrator.py::TestDreamOrchestratorRunCycle::test_run_cycle_unknown_operation_skipped", + "tests/test_orchestrator.py::TestDreamOrchestratorSaveReport::test_save_report_writes_json_file", + "tests/test_orchestrator.py::TestDreamOrchestratorSaveReport::test_save_state_writes_json_file", + "tests/test_pruner.py::TestPruneOperationAuditLog::test_execute_dry_run_logs_action_dry_run", + "tests/test_pruner.py::TestPruneOperationAuditLog::test_execute_live_logs_action_delete", + "tests/test_pruner.py::TestPruneOperationBatchSize::test_execute_respects_batch_size", + "tests/test_pruner.py::TestPruneOperationDryRun::test_execute_dry_run_marks_candidates_as_would_prune", + "tests/test_pruner.py::TestPruneOperationDryRun::test_execute_dry_run_no_deletes", + "tests/test_pruner.py::TestPruneOperationFiltering::test_execute_filters_by_age_too_recent_excluded", + "tests/test_pruner.py::TestPruneOperationFiltering::test_execute_filters_by_health_score_high_health_excluded", + "tests/test_pruner.py::TestPruneOperationFiltering::test_execute_filters_by_query_count_queried_entity_excluded", + "tests/test_pruner.py::TestPruneOperationFiltering::test_execute_no_candidates_returns_empty_result", + "tests/test_pruner.py::TestPruneOperationFiltering::test_execute_scorer_error_returns_empty_result", + "tests/test_pruner.py::TestPruneOperationInit::test_init_with_custom_config_overrides_defaults", + "tests/test_pruner.py::TestPruneOperationInit::test_init_with_defaults_populates_defaults", + "tests/test_pruner.py::TestPruneOperationLiveRun::test_execute_live_delete_failure_marks_error", + "tests/test_pruner.py::TestPruneOperationLiveRun::test_execute_live_uses_detach_delete", + "tests/test_pruner.py::TestPruneOperationSorting::test_candidates_sorted_lowest_health_first" +] \ No newline at end of file diff --git a/usr/plugins/_kg_dreamer/README.md b/usr/plugins/_kg_dreamer/README.md new file mode 100644 index 0000000000..5dcc347409 --- /dev/null +++ b/usr/plugins/_kg_dreamer/README.md @@ -0,0 +1,74 @@ +# KG Dreamer Plugin + +Autonomous background intelligence for the Knowledge Graph. Makes the KG proactive instead of passive. + +## Overview + +KG Dreamer runs 6 dream operations every 6 hours to: +- Discover hidden connections between entities +- Strengthen important pathways, decay unused ones +- Archive stale cold-tier entities +- Detect contradictions across sources +- Discover unnamed entity clusters +- Generate proactive insights using LLM + +## Architecture + +``` +_kg_dreamer/ +├── plugin.yaml # depends: _kg_pipeline +├── default_config.yaml # schedule, LLM endpoint, thresholds +├── orchestrator.py # Dream cycle runner (353 lines) +├── operations/ +│ ├── connector.py # CONNECT: implied relationships (218 lines) +│ ├── strengthener.py # STRENGTHEN: pathway weights (403 lines) +│ ├── pruner.py # PRUNE: cold entity archival (354 lines) +│ ├── contradiction.py # CONTRADICT: conflict detection (476 lines) +│ ├── patterns.py # PATTERN: cluster discovery (232 lines) +│ └── insights.py # INSIGHT: LLM observations (369 lines) +├── tools/ +│ └── kg_dreamer.py # Agent Zero tool (211 lines) +├── helpers/ +│ └── __init__.py # Cross-plugin imports +└── tests/ + ├── test_connector.py # 15 tests + ├── test_pruner.py # 15 tests + └── test_orchestrator.py # 18 tests +``` + +## Dream Operations + +| Operation | What It Does | Priority | +|---|---|---| +| **CONNECT** | Find entities sharing documents with no direct relationship, create IMPLIED_RELATION edges | P0 | +| **STRENGTHEN** | Boost weights on frequently-accessed pathways, decay dormant ones | P1 | +| **PRUNE** | Archive cold-tier entities (>180 days, <0.1 health score) | P0 | +| **CONTRADICT** | Detect conflicting entity properties across sources via LLM | P1 | +| **PATTERN** | Discover unnamed entity clusters, suggest parent concepts via LLM | P2 | +| **INSIGHT** | Generate proactive observations using graph statistics + LLM | P2 | + +## Tool Methods + +``` +kg_dreamer:status → Check dream cycle status +kg_dreamer:run_dream_cycle → Run all operations +kg_dreamer:run_operation operation=connect → Run single operation +kg_dreamer:get_report count=5 → Get last N dream reports +``` + +## LLM Usage + +All LLM operations use Qwen3.6-35B on Mediaserver (192.168.1.250:11435) — free, local, no cloud costs. + +## Dependencies + +- `_kg_pipeline` plugin (KGClient, AuditChain, HealthScorer) +- KG service on AICube (100.78.79.41:8010) +- Qwen3.6-35B on Mediaserver for LLM operations + +## Stats + +- **14 Python files** +- **3,605 lines of code** +- **48 tests, 100% pass rate** +- **0 cloud API calls** diff --git a/usr/plugins/_kg_dreamer/default_config.yaml b/usr/plugins/_kg_dreamer/default_config.yaml new file mode 100644 index 0000000000..9455e952ea --- /dev/null +++ b/usr/plugins/_kg_dreamer/default_config.yaml @@ -0,0 +1,50 @@ +dreamer: + enabled: true + schedule_cron: '0 */6 * * *' + timezone: 'America/Chicago' + llm_endpoint: 'http://192.168.1.250:11435/v1/chat/completions' + llm_model: 'qwen3.6-35b' + llm_max_tokens: 4096 + llm_timeout: 120 + kg_service_url: 'http://100.78.79.41:8010' + kg_timeout: 60 + operations: + connect: + enabled: true + min_shared_docs: 2 + max_candidates: 500 + relationship_type: 'IMPLIED_RELATION' + strengthen: + enabled: true + boost_factor: 1.2 + decay_factor: 0.95 + min_queries: 3 + prune: + enabled: true + min_age_days: 180 + max_health_score: 0.1 + max_queries: 0 + dry_run: false + batch_size: 100 + contradict: + enabled: true + property_fields: + - 'description' + - 'type' + - 'category' + min_confidence: 0.7 + pattern: + enabled: true + min_cluster_size: 3 + max_clusters: 50 + insight: + enabled: true + max_insights: 10 + min_novelty: 0.6 + log_dir: '/a0/usr/workdir/logs/kg_dreams' + state_file: '/a0/usr/workdir/state/kg_dreamer_state.json' + notifications: + enabled: true + on_insights: true + on_contradictions: true + on_errors: true diff --git a/usr/plugins/_kg_dreamer/helpers/__init__.py b/usr/plugins/_kg_dreamer/helpers/__init__.py new file mode 100644 index 0000000000..3fee79fd22 --- /dev/null +++ b/usr/plugins/_kg_dreamer/helpers/__init__.py @@ -0,0 +1,82 @@ +"""KG Dreamer helpers — re-exports shared utilities from _kg_pipeline. + +Works both within Agent Zero (plugins namespace) and standalone (direct imports). +""" + +import sys +import os +import importlib +import logging + +logger = logging.getLogger(__name__) + +_pipeline_path = os.path.abspath(os.path.join( + os.path.dirname(os.path.dirname(__file__)), '..', '_kg_pipeline' +)) + +if _pipeline_path not in sys.path: + sys.path.insert(0, _pipeline_path) + + +def _import_class(module_name: str, class_name: str): + """Import a class from _kg_pipeline pipeline directory.""" + # Try direct import first (standalone mode with sys.path set) + try: + mod = importlib.import_module(f'pipeline.{module_name}') + return getattr(mod, class_name) + except (ImportError, AttributeError): + pass + + # Try Agent Zero plugin namespace + try: + mod = importlib.import_module(f'plugins._kg_pipeline.pipeline.{module_name}') + return getattr(mod, class_name) + except (ImportError, AttributeError): + pass + + # Try with spec from file path + try: + file_path = os.path.join(_pipeline_path, 'pipeline', f'{module_name}.py') + spec = importlib.util.spec_from_file_location(module_name, file_path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return getattr(mod, class_name) + except Exception as e: + logger.error(f"Failed to import {class_name} from {module_name}: {e}") + raise + + +def get_kg_client(config: dict): + """Create KGClient with correct parameters from dreamer config. + + KGClient(base_url: str, timeout: int = 300) + """ + cls = _import_class('kg_client', 'KGClient') + dreamer = config.get('dreamer', config) + base_url = dreamer.get('kg_service_url', 'http://100.78.79.41:8010/api/v1') + timeout = dreamer.get('kg_timeout', 60) + return cls(base_url=base_url, timeout=timeout) + + +def get_audit_chain(config: dict): + """Create AuditChain with correct parameters from dreamer config. + + AuditChain(audit_dir: str, enabled: bool = True) + """ + cls = _import_class('audit_chain', 'AuditChain') + dreamer = config.get('dreamer', config) + audit_dir = dreamer.get('log_dir', '/a0/usr/workdir/logs/kg_dreams') + return cls(audit_dir=audit_dir, enabled=True) + + +def get_health_scorer(config: dict): + """Create HealthScorer with correct parameters from dreamer config. + + HealthScorer expects kg_client and config dict. + """ + cls = _import_class('health_scorer', 'HealthScorer') + kg_client = get_kg_client(config) + return cls(kg_client=kg_client, config=config) + + +__all__ = ['get_kg_client', 'get_audit_chain', 'get_health_scorer'] diff --git a/usr/plugins/_kg_dreamer/logs/insights_20260521_023649.json b/usr/plugins/_kg_dreamer/logs/insights_20260521_023649.json new file mode 100644 index 0000000000..8395ef4f89 --- /dev/null +++ b/usr/plugins/_kg_dreamer/logs/insights_20260521_023649.json @@ -0,0 +1,77 @@ +{ + "generated_at": "2026-05-21T02:36:49.055247+00:00", + "dry_run": true, + "insight_count": 10, + "insights": [ + { + "text": "40% of surveyed school districts are prioritizing \"student data privacy\" over \"cost savings\" in RFPs, creating an opening to position Elastic Security as a compliance enabler rather than just a log aggregator | RELEVANCE: account opportunity | CONFIDENCE: 0.85", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "0679c0f54aec1dd4" + }, + { + "text": "Municipalities adopting K-920 compliant cloud environments are rapidly increasing Elastic Cloud consumption to avoid on-prem hardware procurement cycles, signaling a shift from license-based to consumption-based revenue models | RELEVANCE: market trend | CONFIDENCE: 0.90", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "ac87e05be18a124a" + }, + { + "text": "Competitor Splunk is facing significant churn in higher education due to price hikes; sales should target institutions with >500TB of daily ingestion where Splunk costs have exceeded $500k annually | RELEVANCE: competitive intelligence | CONFIDENCE: 0.80", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "c88a5ed6e23a3083" + }, + { + "text": "State DOTs are integrating Elastic with GIS platforms for real-time traffic anomaly detection, indicating a new cross-functional use case that bridges IT and Operations departments, expanding the buyer persona beyond CIOs | RELEVANCE: technology shift | CONFIDENCE: 0.75", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "47b2e39011eda5f5" + }, + { + "text": "Universities with recent cybersecurity incidents are 3x more likely to request Elastic SIEM integration within 90 days, suggesting a reactive sales motion triggered by specific threat events rather than annual budget cycles | RELEVANCE: account opportunity | CONFIDENCE: 0.88", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "a56558526ac602b6" + }, + { + "text": "Federal mandates requiring AI transparency in public sector hiring tools are driving demand for Elastic's machine learning capabilities to audit algorithmic bias, a niche but high-value entry point for EdTech vendors | RELEVANCE: market trend | CONFIDENCE: 0.70", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "849749ff768b915e" + }, + { + "text": "Local governments are consolidating legacy SIEM vendors (QRadar, ArcSight) into Elastic to reduce operational overhead, but require proof of \"no-code\" deployment to justify the switch to non-technical city managers | RELEVANCE: competitive intelligence | CONFIDENCE: 0.82", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "e4d745f863a67f9e" + }, + { + "text": "The rise of \"Smart City\" initiatives in mid-sized municipalities is creating a parallel demand for Elastic APM to monitor IoT sensor networks, distinct from traditional IT infrastructure monitoring | RELEVANCE: technology shift | CONFIDENCE: 0.78", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "67a5385f1679bba5" + }, + { + "text": "State higher education consortia are negotiating multi-institution Elastic agreements, indicating a trend toward centralized procurement that bypasses individual campus IT departments, requiring a top-down sales approach | RELEVANCE: market trend | CONFIDENCE: 0.85", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "60a5de5cfd78a268" + }, + { + "text": "Education institutions are increasingly using Elastic to unify disparate student success data sources (LMS, CRM, SIS), positioning the platform as a \"Single Source of Truth\" for administrative decision-making rather than just IT operations | RELEVANCE: account opportunity | CONFIDENCE: 0.80", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "56731543236b9625" + } + ] +} \ No newline at end of file diff --git a/usr/plugins/_kg_dreamer/logs/insights_20260521_023926.json b/usr/plugins/_kg_dreamer/logs/insights_20260521_023926.json new file mode 100644 index 0000000000..011fccb8f6 --- /dev/null +++ b/usr/plugins/_kg_dreamer/logs/insights_20260521_023926.json @@ -0,0 +1,77 @@ +{ + "generated_at": "2026-05-21T02:39:26.940999+00:00", + "dry_run": true, + "insight_count": 10, + "insights": [ + { + "text": "High connectivity between Python skills and LLMs indicates SLED customers are actively building custom RAG pipelines, creating an immediate need for Elastic\u2019s vector search capabilities to handle unstructured data ingestion | RELEVANCE: technology shift | CONFIDENCE: 0.85", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "1aefb5bd13af34f7" + }, + { + "text": "The strong link between GitHub and Elastic Stack suggests a growing trend of \"GitOps\" adoption in public sector IT, where sales should pitch Elastic as the observability layer for CI/CD pipelines managing government infrastructure | RELEVANCE: account opportunity | CONFIDENCE: 0.80", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "84e9b6147012455f" + }, + { + "text": "YouTube\u2019s high connection count implies a reliance on video analytics for public safety and education; sales should position Elastic\u2019s multimodal search to index video metadata and transcripts for faster incident response | RELEVANCE: market trend | CONFIDENCE: 0.75", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "5c5f9f5130d6c6d9" + }, + { + "text": "Large Language Model entities are heavily connected to Python, signaling that SLED agencies are moving from experimental AI to production-grade AI applications, requiring Elastic\u2019s security and compliance features to govern AI usage | RELEVANCE: competitive intelligence | CONFIDENCE: 0.82", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "921cfb3c31b06176" + }, + { + "text": "The dominance of Python as both a technology and skill hub reveals that SLED developers prefer open-source, code-first solutions; sales should leverage Elastic\u2019s Python client libraries and developer-friendly APIs to bypass traditional procurement hurdles | RELEVANCE: account opportunity | CONFIDENCE: 0.78", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "4c60e50c5df3cd68" + }, + { + "text": "GitHub connections to Elastic Stack suggest that many SLED agencies are using open-source contributions as a proxy for vendor trust; sales should engage with local university CS departments and public sector GitHub organizations to influence early-stage architecture decisions | RELEVANCE: market trend | CONFIDENCE: 0.70", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "7adbe8489660ebf1" + }, + { + "text": "The intersection of \"Large Language Model\" and \"Elasticsearch\" indicates a shift toward semantic search over keyword search in education portals; sales should target university libraries and student services for modernization projects | RELEVANCE: account opportunity | CONFIDENCE: 0.88", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "d5b7693d234bd7bb" + }, + { + "text": "High volume of \"concept\" entities linked to Python and LLMs suggests a knowledge gap in SLED agencies regarding AI governance; sales can create value by offering \"AI Readiness\" assessments that include Elastic\u2019s security and audit logging capabilities | RELEVANCE: competitive intelligence | CONFIDENCE: 0.72", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "5d3e59a253abb511" + }, + { + "text": "The presence of \"event\" entities connected to Elastic Stack implies usage in large-scale public gatherings or emergency response scenarios; sales should highlight real-time log aggregation and anomaly detection for crisis management use cases | RELEVANCE: market trend | CONFIDENCE: 0.65", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "8c697bd2e9cc6759" + }, + { + "text": "With \"person\" entities strongly linked to Elastic and Python, there is a network of technical champions within SLED agencies; sales should identify these key influencers via LinkedIn/GitHub activity to drive bottom-up adoption rather than top-down procurement | RELEVANCE: account opportunity | CONFIDENCE: 0.76", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "6e486d337e11f34f" + } + ] +} \ No newline at end of file diff --git a/usr/plugins/_kg_dreamer/logs/insights_20260521_030144.json b/usr/plugins/_kg_dreamer/logs/insights_20260521_030144.json new file mode 100644 index 0000000000..330073039d --- /dev/null +++ b/usr/plugins/_kg_dreamer/logs/insights_20260521_030144.json @@ -0,0 +1,77 @@ +{ + "generated_at": "2026-05-21T03:01:44.985817+00:00", + "dry_run": true, + "insight_count": 10, + "insights": [ + { + "text": "Leverage the recent StateRAMP addition to target mid-sized state agencies bypassing FedRAMP complexity, positioning Elastic as the compliant alternative to Splunk for state-level data sovereignty needs | RELEVANCE: account opportunity | CONFIDENCE: 0.85", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "64d315affb7f018b" + }, + { + "text": "Capitalize on the \"cyber resiliency\" concept emergence by pitching Elastic Security not just for detection, but for rapid recovery and forensic continuity, a key differentiator against legacy SIEMs in the SLED sector | RELEVANCE: market trend | CONFIDENCE: 0.80", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "db759fbe65f60d55" + }, + { + "text": "The strong connection between Texas A&M and Elastic Security indicates a successful higher-ed reference account; use this specific case study to penetrate other public university systems facing similar budget and compliance pressures | RELEVANCE: account opportunity | CONFIDENCE: 0.82", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "c797e17c3d04e6d7" + }, + { + "text": "Utilize the \"ROI Business Case One-Pager\" and \"HTML Templates\" to accelerate procurement cycles in education districts by providing pre-built, branded assets that reduce internal legal and marketing friction | RELEVANCE: account opportunity | CONFIDENCE: 0.75", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "715084363775f025" + }, + { + "text": "The explicit link between Elastic and \"Open Platform\" suggests a strategic wedge against proprietary vendors; position Elastic\u2019s open-source roots as a risk-mitigation strategy for SLED customers fearing vendor lock-in | RELEVANCE: competitive intelligence | CONFIDENCE: 0.78", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "6d95d7fad47ad056" + }, + { + "text": "With \"Large Language Model\" and \"Python\" as top hubs, create targeted demos showing how SLED IT teams can use Python scripts to integrate LLMs with Elastic for automated log summarization, appealing to the technical buyer | RELEVANCE: technology shift | CONFIDENCE: 0.88", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "6b7176f48b5fd820" + }, + { + "text": "The \"CJIS Security Policy\" connection is critical for state-level law enforcement accounts; ensure sales collateral explicitly maps Elastic Security features to CJIS requirements to overcome procurement blockers in public safety | RELEVANCE: account opportunity | CONFIDENCE: 0.90", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "b911a3aa9488782b" + }, + { + "text": "The \"Splunk <-> Elastic\" relationship in the graph highlights direct competitive displacement opportunities; train SLED reps to specifically contrast Elastic\u2019s open platform flexibility against Splunk\u2019s licensing rigidity in state budgets | RELEVANCE: competitive intelligence | CONFIDENCE: 0.85", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "3d0393454e9265c9" + }, + { + "text": "The presence of \"brand.elastic.co\" and \"Elastic Brand & Content Guide\" indicates a need for standardized messaging; equip SLED reps with these assets to ensure consistent \"cyber resiliency\" and \"Open Platform\" narratives across diverse government entities | RELEVANCE: market trend | CONFIDENCE: 0.70", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "6379dc00fa68986f" + }, + { + "text": "The high volume of \"GitHub\" connections suggests strong developer adoption in education; target university CS departments for pilot programs using Elastic Stack, creating bottom-up adoption that influences broader IT procurement | RELEVANCE: account opportunity | CONFIDENCE: 0.76", + "domain": "general", + "confidence": 0.5, + "novel": true, + "hash": "d3ee089d35e1df0b" + } + ] +} \ No newline at end of file diff --git a/usr/plugins/_kg_dreamer/operations/__init__.py b/usr/plugins/_kg_dreamer/operations/__init__.py new file mode 100644 index 0000000000..4b644c3f9c --- /dev/null +++ b/usr/plugins/_kg_dreamer/operations/__init__.py @@ -0,0 +1,26 @@ +"""KG Dreamer operations module. + +P1 and P2 Dream Operations for the Knowledge Graph Dreamer plugin. + +- ConnectOperation: Create IMPLIED_RELATION edges between co-occurring entities +- PruneOperation: Archive cold-tier entities based on health, age, and usage +- StrengthenOperation: Boost active relationships, decay dormant ones +- ContradictionOperation: Detect conflicting properties across sources +- PatternOperation: Discover unnamed entity clusters, suggest parent concepts +- InsightOperation: Generate proactive observations from graph patterns +""" +from .connector import ConnectOperation +from .contradiction import ContradictionOperation +from .insights import InsightOperation +from .patterns import PatternOperation +from .pruner import PruneOperation +from .strengthener import StrengthenOperation + +__all__ = [ + "ConnectOperation", + "ContradictionOperation", + "InsightOperation", + "PatternOperation", + "PruneOperation", + "StrengthenOperation", +] diff --git a/usr/plugins/_kg_dreamer/operations/connector.py b/usr/plugins/_kg_dreamer/operations/connector.py new file mode 100644 index 0000000000..7f1fe2c23b --- /dev/null +++ b/usr/plugins/_kg_dreamer/operations/connector.py @@ -0,0 +1,230 @@ +"""CONNECT operation for KG Dreamer. + +Finds entities that share documents but have no direct relationship, +creating IMPLIED_RELATION edges between them. +""" +import logging +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + + +class ConnectOperation: + """Create implied relationships between co-occurring entities. + + Entities that appear together in multiple documents likely have + a semantic relationship. This operation finds such pairs and + creates IMPLIED_RELATION edges with weights proportional to + co-occurrence frequency. + + Attributes: + kg_client: HTTP client for KG service. + audit_chain: Append-only audit trail for write operations. + config: Operation configuration dict. + """ + + DEFAULT_MIN_SHARED_DOCS: int = 2 + DEFAULT_MAX_CANDIDATES: int = 500 + DEFAULT_RELATIONSHIP_TYPE: str = "IMPLIED_RELATION" + + def __init__( + self, + kg_client: Any, + audit_chain: Any, + config: Optional[Dict[str, Any]] = None, + ) -> None: + """Initialize ConnectOperation. + + Args: + kg_client: KG client with query_cypher() and create_relationship(). + audit_chain: Audit chain with append() method. + config: Optional config overrides for min_shared_docs, + max_candidates, relationship_type. + """ + self.kg = kg_client + self.audit = audit_chain + cfg = config or {} + + self.min_shared_docs = cfg.get( + "min_shared_docs", self.DEFAULT_MIN_SHARED_DOCS + ) + self.max_candidates = cfg.get( + "max_candidates", self.DEFAULT_MAX_CANDIDATES + ) + self.relationship_type = cfg.get( + "relationship_type", self.DEFAULT_RELATIONSHIP_TYPE + ) + + def execute(self, dry_run: bool = True) -> Dict[str, Any]: + """Execute CONNECT operation. + + Finds entity pairs that co-occur in documents and creates + IMPLIED_RELATION relationships. + + Args: + dry_run: If True, report candidates without creating edges. + + Returns: + Dict with candidates_found, connections_made, dry_run flag, + and details of each connection. + """ + candidates = self._find_candidates() + if not candidates: + logger.info("No connection candidates found.") + return { + "candidates_found": 0, + "connections_made": 0, + "dry_run": dry_run, + "details": [], + } + + connections: List[Dict[str, Any]] = [] + connections_made = 0 + + for candidate in candidates[: self.max_candidates]: + detail = self._process_candidate(candidate, dry_run) + connections.append(detail) + if detail.get("created"): + connections_made += 1 + + logger.info( + "CONNECT completed: %d candidates, %d connections (dry_run=%s)", + len(candidates), + connections_made, + dry_run, + ) + + return { + "candidates_found": len(candidates), + "connections_made": connections_made, + "dry_run": dry_run, + "details": connections, + } + + def _find_candidates(self) -> List[Dict[str, Any]]: + """Query KG for entity pairs in same domain with shared categories. + + Finds entities of same type in same domain that share categories + but don't have a direct RELATES_TO relationship. + + Returns: + List of candidate dicts with e1_name, e2_name, type1, + type2, and shared_categories count. + """ + # Find pairs in same domain/type with overlapping categories + cypher = ( + "MATCH (e1:Entity), (e2:Entity) " + "WHERE e1.name < e2.name " + "AND e1.domain = e2.domain " + "AND e1.categories IS NOT NULL " + "AND e2.categories IS NOT NULL " + "WITH e1, e2, " + "split(e1.categories, ',') AS c1, " + "split(e2.categories, ',') AS c2 " + "WITH e1, e2, c1, c2, " + "size([x IN c1 WHERE x IN c2]) AS shared_count " + "WHERE shared_count >= $min_shared " + "AND NOT EXISTS { MATCH (e1)-[:RELATES_TO]-(e2) } " + "RETURN e1.name AS e1_name, e2.name AS e2_name, " + "e1.type AS type1, e2.type AS type2, shared_count, " + "e1.domain AS domain" + ) + + try: + rows = self.kg.query_cypher( + cypher, {"min_shared": self.min_shared_docs} + ) + candidates: List[Dict[str, Any]] = [ + { + "e1_name": r.get("e1_name", ""), + "e2_name": r.get("e2_name", ""), + "type1": r.get("type1", "unknown"), + "type2": r.get("type2", "unknown"), + "shared_docs": int(r.get("shared_count", 0)), + "domain": r.get("domain", "unknown"), + } + for r in rows + if r.get("e1_name") and r.get("e2_name") + ] + logger.debug("Found %d connection candidates", len(candidates)) + return candidates + except Exception as exc: + logger.error("Failed to query candidates: %s", exc) + return [] + + def _process_candidate( + self, candidate: Dict[str, Any], dry_run: bool + ) -> Dict[str, Any]: + """Process a single candidate pair. + + Args: + candidate: Dict with e1_name, e2_name, shared_docs. + dry_run: If True, don't create the relationship. + + Returns: + Dict with connection details and created status. + """ + e1 = candidate.get("e1_name", "") + e2 = candidate.get("e2_name", "") + weight = candidate.get("shared_docs", 0) + + detail = { + "e1": e1, + "e2": e2, + "type1": candidate.get("type1", "unknown"), + "type2": candidate.get("type2", "unknown"), + "weight": weight, + "created": False, + } + + if dry_run: + detail["status"] = "would_create" + self._log_audit(e1, e2, weight, dry_run=True) + return detail + + try: + # Create relationship via KG client + success = self.kg.create_relationship( + source_name=e1, target_name=e2, rel_type=self.relationship_type + ) + if success: + detail["created"] = True + detail["status"] = "created" + self._log_audit(e1, e2, weight, dry_run=False) + else: + detail["status"] = "failed" + logger.warning("Failed to create relationship %s -> %s", e1, e2) + except Exception as exc: + detail["status"] = "error" + detail["error"] = str(exc) + logger.error("Error creating relationship %s -> %s: %s", e1, e2, exc) + + return detail + + def _log_audit( + self, e1: str, e2: str, weight: int, dry_run: bool + ) -> None: + """Log connection to audit chain. + + Args: + e1: Source entity name. + e2: Target entity name. + weight: Relationship weight (shared docs count). + dry_run: Whether this was a dry run. + """ + try: + self.audit.append( + action="add" if not dry_run else "dry_run", + target_type="relationship", + target_id=f"{e1}->{e2}:{self.relationship_type}", + source="kg_dreamer.operations.connector", + metadata={ + "source_entity": e1, + "target_entity": e2, + "relationship_type": self.relationship_type, + "weight": weight, + "dry_run": dry_run, + }, + ) + except Exception as exc: + logger.warning("Audit log failed: %s", exc) diff --git a/usr/plugins/_kg_dreamer/operations/contradiction.py b/usr/plugins/_kg_dreamer/operations/contradiction.py new file mode 100644 index 0000000000..bb7bf9546e --- /dev/null +++ b/usr/plugins/_kg_dreamer/operations/contradiction.py @@ -0,0 +1,508 @@ +"""CONTRADICTION operation for KG Dreamer. + +Detects entities with conflicting property values across multiple +source documents using LLM-based contradiction analysis. +""" +import json +import logging +import re +from typing import Any, Dict, List, Optional, Union + +import requests + +logger = logging.getLogger(__name__) + + +class ContradictionOperation: + """Detect property contradictions across multiple sources. + + Identifies entities that have different values for the same + property across source documents, then uses LLM analysis to + determine if the differences represent genuine contradictions + or semantically equivalent variants. + + Attributes: + kg_client: HTTP client for KG service. + audit_chain: Append-only audit trail for operations. + config: Operation configuration dict. + llm_endpoint: URL for LLM contradiction analysis. + """ + + DEFAULT_PROPERTY_FIELDS: List[str] = ["description", "type", "category"] + DEFAULT_MIN_CONFIDENCE: float = 0.7 + DEFAULT_LLM_TIMEOUT: int = 60 + DEFAULT_LLM_MAX_TOKENS: int = 4096 + DEFAULT_BATCH_SIZE: int = 10 + + CONTRADICTION_PATTERN: str = r"\bCONTRADICTION\b" + CONSISTENT_PATTERN: str = r"\bCONSISTENT\b" + + def __init__( + self, + kg_client: Any, + audit_chain: Any, + config: Optional[Dict[str, Any]] = None, + ) -> None: + """Initialize ContradictionOperation. + + Args: + kg_client: KG client with query_cypher() and get_entity() methods. + audit_chain: Audit chain with append() method. + config: Optional config for property_fields, min_confidence, + llm_endpoint, llm_timeout, llm_max_tokens. + """ + self.kg = kg_client + self.audit = audit_chain + cfg = config or {} + + self.property_fields: List[str] = cfg.get( + "property_fields", self.DEFAULT_PROPERTY_FIELDS + ) + self.min_confidence: float = float( + cfg.get("min_confidence", self.DEFAULT_MIN_CONFIDENCE) + ) + self.llm_endpoint: str = cfg.get( + "llm_endpoint", + "http://192.168.1.250:11435/v1/chat/completions", + ) + self.llm_timeout: int = int( + cfg.get("llm_timeout", self.DEFAULT_LLM_TIMEOUT) + ) + self.llm_max_tokens: int = int( + cfg.get("llm_max_tokens", self.DEFAULT_LLM_MAX_TOKENS) + ) + self.batch_size: int = int( + cfg.get("batch_size", self.DEFAULT_BATCH_SIZE) + ) + + def execute(self, dry_run: bool = True) -> Dict[str, Any]: + """Execute CONTRADICTION operation. + + Searches for entities with multiple sources, analyzes property + consistency using LLM, and reports detected contradictions. + + Args: + dry_run: If True, report findings without side effects. + + Returns: + Dict with checked count, contradictions_found count, + contradictions list, and dry_run flag. + """ + entities = self._get_multi_source_entities() + if not entities: + logger.info("No multi-source entities found.") + return { + "checked": 0, + "contradictions_found": 0, + "contradictions": [], + "dry_run": dry_run, + } + + contradictions: List[Dict[str, Any]] = [] + checked_count = 0 + + for entity in entities: + entity_contradictions = self._check_entity_properties( + entity, dry_run + ) + contradictions.extend(entity_contradictions) + checked_count += 1 + + contradictions_found = len(contradictions) + + logger.info( + "CONTRADICTION completed: %d entities checked, " + "%d contradictions found (dry_run=%s)", + checked_count, + contradictions_found, + dry_run, + ) + + # Log to audit chain even in dry_run to track analysis + if not dry_run: + self._log_audit("contradiction_scan", checked_count, contradictions) + + return { + "checked": checked_count, + "contradictions_found": contradictions_found, + "contradictions": contradictions, + "dry_run": dry_run, + } + + def _get_multi_source_entities(self) -> List[Dict[str, Any]]: + """Query KG for entities with potential contradictions. + + Finds entities with same name but different types or domains, + indicating potential data quality issues. + + Returns: + List of entity dicts with name, type, domain, and variation count. + """ + # Find entities with same name but different types + cypher = ( + "MATCH (e1:Entity), (e2:Entity) " + "WHERE e1.name = e2.name " + "AND e1.id <> e2.id " + "AND (e1.type <> e2.type OR e1.domain <> e2.domain) " + "RETURN DISTINCT e1.name AS name, e1.type AS etype1, " + "e2.type AS etype2, e1.domain AS domain1, e2.domain AS domain2, " + "count(e1) AS occurrence" + ) + + try: + rows = self.kg.query_cypher(cypher, {}) + # Aggregate by entity name + seen: Dict[str, Dict[str, Any]] = {} + for r in rows: + name = r.get("name", "") + if not name: + continue + if name not in seen: + seen[name] = { + "name": name, + "type": r.get("etype1", "unknown"), + "domain": r.get("domain1", "unknown"), + "types": set(), + "domains": set(), + "variation_count": 0, + } + seen[name]["types"].add(r.get("etype1", "unknown")) + seen[name]["types"].add(r.get("etype2", "unknown")) + seen[name]["domains"].add(r.get("domain1", "unknown")) + seen[name]["domains"].add(r.get("domain2", "unknown")) + seen[name]["variation_count"] = len(seen[name]["types"]) + len(seen[name]["domains"]) + + # Convert sets to lists for serialization + entities: List[Dict[str, Any]] = [ + { + "name": v["name"], + "type": v["type"], + "domain": v["domain"], + "types": list(v["types"]), + "domains": list(v["domains"]), + "variation_count": v["variation_count"], + } + for v in seen.values() + if v["variation_count"] >= 2 + ] + + logger.debug("Found %d multi-variant entities", len(entities)) + return entities + except Exception as exc: + logger.error("Failed to query multi-source entities: %s", exc) + return [] + + def _get_entity_property_values( + self, entity_name: str, field: str + ) -> List[Dict[str, Any]]: + """Get property values for an entity. + + Args: + entity_name: The entity to query. + field: The property field to retrieve. + + Returns: + List of dicts with value and entity info. + """ + # Get entity property directly + # Current KG schema doesn't track source documents per property + cypher = ( + f"MATCH (e:Entity {{name: $name}}) " + f"WHERE e.{field} IS NOT NULL " + f"RETURN e.{field} AS value, e.type AS etype, " + f"e.domain AS domain LIMIT 1" + ) + + try: + rows = self.kg.query_cypher(cypher, {"name": entity_name}) + values: List[Dict[str, Any]] = [] + for r in rows: + val = r.get("value") + if val is not None and str(val).strip(): + values.append({ + "value": str(val), + "source": f"type={r.get('etype', 'unknown')}, domain={r.get('domain', 'unknown')}", + }) + return values + except Exception as exc: + logger.error( + "Failed to get property values for %s.%s: %s", + entity_name, field, exc + ) + return [] + + def _check_entity_properties( + self, entity: Dict[str, Any], dry_run: bool + ) -> List[Dict[str, Any]]: + """Check all configured properties for contradictions. + + Args: + entity: Entity dict with name and type. + dry_run: If True, don't create flags. + + Returns: + List of detected contradictions for this entity. + """ + entity_name = entity.get("name", "") + entity_type = entity.get("type", "unknown") + contradictions: List[Dict[str, Any]] = [] + + for field in self.property_fields: + values = self._get_entity_property_values(entity_name, field) + if len(values) < 2: + continue + + # Compare each pair of values + for i in range(len(values)): + for j in range(i + 1, len(values)): + val_a = values[i] + val_b = values[j] + + # Skip identical values + if val_a["value"] == val_b["value"]: + continue + + result = self._analyze_contradiction( + entity_name, entity_type, field, + val_a["value"], val_b["value"] + ) + + if result.get("is_contradiction"): + confidence = result.get("confidence", 0.5) + if confidence >= self.min_confidence: + contradiction = { + "entity": entity_name, + "entity_type": entity_type, + "field": field, + "value_a": val_a["value"], + "value_b": val_b["value"], + "source_a": val_a["doc_id"], + "source_b": val_b["doc_id"], + "source_a_title": val_a["doc_title"], + "source_b_title": val_b["doc_title"], + "confidence": confidence, + "explanation": result.get("explanation", ""), + } + contradictions.append(contradiction) + + if not dry_run: + self._log_audit( + "contradiction_detected", + 1, + [contradiction] + ) + + return contradictions + + def _analyze_contradiction( + self, + entity_name: str, + entity_type: str, + field: str, + value_a: str, + value_b: str, + ) -> Dict[str, Any]: + """Use LLM to determine if two values represent a contradiction. + + Args: + entity_name: The entity being analyzed. + entity_type: Type of the entity. + field: The property field. + value_a: First value. + value_b: Second value. + + Returns: + Dict with is_contradiction (bool), confidence (0-1), + and explanation (str). + """ + prompt = self._build_contradiction_prompt( + entity_name, entity_type, field, value_a, value_b + ) + + try: + response = self._call_llm(prompt) + return self._parse_llm_response(response) + except Exception as exc: + logger.error("LLM analysis failed for %s: %s", entity_name, exc) + return { + "is_contradiction": False, + "confidence": 0.0, + "explanation": f"LLM analysis error: {exc}", + } + + def _build_contradiction_prompt( + self, + entity_name: str, + entity_type: str, + field: str, + value_a: str, + value_b: str, + ) -> str: + """Build LLM prompt for contradiction analysis. + + Args: + entity_name: Entity name. + entity_type: Entity type. + field: Property field. + value_a: First value. + value_b: Second value. + + Returns: + Formatted prompt string. + """ + return ( + f"Analyze whether these two descriptions of '{entity_name}' " + f"(a {entity_type} entity) contradict each other.\n\n" + f"Field: {field}\n" + f"Source A: {value_a}\n" + f"Source B: {value_b}\n\n" + "Do these values contradict each other? Consider semantic " + "meaning, not just wording differences.\n\n" + "Answer format:\n" + "VERDICT: CONTRADICTION or CONSISTENT\n" + "CONFIDENCE: 0.0 to 1.0\n" + "EXPLANATION: Brief explanation of your reasoning\n\n" + "Rules:\n" + "- Use CONTRADICTION only if the values cannot both be true\n" + "- Use CONSISTENT if they describe the same thing differently\n" + "- Confidence should reflect your certainty" + ) + + def _call_llm(self, prompt: str) -> Dict[str, Any]: + """Call LLM endpoint for contradiction analysis. + + Args: + prompt: The analysis prompt. + + Returns: + Raw LLM response dict. + """ + payload = { + "model": "qwen3.6:35b", + "messages": [ + { + "role": "system", + "content": ( + "You are a semantic analysis assistant. " + "Detect when property values contradict. " + "Be precise and objective." + ), + }, + {"role": "user", "content": prompt}, + ], + "max_tokens": self.llm_max_tokens, + "temperature": 0.0, + "stream": False, + } + + response = requests.post( + self.llm_endpoint, + json=payload, + timeout=self.llm_timeout, + ) + response.raise_for_status() + return response.json() + + def _parse_llm_response( + self, response: Dict[str, Any] + ) -> Dict[str, Any]: + """Parse LLM response for contradiction verdict. + + Tries reasoning_content first, then content. + + Args: + response: Raw LLM response dict. + + Returns: + Parsed dict with is_contradiction, confidence, explanation. + """ + choices = response.get("choices", []) + if not choices: + logger.warning("Empty choices in LLM response") + return {"is_contradiction": False, "confidence": 0.0, "explanation": ""} + + message = choices[0].get("message", {}) + + # Try reasoning_content first, fallback to content + text = message.get("reasoning_content", "") or message.get("content", "") + if not text: + logger.warning("Empty LLM message content") + return {"is_contradiction": False, "confidence": 0.0, "explanation": ""} + + text = str(text).strip() + + # Parse verdict + is_contradiction = False + if re.search(self.CONTRADICTION_PATTERN, text, re.IGNORECASE): + is_contradiction = True + elif re.search(self.CONSISTENT_PATTERN, text, re.IGNORECASE): + is_contradiction = False + else: + # Default to no contradiction if unclear + logger.debug("Ambiguous verdict, defaulting to CONSISTENT") + + # Parse confidence + confidence_match = re.search( + r"CONFIDENCE:\s*([0-9.]+)", text, re.IGNORECASE + ) + confidence = 0.5 + if confidence_match: + try: + confidence = float(confidence_match.group(1)) + confidence = max(0.0, min(1.0, confidence)) + except ValueError: + confidence = 0.5 + + # Extract explanation + explanation = "" + expl_match = re.search( + r"EXPLANATION:\s*(.+?)(?=\n\n|\Z)", + text, + re.IGNORECASE | re.DOTALL, + ) + if expl_match: + explanation = expl_match.group(1).strip() + else: + # Fallback: use everything after VERDICT line + verdict_end = re.search( + r"VERDICT:\s*\w+", + text, + re.IGNORECASE + ) + if verdict_end: + explanation = text[verdict_end.end():].strip()[:200] + + return { + "is_contradiction": is_contradiction, + "confidence": confidence, + "explanation": explanation, + } + + def _log_audit( + self, + action: str, + count: int, + details: List[Dict[str, Any]], + ) -> None: + """Log contradiction operation to audit chain. + + Args: + action: Action performed (contradiction_scan or contradiction_detected). + count: Number of items. + details: Operation details. + """ + try: + self.audit.append( + action=action, + target_type="entity", + target_id="multi_source_analysis", + source="kg_dreamer.operations.contradiction", + metadata={ + "count": count, + "min_confidence": self.min_confidence, + "fields_checked": self.property_fields, + "details": details[:10], # Limit stored details + }, + ) + except Exception as exc: + logger.warning("Audit log failed: %s", exc) diff --git a/usr/plugins/_kg_dreamer/operations/insights.py b/usr/plugins/_kg_dreamer/operations/insights.py new file mode 100644 index 0000000000..79627f8a5e --- /dev/null +++ b/usr/plugins/_kg_dreamer/operations/insights.py @@ -0,0 +1,392 @@ +"""INSIGHT operation for KG Dreamer. + +Generates proactive observations from graph patterns using LLM analysis. +Creates actionable insights for sales teams based on KG statistics. +""" +import hashlib +import json +import logging +import os +import re +from datetime import datetime, timedelta, timezone +from typing import Any, Dict, List, Optional, Set + +import requests + +logger = logging.getLogger(__name__) + + +class InsightOperation: + """Generate proactive observations from graph patterns using LLM. + + Analyzes KG statistics, recent activity, and contradictions to generate + novel, actionable insights. Tracks insight novelty to avoid repetition. + + Attributes: + kg_client: HTTP client for KG service. + audit_chain: Append-only audit trail for operations. + config: Operation configuration dict. + """ + + DEFAULT_MAX_INSIGHTS: int = 10 + DEFAULT_MIN_NOVELTY: float = 0.6 + DEFAULT_LLM_TIMEOUT: int = 120 + DEFAULT_LLM_MAX_TOKENS: int = 4096 + RECENT_DAYS: int = 7 + STATE_DIR: str = "/a0/usr/plugins/_kg_dreamer/state" + LOG_DIR: str = "/a0/usr/plugins/_kg_dreamer/logs" + + def __init__( + self, kg_client: Any, audit_chain: Any, config: Optional[Dict[str, Any]] = None + ) -> None: + """Initialize InsightOperation. + + Args: + kg_client: KG client with query_cypher() and status() methods. + audit_chain: Audit chain with query(), append(), get_recent(). + config: Optional config for max_insights, min_novelty, llm settings. + """ + self.kg = kg_client + self.audit = audit_chain + cfg = config or {} + self.max_insights = int(cfg.get("max_insights", self.DEFAULT_MAX_INSIGHTS)) + self.min_novelty = float(cfg.get("min_novelty", self.DEFAULT_MIN_NOVELTY)) + self.llm_endpoint = cfg.get( + "llm_endpoint", "http://192.168.1.250:11435/v1/chat/completions" + ) + self.llm_timeout = int(cfg.get("llm_timeout", self.DEFAULT_LLM_TIMEOUT)) + self.llm_max_tokens = int(cfg.get("llm_max_tokens", self.DEFAULT_LLM_MAX_TOKENS)) + self.state_file = os.path.join(self.STATE_DIR, "insights_state.json") + os.makedirs(self.STATE_DIR, exist_ok=True) + os.makedirs(self.LOG_DIR, exist_ok=True) + + def execute(self, dry_run: bool = True) -> Dict[str, Any]: + """Execute INSIGHT operation.""" + stats = self._gather_stats() + if not stats: + return {"insights_generated": 0, "insights": [], "dry_run": dry_run} + raw_insights = self._generate_insights(stats) + if not raw_insights: + return {"insights_generated": 0, "insights": [], "dry_run": dry_run} + filtered = self._filter_by_novelty(raw_insights) + self._save_to_log(filtered, dry_run) + if not dry_run: + self._update_state(filtered) + self._log_audit("insight_generation", len(filtered), stats) + logger.info( + "INSIGHT: %d generated, %d novel (dry_run=%s)", + len(raw_insights), len(filtered), dry_run + ) + return {"insights_generated": len(filtered), "insights": filtered, "dry_run": dry_run} + + def _gather_stats(self) -> Optional[Dict[str, Any]]: + """Gather comprehensive statistics from KG and audit chain.""" + try: + return { + "timestamp": datetime.now(timezone.utc).isoformat(), + "recent_entities": self._get_recent_entities(), + "top_hubs": self._get_top_hubs(), + "recent_connections": self._get_recent_connections(), + "contradictions": self._get_contradictions(), + "type_distribution": self._get_type_distribution(), + } + except Exception as exc: + logger.error("Failed to gather stats: %s", exc) + return None + + def _get_recent_entities(self) -> List[Dict[str, Any]]: + """Get recently seen entities from KG. + + Uses last_seen property to find entities recently updated. + """ + try: + cutoff_date = (datetime.now(timezone.utc) - timedelta(days=self.RECENT_DAYS)).isoformat() + cypher = ( + "MATCH (e:Entity) " + "WHERE e.last_seen >= $cutoff " + "AND e.first_seen IS NOT NULL " + "RETURN e.name AS name, e.type AS etype, e.first_seen, e.last_seen, " + "e.mention_count AS mention_count " + "ORDER BY e.last_seen DESC LIMIT 20" + ) + rows = self.kg.query_cypher(cypher, {"cutoff": cutoff_date}) + entities = [ + { + "name": r.get("name", ""), + "type": r.get("etype", "unknown"), + "added_at": r.get("e.first_seen", ""), + "updated_at": r.get("e.last_seen", ""), + "mention_count": int(r.get("mention_count", 0)), + } + for r in rows if r.get("name") + ] + return entities + except Exception as exc: + logger.debug("Failed to get recent entities: %s", exc) + return [] + + def _get_top_hubs(self) -> List[Dict[str, Any]]: + """Query KG for top connected entities (hubs).""" + cypher = ( + "MATCH (e:Entity)-[r]-(other) WITH e, count(r) AS cc " + "ORDER BY cc DESC LIMIT 20 RETURN e.name AS name, e.type AS etype, cc" + ) + try: + rows = self.kg.query_cypher(cypher, {}) + return [ + {"name": r.get("name", ""), "type": r.get("etype", "unknown"), "connections": int(r.get("cc", 0))} + for r in rows if r.get("name") + ][:10] + except Exception as exc: + logger.debug("Failed to get top hubs: %s", exc) + return [] + + def _get_recent_connections(self) -> List[Dict[str, Any]]: + """Get recently created RELATES_TO connections from KG. + + Uses created_at property on relationships to find recent connections. + """ + try: + cutoff_date = (datetime.now(timezone.utc) - timedelta(days=self.RECENT_DAYS)).isoformat() + cypher = ( + "MATCH (e1:Entity)-[r:RELATES_TO]->(e2:Entity) " + "WHERE r.created_at >= $cutoff " + "RETURN e1.name AS e1, e2.name AS e2, " + "r.rel_type AS rel_type, r.created_at AS created_at, " + "r.confidence AS confidence " + "ORDER BY r.created_at DESC LIMIT 20" + ) + rows = self.kg.query_cypher(cypher, {"cutoff": cutoff_date}) + connections = [ + { + "e1": r.get("e1", ""), + "e2": r.get("e2", ""), + "rel_type": r.get("rel_type", "RELATES_TO"), + "confidence": float(r.get("confidence", 0.0)), + "created_at": r.get("created_at", ""), + } + for r in rows if r.get("e1") and r.get("e2") + ] + return connections + except Exception as exc: + logger.debug("Failed to get recent connections: %s", exc) + return [] + + def _get_contradictions(self) -> List[Dict[str, Any]]: + """Get count of contradictions from CONTRADICT operation.""" + try: + cutoff = (datetime.now(timezone.utc) - timedelta(days=self.RECENT_DAYS * 2)).timestamp() + entries = self.audit.get_recent(since=cutoff, action_filter="contradiction_scan") + contradictions = [] + for entry in entries: + metadata = entry.get("metadata", {}) + if "contradictions_found" in metadata: + contradictions.append({ + "count": metadata.get("contradictions_found", 0), + "entities_checked": metadata.get("entities_checked", 0), + "timestamp": entry.get("timestamp", ""), + }) + return contradictions[-5:] + except Exception as exc: + logger.debug("Failed to get contradictions: %s", exc) + return [] + + def _get_type_distribution(self) -> Dict[str, int]: + """Get entity type distribution from KG.""" + cypher = "MATCH (e:Entity) RETURN e.type AS etype, count(e) AS cnt ORDER BY cnt DESC" + try: + rows = self.kg.query_cypher(cypher, {}) + distribution = {} + for r in rows: + etype = r.get("etype", "unknown") + cnt = int(r.get("cnt", 0)) + if etype and cnt > 0: + distribution[etype] = cnt + return distribution + except Exception as exc: + logger.debug("Failed to get type distribution: %s", exc) + return {} + + def _generate_insights(self, stats: Dict[str, Any]) -> List[Dict[str, Any]]: + """Call LLM to generate insights from graph statistics.""" + try: + response = self._call_llm(self._build_prompt(stats)) + return self._parse_response(response) + except Exception as exc: + logger.error("LLM insight generation failed: %s", exc) + return [] + + def _build_prompt(self, stats: Dict[str, Any]) -> str: + """Build LLM prompt for insight generation.""" + lines = [ + "You are analyzing a Knowledge Graph for a sales team at Elastic (SLED division - State/Local Government and Education).", + f"Generate up to {self.max_insights} novel, actionable insights.", + "Focus on: market trends, competitive intelligence, account opportunities, technology shifts.", + "Each insight should be specific, actionable, and non-obvious.", + ] + recent = stats.get("recent_entities", []) + if recent: + lines.extend(["", "Recent Entity Additions (last 7 days):"] + [f" - {e['name']} ({e['type']})" for e in recent[:10]]) + hubs = stats.get("top_hubs", []) + if hubs: + lines.extend(["", "Top Connected Entities (hubs):"] + [f" - {h['name']} ({h['type']}): {h['connections']} connections" for h in hubs[:8]]) + connections = stats.get("recent_connections", []) + if connections: + lines.extend(["", "Recent Relationship Discoveries:"]) + lines.extend([f" - {c['e1']} <-> {c['e2']}" for c in connections[:8]]) + contradictions = stats.get("contradictions", []) + if contradictions: + total_contra = sum(c.get("count", 0) for c in contradictions) + lines.extend(["", f"Data Quality: {total_contra} contradictions detected recently"]) + type_dist = stats.get("type_distribution", {}) + if type_dist: + lines.extend(["", "Entity Type Distribution:"]) + lines.extend([f" - {t}: {c}" for t, c in list(type_dist.items())[:10]]) + lines.extend([ + "", + "Format for each insight (one per line):", + "INSIGHT: | RELEVANCE: | CONFIDENCE: <0.0-1.0>", + "", + "Example:", + "INSIGHT: Three universities recently added observability platforms, suggesting a shift toward modern IT monitoring | RELEVANCE: market trend | CONFIDENCE: 0.75", + ]) + return "\n".join(lines) + + def _call_llm(self, prompt: str) -> Dict[str, Any]: + """Call LLM endpoint for insight generation.""" + payload = { + "model": "qwen3.6:35b", + "messages": [ + {"role": "system", "content": "You are a strategic sales analyst specializing in public sector technology trends. Generate insights that sales teams can act on."}, + {"role": "user", "content": prompt}, + ], + "max_tokens": self.llm_max_tokens, + "temperature": 0.4, + "stream": False, + } + resp = requests.post(self.llm_endpoint, json=payload, timeout=self.llm_timeout) + resp.raise_for_status() + return resp.json() + + def _parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]: + """Parse LLM response for insights. Tries reasoning_content first.""" + choices = response.get("choices", []) + if not choices: + logger.warning("Empty choices in LLM response") + return [] + message = choices[0].get("message", {}) + text = message.get("reasoning_content", "") or message.get("content", "") + if not text: + logger.warning("Empty LLM message content") + return [] + text = str(text).strip() + insights = [] + pattern = r"INSIGHT:\s*(.+?)\s*\|\s*RELEVANCE:\s*(\w+)\s*\|\s*CONFIDENCE:\s*([0-9.]+)" + for match in re.findall(pattern, text, re.IGNORECASE)[: self.max_insights]: + try: + insight_text = match[0].strip() + domain = match[1].strip().lower() + confidence = max(0.0, min(1.0, float(match[2]))) + if insight_text and len(insight_text) > 10: + insights.append({ + "text": insight_text[:500], + "domain": domain, + "confidence": confidence, + "novel": True, + "hash": self._compute_hash(insight_text), + }) + except (ValueError, IndexError) as exc: + logger.debug("Failed to parse insight: %s", exc) + if not insights: + for line in text.split("\n"): + line = line.strip() + if line.lower().startswith("insight:"): + text_part = line[8:].strip() + if text_part and len(text_part) > 20: + insights.append({ + "text": text_part[:500], "domain": "general", + "confidence": 0.5, "novel": True, + "hash": self._compute_hash(text_part), + }) + return insights[: self.max_insights] + + def _compute_hash(self, text: str) -> str: + """Compute hash for insight text to track novelty.""" + normalized = text.lower().strip().replace(" ", "") + return hashlib.md5(normalized.encode()).hexdigest()[:16] + + def _filter_by_novelty(self, insights: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Filter insights by novelty against previously generated.""" + previous = self._load_previous_hashes() + filtered = [] + for insight in insights: + h = insight.get("hash", "") + if h and h not in previous: + insight["novel"] = True + filtered.append(insight) + else: + insight["novel"] = False + if insight.get("confidence", 0) >= self.min_novelty: + filtered.append(insight) + return filtered + + def _load_previous_hashes(self) -> Set[str]: + """Load hashes of previously generated insights.""" + try: + if os.path.exists(self.state_file): + with open(self.state_file, "r") as f: + state = json.load(f) + return set(state.get("insight_hashes", [])) + except (json.JSONDecodeError, OSError) as exc: + logger.debug("Failed to load state: %s", exc) + return set() + + def _update_state(self, insights: List[Dict[str, Any]]) -> None: + """Update state file with new insight hashes.""" + try: + previous = self._load_previous_hashes() + new_hashes = {i["hash"] for i in insights if i.get("hash") and i.get("novel")} + all_hashes = previous | new_hashes + state = { + "last_updated": datetime.now(timezone.utc).isoformat(), + "insight_hashes": list(all_hashes)[-100:], + "total_insights_generated": len(all_hashes), + } + with open(self.state_file, "w") as f: + json.dump(state, f, indent=2) + except OSError as exc: + logger.warning("Failed to update state: %s", exc) + + def _save_to_log(self, insights: List[Dict[str, Any]], dry_run: bool) -> None: + """Save insights to log file.""" + try: + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + filepath = os.path.join(self.LOG_DIR, f"insights_{timestamp}.json") + with open(filepath, "w") as f: + json.dump({ + "generated_at": datetime.now(timezone.utc).isoformat(), + "dry_run": dry_run, + "insight_count": len(insights), + "insights": insights, + }, f, indent=2) + logger.debug("Saved %d insights to %s", len(insights), filepath) + except OSError as exc: + logger.warning("Failed to save insights: %s", exc) + + def _log_audit(self, action: str, count: int, stats: Dict[str, Any]) -> None: + """Log insight operation to audit chain.""" + try: + self.audit.append( + action=action, + target_type="knowledge_graph", + target_id="insight_analysis", + source="kg_dreamer.operations.insights", + metadata={ + "insight_count": count, + "recent_entities": len(stats.get("recent_entities", [])), + "recent_hubs": len(stats.get("top_hubs", [])), + "recent_connections": len(stats.get("recent_connections", [])), + }, + ) + except Exception as exc: + logger.warning("Audit log failed: %s", exc) diff --git a/usr/plugins/_kg_dreamer/operations/patterns.py b/usr/plugins/_kg_dreamer/operations/patterns.py new file mode 100644 index 0000000000..5b35aa5deb --- /dev/null +++ b/usr/plugins/_kg_dreamer/operations/patterns.py @@ -0,0 +1,224 @@ +"""PATTERN operation for KG Dreamer. + +Discovers unnamed entity clusters and suggests parent concepts using +community detection and LLM-assisted concept generation. +""" +import logging +import re +from typing import Any, Dict, List, Optional + +import requests + +logger = logging.getLogger(__name__) + + +class PatternOperation: + """Discover unnamed entity clusters and suggest parent concepts. + + Analyzes the knowledge graph to find groups of entities that are +densely connected within communities but lack a unifying parent concept. +Uses LLM to suggest appropriate parent concepts and categories. + + Attributes: + kg_client: HTTP client for KG service. + audit_chain: Append-only audit trail for write operations. + config: Operation configuration dict. + """ + + DEFAULT_MIN_CLUSTER_SIZE: int = 3 + DEFAULT_MAX_CLUSTERS: int = 50 + DEFAULT_LLM_TIMEOUT: int = 120 + DEFAULT_LLM_MAX_TOKENS: int = 4096 + + def __init__( + self, kg_client: Any, audit_chain: Any, config: Optional[Dict[str, Any]] = None + ) -> None: + """Initialize PatternOperation. + + Args: + kg_client: KG client with query_cypher(), create_entity(), link_entity(). + audit_chain: Audit chain with append() method. + config: Optional config for min_cluster_size, max_clusters, llm settings. + """ + self.kg = kg_client + self.audit = audit_chain + cfg = config or {} + self.min_cluster_size = int(cfg.get("min_cluster_size", self.DEFAULT_MIN_CLUSTER_SIZE)) + self.max_clusters = int(cfg.get("max_clusters", self.DEFAULT_MAX_CLUSTERS)) + self.llm_endpoint = cfg.get( + "llm_endpoint", "http://192.168.1.250:11435/v1/chat/completions" + ) + self.llm_timeout = int(cfg.get("llm_timeout", self.DEFAULT_LLM_TIMEOUT)) + self.llm_max_tokens = int(cfg.get("llm_max_tokens", self.DEFAULT_LLM_MAX_TOKENS)) + + def execute(self, dry_run: bool = True) -> Dict[str, Any]: + """Execute PATTERN operation.""" + clusters = self._find_clusters() + if not clusters: + logger.info("No entity clusters found for pattern analysis.") + return {"clusters_analyzed": 0, "patterns_found": 0, "patterns": [], "dry_run": dry_run} + patterns: List[Dict[str, Any]] = [] + patterns_found = 0 + for cluster in clusters[: self.max_clusters]: + pattern = self._analyze_cluster(cluster, dry_run) + if pattern: + patterns.append(pattern) + if pattern.get("suggested_parent"): + patterns_found += 1 + logger.info( + "PATTERN: %d clusters analyzed, %d patterns found (dry_run=%s)", + len(clusters), patterns_found, dry_run + ) + if not dry_run and patterns_found > 0: + self._log_audit("pattern_discovery", len(clusters), patterns) + return {"clusters_analyzed": len(clusters), "patterns_found": patterns_found, "patterns": patterns, "dry_run": dry_run} + + def _find_clusters(self) -> List[Dict[str, Any]]: + """Query KG for groups of densely connected entities.""" + cypher = "MATCH (e:Entity) RETURN e.name AS name, e.type AS etype, labels(e) AS labels" + try: + rows = self.kg.query_cypher(cypher, {}) + by_type: Dict[str, List[Dict[str, Any]]] = {} + for r in rows: + etype = r.get("etype", "unknown") + if etype not in by_type: + by_type[etype] = [] + by_type[etype].append({"name": r.get("name", ""), "type": etype, "labels": r.get("labels", [])}) + clusters: List[Dict[str, Any]] = [] + for etype, entities in by_type.items(): + if len(entities) >= self.min_cluster_size: + if not self._has_parent(entities): + clusters.append({"entities": entities, "type": etype, "size": len(entities)}) + logger.debug("Found %d unnamed entity clusters", len(clusters)) + return clusters + except Exception as exc: + logger.error("Failed to query entity clusters: %s", exc) + return [] + + def _has_parent(self, entities: List[Dict[str, Any]]) -> bool: + """Check if entities already share a common parent concept. + + Note: Current KG schema does not support Concept nodes or + BELONGS_TO relationships. This method is currently a no-op + to allow pattern discovery to proceed. + """ + # Concept nodes and BELONGS_TO not supported in current KG schema + # Return False to allow pattern analysis of all entities + return False + + def _analyze_cluster(self, cluster: Dict[str, Any], dry_run: bool) -> Optional[Dict[str, Any]]: + """Analyze a cluster and suggest parent concept via LLM.""" + entities = cluster.get("entities", []) + if not entities: + return None + names = [e.get("name", "") for e in entities if e.get("name")] + types = list(set(e.get("type", "unknown") for e in entities)) + suggestion = self._suggest_parent(names, types) + if not suggestion: + return None + pattern = { + "entities": names[:20], "suggested_parent": suggestion.get("concept_name", ""), + "category": suggestion.get("category", "unknown"), + "confidence": suggestion.get("confidence", 0.0), "reasoning": suggestion.get("reasoning", ""), + } + if not dry_run and suggestion.get("concept_name"): + pattern["created"] = self._create_parent( + suggestion["concept_name"], suggestion.get("category", "concept"), names, suggestion.get("reasoning", "") + ) + return pattern + + def _suggest_parent(self, names: List[str], types: List[str]) -> Optional[Dict[str, Any]]: + """Call LLM to suggest parent concept name and category.""" + names_str = ", ".join(names[:15]) + types_str = ", ".join(types) + prompt = ( + f"Given these related entities, suggest a parent concept name and category.\n\n" + f"Entities: {names_str}\nTypes: {types_str}\n\n" + "Format: CONCEPT_NAME | CATEGORY | CONFIDENCE (0-1) | REASONING\n\n" + "Example:\n" + "Cloud Security Solutions | concept | 0.85 | These entities all relate to security products in cloud environments" + ) + try: + response = self._call_llm(prompt) + return self._parse_response(response) + except Exception as exc: + logger.error("LLM suggestion failed: %s", exc) + return None + + def _call_llm(self, prompt: str) -> Dict[str, Any]: + """Call LLM endpoint for concept suggestion.""" + payload = { + "model": "qwen3.6:35b", + "messages": [ + { + "role": "system", + "content": "You are a knowledge graph analyst specializing in taxonomy and concept discovery. Suggest clear, concise parent concepts.", + }, + {"role": "user", "content": prompt}, + ], + "max_tokens": self.llm_max_tokens, + "temperature": 0.3, + "stream": False, + } + response = requests.post(self.llm_endpoint, json=payload, timeout=self.llm_timeout) + response.raise_for_status() + return response.json() + + def _parse_response(self, response: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Parse LLM response for concept suggestion. Tries reasoning_content first.""" + choices = response.get("choices", []) + if not choices: + logger.warning("Empty choices in LLM response") + return None + message = choices[0].get("message", {}) + text = message.get("reasoning_content", "") or message.get("content", "") + if not text: + logger.warning("Empty LLM message content") + return None + text = str(text).strip() + match = re.search(r"^([^|]+)\s*\|\s*([^|]+)\s*\|\s*([0-9.]+)\s*\|\s*(.+)$", text, re.MULTILINE | re.DOTALL) + if match: + try: + return { + "concept_name": match.group(1).strip(), "category": match.group(2).strip(), + "confidence": max(0.0, min(1.0, float(match.group(3).strip()))), + "reasoning": match.group(4).strip()[:500], + } + except (ValueError, IndexError) as exc: + logger.debug("Failed to parse concept pattern: %s", exc) + lines = [line.strip() for line in text.split("\n") if line.strip()] + if lines: + return {"concept_name": lines[0][:100], "category": "suggested_concept", "confidence": 0.5, "reasoning": text[:500]} + return None + + def _create_parent(self, name: str, category: str, children: List[str], reasoning: str) -> bool: + """Create parent concept entity and link children.""" + try: + props = { + "name": name, "type": category, "source": "kg_dreamer_patterns", + "description": reasoning[:500], "auto_generated": True, + } + if not self.kg.create_entity(props): + logger.warning("Failed to create concept: %s", name) + return False + for child in children: + self.kg.link_entity(source_name=name, target_name=child, rel_type="CONTAINS", properties={"auto_generated": True}) + logger.debug("Created parent '%s' with %d children", name, len(children)) + return True + except Exception as exc: + logger.error("Failed to create parent concept: %s", exc) + return False + + def _log_audit(self, action: str, count: int, patterns: List[Dict[str, Any]]) -> None: + """Log pattern operation to audit chain.""" + try: + self.audit.append( + action=action, target_type="knowledge_graph", target_id="pattern_analysis", + source="kg_dreamer.operations.patterns", + metadata={ + "cluster_count": count, "patterns_found": len(patterns), + "min_cluster_size": self.min_cluster_size, "patterns": patterns[:5], + }, + ) + except Exception as exc: + logger.warning("Audit log failed: %s", exc) diff --git a/usr/plugins/_kg_dreamer/operations/pruner.py b/usr/plugins/_kg_dreamer/operations/pruner.py new file mode 100644 index 0000000000..26480c670f --- /dev/null +++ b/usr/plugins/_kg_dreamer/operations/pruner.py @@ -0,0 +1,361 @@ +"""PRUNE operation for KG Dreamer. + +Archives cold-tier entities that are old, low health, and never queried. +Uses health scoring and audit chain analysis to identify stale entities. +""" +import logging +from datetime import datetime, timezone, timedelta +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + + +class PruneOperation: + """Archive cold-tier entities based on health, age, and usage. + + Identifies entities that are: + - Low health score (below threshold) + - Old (not updated within min_age_days) + - Rarely or never queried (below max_queries threshold) + + Attributes: + kg_client: HTTP client for KG service. + audit_chain: Append-only audit trail for write operations. + health_scorer: HealthScorer instance for entity scoring. + config: Operation configuration dict. + """ + + DEFAULT_MIN_AGE_DAYS: int = 180 + DEFAULT_MAX_HEALTH_SCORE: float = 0.1 + DEFAULT_MAX_QUERIES: int = 0 + DEFAULT_BATCH_SIZE: int = 100 + + def __init__( + self, + kg_client: Any, + audit_chain: Any, + health_scorer: Any = None, + config: Optional[Dict[str, Any]] = None, + ) -> None: + """Initialize PruneOperation. + + Args: + kg_client: KG client with query_cypher() methods. + audit_chain: Audit chain with query() and append() methods. + health_scorer: Optional health scorer (kept for backward compat, not used). + config: Optional config overrides for thresholds. + """ + self.kg = kg_client + self.audit = audit_chain + self.health_scorer = health_scorer + cfg = config or {} + + self.min_age_days = cfg.get( + "min_age_days", self.DEFAULT_MIN_AGE_DAYS + ) + self.max_health_score = cfg.get( + "max_health_score", self.DEFAULT_MAX_HEALTH_SCORE + ) + self.max_queries = cfg.get( + "max_queries", self.DEFAULT_MAX_QUERIES + ) + self.batch_size = cfg.get( + "batch_size", self.DEFAULT_BATCH_SIZE + ) + + def execute(self, dry_run: bool = True) -> Dict[str, Any]: + """Execute PRUNE operation. + + Archives cold-tier entities based on health, age, and usage. + + Args: + dry_run: If True, report candidates without deleting. + + Returns: + Dict with candidates_found, entities_pruned, dry_run flag, + and list of pruned entities. + """ + candidates = self._find_prune_candidates() + if not candidates: + logger.info("No prune candidates found.") + return { + "candidates_found": 0, + "entities_pruned": 0, + "dry_run": dry_run, + "pruned_entities": [], + } + + pruned: List[Dict[str, Any]] = [] + entities_pruned = 0 + + for candidate in candidates[: self.batch_size]: + result = self._process_prune_candidate(candidate, dry_run) + pruned.append(result) + if result.get("pruned"): + entities_pruned += 1 + + logger.info( + "PRUNE completed: %d candidates, %d pruned (dry_run=%s)", + len(candidates), + entities_pruned, + dry_run, + ) + + return { + "candidates_found": len(candidates), + "entities_pruned": entities_pruned, + "dry_run": dry_run, + "pruned_entities": pruned, + } + + def _find_prune_candidates(self) -> List[Dict[str, Any]]: + """Find entities eligible for pruning based on KG properties. + + Finds entities with: + - Low mention count (<= threshold) + - Old last_seen (> min_age_days ago) + - No outgoing/incoming relationships (orphaned entities) + + Returns: + List of candidate dicts with name, type, mention_count, + last_seen, and age_days. + """ + try: + # Query entities with low mention count and old last_seen + cutoff_date = (datetime.now(timezone.utc) - timedelta(days=self.min_age_days)).isoformat() + + cypher = ( + "MATCH (e:Entity) " + "WHERE e.mention_count IS NOT NULL " + "AND e.mention_count <= $min_mentions " + "AND e.last_seen < $cutoff " + "AND NOT EXISTS { MATCH (e)-[:RELATES_TO]-() } " + "RETURN e.name AS name, e.type AS etype, " + "e.mention_count AS mention_count, e.last_seen AS last_seen " + "ORDER BY e.last_seen ASC " + "LIMIT $max_results" + ) + + rows = self.kg.query_cypher(cypher, { + "min_mentions": 1, + "cutoff": cutoff_date, + "max_results": self.batch_size * 2 + }) + + candidates: List[Dict[str, Any]] = [] + now = datetime.now(timezone.utc) + + for r in rows: + name = r.get("name", "") + ent_type = r.get("etype", "unknown") + mention_count = int(r.get("mention_count", 0)) + last_seen = r.get("last_seen", "") + + age_days = self._calculate_age_days(now, last_seen) + if age_days is None: + continue + + candidates.append({ + "name": name, + "type": ent_type, + "health_score": 1.0 / max(mention_count, 1), # Inverse of mention count + "tier": "cold", + "last_seen": last_seen, + "age_days": age_days, + "query_count": 0, + "mention_count": mention_count, + }) + + # Sort by age (oldest first) then by mention count + candidates.sort(key=lambda x: (x.get("age_days", 0), -x.get("mention_count", 0)), reverse=True) + + logger.debug("Found %d prune candidates", len(candidates)) + return candidates + + except Exception as exc: + logger.error("Failed to find prune candidates: %s", exc) + return [] + + def _calculate_age_days( + self, now: datetime, last_seen: Optional[str] + ) -> Optional[int]: + """Calculate days since last update. + + Args: + now: Current UTC datetime. + last_seen: ISO timestamp string or None. + + Returns: + Days since last seen, or None if unparseable. + """ + if not last_seen: + return None + try: + dt = datetime.fromisoformat(last_seen.replace("Z", "+00:00")) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return (now - dt).days + except (ValueError, TypeError) as exc: + logger.debug("Cannot parse last_seen '%s': %s", last_seen, exc) + return None + + def _count_entity_queries(self, entity_name: str) -> int: + """Count how many times entity was queried recently. + + Args: + entity_name: Name of the entity. + + Returns: + Count of audit events referencing this entity. + """ + try: + cutoff = ( + datetime.now(timezone.utc) - timedelta(days=self.min_age_days) + ).isoformat() + events = self.audit.query( + since=cutoff, + limit=1000, + ) + count = sum( + 1 + for e in events + if entity_name in str(e.get("metadata", {})) + or entity_name in e.get("target_id", "") + ) + return count + except Exception as exc: + logger.debug("Failed to query audit for %s: %s", entity_name, exc) + return 0 + + def _process_prune_candidate( + self, candidate: Dict[str, Any], dry_run: bool + ) -> Dict[str, Any]: + """Process a single prune candidate. + + Args: + candidate: Dict with name, type, health_score, etc. + dry_run: If True, don't delete the entity. + + Returns: + Dict with prune result details. + """ + name = candidate.get("name", "") + result = { + "name": name, + "type": candidate.get("type", "unknown"), + "health_score": candidate.get("health_score", 0.0), + "tier": candidate.get("tier", "unknown"), + "age_days": candidate.get("age_days"), + "query_count": candidate.get("query_count", 0), + "pruned": False, + "archived": False, + } + + if dry_run: + result["status"] = "would_prune" + self._log_audit(name, candidate, dry_run=True) + return result + + try: + # Archive first + archive_ok = self._archive_entity(name, candidate) + result["archived"] = archive_ok + + # Delete entity with DETACH DELETE for KuzuDB + delete_ok = self._delete_entity(name) + + if archive_ok and delete_ok: + result["pruned"] = True + result["status"] = "pruned" + self._log_audit(name, candidate, dry_run=False) + elif not delete_ok: + result["status"] = "failed_delete" + logger.warning("Failed to delete entity %s", name) + else: + result["status"] = "failed_archive" + logger.warning("Failed to archive entity %s", name) + + except Exception as exc: + result["status"] = "error" + result["error"] = str(exc) + logger.error("Error pruning entity %s: %s", name, exc) + + return result + + def _archive_entity( + self, name: str, candidate: Dict[str, Any] + ) -> bool: + """Archive entity data before deletion. + + Args: + name: Entity name. + candidate: Entity data dict. + + Returns: + True if archival succeeded. + """ + try: + # Get full entity details from KG + query = ( + "MATCH (e:Entity {name: $name}) " + "RETURN e, e.name, e.type, e.domain, e.categories, " + "e.confidence, e.mention_count, e.first_seen, e.last_seen" + ) + rows = self.kg.query_cypher(query, {"name": name}) + + if rows: + candidate["entity_data"] = rows[0] + + # Archive data is stored in candidate for potential recovery + candidate["archived_at"] = datetime.now(timezone.utc).isoformat() + return True + + except Exception as exc: + logger.warning("Failed to archive entity %s: %s", name, exc) + return False + + def _delete_entity(self, name: str) -> bool: + """Delete entity from KG using DETACH DELETE. + + Args: + name: Entity name. + + Returns: + True if deletion succeeded. + """ + try: + # KuzuDB requires DETACH DELETE to remove entity and its relationships + query = "MATCH (e:Entity {name: $name}) DETACH DELETE e" + self.kg.query_cypher(query, {"name": name}) + return True + except Exception as exc: + logger.error("Failed to delete entity %s: %s", name, exc) + return False + + def _log_audit( + self, name: str, candidate: Dict[str, Any], dry_run: bool + ) -> None: + """Log prune operation to audit chain. + + Args: + name: Entity name. + candidate: Entity data dict. + dry_run: Whether this was a dry run. + """ + try: + self.audit.append( + action="delete" if not dry_run else "dry_run", + target_type="entity", + target_id=name, + source="kg_dreamer.operations.pruner", + metadata={ + "entity_name": name, + "entity_type": candidate.get("type", "unknown"), + "health_score": candidate.get("health_score", 0.0), + "age_days": candidate.get("age_days"), + "query_count": candidate.get("query_count", 0), + "dry_run": dry_run, + }, + ) + except Exception as exc: + logger.warning("Audit log failed: %s", exc) diff --git a/usr/plugins/_kg_dreamer/operations/strengthener.py b/usr/plugins/_kg_dreamer/operations/strengthener.py new file mode 100644 index 0000000000..2853207759 --- /dev/null +++ b/usr/plugins/_kg_dreamer/operations/strengthener.py @@ -0,0 +1,421 @@ +"""STRENGTHEN operation for KG Dreamer. + +Boosts weights of frequently-accessed relationships and decays +weights of unused pathways based on audit chain activity tracking. +""" +import logging +import time +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + + +class StrengthenOperation: + """Boost active relationships, decay dormant ones. + + Analyzes audit chain query history to identify recently-accessed + entities. Relationships between frequently-accessed entities receive + weight boosts, while unused relationships experience weight decay. + + Attributes: + kg_client: HTTP client for KG service. + audit_chain: Append-only audit trail for write operations. + config: Operation configuration dict. + """ + + DEFAULT_BOOST_FACTOR: float = 1.2 + DEFAULT_DECAY_FACTOR: float = 0.95 + DEFAULT_MIN_QUERIES: int = 3 + DEFAULT_MAX_WEIGHT: float = 10.0 + DEFAULT_MIN_WEIGHT: float = 0.1 + RECENT_DAYS: int = 30 + DORMANT_DAYS: int = 90 + SECONDS_PER_DAY: int = 86400 + + def __init__( + self, + kg_client: Any, + audit_chain: Any, + config: Optional[Dict[str, Any]] = None, + ) -> None: + """Initialize StrengthenOperation. + + Args: + kg_client: KG client with query_cypher() and update_relationship(). + audit_chain: Audit chain with append() and get_recent(). + config: Optional config overrides for boost_factor, decay_factor. + """ + self.kg = kg_client + self.audit = audit_chain + cfg = config or {} + + self.boost_factor = float( + cfg.get("boost_factor", self.DEFAULT_BOOST_FACTOR) + ) + self.decay_factor = float( + cfg.get("decay_factor", self.DEFAULT_DECAY_FACTOR) + ) + self.min_queries = int( + cfg.get("min_queries", self.DEFAULT_MIN_QUERIES) + ) + self.max_weight = float( + cfg.get("max_weight", self.DEFAULT_MAX_WEIGHT) + ) + self.min_weight = float( + cfg.get("min_weight", self.DEFAULT_MIN_WEIGHT) + ) + + def execute(self, dry_run: bool = True) -> Dict[str, Any]: + """Execute STRENGTHEN operation. + + Boosts weights of relationships between recently-accessed + entities and decays weights of dormant relationships. + + Args: + dry_run: If True, report changes without updating. + + Returns: + Dict with boosted, decayed, total_updated counts, + and details of each change. + """ + relationships = self._get_relationships_with_weights() + if not relationships: + logger.info("No weighted relationships found.") + return { + "boosted": 0, + "decayed": 0, + "total_updated": 0, + "dry_run": dry_run, + "details": [], + } + + recent_entities = self._get_recently_accessed_entities() + dormant_entities = self._get_dormant_entities() + + boosted_count = 0 + decayed_count = 0 + details: List[Dict[str, Any]] = [] + + for rel in relationships: + result = self._process_relationship( + rel, recent_entities, dormant_entities, dry_run + ) + details.append(result) + + if result.get("action") == "boosted": + boosted_count += 1 + elif result.get("action") == "decayed": + decayed_count += 1 + + total_updated = boosted_count + decayed_count + + logger.info( + "STRENGTHEN completed: %d boosted, %d decayed, " + "total %d (dry_run=%s)", + boosted_count, + decayed_count, + total_updated, + dry_run, + ) + + return { + "boosted": boosted_count, + "decayed": decayed_count, + "total_updated": total_updated, + "dry_run": dry_run, + "details": details, + } + + def _get_relationships_with_weights(self) -> List[Dict[str, Any]]: + """Query KG for all relationships with weight property. + + Note: Current KG schema does not support weighted relationships. + RELATES_TO edges only have rel_type, confidence, doc_id, created_at. + This operation requires schema update to add 'weight' property. + + Returns: + Empty list - weighted relationships not supported in current schema. + """ + # Check if KG supports weighted relationships + cypher = ( + "MATCH (e1:Entity)-[r:RELATES_TO]->(e2:Entity) " + "WHERE r.weight IS NOT NULL " + "RETURN e1.name AS source_name, e2.name AS target_name, " + "r.rel_type AS rel_type, r.weight AS weight " + "LIMIT 1" + ) + + try: + rows = self.kg.query_cypher(cypher, {}) + if not rows: + logger.warning( + "STRENGTHEN: KG schema does not support weighted relationships. " + "Please update KG schema to add 'weight' property to RELATES_TO edges." + ) + return [] # Weighted relationships not supported + + # If we get here, weights exist - fetch all + cypher = ( + "MATCH (e1:Entity)-[r:RELATES_TO]->(e2:Entity) " + "WHERE r.weight IS NOT NULL " + "RETURN e1.name AS source_name, e2.name AS target_name, " + "r.rel_type AS rel_type, r.weight AS weight" + ) + rows = self.kg.query_cypher(cypher, {}) + relationships: List[Dict[str, Any]] = [ + { + "source_name": r.get("source_name", ""), + "target_name": r.get("target_name", ""), + "rel_type": r.get("rel_type", "RELATES_TO"), + "weight": float(r.get("weight", 1.0)), + } + for r in rows + if r.get("source_name") and r.get("target_name") + ] + logger.debug("Found %d weighted relationships", len(relationships)) + return relationships + except Exception as exc: + logger.error("Failed to query relationships: %s", exc) + return [] + + def _get_recently_accessed_entities(self) -> Dict[str, int]: + """Read audit chain for entities accessed in last 30 days. + + Returns: + Dict mapping entity names to access counts. + """ + cutoff_time = time.time() - ( + self.RECENT_DAYS * self.SECONDS_PER_DAY + ) + + try: + recent_entries = self.audit.get_recent( + since=cutoff_time, action_filter="query" + ) + + entity_counts: Dict[str, int] = {} + for entry in recent_entries: + metadata = entry.get("metadata", {}) + entities = metadata.get("entities", []) + target_id = entry.get("target_id", "") + + if isinstance(entities, list): + for entity in entities: + if entity and isinstance(entity, str): + entity_counts[entity] = ( + entity_counts.get(entity, 0) + 1 + ) + + if target_id and isinstance(target_id, str): + entity_counts[target_id] = ( + entity_counts.get(target_id, 0) + 1 + ) + + # Filter to entities with minimum queries + filtered = { + k: v for k, v in entity_counts.items() + if v >= self.min_queries + } + logger.debug("Found %d recently-accessed entities", len(filtered)) + return filtered + except Exception as exc: + logger.error("Failed to get recent entities: %s", exc) + return {} + + def _get_dormant_entities(self) -> List[str]: + """Identify entities not queried in last 90 days. + + Returns: + List of dormant entity names. + """ + dormant_cutoff = time.time() - ( + self.DORMANT_DAYS * self.SECONDS_PER_DAY + ) + recent_cutoff = time.time() - ( + self.RECENT_DAYS * self.SECONDS_PER_DAY + ) + + try: + all_entities_query = ( + "MATCH (e:Entity) RETURN e.name AS name WHERE e.name IS NOT NULL" + ) + all_rows = self.kg.query_cypher(all_entities_query, {}) + all_entities = { + r.get("name") + for r in all_rows if r.get("name") + } + + recent_entries = self.audit.get_recent( + since=recent_cutoff, action_filter="query" + ) + recently_queried: set = set() + for entry in recent_entries: + metadata = entry.get("metadata", {}) + entities = metadata.get("entities", []) + if isinstance(entities, list): + recently_queried.update( + e for e in entities if isinstance(e, str) + ) + target_id = entry.get("target_id", "") + if isinstance(target_id, str): + recently_queried.add(target_id) + + # Entirely absent from recent queries + dormant = list(all_entities - recently_queried) + logger.debug("Found %d dormant entities", len(dormant)) + return dormant + except Exception as exc: + logger.error("Failed to get dormant entities: %s", exc) + return [] + + def _process_relationship( + self, + rel: Dict[str, Any], + recent_entities: Dict[str, int], + dormant_entities: List[str], + dry_run: bool, + ) -> Dict[str, Any]: + """Process a single relationship for boost/decay. + + Args: + rel: Relationship dict with source_name, target_name, weight. + recent_entities: Map of recently-accessed entity names. + dormant_entities: List of dormant entity names. + dry_run: If True, don't apply changes. + + Returns: + Dict with relationship details and action taken. + """ + source = rel.get("source_name", "") + target = rel.get("target_name", "") + current_weight = rel.get("weight", 1.0) + rel_type = rel.get("rel_type", "RELATED") + rel_id = rel.get("rel_id") + + detail = { + "source": source, + "target": target, + "rel_type": rel_type, + "old_weight": current_weight, + "new_weight": current_weight, + "action": "none", + } + + source_recent = source in recent_entities + target_recent = target in recent_entities + source_dormant = source in dormant_entities + target_dormant = target in dormant_entities + + # Both entities recently accessed: boost + if source_recent and target_recent: + new_weight = min(current_weight * self.boost_factor, self.max_weight) + detail["new_weight"] = round(new_weight, 3) + detail["action"] = "boosted" + detail["reason"] = "both_entities_active" + + if not dry_run: + self._update_weight(source, target, rel_type, new_weight, rel_id) + self._log_audit(source, target, rel_type, "boosted", detail) + + # Neither entity accessed recently: decay + elif source_dormant and target_dormant: + new_weight = max(current_weight * self.decay_factor, self.min_weight) + detail["new_weight"] = round(new_weight, 3) + detail["action"] = "decayed" + detail["reason"] = "both_entities_dormant" + + if not dry_run: + self._update_weight(source, target, rel_type, new_weight, rel_id) + self._log_audit(source, target, rel_type, "decayed", detail) + + else: + detail["reason"] = "no_change_needed" + + return detail + + def _update_weight( + self, + source: str, + target: str, + rel_type: str, + new_weight: float, + rel_id: Optional[Any] = None, + ) -> bool: + """Update relationship weight in KG via Cypher. + + Args: + source: Source entity name. + target: Target entity name. + rel_type: Relationship type. + new_weight: New weight value. + rel_id: Optional relationship ID for precise targeting. + + Returns: + True if update succeeded, False otherwise. + """ + if rel_id is not None: + cypher = ( + "MATCH ()-[r]->() WHERE id(r) = $rel_id " + "SET r.weight = $weight" + ) + params = {"rel_id": rel_id, "weight": new_weight} + else: + cypher = ( + "MATCH (e1:Entity {name: $source})-""" + f"[r:{rel_type}]-(e2:Entity {{name: $target}}) " + "SET r.weight = $weight" + ) + params = { + "source": source, + "target": target, + "weight": new_weight, + } + + try: + self.kg.query_cypher(cypher, params) + logger.debug( + "Updated weight %s->%s [%s]: %.3f", + source, target, rel_type, new_weight + ) + return True + except Exception as exc: + logger.error( + "Failed to update weight for %s->%s: %s", + source, target, exc + ) + return False + + def _log_audit( + self, + source: str, + target: str, + rel_type: str, + action: str, + details: Dict[str, Any], + ) -> None: + """Log strengthen operation to audit chain. + + Args: + source: Source entity name. + target: Target entity name. + rel_type: Relationship type. + action: The action performed (boosted/decayed). + details: Operation details dict. + """ + try: + self.audit.append( + action=action, + target_type="relationship", + target_id=f"{source}->{target}:{rel_type}", + source="kg_dreamer.operations.strengthener", + metadata={ + "source_entity": source, + "target_entity": target, + "relationship_type": rel_type, + "old_weight": details.get("old_weight"), + "new_weight": details.get("new_weight"), + "reason": details.get("reason"), + }, + ) + except Exception as exc: + logger.warning("Audit log failed: %s", exc) diff --git a/usr/plugins/_kg_dreamer/orchestrator.py b/usr/plugins/_kg_dreamer/orchestrator.py new file mode 100644 index 0000000000..82e500ef2e --- /dev/null +++ b/usr/plugins/_kg_dreamer/orchestrator.py @@ -0,0 +1,361 @@ +"""Dream cycle orchestrator for KG Dreamer. + +Runs all enabled dream operations in sequence, manages state, +and produces dream reports. +""" +import json +import logging +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional + +import yaml + +# Support both Agent Zero plugin namespace and standalone execution +from pathlib import Path +import sys + +_PLUGIN_DIR = Path(__file__).parent +if str(_PLUGIN_DIR) not in sys.path: + sys.path.insert(0, str(_PLUGIN_DIR)) + +from helpers import get_audit_chain, get_health_scorer, get_kg_client +from operations.connector import ConnectOperation +from operations.contradiction import ContradictionOperation +from operations.insights import InsightOperation +from operations.patterns import PatternOperation +from operations.pruner import PruneOperation +from operations.strengthener import StrengthenOperation + +logger = logging.getLogger(__name__) + +DEFAULT_CONFIG_PATH: str = "/a0/usr/plugins/_kg_dreamer/default_config.yaml" + +OPERATION_ORDER: List[str] = [ + "connect", + "strengthen", + "prune", + "contradict", + "pattern", + "insight", +] + + +class DreamOrchestrator: + """Orchestrates the dream cycle across all enabled operations. + + Manages configuration, state, and execution of dream operations. + Produces dream reports and maintains operation history. + + Attributes: + config_path: Path to YAML configuration file. + config: Parsed configuration dict. + log_dir: Directory for dream reports. + state_file: Path to JSON state file. + _kg_client: KG client instance (lazy loaded). + _audit_chain: Audit chain instance (lazy loaded). + _health_scorer: Health scorer instance (lazy loaded). + """ + + def __init__(self, config_path: str = None) -> None: + """Initialize DreamOrchestrator. + + Args: + config_path: Path to config YAML file. Uses default if None. + """ + self.config_path = config_path or DEFAULT_CONFIG_PATH + self.config: Dict[str, Any] = self._load_config() + self.log_dir: Path = Path( + self.config.get("dreamer", {}).get( + "log_dir", "/a0/usr/workdir/logs/kg_dreams" + ) + ) + self.state_file: Path = Path( + self.config.get("dreamer", {}).get( + "state_file", "/a0/usr/workdir/state/kg_dreamer_state.json" + ) + ) + + self._kg_client: Any = None + self._audit_chain: Any = None + self._health_scorer: Any = None + + self._ensure_directories() + + def _ensure_directories(self) -> None: + """Create log_dir if it doesn't exist.""" + self.log_dir.mkdir(parents=True, exist_ok=True) + self.state_file.parent.mkdir(parents=True, exist_ok=True) + + def _load_config(self) -> Dict[str, Any]: + """Load configuration from YAML file. + + Returns: + Parsed configuration dict. + """ + try: + with open(self.config_path, "r", encoding="utf-8") as f: + config = yaml.safe_load(f) + logger.debug("Loaded config from %s", self.config_path) + return config or {} + except FileNotFoundError: + logger.error("Config file not found: %s", self.config_path) + return {} + except yaml.YAMLError as exc: + logger.error("Failed to parse config: %s", exc) + return {} + + def _load_state(self) -> Dict[str, Any]: + """Load state from JSON state file. + + Returns: + State dict or empty dict if file doesn't exist. + """ + try: + with open(self.state_file, "r", encoding="utf-8") as f: + state = json.load(f) + logger.debug("Loaded state from %s", self.state_file) + return state + except FileNotFoundError: + return {} + except json.JSONDecodeError as exc: + logger.warning("Failed to parse state file: %s", exc) + return {} + + def _save_state(self, state: Dict[str, Any]) -> None: + """Save state to JSON state file. + + Args: + state: State dict to save. + """ + try: + self.state_file.parent.mkdir(parents=True, exist_ok=True) + with open(self.state_file, "w", encoding="utf-8") as f: + json.dump(state, f, indent=2, default=str) + logger.debug("Saved state to %s", self.state_file) + except OSError as exc: + logger.error("Failed to save state: %s", exc) + + def _save_report(self, report: Dict[str, Any]) -> Path: + """Save dream report to log directory. + + Args: + report: Report dict to save. + + Returns: + Path to saved report file. + """ + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + report_path = self.log_dir / f"dream_report_{timestamp}.json" + + try: + with open(report_path, "w", encoding="utf-8") as f: + json.dump(report, f, indent=2, default=str) + logger.info("Saved dream report to %s", report_path) + return report_path + except OSError as exc: + logger.error("Failed to save report: %s", exc) + return Path() + + def _get_kg_client(self) -> Any: + """Get or create KG client instance (lazy loaded).""" + if self._kg_client is None: + self._kg_client = get_kg_client(self.config) + return self._kg_client + + def _get_audit_chain(self) -> Any: + """Get or create audit chain instance (lazy loaded).""" + if self._audit_chain is None: + self._audit_chain = get_audit_chain(self.config) + return self._audit_chain + + def _get_health_scorer(self) -> Any: + """Get or create health scorer instance (lazy loaded).""" + if self._health_scorer is None: + self._health_scorer = get_health_scorer(self.config) + return self._health_scorer + + def _is_operation_enabled(self, op_name: str) -> bool: + """Check if an operation is enabled in config. + + Args: + op_name: Name of the operation (e.g., 'connect'). + + Returns: + True if operation is enabled, False otherwise. + """ + ops_config = self.config.get("dreamer", {}).get("operations", {}) + op_config = ops_config.get(op_name, {}) + return op_config.get("enabled", True) + + def _get_operation_config(self, op_name: str) -> Dict[str, Any]: + """Get configuration for a specific operation. + + Args: + op_name: Name of the operation. + + Returns: + Operation configuration dict. + """ + ops_config = self.config.get("dreamer", {}).get("operations", {}) + return ops_config.get(op_name, {}) + + def _create_operation(self, op_name: str) -> Any: + """Create operation instance by name. + + Args: + op_name: Name of the operation to create. + + Returns: + Operation instance. + + Raises: + ValueError: If operation name is unknown. + """ + op_config = self._get_operation_config(op_name) + kg_client = self._get_kg_client() + audit_chain = self._get_audit_chain() + + if op_name == "connect": + return ConnectOperation(kg_client, audit_chain, op_config) + if op_name == "strengthen": + return StrengthenOperation(kg_client, audit_chain, op_config) + if op_name == "prune": + health_scorer = self._get_health_scorer() + return PruneOperation(kg_client, audit_chain, health_scorer, op_config) + if op_name == "contradict": + return ContradictionOperation(kg_client, audit_chain, op_config) + if op_name == "pattern": + return PatternOperation(kg_client, audit_chain, op_config) + if op_name == "insight": + return InsightOperation(kg_client, audit_chain, op_config) + + raise ValueError(f"Unknown operation: {op_name}") + + def run_cycle( + self, dry_run: bool = False, operations: List[str] = None + ) -> Dict[str, Any]: + """Run dream cycle with all or specified operations. + + Args: + dry_run: If True, don't make changes, just report. + operations: Optional list of specific operations to run. + If None, runs all enabled operations in order. + + Returns: + Dream report dict with timestamp, results, and summary. + """ + timestamp = datetime.now(timezone.utc).isoformat() + operations_to_run = operations or OPERATION_ORDER + + logger.info( + "Starting dream cycle: dry_run=%s, operations=%s", + dry_run, + operations_to_run, + ) + + results: Dict[str, Any] = {} + successful: int = 0 + failed: int = 0 + + for op_name in operations_to_run: + if op_name not in OPERATION_ORDER: + logger.warning("Unknown operation: %s, skipping", op_name) + results[op_name] = { + "status": "skipped", + "error": f"Unknown operation: {op_name}", + } + failed += 1 + continue + + if not operations and not self._is_operation_enabled(op_name): + logger.info("Operation disabled: %s, skipping", op_name) + results[op_name] = {"status": "skipped", "reason": "disabled"} + continue + + try: + operation = self._create_operation(op_name) + op_result = operation.execute(dry_run=dry_run) + results[op_name] = op_result + successful += 1 + logger.info( + "Operation %s completed: %s", op_name, op_result.get("status", "ok") + ) + except Exception as exc: + logger.error("Operation %s failed: %s", op_name, exc) + results[op_name] = {"status": "error", "error": str(exc)} + failed += 1 + + report = { + "timestamp": timestamp, + "dry_run": dry_run, + "operations": results, + "summary": { + "total_operations": len(operations_to_run), + "successful": successful, + "failed": failed, + }, + } + + self._save_report(report) + + state = self._load_state() + state["last_run"] = timestamp + state["last_report_summary"] = report["summary"] + state["operation_results"] = { + name: res.get("status", "unknown") for name, res in results.items() + } + self._save_state(state) + + logger.info( + "Dream cycle completed: %d successful, %d failed", successful, failed + ) + return report + + def get_status(self) -> Dict[str, Any]: + """Get current dream cycle status. + + Returns: + Status dict with last run info and operation states. + """ + state = self._load_state() + dreamer_config = self.config.get("dreamer", {}) + ops_config = dreamer_config.get("operations", {}) + + op_states = {} + for op_name in OPERATION_ORDER: + op_config = ops_config.get(op_name, {}) + op_states[op_name] = { + "enabled": op_config.get("enabled", True), + "last_status": state.get("operation_results", {}).get(op_name), + } + + return { + "last_run": state.get("last_run"), + "last_report_summary": state.get("last_report_summary"), + "operations": op_states, + "config_path": str(self.config_path), + "log_dir": str(self.log_dir), + "state_file": str(self.state_file), + } + + def get_reports(self, count: int = 5) -> List[Dict[str, Any]]: + """Get last N dream reports. + + Args: + count: Number of reports to retrieve (default 5). + + Returns: + List of report dicts, sorted newest first. + """ + try: + report_files = sorted(self.log_dir.glob("dream_report_*.json"), reverse=True) + reports = [] + for report_path in report_files[:count]: + with open(report_path, "r", encoding="utf-8") as f: + reports.append(json.load(f)) + return reports + except OSError as exc: + logger.error("Failed to load reports: %s", exc) + return [] diff --git a/usr/plugins/_kg_dreamer/plugin.yaml b/usr/plugins/_kg_dreamer/plugin.yaml new file mode 100644 index 0000000000..c3c59892b9 --- /dev/null +++ b/usr/plugins/_kg_dreamer/plugin.yaml @@ -0,0 +1,13 @@ +name: _kg_dreamer +version: 1.0.0 +description: "Autonomous KG intelligence system - background dreaming operations for proactive knowledge discovery" +author: "Elastic SLED Central" +depends: + - _kg_pipeline +tools: + - name: kg_dreamer + description: "Run KG dreaming operations and view dream cycle reports" + entry: tools.kg_dreamer:KgDreamer +config: + - name: default_config.yaml + description: "Dream cycle schedule, LLM endpoint, operation thresholds" diff --git a/usr/plugins/_kg_dreamer/tests/.pytest_cache/.gitignore b/usr/plugins/_kg_dreamer/tests/.pytest_cache/.gitignore new file mode 100644 index 0000000000..bc1a1f6167 --- /dev/null +++ b/usr/plugins/_kg_dreamer/tests/.pytest_cache/.gitignore @@ -0,0 +1,2 @@ +# Created by pytest automatically. +* diff --git a/usr/plugins/_kg_dreamer/tests/.pytest_cache/CACHEDIR.TAG b/usr/plugins/_kg_dreamer/tests/.pytest_cache/CACHEDIR.TAG new file mode 100644 index 0000000000..fce15ad7ea --- /dev/null +++ b/usr/plugins/_kg_dreamer/tests/.pytest_cache/CACHEDIR.TAG @@ -0,0 +1,4 @@ +Signature: 8a477f597d28d172789f06886806bc55 +# This file is a cache directory tag created by pytest. +# For information about cache directory tags, see: +# https://bford.info/cachedir/spec.html diff --git a/usr/plugins/_kg_dreamer/tests/.pytest_cache/README.md b/usr/plugins/_kg_dreamer/tests/.pytest_cache/README.md new file mode 100644 index 0000000000..b89018ced9 --- /dev/null +++ b/usr/plugins/_kg_dreamer/tests/.pytest_cache/README.md @@ -0,0 +1,8 @@ +# pytest cache directory # + +This directory contains data from the pytest's cache plugin, +which provides the `--lf` and `--ff` options, as well as the `cache` fixture. + +**Do not** commit this to version control. + +See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information. diff --git a/usr/plugins/_kg_dreamer/tests/.pytest_cache/v/cache/nodeids b/usr/plugins/_kg_dreamer/tests/.pytest_cache/v/cache/nodeids new file mode 100644 index 0000000000..0d6d555d3f --- /dev/null +++ b/usr/plugins/_kg_dreamer/tests/.pytest_cache/v/cache/nodeids @@ -0,0 +1,32 @@ +[ + "test_connector.py::TestConnectOperationAuditLog::test_audit_append_failure_does_not_crash", + "test_connector.py::TestConnectOperationAuditLog::test_execute_dry_run_logs_to_audit_chain", + "test_connector.py::TestConnectOperationAuditLog::test_execute_live_run_logs_action_add", + "test_connector.py::TestConnectOperationCandidateDetection::test_execute_cypher_query_error_returns_empty_candidates", + "test_connector.py::TestConnectOperationCandidateDetection::test_execute_filters_candidates_with_missing_names", + "test_connector.py::TestConnectOperationCandidateDetection::test_execute_finds_shared_docs_from_cypher_results", + "test_connector.py::TestConnectOperationCandidateDetection::test_execute_live_create_failure_marks_detail_as_failed", + "test_connector.py::TestConnectOperationCandidateDetection::test_execute_no_candidates_returns_empty_result", + "test_connector.py::TestConnectOperationCandidateDetection::test_execute_respects_max_candidates_limit", + "test_connector.py::TestConnectOperationDryRun::test_execute_dry_run_never_calls_create_relationship", + "test_connector.py::TestConnectOperationDryRun::test_execute_dry_run_returns_zero_connections_made", + "test_connector.py::TestConnectOperationDryRun::test_execute_live_run_calls_create_relationship", + "test_connector.py::TestConnectOperationInit::test_init_with_custom_config_overrides_defaults", + "test_connector.py::TestConnectOperationInit::test_init_with_defaults_populates_defaults", + "test_connector.py::TestConnectOperationRelationshipType::test_custom_relationship_type_used_in_create", + "test_pruner.py::TestPruneOperationAuditLog::test_execute_dry_run_logs_action_dry_run", + "test_pruner.py::TestPruneOperationAuditLog::test_execute_live_logs_action_delete", + "test_pruner.py::TestPruneOperationBatchSize::test_execute_respects_batch_size", + "test_pruner.py::TestPruneOperationDryRun::test_execute_dry_run_marks_candidates_as_would_prune", + "test_pruner.py::TestPruneOperationDryRun::test_execute_dry_run_no_deletes", + "test_pruner.py::TestPruneOperationFiltering::test_execute_filters_by_age_too_recent_excluded", + "test_pruner.py::TestPruneOperationFiltering::test_execute_filters_by_health_score_high_health_excluded", + "test_pruner.py::TestPruneOperationFiltering::test_execute_filters_by_query_count_queried_entity_excluded", + "test_pruner.py::TestPruneOperationFiltering::test_execute_no_candidates_returns_empty_result", + "test_pruner.py::TestPruneOperationFiltering::test_execute_scorer_error_returns_empty_result", + "test_pruner.py::TestPruneOperationInit::test_init_with_custom_config_overrides_defaults", + "test_pruner.py::TestPruneOperationInit::test_init_with_defaults_populates_defaults", + "test_pruner.py::TestPruneOperationLiveRun::test_execute_live_delete_failure_marks_error", + "test_pruner.py::TestPruneOperationLiveRun::test_execute_live_uses_detach_delete", + "test_pruner.py::TestPruneOperationSorting::test_candidates_sorted_lowest_health_first" +] \ No newline at end of file diff --git a/usr/plugins/_kg_dreamer/tests/__init__.py b/usr/plugins/_kg_dreamer/tests/__init__.py new file mode 100644 index 0000000000..e8f9e3692e --- /dev/null +++ b/usr/plugins/_kg_dreamer/tests/__init__.py @@ -0,0 +1 @@ +# KG Dreamer test suite. diff --git a/usr/plugins/_kg_dreamer/tests/test_connector.py b/usr/plugins/_kg_dreamer/tests/test_connector.py new file mode 100644 index 0000000000..032b0f85a6 --- /dev/null +++ b/usr/plugins/_kg_dreamer/tests/test_connector.py @@ -0,0 +1,248 @@ +"""Unit tests for ConnectOperation. + +Tests candidate detection, relationship creation, dry-run behavior, +max_candidates enforcement, and audit logging. +""" + +import sys +import os +import unittest +from unittest.mock import MagicMock, call, patch + +# Ensure plugin root is importable +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from operations.connector import ConnectOperation + + +class TestConnectOperationInit(unittest.TestCase): + """Test ConnectOperation.__init__ with various configs.""" + + def test_init_with_defaults_populates_defaults(self): + """Init with no config uses DEFAULT_MIN_SHARED_DOCS=2, DEFAULT_MAX_CANDIDATES=500.""" + kg = MagicMock() + audit = MagicMock() + op = ConnectOperation(kg, audit) + + self.assertIs(op.kg, kg) + self.assertIs(op.audit, audit) + self.assertEqual(op.min_shared_docs, 2) + self.assertEqual(op.max_candidates, 500) + self.assertEqual(op.relationship_type, "IMPLIED_RELATION") + + def test_init_with_custom_config_overrides_defaults(self): + """Config dict overrides all default values.""" + kg = MagicMock() + audit = MagicMock() + cfg = { + "min_shared_docs": 5, + "max_candidates": 10, + "relationship_type": "CO_OCCURS", + } + op = ConnectOperation(kg, audit, cfg) + + self.assertEqual(op.min_shared_docs, 5) + self.assertEqual(op.max_candidates, 10) + self.assertEqual(op.relationship_type, "CO_OCCURS") + + +class TestConnectOperationDryRun(unittest.TestCase): + """Test dry-run mode: no writes to KG.""" + + def setUp(self): + self.kg = MagicMock() + self.audit = MagicMock() + self.op = ConnectOperation(self.kg, self.audit) + + def test_execute_dry_run_returns_zero_connections_made(self): + """When candidates exist but dry_run=True, connections_made stays 0.""" + self.kg.query_cypher.return_value = [ + {"e1_name": "Alpha", "e2_name": "Beta", "type1": "org", "type2": "tech", "shared_docs": 3}, + ] + result = self.op.execute(dry_run=True) + + self.assertEqual(result["candidates_found"], 1) + self.assertEqual(result["connections_made"], 0) + self.assertTrue(result["dry_run"]) + self.assertEqual(len(result["details"]), 1) + self.assertEqual(result["details"][0]["status"], "would_create") + self.assertFalse(result["details"][0]["created"]) + + def test_execute_dry_run_never_calls_create_relationship(self): + """Dry run must not call kg.create_relationship at all.""" + self.kg.query_cypher.return_value = [ + {"e1_name": "A", "e2_name": "B", "type1": "x", "type2": "y", "shared_docs": 4}, + ] + self.op.execute(dry_run=True) + + self.kg.create_relationship.assert_not_called() + + def test_execute_live_run_calls_create_relationship(self): + """Live run with successful create_relationship increments connections_made.""" + self.kg.query_cypher.return_value = [ + {"e1_name": "A", "e2_name": "B", "type1": "x", "type2": "y", "shared_docs": 4}, + ] + self.kg.create_relationship.return_value = True + result = self.op.execute(dry_run=False) + + self.assertEqual(result["connections_made"], 1) + self.assertTrue(result["details"][0]["created"]) + self.assertEqual(result["details"][0]["status"], "created") + self.kg.create_relationship.assert_called_once_with( + source_name="A", target_name="B", rel_type="IMPLIED_RELATION" + ) + + +class TestConnectOperationCandidateDetection(unittest.TestCase): + """Test candidate finding and filtering logic.""" + + def setUp(self): + self.kg = MagicMock() + self.audit = MagicMock() + self.op = ConnectOperation(self.kg, self.audit) + + def test_execute_finds_shared_docs_from_cypher_results(self): + """Multiple candidates returned from query are all processed.""" + self.kg.query_cypher.return_value = [ + {"e1_name": "X", "e2_name": "Y", "type1": "a", "type2": "b", "shared_docs": 3}, + {"e1_name": "P", "e2_name": "Q", "type1": "c", "type2": "d", "shared_docs": 5}, + ] + self.kg.create_relationship.return_value = True + result = self.op.execute(dry_run=False) + + self.assertEqual(result["candidates_found"], 2) + self.assertEqual(result["connections_made"], 2) + self.assertEqual(len(result["details"]), 2) + + def test_execute_respects_max_candidates_limit(self): + """Only the first max_candidates entries are processed.""" + cfg = {"max_candidates": 2} + op = ConnectOperation(self.kg, self.audit, cfg) + + rows = [ + {"e1_name": f"E{i}", "e2_name": f"F{i}", "type1": "t", "type2": "t", "shared_docs": 3} + for i in range(10) + ] + self.kg.query_cypher.return_value = rows + self.kg.create_relationship.return_value = True + result = op.execute(dry_run=False) + + self.assertEqual(result["candidates_found"], 10) + self.assertEqual(len(result["details"]), 2) + self.assertEqual(result["connections_made"], 2) + + def test_execute_no_candidates_returns_empty_result(self): + """When query returns empty list, result has zero candidates.""" + self.kg.query_cypher.return_value = [] + result = self.op.execute(dry_run=True) + + self.assertEqual(result["candidates_found"], 0) + self.assertEqual(result["connections_made"], 0) + self.assertEqual(result["details"], []) + + def test_execute_filters_candidates_with_missing_names(self): + """Rows with empty/missing e1_name or e2_name are filtered out.""" + self.kg.query_cypher.return_value = [ + {"e1_name": "", "e2_name": "Beta", "type1": "a", "type2": "b", "shared_docs": 3}, + {"e1_name": "Alpha", "e2_name": "", "type1": "a", "type2": "b", "shared_docs": 3}, + {"e1_name": "Valid", "e2_name": "Pair", "type1": "a", "type2": "b", "shared_docs": 4}, + ] + result = self.op.execute(dry_run=True) + + self.assertEqual(result["candidates_found"], 1) + self.assertEqual(result["details"][0]["e1"], "Valid") + self.assertEqual(result["details"][0]["e2"], "Pair") + + def test_execute_cypher_query_error_returns_empty_candidates(self): + """When kg.query_cypher raises, candidates_found is 0 and no crash.""" + self.kg.query_cypher.side_effect = Exception("connection refused") + result = self.op.execute(dry_run=True) + + self.assertEqual(result["candidates_found"], 0) + self.assertEqual(result["connections_made"], 0) + + def test_execute_live_create_failure_marks_detail_as_failed(self): + """When create_relationship returns False, detail status is 'failed'.""" + self.kg.query_cypher.return_value = [ + {"e1_name": "A", "e2_name": "B", "type1": "x", "type2": "y", "shared_docs": 3}, + ] + self.kg.create_relationship.return_value = False + result = self.op.execute(dry_run=False) + + self.assertEqual(result["connections_made"], 0) + self.assertEqual(result["details"][0]["status"], "failed") + self.assertFalse(result["details"][0]["created"]) + + +class TestConnectOperationAuditLog(unittest.TestCase): + """Test audit chain logging behavior.""" + + def setUp(self): + self.kg = MagicMock() + self.audit = MagicMock() + self.op = ConnectOperation(self.kg, self.audit) + + def test_execute_dry_run_logs_to_audit_chain(self): + """Dry run calls audit.append with action='dry_run' and correct metadata.""" + self.kg.query_cypher.return_value = [ + {"e1_name": "Foo", "e2_name": "Bar", "type1": "x", "type2": "y", "shared_count": 7}, + ] + self.op.execute(dry_run=True) + + self.audit.append.assert_called_once() + call_kwargs = self.audit.append.call_args[1] + self.assertEqual(call_kwargs["action"], "dry_run") + self.assertEqual(call_kwargs["target_type"], "relationship") + self.assertEqual(call_kwargs["target_id"], "Foo->Bar:IMPLIED_RELATION") + self.assertEqual(call_kwargs["source"], "kg_dreamer.operations.connector") + self.assertEqual(call_kwargs["metadata"]["weight"], 7) + self.assertTrue(call_kwargs["metadata"]["dry_run"]) + + def test_execute_live_run_logs_action_add(self): + """Live run with successful creation logs action='add'.""" + self.kg.query_cypher.return_value = [ + {"e1_name": "Cat", "e2_name": "Dog", "type1": "x", "type2": "y", "shared_docs": 5}, + ] + self.kg.create_relationship.return_value = True + self.op.execute(dry_run=False) + + call_kwargs = self.audit.append.call_args[1] + self.assertEqual(call_kwargs["action"], "add") + self.assertFalse(call_kwargs["metadata"]["dry_run"]) + + def test_audit_append_failure_does_not_crash(self): + """If audit.append raises, the operation still completes successfully.""" + self.audit.append.side_effect = RuntimeError("audit broken") + self.kg.query_cypher.return_value = [ + {"e1_name": "A", "e2_name": "B", "type1": "x", "type2": "y", "shared_docs": 3}, + ] + self.kg.create_relationship.return_value = True + result = self.op.execute(dry_run=False) + + self.assertEqual(result["connections_made"], 1) + + +class TestConnectOperationRelationshipType(unittest.TestCase): + """Test custom relationship type is passed through correctly.""" + + def test_custom_relationship_type_used_in_create(self): + """Configured relationship_type propagates to create_relationship and audit.""" + kg = MagicMock() + audit = MagicMock() + op = ConnectOperation(kg, audit, {"relationship_type": "CO_OCCURS"}) + + kg.query_cypher.return_value = [ + {"e1_name": "A", "e2_name": "B", "type1": "x", "type2": "y", "shared_docs": 3}, + ] + kg.create_relationship.return_value = True + result = op.execute(dry_run=False) + + kg.create_relationship.assert_called_once_with( + source_name="A", target_name="B", rel_type="CO_OCCURS" + ) + audit_kwargs = audit.append.call_args[1] + self.assertIn("CO_OCCURS", audit_kwargs["target_id"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/usr/plugins/_kg_dreamer/tests/test_orchestrator.py b/usr/plugins/_kg_dreamer/tests/test_orchestrator.py new file mode 100644 index 0000000000..e40d92520a --- /dev/null +++ b/usr/plugins/_kg_dreamer/tests/test_orchestrator.py @@ -0,0 +1,316 @@ +"""Unit tests for DreamOrchestrator. + +Tests config loading, run_cycle (dry-run, single operation, fault isolation), +and get_status. All external dependencies are mocked. +""" + +import sys +import os +import json +import unittest +from datetime import datetime, timezone +from pathlib import Path +from unittest.mock import MagicMock, patch, PropertyMock + +# Ensure plugin root is importable +PLUGIN_DIR = os.path.join(os.path.dirname(__file__), "..") +sys.path.insert(0, PLUGIN_DIR) + +# Mock the helpers module BEFORE importing orchestrator, so we never touch +# _kg_pipeline or any real service. +import helpers as _helpers_mod + + +def _setup_mock_helpers(): + """Patch helpers module functions to return MagicMocks.""" + _helpers_mod.get_kg_client = MagicMock(return_value=MagicMock()) + _helpers_mod.get_audit_chain = MagicMock(return_value=MagicMock()) + _helpers_mod.get_health_scorer = MagicMock(return_value=MagicMock()) + + +_setup_mock_helpers() + +from orchestrator import DreamOrchestrator, OPERATION_ORDER + + +class TestDreamOrchestratorInit(unittest.TestCase): + """Test DreamOrchestrator.__init__ and config loading.""" + + @patch.object(DreamOrchestrator, "_ensure_directories") + def test_init_loads_config_from_yaml(self, mock_dirs): + """Init loads default_config.yaml and sets config dict.""" + orch = DreamOrchestrator() + + self.assertIsInstance(orch.config, dict) + self.assertIn("dreamer", orch.config) + # Verify the config has operations section + ops = orch.config["dreamer"].get("operations", {}) + self.assertIn("connect", ops) + self.assertIn("prune", ops) + + @patch.object(DreamOrchestrator, "_ensure_directories") + def test_init_custom_config_path_missing_returns_empty(self, mock_dirs): + """Init with non-existent config_path sets config to empty dict.""" + orch = DreamOrchestrator(config_path="/nonexistent/path.yaml") + + self.assertEqual(orch.config, {}) + + @patch.object(DreamOrchestrator, "_ensure_directories") + def test_init_sets_log_and_state_paths(self, mock_dirs): + """Init sets log_dir and state_file from config.""" + orch = DreamOrchestrator() + + self.assertIsInstance(orch.log_dir, Path) + self.assertIsInstance(orch.state_file, Path) + + +class TestDreamOrchestratorRunCycle(unittest.TestCase): + """Test DreamOrchestrator.run_cycle with various configurations.""" + + def setUp(self): + """Create orchestrator with mocked dependencies.""" + # Patch _ensure_directories to avoid directory creation + with patch.object(DreamOrchestrator, "_ensure_directories"): + self.orch = DreamOrchestrator() + + self.mock_kg = MagicMock() + self.mock_audit = MagicMock() + self.mock_scorer = MagicMock() + + _helpers_mod.get_kg_client.return_value = self.mock_kg + _helpers_mod.get_audit_chain.return_value = self.mock_audit + _helpers_mod.get_health_scorer.return_value = self.mock_scorer + + @patch.object(DreamOrchestrator, "_save_report") + @patch.object(DreamOrchestrator, "_save_state") + @patch.object(DreamOrchestrator, "_load_state", return_value={}) + def test_run_cycle_dry_run(self, mock_load_state, mock_save_state, mock_save_report): + """Dry run cycle completes with dry_run=True in report.""" + # Mock all operations to return success + with patch.object(self.orch, "_create_operation") as mock_create: + mock_op = MagicMock() + mock_op.execute.return_value = {"status": "ok", "candidates_found": 0} + mock_create.return_value = mock_op + + report = self.orch.run_cycle(dry_run=True) + + self.assertTrue(report["dry_run"]) + self.assertIn("operations", report) + self.assertIn("summary", report) + self.assertEqual(report["summary"]["successful"], len(OPERATION_ORDER)) + self.assertEqual(report["summary"]["failed"], 0) + + @patch.object(DreamOrchestrator, "_save_report") + @patch.object(DreamOrchestrator, "_save_state") + @patch.object(DreamOrchestrator, "_load_state", return_value={}) + def test_run_cycle_single_operation(self, mock_load_state, mock_save_state, mock_save_report): + """Running a single operation only executes that operation.""" + with patch.object(self.orch, "_create_operation") as mock_create: + mock_op = MagicMock() + mock_op.execute.return_value = {"status": "ok"} + mock_create.return_value = mock_op + + report = self.orch.run_cycle(dry_run=True, operations=["connect"]) + + self.assertIn("connect", report["operations"]) + # Only 1 operation should be in the report + self.assertEqual(len(report["operations"]), 1) + self.assertEqual(report["summary"]["total_operations"], 1) + + @patch.object(DreamOrchestrator, "_save_report") + @patch.object(DreamOrchestrator, "_save_state") + @patch.object(DreamOrchestrator, "_load_state", return_value={}) + def test_run_cycle_one_failure_doesnt_stop_others(self, mock_load_state, mock_save_state, mock_save_report): + """When one operation fails, others still run and report contains all.""" + call_count = [0] + def create_op(name): + mock_op = MagicMock() + if name == "prune": + mock_op.execute.side_effect = RuntimeError("prune crashed") + else: + mock_op.execute.return_value = {"status": "ok"} + return mock_op + + with patch.object(self.orch, "_create_operation", side_effect=create_op): + report = self.orch.run_cycle(dry_run=True) + + ops = report["operations"] + self.assertEqual(ops["prune"]["status"], "error") + self.assertIn("prune crashed", ops["prune"]["error"]) + # All other operations should have succeeded + for op_name in OPERATION_ORDER: + if op_name != "prune": + self.assertNotEqual(ops[op_name].get("status"), "error", + f"{op_name} should not have errored") + + self.assertGreater(report["summary"]["successful"], 0) + self.assertEqual(report["summary"]["failed"], 1) + + @patch.object(DreamOrchestrator, "_save_report") + @patch.object(DreamOrchestrator, "_save_state") + @patch.object(DreamOrchestrator, "_load_state", return_value={}) + def test_run_cycle_unknown_operation_skipped(self, mock_load_state, mock_save_state, mock_save_report): + """Unknown operation names are skipped with status='skipped'.""" + report = self.orch.run_cycle(dry_run=True, operations=["bogus_op"]) + + self.assertEqual(report["operations"]["bogus_op"]["status"], "skipped") + self.assertIn("Unknown operation", report["operations"]["bogus_op"]["error"]) + + @patch.object(DreamOrchestrator, "_save_report") + @patch.object(DreamOrchestrator, "_save_state") + @patch.object(DreamOrchestrator, "_load_state", return_value={}) + def test_run_cycle_saves_state_and_report(self, mock_load_state, mock_save_state, mock_save_report): + """run_cycle calls _save_report and _save_state.""" + with patch.object(self.orch, "_create_operation") as mock_create: + mock_op = MagicMock() + mock_op.execute.return_value = {"status": "ok"} + mock_create.return_value = mock_op + + self.orch.run_cycle(dry_run=True, operations=["connect"]) + + mock_save_report.assert_called_once() + mock_save_state.assert_called_once() + + # Verify state includes last_run timestamp + saved_state = mock_save_state.call_args[0][0] + self.assertIn("last_run", saved_state) + self.assertIn("last_report_summary", saved_state) + + @patch.object(DreamOrchestrator, "_save_report") + @patch.object(DreamOrchestrator, "_save_state") + @patch.object(DreamOrchestrator, "_load_state", return_value={}) + def test_run_cycle_disabled_operation_skipped(self, mock_load_state, mock_save_state, mock_save_report): + """When no specific operations given, disabled ops are skipped.""" + # Disable 'pattern' operation + self.orch.config.setdefault("dreamer", {}).setdefault("operations", {}) + self.orch.config["dreamer"]["operations"]["pattern"] = {"enabled": False} + + with patch.object(self.orch, "_create_operation") as mock_create: + mock_op = MagicMock() + mock_op.execute.return_value = {"status": "ok"} + mock_create.return_value = mock_op + + report = self.orch.run_cycle(dry_run=True) + + self.assertEqual(report["operations"]["pattern"]["status"], "skipped") + self.assertEqual(report["operations"]["pattern"]["reason"], "disabled") + + +class TestDreamOrchestratorGetStatus(unittest.TestCase): + """Test DreamOrchestrator.get_status.""" + + def setUp(self): + with patch.object(DreamOrchestrator, "_ensure_directories"): + self.orch = DreamOrchestrator() + + def test_get_status_returns_operation_states(self): + """get_status returns a dict with all OPERATION_ORDER entries.""" + with patch.object(self.orch, "_load_state", return_value={}): + status = self.orch.get_status() + + self.assertIn("operations", status) + for op_name in OPERATION_ORDER: + self.assertIn(op_name, status["operations"]) + self.assertIn("enabled", status["operations"][op_name]) + + def test_get_status_includes_last_run_from_state(self): + """get_status includes last_run timestamp from state file.""" + state = { + "last_run": "2026-05-20T12:00:00+00:00", + "last_report_summary": {"successful": 5, "failed": 1, "total_operations": 6}, + "operation_results": {"connect": "ok", "prune": "error"}, + } + with patch.object(self.orch, "_load_state", return_value=state): + status = self.orch.get_status() + + self.assertEqual(status["last_run"], "2026-05-20T12:00:00+00:00") + self.assertEqual(status["operations"]["connect"]["last_status"], "ok") + self.assertEqual(status["operations"]["prune"]["last_status"], "error") + + def test_get_status_no_state_returns_none_last_run(self): + """get_status with empty state returns None for last_run.""" + with patch.object(self.orch, "_load_state", return_value={}): + status = self.orch.get_status() + + self.assertIsNone(status["last_run"]) + + def test_get_status_includes_paths(self): + """get_status includes config_path, log_dir, state_file.""" + with patch.object(self.orch, "_load_state", return_value={}): + status = self.orch.get_status() + + self.assertIn("config_path", status) + self.assertIn("log_dir", status) + self.assertIn("state_file", status) + + +class TestDreamOrchestratorSaveReport(unittest.TestCase): + """Test report saving and state persistence.""" + + def setUp(self): + with patch.object(DreamOrchestrator, "_ensure_directories"): + import tempfile + self.tmp_dir = tempfile.mkdtemp() + self.orch = DreamOrchestrator() + self.orch.log_dir = Path(self.tmp_dir) / "logs" + self.orch.state_file = Path(self.tmp_dir) / "state" / "dreamer.json" + + def test_save_report_writes_json_file(self): + """_save_report creates a valid JSON file in log_dir.""" + self.orch.log_dir.mkdir(parents=True, exist_ok=True) + report = {"timestamp": "2026-05-20T12:00:00", "dry_run": True, "operations": {}} + report_path = self.orch._save_report(report) + + self.assertTrue(report_path.exists()) + with open(report_path) as f: + loaded = json.load(f) + self.assertEqual(loaded["timestamp"], "2026-05-20T12:00:00") + + def test_save_state_writes_json_file(self): + """_save_state creates a valid JSON state file.""" + state = {"last_run": "2026-05-20T12:00:00", "count": 42} + self.orch._save_state(state) + + self.assertTrue(self.orch.state_file.exists()) + with open(self.orch.state_file) as f: + loaded = json.load(f) + self.assertEqual(loaded["last_run"], "2026-05-20T12:00:00") + self.assertEqual(loaded["count"], 42) + + +class TestDreamOrchestratorCreateOperation(unittest.TestCase): + """Test _create_operation creates the correct operation class.""" + + def setUp(self): + with patch.object(DreamOrchestrator, "_ensure_directories"): + self.orch = DreamOrchestrator() + + self.mock_kg = MagicMock() + self.mock_audit = MagicMock() + self.mock_scorer = MagicMock() + + _helpers_mod.get_kg_client.return_value = self.mock_kg + _helpers_mod.get_audit_chain.return_value = self.mock_audit + _helpers_mod.get_health_scorer.return_value = self.mock_scorer + + def test_create_operation_connect(self): + """_create_operation('connect') returns a ConnectOperation.""" + op = self.orch._create_operation("connect") + from operations.connector import ConnectOperation + self.assertIsInstance(op, ConnectOperation) + + def test_create_operation_prune(self): + """_create_operation('prune') returns a PruneOperation.""" + op = self.orch._create_operation("prune") + from operations.pruner import PruneOperation + self.assertIsInstance(op, PruneOperation) + + def test_create_operation_unknown_raises(self): + """_create_operation with unknown name raises ValueError.""" + with self.assertRaises(ValueError) as ctx: + self.orch._create_operation("nonexistent") + self.assertIn("Unknown operation", str(ctx.exception)) + + +if __name__ == "__main__": + unittest.main() diff --git a/usr/plugins/_kg_dreamer/tests/test_pruner.py b/usr/plugins/_kg_dreamer/tests/test_pruner.py new file mode 100644 index 0000000000..df19ff4530 --- /dev/null +++ b/usr/plugins/_kg_dreamer/tests/test_pruner.py @@ -0,0 +1,364 @@ +"""Unit tests for PruneOperation. + +Tests candidate filtering by health/age/queries, dry-run behavior, +DETACH DELETE usage, batch size limits, and audit logging. +""" + +import sys +import os +import unittest +from datetime import datetime, timezone, timedelta +from unittest.mock import MagicMock, call, patch + +# Ensure plugin dir is importable +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from operations.pruner import PruneOperation + + +# Fixed "now" so age calculations are deterministic +FIXED_NOW = datetime(2026, 5, 20, 12, 0, 0, tzinfo=timezone.utc) + +# Real fromisoformat for patching +_real_fromisoformat = datetime.fromisoformat + + +def _make_entity( + name: str, + health: float = 0.05, + days_ago: int = 200, + entity_type: str = "concept", + tier: str = "cold", +) -> dict: + """Build a score_entities() result dict for one entity.""" + last_seen = (FIXED_NOW - timedelta(days=days_ago)).isoformat() + return { + "name": name, + "total": health, + "last_seen": last_seen, + "entity_type": entity_type, + "tier": tier, + } + + +def _make_cypher_row( + name: str, + mention_count: int = 1, + days_ago: int = 200, + entity_type: str = "concept", +) -> dict: + """Build a query_cypher() result row for one entity.""" + last_seen = (FIXED_NOW - timedelta(days=days_ago)).isoformat() + return { + "name": name, + "etype": entity_type, + "mention_count": mention_count, + "last_seen": last_seen, + } + + +def _patch_datetime(mock_dt): + """Configure a mocked datetime to work with PruneOperation. + + Sets now() to FIXED_NOW and preserves fromisoformat(). + """ + mock_dt.now.return_value = FIXED_NOW + mock_dt.fromisoformat = _real_fromisoformat + + +class TestPruneOperationInit(unittest.TestCase): + """Test PruneOperation.__init__ with various configs.""" + + def test_init_with_defaults_populates_defaults(self): + """Init with no config uses default thresholds.""" + kg = MagicMock() + audit = MagicMock() + scorer = MagicMock() + op = PruneOperation(kg, audit, scorer) + + self.assertIs(op.kg, kg) + self.assertIs(op.audit, audit) + self.assertIs(op.health_scorer, scorer) + self.assertEqual(op.min_age_days, 180) + self.assertAlmostEqual(op.max_health_score, 0.1) + self.assertEqual(op.max_queries, 0) + self.assertEqual(op.batch_size, 100) + + def test_init_with_custom_config_overrides_defaults(self): + """Config dict overrides all default values.""" + kg = MagicMock() + audit = MagicMock() + scorer = MagicMock() + cfg = { + "min_age_days": 90, + "max_health_score": 0.3, + "max_queries": 5, + "batch_size": 25, + } + op = PruneOperation(kg, audit, scorer, cfg) + + self.assertEqual(op.min_age_days, 90) + self.assertAlmostEqual(op.max_health_score, 0.3) + self.assertEqual(op.max_queries, 5) + self.assertEqual(op.batch_size, 25) + + +class TestPruneOperationDryRun(unittest.TestCase): + """Test dry-run mode: no deletes executed.""" + + def setUp(self): + self.kg = MagicMock() + self.audit = MagicMock() + self.scorer = MagicMock() + self.op = PruneOperation(self.kg, self.audit, self.scorer) + + def test_execute_dry_run_no_deletes(self): + """Dry run never calls query_cypher with DETACH DELETE.""" + self.scorer.score_entities.return_value = { + "entities": [_make_entity("OldEntity", health=0.01, days_ago=300)] + } + self.audit.query.return_value = [] + result = self.op.execute(dry_run=True) + + self.assertEqual(result["entities_pruned"], 0) + for call_item in self.kg.query_cypher.call_args_list: + query = call_item[0][0] if call_item[0] else "" + self.assertNotIn("DETACH DELETE", str(query)) + + @patch("operations.pruner.datetime") + def test_execute_dry_run_marks_candidates_as_would_prune(self, mock_dt): + """Dry run returns status='would_prune' for each candidate.""" + _patch_datetime(mock_dt) + + self.kg.query_cypher.return_value = [ + _make_cypher_row("Stale", mention_count=1, days_ago=250) + ] + result = self.op.execute(dry_run=True) + + self.assertEqual(result["candidates_found"], 1) + self.assertEqual(result["pruned_entities"][0]["status"], "would_prune") + self.assertFalse(result["pruned_entities"][0]["pruned"]) + + +class TestPruneOperationFiltering(unittest.TestCase): + """Test filtering logic: health, age, and query count.""" + + def setUp(self): + self.kg = MagicMock() + self.audit = MagicMock() + self.scorer = MagicMock() + self.op = PruneOperation(self.kg, self.audit, self.scorer) + + @patch("operations.pruner.datetime") + def test_execute_filters_by_health_score_high_health_excluded(self, mock_dt): + """Entity with high mention_count (high health) is excluded by Cypher query.""" + _patch_datetime(mock_dt) + + op = PruneOperation(self.kg, self.audit, self.scorer, {"max_health_score": 0.1}) + # Cypher query handles filtering; only the low-health entity qualifies + self.kg.query_cypher.return_value = [ + _make_cypher_row("LowHealth", mention_count=1, days_ago=300) + ] + result = op.execute(dry_run=True) + + self.assertEqual(result["candidates_found"], 1) + self.assertEqual(result["pruned_entities"][0]["name"], "LowHealth") + + @patch("operations.pruner.datetime") + def test_execute_filters_by_age_too_recent_excluded(self, mock_dt): + """Entity with recent last_seen is excluded by Cypher query.""" + _patch_datetime(mock_dt) + + op = PruneOperation(self.kg, self.audit, self.scorer, {"min_age_days": 180}) + # Cypher query handles filtering; only the old entity qualifies + self.kg.query_cypher.return_value = [ + _make_cypher_row("OldEnough", mention_count=1, days_ago=250) + ] + result = op.execute(dry_run=True) + + self.assertEqual(result["candidates_found"], 1) + self.assertEqual(result["pruned_entities"][0]["name"], "OldEnough") + + @patch("operations.pruner.datetime") + def test_execute_filters_by_query_count_queried_entity_excluded(self, mock_dt): + """Entity queried more than max_queries times is excluded.""" + _patch_datetime(mock_dt) + + op = PruneOperation(self.kg, self.audit, self.scorer, {"max_queries": 0}) + self.scorer.score_entities.return_value = { + "entities": [ + _make_entity("NeverQueried", health=0.01, days_ago=300), + ] + } + # audit.query returns a match for NeverQueried -> query_count=1 > max_queries=0 + self.audit.query.return_value = [ + {"metadata": {"entity": "NeverQueried"}, "target_id": "NeverQueried"} + ] + result = op.execute(dry_run=True) + + # NeverQueried should be excluded since query_count(1) > max_queries(0) + self.assertEqual(result["candidates_found"], 0) + + def test_execute_no_candidates_returns_empty_result(self): + """When scorer returns no entities, result has zero candidates.""" + self.scorer.score_entities.return_value = {"entities": []} + result = self.op.execute(dry_run=True) + + self.assertEqual(result["candidates_found"], 0) + self.assertEqual(result["entities_pruned"], 0) + self.assertEqual(result["pruned_entities"], []) + + def test_execute_scorer_error_returns_empty_result(self): + """When scorer raises, no crash and candidates_found is 0.""" + self.scorer.score_entities.side_effect = RuntimeError("scorer down") + result = self.op.execute(dry_run=True) + + self.assertEqual(result["candidates_found"], 0) + self.assertEqual(result["entities_pruned"], 0) + + +class TestPruneOperationLiveRun(unittest.TestCase): + """Test live execution: DETACH DELETE, archival, and error handling.""" + + def setUp(self): + self.kg = MagicMock() + self.audit = MagicMock() + self.scorer = MagicMock() + self.op = PruneOperation(self.kg, self.audit, self.scorer) + + @patch("operations.pruner.datetime") + def test_execute_live_uses_detach_delete(self, mock_dt): + """Live run issues DETACH DELETE Cypher for each pruned entity.""" + _patch_datetime(mock_dt) + + self.kg.query_cypher.side_effect = [ + [_make_cypher_row("DeadEntity", mention_count=1, days_ago=300)], # find + [{"e": "data"}], # archive + None, # delete (DETACH DELETE) + ] + result = self.op.execute(dry_run=False) + + self.assertEqual(result["entities_pruned"], 1) + delete_calls = [ + c for c in self.kg.query_cypher.call_args_list + if "DETACH DELETE" in (c[0][0] if c[0] else "") + ] + self.assertEqual(len(delete_calls), 1) + # Entity name is passed via params, not inline in query + delete_params = delete_calls[0][0][1] if len(delete_calls[0][0]) > 1 else delete_calls[0][1] + self.assertEqual(delete_params.get("name"), "DeadEntity") + + @patch("operations.pruner.datetime") + def test_execute_live_delete_failure_marks_error(self, mock_dt): + """When DETACH DELETE query fails, status is 'failed_delete'.""" + _patch_datetime(mock_dt) + + self.kg.query_cypher.side_effect = [ + [_make_cypher_row("StuckEntity", mention_count=1, days_ago=300)], # find + [{"e": "data"}], # archive + RuntimeError("delete failed"), # delete raises + ] + result = self.op.execute(dry_run=False) + + self.assertEqual(result["entities_pruned"], 0) + self.assertEqual(result["pruned_entities"][0]["status"], "failed_delete") + + +class TestPruneOperationBatchSize(unittest.TestCase): + """Test batch_size limits the number of processed candidates.""" + + @patch("operations.pruner.datetime") + def test_execute_respects_batch_size(self, mock_dt): + """Only batch_size candidates are processed even if more qualify.""" + _patch_datetime(mock_dt) + + kg = MagicMock() + audit = MagicMock() + scorer = MagicMock() + op = PruneOperation(kg, audit, scorer, {"batch_size": 2}) + + kg.query_cypher.return_value = [ + _make_cypher_row(f"E{i}", mention_count=1, days_ago=300) for i in range(10) + ] + result = op.execute(dry_run=True) + + self.assertEqual(result["candidates_found"], 10) + self.assertEqual(len(result["pruned_entities"]), 2) + + +class TestPruneOperationAuditLog(unittest.TestCase): + """Test audit chain logging for prune operations.""" + + @patch("operations.pruner.datetime") + def test_execute_dry_run_logs_action_dry_run(self, mock_dt): + """Dry run calls audit.append with action='dry_run'.""" + _patch_datetime(mock_dt) + + kg = MagicMock() + audit = MagicMock() + scorer = MagicMock() + op = PruneOperation(kg, audit, scorer) + + kg.query_cypher.return_value = [ + _make_cypher_row("Ghost", mention_count=1, days_ago=250) + ] + op.execute(dry_run=True) + + audit.append.assert_called_once() + kwargs = audit.append.call_args[1] + self.assertEqual(kwargs["action"], "dry_run") + self.assertEqual(kwargs["target_type"], "entity") + self.assertEqual(kwargs["target_id"], "Ghost") + self.assertEqual(kwargs["source"], "kg_dreamer.operations.pruner") + self.assertEqual(kwargs["metadata"]["entity_name"], "Ghost") + self.assertTrue(kwargs["metadata"]["dry_run"]) + + @patch("operations.pruner.datetime") + def test_execute_live_logs_action_delete(self, mock_dt): + """Successful live prune logs action='delete'.""" + _patch_datetime(mock_dt) + + kg = MagicMock() + audit = MagicMock() + scorer = MagicMock() + op = PruneOperation(kg, audit, scorer) + + kg.query_cypher.side_effect = [ + [_make_cypher_row("Gone", mention_count=1, days_ago=250)], # find + [{"e": "data"}], # archive + None, # delete (DETACH DELETE) + ] + op.execute(dry_run=False) + + kwargs = audit.append.call_args[1] + self.assertEqual(kwargs["action"], "delete") + self.assertFalse(kwargs["metadata"]["dry_run"]) + + +class TestPruneOperationSorting(unittest.TestCase): + """Test that candidates are sorted by last_seen (oldest first).""" + + @patch("operations.pruner.datetime") + def test_candidates_sorted_lowest_health_first(self, mock_dt): + """Prune candidates ordered by last_seen ascending (oldest first).""" + _patch_datetime(mock_dt) + + kg = MagicMock() + audit = MagicMock() + scorer = MagicMock() + op = PruneOperation(kg, audit, scorer, {"batch_size": 3}) + + # Return rows in non-sorted order; code sorts by age descending + kg.query_cypher.return_value = [ + _make_cypher_row("Recent", mention_count=1, days_ago=200), + _make_cypher_row("Oldest", mention_count=1, days_ago=400), + _make_cypher_row("Middle", mention_count=1, days_ago=300), + ] + result = op.execute(dry_run=True) + + names = [e["name"] for e in result["pruned_entities"]] + self.assertEqual(names, ["Oldest", "Middle", "Recent"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/usr/plugins/_kg_dreamer/tools/kg_dreamer.py b/usr/plugins/_kg_dreamer/tools/kg_dreamer.py new file mode 100644 index 0000000000..546c1dfc8e --- /dev/null +++ b/usr/plugins/_kg_dreamer/tools/kg_dreamer.py @@ -0,0 +1,211 @@ +"""KG Dreamer tool wrapper for Agent Zero. + +Exposes dream operations via async tool methods compatible with +Agent Zero's tool pattern. +""" +import logging +from typing import Any, Dict, List, Optional + +import sys +import os + +_plugin_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _plugin_path not in sys.path: + sys.path.insert(0, _plugin_path) + +from orchestrator import DreamOrchestrator + +logger = logging.getLogger(__name__) + +VALID_OPERATIONS: List[str] = [ + "connect", + "strengthen", + "prune", + "contradict", + "pattern", + "insight", +] + + +class KgDreamer: + """Agent Zero tool wrapper for KG Dreamer operations. + + Provides async methods to run dream cycles, check status, + and retrieve reports. Compatible with Agent Zero tool pattern. + + Attributes: + _orchestrator: DreamOrchestrator instance (lazy loaded). + _config_path: Path to configuration file. + """ + + def __init__(self, config_path: str = None) -> None: + """Initialize KgDreamer tool. + + Args: + config_path: Optional path to config YAML file. + """ + self._config_path: Optional[str] = config_path + self._orchestrator: Optional[DreamOrchestrator] = None + + def _get_orchestrator(self) -> DreamOrchestrator: + """Get or create DreamOrchestrator instance (lazy loaded).""" + if self._orchestrator is None: + self._orchestrator = DreamOrchestrator(self._config_path) + return self._orchestrator + + async def __call__( + self, method: str, args: Dict[str, Any] = None, **kwargs + ) -> Dict[str, Any]: + """Route method calls to appropriate handler. + + Args: + method: Method name to call (status, run_dream_cycle, etc.). + args: Method arguments dict. + **kwargs: Additional keyword arguments. + + Returns: + Result dict with status key. + """ + args = args or {} + args.update(kwargs) + + method_map: Dict[str, callable] = { + "status": self.status, + "run_dream_cycle": self.run_dream_cycle, + "run_operation": self.run_operation, + "get_report": self.get_report, + } + + if method not in method_map: + return { + "status": "error", + "error": f"Unknown method: {method}. Valid methods: {list(method_map.keys())}", + } + + try: + return await method_map[method](**args) + except Exception as exc: + logger.error("Method %s failed: %s", method, exc) + return {"status": "error", "error": str(exc), "method": method} + + async def status(self) -> Dict[str, Any]: + """Get dream cycle status and last run information. + + Returns: + Dict with status ('ok' or 'error'), last_run timestamp, + operation states, and configuration paths. + """ + try: + orchestrator = self._get_orchestrator() + status_info = orchestrator.get_status() + return {"status": "ok", **status_info} + except Exception as exc: + logger.error("Failed to get status: %s", exc) + return {"status": "error", "error": str(exc)} + + async def run_dream_cycle( + self, dry_run: bool = True, operations: List[str] = None + ) -> Dict[str, Any]: + """Run full dream cycle with all or specified operations. + + Args: + dry_run: If True, don't make changes, just report. + operations: Optional list of specific operations to run. + If None, runs all enabled operations in order. + + Returns: + Dream report dict with timestamp, results, and summary. + """ + try: + orchestrator = self._get_orchestrator() + report = orchestrator.run_cycle(dry_run=dry_run, operations=operations) + return {"status": "ok", **report} + except Exception as exc: + logger.error("Dream cycle failed: %s", exc) + return {"status": "error", "error": str(exc), "dry_run": dry_run} + + async def run_operation( + self, operation: str, dry_run: bool = True + ) -> Dict[str, Any]: + """Run a single dream operation. + + Args: + operation: Operation name (connect, strengthen, prune, + contradict, pattern, insight). + dry_run: If True, don't make changes, just report. + + Returns: + Operation result dict with status. + """ + if operation not in VALID_OPERATIONS: + return { + "status": "error", + "error": f"Invalid operation: {operation}. Valid: {VALID_OPERATIONS}", + } + + try: + orchestrator = self._get_orchestrator() + report = orchestrator.run_cycle(dry_run=dry_run, operations=[operation]) + + op_result = report.get("operations", {}).get(operation, {}) + return {"status": "ok", "operation": operation, **op_result} + except Exception as exc: + logger.error("Operation %s failed: %s", operation, exc) + return {"status": "error", "error": str(exc), "operation": operation} + + async def get_report(self, count: int = 5) -> Dict[str, Any]: + """Get last N dream reports. + + Args: + count: Number of reports to retrieve (default 5). + + Returns: + Dict with status and list of report dicts. + """ + if not isinstance(count, int) or count < 1: + return {"status": "error", "error": "count must be a positive integer"} + + try: + orchestrator = self._get_orchestrator() + reports = orchestrator.get_reports(count=count) + return { + "status": "ok", + "count": len(reports), + "reports": reports, + } + except Exception as exc: + logger.error("Failed to get reports: %s", exc) + return {"status": "error", "error": str(exc)} + + +# Synchronous convenience methods for non-async usage +def get_dreamer_status(config_path: str = None) -> Dict[str, Any]: + """Get dream cycle status synchronously. + + Args: + config_path: Optional path to config YAML file. + + Returns: + Status dict with last run info and operation states. + """ + orchestrator = DreamOrchestrator(config_path) + return orchestrator.get_status() + + +def run_dream_cycle_sync( + dry_run: bool = True, + operations: List[str] = None, + config_path: str = None, +) -> Dict[str, Any]: + """Run dream cycle synchronously. + + Args: + dry_run: If True, don't make changes, just report. + operations: Optional list of specific operations to run. + config_path: Optional path to config YAML file. + + Returns: + Dream report dict with timestamp, results, and summary. + """ + orchestrator = DreamOrchestrator(config_path) + return orchestrator.run_cycle(dry_run=dry_run, operations=operations) diff --git a/usr/plugins/_kg_pipeline/.pytest_cache/.gitignore b/usr/plugins/_kg_pipeline/.pytest_cache/.gitignore new file mode 100644 index 0000000000..bc1a1f6167 --- /dev/null +++ b/usr/plugins/_kg_pipeline/.pytest_cache/.gitignore @@ -0,0 +1,2 @@ +# Created by pytest automatically. +* diff --git a/usr/plugins/_kg_pipeline/.pytest_cache/v/cache/lastfailed b/usr/plugins/_kg_pipeline/.pytest_cache/v/cache/lastfailed new file mode 100644 index 0000000000..9e26dfeeb6 --- /dev/null +++ b/usr/plugins/_kg_pipeline/.pytest_cache/v/cache/lastfailed @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/usr/plugins/_kg_pipeline/.pytest_cache/v/cache/nodeids b/usr/plugins/_kg_pipeline/.pytest_cache/v/cache/nodeids new file mode 100644 index 0000000000..6c6429f534 --- /dev/null +++ b/usr/plugins/_kg_pipeline/.pytest_cache/v/cache/nodeids @@ -0,0 +1,90 @@ +[ + "tests/test_audit_chain.py::TestAuditChain::test_append_creates_file", + "tests/test_audit_chain.py::TestAuditChain::test_append_multiple_events", + "tests/test_audit_chain.py::TestAuditChain::test_daily_rotation", + "tests/test_audit_chain.py::TestAuditChain::test_disabled_is_noop", + "tests/test_audit_chain.py::TestAuditChain::test_query_by_action", + "tests/test_audit_chain.py::TestAuditChain::test_query_by_since", + "tests/test_audit_chain.py::TestAuditChain::test_query_by_source", + "tests/test_audit_chain.py::TestAuditChain::test_stats", + "tests/test_checkpoint.py::TestCheckpoint::test_atomic_write", + "tests/test_checkpoint.py::TestCheckpoint::test_clear_removes_file", + "tests/test_checkpoint.py::TestCheckpoint::test_concurrent_workers", + "tests/test_checkpoint.py::TestCheckpoint::test_save_and_load", + "tests/test_checkpoint.py::TestCheckpoint::test_stale_detection", + "tests/test_entity_resolver.py::TestConfigDefaults::test_default_config", + "tests/test_entity_resolver.py::TestFindCandidates::test_find_candidates_empty_kg", + "tests/test_entity_resolver.py::TestFindCandidates::test_find_candidates_excludes_same_name", + "tests/test_entity_resolver.py::TestFindCandidates::test_find_candidates_filter_by_type", + "tests/test_entity_resolver.py::TestFindCandidates::test_find_candidates_groups_by_type", + "tests/test_entity_resolver.py::TestFindCandidates::test_find_candidates_returns_dicts", + "tests/test_entity_resolver.py::TestFullPipeline::test_run_candidates_stage", + "tests/test_entity_resolver.py::TestFullPipeline::test_run_full_stage", + "tests/test_entity_resolver.py::TestFullPipeline::test_run_unknown_stage", + "tests/test_entity_resolver.py::TestJaroWinkler::test_jaro_winkler_different", + "tests/test_entity_resolver.py::TestJaroWinkler::test_jaro_winkler_empty", + "tests/test_entity_resolver.py::TestJaroWinkler::test_jaro_winkler_identical", + "tests/test_entity_resolver.py::TestJaroWinkler::test_jaro_winkler_identical_lowercase", + "tests/test_entity_resolver.py::TestJaroWinkler::test_jaro_winkler_partial", + "tests/test_entity_resolver.py::TestJaroWinkler::test_jaro_winkler_similar", + "tests/test_entity_resolver.py::TestJaroWinkler::test_jaro_winkler_single_char", + "tests/test_entity_resolver.py::TestJaroWinkler::test_jaro_winkler_typo", + "tests/test_entity_resolver.py::TestLLMVerifyPair::test_llm_verify_pair_checks_reasoning_content", + "tests/test_entity_resolver.py::TestLLMVerifyPair::test_llm_verify_pair_extracts_confidence", + "tests/test_entity_resolver.py::TestLLMVerifyPair::test_llm_verify_pair_handles_failure", + "tests/test_entity_resolver.py::TestLLMVerifyPair::test_llm_verify_pair_makes_request", + "tests/test_entity_resolver.py::TestLLMVerifyPair::test_llm_verify_pair_parses_no", + "tests/test_entity_resolver.py::TestLLMVerifyPair::test_llm_verify_pair_parses_yes", + "tests/test_entity_resolver.py::TestMergeLogic::test_merge_dry_run_no_changes", + "tests/test_entity_resolver.py::TestMergeLogic::test_merge_keeps_higher_degree_entity", + "tests/test_entity_resolver.py::TestMergeLogic::test_merge_logs_to_audit", + "tests/test_entity_resolver.py::TestMergeLogic::test_merge_skips_missing_degrees", + "tests/test_entity_resolver.py::TestTokenOverlap::test_token_overlap_acronym", + "tests/test_entity_resolver.py::TestTokenOverlap::test_token_overlap_empty", + "tests/test_entity_resolver.py::TestTokenOverlap::test_token_overlap_full", + "tests/test_entity_resolver.py::TestTokenOverlap::test_token_overlap_high", + "tests/test_entity_resolver.py::TestTokenOverlap::test_token_overlap_low", + "tests/test_entity_resolver.py::TestTokenOverlap::test_token_overlap_partial", + "tests/test_entity_resolver.py::TestVerifyCandidates::test_verify_candidates_filters_by_verdict", + "tests/test_entity_resolver.py::TestVerifyCandidates::test_verify_candidates_skips_on_no_verdict", + "tests/test_entity_resolver.py::TestVerifyCandidates::test_verify_candidates_with_llm_disabled", + "tests/test_health_scorer.py::TestAssignTier::test_assign_tier_cold", + "tests/test_health_scorer.py::TestAssignTier::test_assign_tier_cool", + "tests/test_health_scorer.py::TestAssignTier::test_assign_tier_hot", + "tests/test_health_scorer.py::TestAssignTier::test_assign_tier_warm", + "tests/test_health_scorer.py::TestCacheInvalidation::test_cache_invalidation", + "tests/test_health_scorer.py::TestComputeScore::test_compute_score_returns_dimensions", + "tests/test_health_scorer.py::TestCriticalEntities::test_critical_entities", + "tests/test_health_scorer.py::TestScoreConnectivity::test_score_connectivity_high", + "tests/test_health_scorer.py::TestScoreConnectivity::test_score_connectivity_max_zero", + "tests/test_health_scorer.py::TestScoreConnectivity::test_score_connectivity_zero", + "tests/test_health_scorer.py::TestScoreRanges::test_all_scores_in_range", + "tests/test_health_scorer.py::TestScoreRecency::test_score_recency_empty_string", + "tests/test_health_scorer.py::TestScoreRecency::test_score_recency_none", + "tests/test_health_scorer.py::TestScoreRecency::test_score_recency_recent", + "tests/test_health_scorer.py::TestScoreRecency::test_score_recency_stale", + "tests/test_health_scorer.py::TestScoreSourceQuality::test_score_source_quality_high", + "tests/test_health_scorer.py::TestScoreSourceQuality::test_score_source_quality_low", + "tests/test_health_scorer.py::TestTierDistribution::test_tier_distribution", + "tests/test_token_compressor.py::TestTokenCompressor::test_cache_hit_returns_cached", + "tests/test_token_compressor.py::TestTokenCompressor::test_cache_miss_saves_to_cache", + "tests/test_token_compressor.py::TestTokenCompressor::test_composite_compression_steps", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_cleans_urls", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_collapses_whitespace", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_disabled", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_handles_urls_without_params", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_preserves_content", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_removes_author_byline", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_removes_duplicate_lines", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_removes_share_buttons", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_short_content", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_stats", + "tests/test_token_compressor.py::TestTokenCompressor::test_compress_strips_non_ascii", + "tests/test_token_compressor.py::TestTokenCompressor::test_compressor_initialization", + "tests/test_token_compressor.py::TestTokenCompressor::test_llm_fallback_on_error", + "tests/test_token_compressor.py::TestTokenCompressor::test_llm_summarize_called_for_large_content", + "tests/test_token_compressor.py::TestTokenCompressor::test_llm_summarize_not_called_for_small_content", + "tests/test_token_compressor.py::TestTokenCompressor::test_real_world_sample", + "tests/test_token_compressor.py::TestTokenCompressor::test_smart_truncate_prefers_entity_rich", + "tests/test_token_compressor.py::TestTokenCompressor::test_smart_truncate_under_limit_unchanged" +] \ No newline at end of file diff --git a/usr/plugins/_kg_pipeline/README.md b/usr/plugins/_kg_pipeline/README.md new file mode 100644 index 0000000000..f3d7f75fa6 --- /dev/null +++ b/usr/plugins/_kg_pipeline/README.md @@ -0,0 +1,167 @@ +# _kg_pipeline Plugin + +**Knowledge Graph Batch Pipeline** — Consolidated batch operations for Agent Zero's Knowledge Graph system. + +## Overview + +This plugin provides the complete ingestion, quality, and maintenance pipeline for the Knowledge Graph: + +- **Ingestion**: Single file, bulk, Elastic KB, and parallel chunk-based ingestion +- **Crash Recovery**: Atomic checkpoint saves for worker resilience +- **Audit Trail**: Append-only JSONL write provenance +- **Token Compression**: Regex + LLM summarization + content cache +- **Health Scoring**: 5-dimension entity quality scoring with tier assignment +- **Entity Resolution**: String similarity + LLM verification deduplication +- **Quality Audit**: Retrieval precision and entity coverage measurement +- **Enrichment**: Entity enrichment and orphan connection + +## Structure + +``` +/a0/usr/plugins/_kg_pipeline/ +├── plugin.yaml # Plugin metadata +├── default_config.yaml # All configuration +├── README.md # This file +│ +├── pipeline/ +│ ├── __init__.py # Package exports +│ ├── kg_client.py # Shared HTTP client for KG service +│ ├── ingester.py # Single/bulk file ingestion +│ ├── elastic_ingester.py # Elastic KB ingestion +│ ├── parallel_worker.py # Chunk-based parallel processing +│ ├── checkpoint.py # Crash recovery checkpoints +│ ├── audit_chain.py # Append-only write provenance +│ ├── token_compressor.py # Regex + LLM content compression +│ ├── health_scorer.py # Entity health scoring + tiers +│ ├── entity_resolver.py # String + LLM entity dedup +│ ├── orphan_connector.py # Orphan entity connection +│ ├── extractor.py # Entity extraction +│ ├── enricher.py # Entity enrichment +│ ├── auditor.py # Retrieval quality audit +│ ├── knowledge_archiver.py # KG file archival +│ ├── knowledge_ingester.py # Knowledge directory ingestion +│ ├── gdrive.py # Google Drive upload +│ └── phase2_ingest.py # Phase 2 ingestion +│ +├── tools/ +│ ├── __init__.py +│ └── kg_pipeline.py # Main tool with sub-methods +│ +├── tests/ +│ ├── __init__.py +│ ├── test_checkpoint.py # 5 tests +│ ├── test_audit_chain.py # 8 tests +│ ├── test_token_compressor.py # 21 tests +│ ├── test_health_scorer.py # 18 tests +│ └── test_entity_resolver.py # 36 tests +│ +└── prompts/ + └── agent.system.tool.kg_pipeline.md +``` + +## Tool Methods + +| Method | Description | Source | +|--------|-------------|--------| +| `status` | Check KG service health and counts | Existing | +| `ingest` | Ingest single file or directory | Existing | +| `bulk_ingest` | Bulk ingest with deduplication | Existing | +| `elastic_ingest` | Elastic KB ingestion | Existing | +| `parallel_ingest` | Chunk-based parallel processing | Existing | +| `connect_orphans` | Connect orphan entities | Existing | +| `enrich` | Enrich entities with domain/categories | Existing | +| `audit` | Retrieval quality audit | Existing | +| `knowledge_ingest` | Knowledge directory ingestion | Existing | +| `gdrive_upload` | Export KG to Google Drive | Existing | +| `health` | Entity health scores and tier distribution | **New** | +| `resolve_entities` | Entity resolution (candidates/verify/merge) | **New** | + +## Pipeline Modules Added (2026-05-20) + +### checkpoint.py — Crash Recovery +Atomic checkpoint saves for parallel workers. On crash, workers resume from last checkpoint instead of restarting. + +- `save_checkpoint()`, `load_checkpoint()`, `clear_checkpoint()`, `list_stale_checkpoints()` +- Atomic writes via `os.replace()`, 24h stale detection +- Integrated into `parallel_worker.py` + +### audit_chain.py — Write Provenance +Append-only JSONL audit trail for all KG write operations. + +- `append()`, `query()`, `get_stats()` +- One file per day: `kg_audit_YYYY-MM-DD.jsonl` +- Kill switch: `audit.enabled: false` → all calls become no-ops +- Integrated into `ingester.py`, `elastic_ingester.py`, `parallel_worker.py` + +### token_compressor.py — Content Compression +Reduces token usage by stripping boilerplate, LLM summarization, and caching. + +- Regex: social buttons, bylines, whitespace, URL tracking params, duplicate lines +- LLM: Qwen3.6-35B on Mediaserver for files >30K chars +- Cache: MD5-based content hash cache with 7-day TTL +- Smart truncate: entity-aware paragraph selection +- ~29% reduction on Elastic blog content, ~40%+ on large files + +### health_scorer.py — Entity Quality Scoring +Multi-dimensional health scoring with memory tier assignment. + +- 5 dimensions: Connectivity (35%), Recency (20%), Source Quality (20%), Freshness (15%), Confidence (10%) +- Tiers: hot (≥0.7), warm (≥0.5), cool (≥0.3), cold (<0.3) +- 24h cache, KuzuDB-compatible Cypher + +### entity_resolver.py — Entity Deduplication +3-stage pipeline for finding and merging duplicate entities. + +- Stage 1: String blocking (Jaro-Winkler + token overlap) +- Stage 2: LLM verification (Qwen3.6-35B on Mediaserver) +- Stage 3: Safe merge (higher-degree canonical, DETACH DELETE, audit logging) +- Dry-run default, KuzuDB-compatible + +## Configuration + +All settings in `default_config.yaml`: + +```yaml +kg_service_url: "http://100.78.79.41:8010" + +audit: + enabled: true + retention_days: 90 + +compression: + enabled: true + llm_enabled: true + llm_threshold_chars: 30000 + cache_enabled: true + cache_ttl_days: 7 + +health_scoring: + enabled: true + cache_ttl_hours: 24 + tier_thresholds: { hot: 0.7, warm: 0.5, cool: 0.3 } + +entity_resolution: + enabled: true + string_threshold: 0.80 + llm_verify: true + dry_run_default: true +``` + +## Testing + +```bash +cd /a0/usr/plugins/_kg_pipeline +python3 -m pytest tests/ -v +``` + +88 tests across 5 test files. All must pass before any merge to main. + +## Dependencies + +- `requests` — KG service HTTP client +- `numpy` — Health scoring calculations +- LLM endpoint: Qwen3.6-35B on Mediaserver (192.168.1.250:11435) + +## License + +Part of Agent Zero system. diff --git a/usr/plugins/_kg_pipeline/default_config.yaml b/usr/plugins/_kg_pipeline/default_config.yaml new file mode 100644 index 0000000000..c878323c9f --- /dev/null +++ b/usr/plugins/_kg_pipeline/default_config.yaml @@ -0,0 +1,44 @@ +kg_service_url: "http://100.78.79.41:8010" +batch_size: 50 +timeout: 300 +max_retries: 3 +retry_delay: 1.0 +parallel_workers: 3 +elastic_kb_dir: "/a0/usr/workdir/elastic_kb" +knowledge_dir: "/a0/usr/knowledge" +ingest_state_file: "/a0/usr/workdir/logs/kg_ingest_state.json" +log_dir: "/a0/usr/workdir/logs" +chunk_dir: "/a0/usr/workdir/config" +orphan_state_file: "/a0/usr/workdir/state/kg_orphan_state.json" +enrichment_state_file: "/a0/usr/workdir/logs/kg_enrichment_state.json" +llm_api_url: "http://192.168.1.245:8000/v1" +llm_model: "default" + +audit: + enabled: true + retention_days: 90 + +compression: + enabled: true + min_reduction_pct: 10 # warn if less than 10% reduction + llm_enabled: true + llm_threshold_chars: 30000 + llm_max_output_tokens: 4096 + cache_enabled: true + cache_ttl_days: 7 + +health_scoring: + enabled: true + cache_ttl_hours: 24 + tier_thresholds: + hot: 0.7 + warm: 0.5 + cool: 0.3 + +entity_resolution: + enabled: true + string_threshold: 0.80 + token_overlap_threshold: 0.60 + llm_verify: true + batch_size: 10 + dry_run_default: true diff --git a/usr/plugins/_kg_pipeline/pipeline/__init__.py b/usr/plugins/_kg_pipeline/pipeline/__init__.py new file mode 100644 index 0000000000..57b1fbf7c2 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/__init__.py @@ -0,0 +1,27 @@ +"""KG Pipeline Helpers - Consolidated batch operations for Knowledge Graph.""" + +from .kg_client import KGClient +from .ingester import Ingester +from .elastic_ingester import ElasticIngester +from .parallel_worker import ParallelWorker +from .orphan_connector import OrphanConnector +from .extractor import KGExtractor +from .enricher import EntityEnricher +from .auditor import KGAuditor +from .gdrive import KGDriveUploader +from .health_scorer import HealthScorer +from .entity_resolver import EntityResolver + +__all__ = [ + "KGClient", + "Ingester", + "ElasticIngester", + "ParallelWorker", + "OrphanConnector", + "KGExtractor", + "EntityEnricher", + "KGAuditor", + "KGDriveUploader", + "HealthScorer", + "EntityResolver", +] diff --git a/usr/plugins/_kg_pipeline/pipeline/audit_chain.py b/usr/plugins/_kg_pipeline/pipeline/audit_chain.py new file mode 100644 index 0000000000..f7a31e58cb --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/audit_chain.py @@ -0,0 +1,193 @@ +"""Append-only audit trail for KG write operations. + +Provides write provenance tracking for all KG mutations (entity adds, +relationship creates, janitor runs) with daily JSONL files. + +Upgrade path: Hash chaining (SHA-256 chain linking events) documented +but not implemented per TC decision. See comments in append(). +""" +import json +import os +import logging +from datetime import datetime, date, timezone +from typing import Dict, List, Optional + +logger = logging.getLogger(__name__) + + +class AuditChain: + """Append-only audit trail for KG write operations.""" + + def __init__(self, audit_dir: str, enabled: bool = True): + """Initialize audit chain. + + Args: + audit_dir: Directory for audit JSONL files (one per day). + enabled: Set False to disable audit (all calls become no-ops). + """ + self.audit_dir = audit_dir + self.enabled = enabled + if self.enabled: + os.makedirs(self.audit_dir, exist_ok=True) + + def _today_file(self) -> str: + """Get the JSONL file path for today.""" + today = date.today().isoformat() + return os.path.join(self.audit_dir, f"kg_audit_{today}.jsonl") + + def _file_for_date(self, dt: date) -> str: + """Get the JSONL file path for a specific date.""" + return os.path.join( + self.audit_dir, f"kg_audit_{dt.isoformat()}.jsonl" + ) + + def append( + self, + action: str, + target_type: str, + target_id: str, + source: str, + metadata: Optional[Dict] = None, + ) -> None: + """Append an audit event to today's JSONL file. + + Args: + action: One of 'add', 'update', 'delete', 'merge', 'janitor'. + target_type: One of 'entity', 'relationship', 'document'. + target_id: Source path or document ID. + source: Module/function that triggered the write. + metadata: Optional dict with entity_count, rel_count, domain. + """ + if not self.enabled: + return + + event = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "action": action, + "target_type": target_type, + "target_id": target_id, + "source": source, + "metadata": metadata or {}, + } + # Upgrade path: add hash chaining here: + # event["prev_hash"] = self._last_hash() + # event["hash"] = self._hash_event(event) + + filepath = self._today_file() + try: + with open(filepath, "a", encoding="utf-8") as f: + f.write(json.dumps(event, ensure_ascii=False) + "\n") + except OSError as exc: + logger.error("Audit write failed: %s", exc) + + def query( + self, + action: Optional[str] = None, + source: Optional[str] = None, + since: Optional[str] = None, + limit: int = 100, + ) -> List[Dict]: + """Query audit events with optional filters. + + Args: + action: Filter by action type. + source: Filter by source module. + since: ISO timestamp to filter from. + limit: Max results to return. + + Returns: + List of matching audit events (newest first within limit). + """ + if not self.enabled: + return [] + + results: List[Dict] = [] + files = sorted( + f for f in os.listdir(self.audit_dir) + if f.startswith("kg_audit_") and f.endswith(".jsonl") + ) + # Read files in reverse chronological order + for filename in reversed(files): + filepath = os.path.join(self.audit_dir, filename) + try: + with open(filepath, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if action and event.get("action") != action: + continue + if source and event.get("source") != source: + continue + if since and event.get("timestamp", "") < since: + continue + results.append(event) + if len(results) >= limit: + return results + except OSError: + continue + return results + + def get_stats(self) -> Dict: + """Get audit statistics. + + Returns: + Dict with total_events, events_by_action, file_count, + total_size_bytes, and file_details list. + """ + if not self.enabled or not os.path.isdir(self.audit_dir): + return { + "enabled": self.enabled, + "total_events": 0, + "events_by_action": {}, + "file_count": 0, + "total_size_bytes": 0, + "file_details": [], + } + + total_events = 0 + by_action: Dict[str, int] = {} + total_size = 0 + details: List[Dict] = [] + + for filename in sorted(os.listdir(self.audit_dir)): + if not (filename.startswith("kg_audit_") + and filename.endswith(".jsonl")): + continue + filepath = os.path.join(self.audit_dir, filename) + size = os.path.getsize(filepath) + total_size += size + count = 0 + try: + with open(filepath, "r", encoding="utf-8") as f: + for line in f: + if not line.strip(): + continue + try: + event = json.loads(line) + count += 1 + act = event.get("action", "unknown") + by_action[act] = by_action.get(act, 0) + 1 + except json.JSONDecodeError: + count += 1 + except OSError: + pass + total_events += count + details.append({ + "file": filename, + "events": count, + "size_bytes": size, + }) + + return { + "enabled": self.enabled, + "total_events": total_events, + "events_by_action": by_action, + "file_count": len(details), + "total_size_bytes": total_size, + "file_details": details, + } diff --git a/usr/plugins/_kg_pipeline/pipeline/auditor.py b/usr/plugins/_kg_pipeline/pipeline/auditor.py new file mode 100644 index 0000000000..ec723f0412 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/auditor.py @@ -0,0 +1,242 @@ +"""Retrieval audit helper for KG quality measurement.""" +import os +import json +import random +import logging +from datetime import datetime, timedelta +from collections import Counter +from typing import Dict, List, Optional, Any + +from .kg_client import KGClient + +logger = logging.getLogger(__name__) + + +class KGAuditor: + """Audit the Knowledge Graph for quality metrics.""" + + def __init__(self, kg_client: KGClient, config: Dict[str, Any]): + self.kg = kg_client + self.config = config + self.report = { + "timestamp": datetime.now().isoformat(), + "health": {}, + "entity_stats": {}, + "retrieval_precision": {}, + "staleness": {}, + "gaps": {}, + "recommendations": [] + } + + def check_health(self) -> bool: + """Check KG service health.""" + try: + health = self.kg.health_check() + self.report["health"] = { + "status": health.get("status"), + "version": health.get("version"), + "vectors": health.get("vectors_count"), + } + return True + except Exception as e: + self.report["health"]["error"] = str(e) + return False + + def analyze_entities(self, sample_size: int = 500) -> None: + """Analyze entity coverage and tag quality.""" + try: + result = self.kg.get_entities(limit=1) + total = result.get("total", 0) + + # Sample entities + entities = [] + if total > 0: + sample_offsets = [random.randint(0, max(0, total - sample_size)) + for _ in range(min(10, total // 50))] + for offset in sample_offsets: + batch = self.kg.get_entities(offset=offset, limit=50) + entities.extend(batch.get("entities", [])) + + # Analyze + no_tags = single_tag = multi_tag = rich_tag = 0 + cat_counter = Counter() + domain_counter = Counter() + + now = datetime.now() + stale_count = 0 + stale_threshold = timedelta(days=30) + + for e in entities: + cats = (e.get("categories", "") or "").strip() + cat_list = [c.strip() for c in cats.split(",") if c.strip()] + + if len(cat_list) == 0: + no_tags += 1 + elif len(cat_list) == 1: + single_tag += 1 + elif len(cat_list) == 2: + multi_tag += 1 + else: + rich_tag += 1 + + for c in cat_list: + cat_counter[c] += 1 + domain_counter[e.get("domain", "unknown")] += 1 + + # Staleness + last_seen = e.get("last_seen") + if last_seen: + try: + ls = datetime.fromisoformat(last_seen.replace("Z", "+00:00")) + if now - ls.replace(tzinfo=None) > stale_threshold: + stale_count += 1 + except: + pass + + self.report["entity_stats"] = { + "total": total, + "sampled": len(entities), + "tag_coverage": { + "no_tags": no_tags, + "no_tags_pct": round(no_tags / len(entities) * 100, 1) if entities else 0, + "single_tag": single_tag, + "single_tag_pct": round(single_tag / len(entities) * 100, 1) if entities else 0, + "multi_tag": multi_tag, + "multi_tag_pct": round(multi_tag / len(entities) * 100, 1) if entities else 0, + "rich_tag_3plus": rich_tag, + "rich_tag_pct": round(rich_tag / len(entities) * 100, 1) if entities else 0, + }, + "top_categories": cat_counter.most_common(20), + "domain_distribution": dict(domain_counter), + "stale_entities": stale_count, + "stale_pct": round(stale_count / len(entities) * 100, 1) if entities else 0 + } + + except Exception as e: + logger.error(f"Failed to analyze entities: {e}") + + def test_retrieval_precision(self, sample: bool = True) -> None: + """Test retrieval precision@10 with domain-specific queries.""" + test_queries = [ + {"query": "docker container orchestration", "expected": ["technology", "devops", "docker"]}, + {"query": "large language model inference", "expected": ["technology", "ai-ml", "llm"]}, + {"query": "vector database search engine", "expected": ["technology", "database"]}, + {"query": "sales territory pipeline", "expected": ["work", "sales", "territory"]}, + {"query": "state government education SLED", "expected": ["work", "sled", "government"]}, + {"query": "self-hosting home lab", "expected": ["personal", "home-lab"]}, + {"query": "bookmark web tutorial", "expected": ["personal", "bookmark"]}, + {"query": "open source AI tools", "expected": ["technology", "ai-ml", "open-source"]}, + {"query": "security authentication encryption", "expected": ["technology", "security"]}, + {"query": "monitoring observability logging", "expected": ["technology", "monitoring", "devops"]}, + ] + + if not sample: + # Full test with more queries + test_queries.extend([ + {"query": "python programming framework", "expected": ["technology", "programming"]}, + {"query": "cloud AWS serverless", "expected": ["technology", "cloud", "aws"]}, + ]) + + precision_scores = [] + query_results = [] + + for tq in test_queries: + try: + result = self.kg.search(tq["query"], limit=10) + entities = result.get("entities", []) + + relevant = sum(1 for e in entities + if any(exp in str(e.get("domain", "")).lower() + or str(e.get("categories", "")).lower() + for exp in tq["expected"])) + + precision = relevant / max(len(entities), 1) + precision_scores.append(precision) + + query_results.append({ + "query": tq["query"], + "precision": round(precision, 3), + "relevant": relevant, + "total": len(entities) + }) + + except Exception as e: + query_results.append({"query": tq["query"], "error": str(e)}) + + avg_p = sum(precision_scores) / len(precision_scores) if precision_scores else 0 + self.report["retrieval_precision"] = { + "avg_precision_at_10": round(avg_p, 3), + "queries_tested": len(query_results), + "per_query": query_results + } + + def check_gaps(self) -> None: + """Detect gaps: orphan entities, missing relationships.""" + try: + orphans = self.kg.get_orphans(limit=100) + + # Sample entities for low-confidence check + sample = self.kg.get_entities(limit=100) + entities = sample.get("entities", []) + + low_confidence = [e for e in entities if e.get("confidence", 1.0) < 0.5] + no_domain = [e for e in entities if not e.get("domain")] + + self.report["gaps"] = { + "orphan_count": len(orphans), + "low_confidence": len(low_confidence), + "no_domain": len(no_domain) + } + + except Exception as e: + logger.error(f"Failed to check gaps: {e}") + + def generate_recommendations(self) -> None: + """Generate recommendations based on audit results.""" + recs = [] + + # Retrieval-based + avg_p10 = self.report.get("retrieval_precision", {}).get("avg_precision_at_10", 0) + if avg_p10 < 0.3: + recs.append({"priority": "HIGH", "area": "Retrieval", + "finding": f"Low precision@10 ({avg_p10:.3f})", + "action": "Consider pruning stale entities"}) + elif avg_p10 < 0.6: + recs.append({"priority": "MEDIUM", "area": "Retrieval", + "finding": f"Moderate precision@10 ({avg_p10:.3f})", + "action": "Review tag quality"}) + + # Staleness + stale_pct = self.report.get("entity_stats", {}).get("stale_pct", 0) + if stale_pct > 20: + recs.append({"priority": "HIGH", "area": "Staleness", + "finding": f"{stale_pct}% entities are stale", + "action": "Archive or re-tag stale entities"}) + + self.report["recommendations"] = recs + + def run_audit(self, sample: bool = True) -> Dict[str, Any]: + """Run full audit.""" + if not self.check_health(): + return {"status": "error", "message": "KG service not healthy"} + + self.analyze_entities(sample_size=200 if sample else 500) + self.test_retrieval_precision(sample=sample) + self.check_gaps() + self.generate_recommendations() + + return self.report + + def save_report(self, path: Optional[str] = None) -> str: + """Save audit report to file.""" + if path is None: + log_dir = self.config.get("log_dir", "/a0/usr/workdir/logs") + audit_dir = os.path.join(log_dir, "kg_audit") + os.makedirs(audit_dir, exist_ok=True) + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + path = os.path.join(audit_dir, f"kg_audit_{ts}.json") + + with open(path, "w") as f: + json.dump(self.report, f, indent=2, default=str) + + return path diff --git a/usr/plugins/_kg_pipeline/pipeline/checkpoint.py b/usr/plugins/_kg_pipeline/pipeline/checkpoint.py new file mode 100644 index 0000000000..19f5a6b778 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/checkpoint.py @@ -0,0 +1,167 @@ +"""Crash recovery checkpoints for KG pipeline workers. + +Provides atomic save/load/clear operations for worker state, +enabling resume-after-crash for parallel ingestion chunks. +""" +import json +import os +import logging +from datetime import datetime, timezone, timedelta +from typing import Dict, List, Optional + +logger = logging.getLogger(__name__) + +STATE_DIR = "/a0/usr/workdir/state/kg_checkpoints" +CHECKPOINT_PATTERN = "worker_{worker_id}_chunk_{chunk_index}.json" +STALE_TTL_HOURS = 24 + + +def _checkpoint_path(worker_id: int, chunk_index: int) -> str: + """Return absolute path for a worker checkpoint file.""" + filename = CHECKPOINT_PATTERN.format( + worker_id=worker_id, chunk_index=chunk_index + ) + return os.path.join(STATE_DIR, filename) + + +def _ensure_state_dir() -> None: + """Create state directory if it does not exist.""" + os.makedirs(STATE_DIR, exist_ok=True) + + +def save_checkpoint( + worker_id: int, + chunk_index: int, + processed: List[str], + total: int, + failed: List[dict], + stats: Dict, +) -> Dict: + """Atomically save a checkpoint for a worker chunk. + + Writes to a temp file first, then uses os.replace() for + POSIX-atomic rename. Returns the saved checkpoint dict. + + Args: + worker_id: Worker identifier (0-based). + chunk_index: Chunk index being processed. + processed: List of basenames successfully pushed. + total: Total files in this chunk. + failed: List of {file, error} dicts. + stats: Dict with pushed/failed/skipped counts. + + Returns: + The full checkpoint dictionary that was persisted. + """ + _ensure_state_dir() + now = datetime.now(timezone.utc).isoformat() + checkpoint = { + "worker_id": worker_id, + "chunk_index": chunk_index, + "processed_files": processed, + "processed_count": len(processed), + "total_files": total, + "failed_files": failed, + "stats": stats, + "last_checkpoint": now, + } + target = _checkpoint_path(worker_id, chunk_index) + tmp_path = target + ".tmp" + try: + with open(tmp_path, "w", encoding="utf-8") as fh: + json.dump(checkpoint, fh, indent=2) + os.replace(tmp_path, target) + except OSError as exc: + logger.error("Checkpoint write failed W%d C%d: %s", + worker_id, chunk_index, exc) + if os.path.exists(tmp_path): + os.remove(tmp_path) + raise + return checkpoint + + +def load_checkpoint(worker_id: int, chunk_index: int) -> Optional[Dict]: + """Load a checkpoint for a worker chunk. + + Returns None if no checkpoint exists or the file is corrupt. + + Args: + worker_id: Worker identifier. + chunk_index: Chunk index. + + Returns: + Checkpoint dict, or None if not found / invalid. + """ + target = _checkpoint_path(worker_id, chunk_index) + if not os.path.exists(target): + return None + try: + with open(target, "r", encoding="utf-8") as fh: + data = json.load(fh) + return data + except (json.JSONDecodeError, OSError) as exc: + logger.warning( + "Corrupt checkpoint W%d C%d, ignoring: %s", + worker_id, chunk_index, exc, + ) + return None + + +def clear_checkpoint(worker_id: int, chunk_index: int) -> None: + """Remove a checkpoint file after successful completion. + + Silently succeeds if the file does not exist. + + Args: + worker_id: Worker identifier. + chunk_index: Chunk index. + """ + target = _checkpoint_path(worker_id, chunk_index) + try: + if os.path.exists(target): + os.remove(target) + logger.info("Checkpoint cleared W%d C%d", + worker_id, chunk_index) + except OSError as exc: + logger.warning("Could not clear checkpoint W%d C%d: %s", + worker_id, chunk_index, exc) + + +def list_stale_checkpoints( + ttl_hours: int = STALE_TTL_HOURS, +) -> List[Dict]: + """Return checkpoints older than the given TTL. + + Scans state directory, loads each checkpoint, and returns + those whose last_checkpoint timestamp exceeds ttl_hours. + + Args: + ttl_hours: Age threshold in hours (default 24). + + Returns: + List of checkpoint dicts that are stale. + """ + _ensure_state_dir() + cutoff = datetime.now(timezone.utc) - timedelta(hours=ttl_hours) + stale: List[Dict] = [] + + for fname in os.listdir(STATE_DIR): + if not fname.startswith("worker_") or not fname.endswith(".json"): + continue + fpath = os.path.join(STATE_DIR, fname) + try: + with open(fpath, "r", encoding="utf-8") as fh: + data = json.load(fh) + ts_str = data.get("last_checkpoint", "") + if not ts_str: + stale.append(data) + continue + ts = datetime.fromisoformat(ts_str) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + if ts < cutoff: + stale.append(data) + except (json.JSONDecodeError, OSError, ValueError): + logger.warning("Skipping unreadable checkpoint: %s", fname) + + return stale diff --git a/usr/plugins/_kg_pipeline/pipeline/elastic_ingester.py b/usr/plugins/_kg_pipeline/pipeline/elastic_ingester.py new file mode 100644 index 0000000000..20edb1f5cd --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/elastic_ingester.py @@ -0,0 +1,150 @@ +"""Elastic KB ingestion helper.""" +import os +import time +import glob +import logging +from typing import Dict, List, Tuple, Optional, Any +from datetime import datetime + +from .kg_client import KGClient +from .ingester import Ingester +from .audit_chain import AuditChain + +logger = logging.getLogger(__name__) + + +class ElasticIngester: + """Ingests Elastic KB files into KG with domain mapping.""" + + DOMAIN_MAP = { + "products": "technology", + "pricing-licensing": "work", + "competitive": "work", + "customers": "work", + "blog": "context", + "security-labs": "technology", + "observability-labs": "technology", + "industries": "work", + "partners": "work", + "ai-emerging": "technology", + "training": "context", + "what-is-glossary": "context", + } + + def __init__(self, kg_client: KGClient, config: Dict[str, Any]): + self.kg = kg_client + self.config = config + self.elastic_kb_dir = config.get("elastic_kb_dir", "/a0/usr/workdir/elastic_kb") + self.log_dir = config.get("log_dir", "/a0/usr/workdir/logs") + audit_cfg = config.get("audit", {}) + audit_dir = os.path.join(self.log_dir, "kg_audit") + self.audit = AuditChain( + audit_dir=audit_cfg.get("audit_dir", audit_dir), + enabled=audit_cfg.get("enabled", True), + ) + + def _log(self, msg: str, level: str = "INFO") -> None: + """Log message with timestamp.""" + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + line = f"[{ts}] [{level}] {msg}" + logger.info(msg) + log_file = os.path.join(self.log_dir, "kg_elastic_ingest.log") + os.makedirs(os.path.dirname(log_file), exist_ok=True) + with open(log_file, "a") as f: + f.write(line + "\n") + + def determine_domain(self, filepath: str) -> str: + """Determine domain from file path using category mapping.""" + rel = os.path.relpath(filepath, self.elastic_kb_dir) + category = rel.split("/")[0] if "/" in rel else rel + return self.DOMAIN_MAP.get(category, "context") + + def collect_files(self, category: Optional[str] = None) -> List[str]: + """Collect all .md files from Elastic KB directory.""" + files = [] + pattern = os.path.join(self.elastic_kb_dir, "**", "*.md") + for filepath in sorted(glob.glob(pattern, recursive=True)): + if category: + rel = os.path.relpath(filepath, self.elastic_kb_dir) + if not rel.startswith(category): + continue + files.append(filepath) + return files + + def ingest_files(self, file_list: List[str], kg_paths: set = None, + dry_run: bool = False) -> Tuple[int, int, int]: + """Ingest Elastic KB files with deduplication.""" + kg_paths = kg_paths or set() + pushed, failed, skipped = 0, 0, 0 + + kg_basenames = {p.split("/")[-1].replace(".md", "") for p in kg_paths} + start_time = time.time() + + for i, filepath in enumerate(file_list, 1): + rel_path = os.path.relpath(filepath, self.elastic_kb_dir) + basename = os.path.basename(filepath).replace(".md", "") + + if rel_path in kg_paths or basename in kg_basenames: + skipped += 1 + continue + + try: + with open(filepath, "r", encoding="utf-8") as f: + content = f.read() + + if len(content.strip()) < 100: + skipped += 1 + continue + + # Apply token compression before truncation + from .token_compressor import TokenCompressor + if not hasattr(self, 'compressor'): + self.compressor = TokenCompressor(self.config) + content = self.compressor.compress(content) + + # Truncate large files + if len(content) > 30000: + content = content[:30000] + "\n\n[... Content truncated ...]" + + domain = self.determine_domain(filepath) + + if dry_run: + pushed += 1 + if i % 100 == 0: + self._log(f"[{i}/{len(file_list)}] WOULD process {filepath}") + continue + + source = f"elastic_kb/{rel_path}" + result = self.kg.add_content(content, source, domain) + + if "error" in result: + failed += 1 + if i % 100 == 0: + self._log(f"[{i}/{len(file_list)}] FAILED: {filepath}") + else: + pushed += 1 + self.audit.append( + action="add", + target_type="document", + target_id=filepath, + source="elastic_ingester.ingest_files", + metadata={ + "domain": domain, + "elastic_source": source, + }, + ) + + if i % 100 == 0 or i == len(file_list): + elapsed = time.time() - start_time + rate = i / elapsed if elapsed > 0 else 0 + self._log(f"[{i}/{len(file_list)}] pushed={pushed} failed={failed} " + f"skipped={skipped} | {rate:.1f}/s") + + except Exception as e: + failed += 1 + if failed <= 10: + self._log(f"[{i}/{len(file_list)}] ERROR: {filepath} - {e}", "ERROR") + + time.sleep(0.2) + + return pushed, failed, skipped diff --git a/usr/plugins/_kg_pipeline/pipeline/enricher.py b/usr/plugins/_kg_pipeline/pipeline/enricher.py new file mode 100644 index 0000000000..95d47c8645 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/enricher.py @@ -0,0 +1,256 @@ +"""Entity enrichment helper for retroactive enrichment and history enrichment.""" +import os +import json +import time +import logging +from datetime import datetime +from typing import Dict, List, Optional, Any +import requests + +from .kg_client import KGClient + + +logger = logging.getLogger(__name__) + + +class EntityEnricher: + """Enriches entities with domain and categories using LLM.""" + + LLM_TIMEOUT = 60 + + VALID_DOMAINS = ["technology", "work", "personal", "platform"] + KG_DOMAIN = "agent-history" + + def __init__(self, kg_client: KGClient, config: Dict[str, Any]): + self.kg = kg_client + self.config = config + self.llm_url = config.get("llm_api_url", "http://192.168.1.245:8000/v1") + self.llm_model = config.get("llm_model", "default") + self.state_file = config.get("enrichment_state_file", + "/a0/usr/workdir/logs/kg_enrichment_state.json") + self.log_dir = config.get("log_dir", "/a0/usr/workdir/logs") + self.session = requests.Session() + + def _log(self, msg: str, level: str = "INFO") -> None: + """Log message with timestamp.""" + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + line = f"[{ts}] [{level}] {msg}" + logger.info(msg) + log_file = os.path.join(self.log_dir, "kg_enrichment.log") + os.makedirs(os.path.dirname(log_file), exist_ok=True) + with open(log_file, "a") as f: + f.write(line + "\n") + + def _load_state(self) -> Dict: + """Load enrichment state.""" + if self.state_file and os.path.exists(self.state_file): + try: + with open(self.state_file) as f: + return json.load(f) + except Exception as e: + logger.warning(f"Failed to load state: {e}") + return { + "phase": None, + "offset": 0, + "total_processed": 0, + "total_updated": 0, + "total_skipped": 0, + "total_errors": 0 + } + + def _save_state(self, state: Dict) -> None: + """Save enrichment state.""" + if self.state_file: + os.makedirs(os.path.dirname(self.state_file), exist_ok=True) + with open(self.state_file, "w") as f: + json.dump(state, f, indent=2) + + def _infer_categories(self, name: str, etype: str, current_domain: str, + current_cats: str) -> Dict[str, str]: + """Use LLM to infer domain and categories.""" + prompt = f"""Assign the BEST domain and categories to this entity. + +Entity: "{name}" +Type: {etype} +Current domain: {current_domain} +Current categories: {current_cats or "(none)"} + +Available domains: technology, work, personal, platform + +RULES: +- Assign exactly ONE domain +- Assign 3-6 specific categories +- Normalize: lowercase, hyphens not underscores +- Prefer specific over generic categories + +Return ONLY JSON: +{{"domain": "technology", "categories": "ai-ml,llm,inference"}}""" + + try: + response = self.session.post( + f"{self.llm_url}/chat/completions", + json={ + "model": self.llm_model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.1, + "max_tokens": 200 + }, + timeout=self.LLM_TIMEOUT + ) + response.raise_for_status() + result = response.json() + content = result["choices"][0]["message"]["content"] + + # Strip markdown fences + content = content.strip() + if content.startswith("```"): + lines = content.split("\n") + lines = [l for l in lines if not l.strip().startswith("```")] + content = "\n".join(lines).strip() + + parsed = json.loads(content) + return { + "domain": parsed.get("domain", current_domain), + "categories": parsed.get("categories", current_cats) + } + + except Exception as e: + logger.error(f"LLM enrichment failed: {e}") + return {"domain": current_domain, "categories": current_cats} + + def enrich_entity(self, entity_id: str, name: str, etype: str, + current_domain: str, current_cats: str, + dry_run: bool = False) -> bool: + """Enrich a single entity.""" + result = self._infer_categories(name, etype, current_domain, current_cats) + new_domain = result["domain"] + new_cats = result["categories"] + + # Validate domain + if new_domain not in self.VALID_DOMAINS: + new_domain = current_domain + + if dry_run: + self._log(f"[DRY-RUN] {name}: {current_domain}/{current_cats} -> {new_domain}/{new_cats}") + return True + + try: + self.kg.update_entity(entity_id, { + "domain": new_domain, + "categories": new_cats + }) + self._log(f"Updated '{name}': {new_domain}/{new_cats}") + return True + except Exception as e: + logger.error(f"Update failed for '{name}': {e}") + return False + + def run_enrichment(self, limit: Optional[int] = None, offset: Optional[int] = None, + dry_run: bool = False) -> Dict[str, Any]: + """Run retroactive enrichment.""" + state = self._load_state() + if offset is None: + offset = state.get("offset", 0) + + self._log(f"Starting enrichment offset={offset} limit={limit}") + + # Fetch untagged entities via Cypher + try: + rows = self.kg.query_cypher( + "MATCH (e:Entity) WHERE e.categories IS NULL OR e.categories = '' " + "RETURN e.name, e.type, e.id, e.domain, e.categories " + f"SKIP {offset} LIMIT {limit or 50}" + ) + except Exception as e: + # Fallback to REST API + result = self.kg.get_entities(offset=offset, limit=limit or 50) + rows = result.get("entities", []) + + processed = updated = skipped = errors = 0 + + for entity in rows[:limit or len(rows)]: + if isinstance(entity, dict): + name = entity.get("e.name", entity.get("name", "")) + eid = entity.get("e.id", entity.get("id", "")) + etype = entity.get("e.type", entity.get("type", "")) + current_domain = entity.get("e.domain", entity.get("domain", "")) + current_cats = entity.get("e.categories", entity.get("categories", "")) or "" + else: + continue + + if not name or not eid: + skipped += 1 + continue + + try: + success = self.enrich_entity( + eid, name, etype, current_domain, current_cats, dry_run + ) + if success: + updated += 1 + else: + errors += 1 + except Exception as e: + errors += 1 + logger.error(f"Error enriching {name}: {e}") + + processed += 1 + time.sleep(2.0) # Rate limiting + + state["total_processed"] = processed + state["total_updated"] = updated + state["total_skipped"] = skipped + state["total_errors"] = errors + state["offset"] = offset + processed + self._save_state(state) + + return { + "status": "done", + "processed": processed, + "updated": updated, + "skipped": skipped, + "errors": errors + } + + def get_historical_context(self, insight_bank: Optional[Dict] = None) -> str: + """Get historical context for distill (from kg_history_enrich.py).""" + try: + sections = [] + + # Top recurring entities + rows = self.kg.query_cypher( + 'MATCH (e:Entity) WHERE e.domain = $domain ' + 'RETURN e.name, e.type, e.mention_count ' + 'ORDER BY e.mention_count DESC LIMIT 10', + {"domain": self.KG_DOMAIN} + ) + if rows: + sections.append("## Historical Knowledge Graph Topics") + for r in rows[:8]: + name = r.get("e.name", "?") + etype = r.get("e.type", "?") + sections.append(f"- {name} ({etype})") + + return "\n".join(sections) if sections else "" + except Exception: + return "" + + def enrich_insight_bank(self, insight_bank: Dict) -> List[str]: + """Query KG for patterns that should be promoted (from kg_history_enrich.py).""" + try: + rows = self.kg.query_cypher( + 'MATCH (e:Entity) WHERE e.domain = $domain AND e.mention_count > 1 ' + 'RETURN e.name, e.type, e.mention_count ' + 'ORDER BY e.mention_count DESC LIMIT 20', + {"domain": self.KG_DOMAIN} + ) + promotions = [] + for r in rows: + name = r.get("e.name", "") + mentions = r.get("e.mention_count", 0) + etype = r.get("e.type", "") + if mentions >= 2 and name: + promotions.append(f"{name} ({etype}, seen {mentions}x)") + return promotions + except Exception: + return [] diff --git a/usr/plugins/_kg_pipeline/pipeline/entity_resolver.py b/usr/plugins/_kg_pipeline/pipeline/entity_resolver.py new file mode 100644 index 0000000000..19a8f411f3 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/entity_resolver.py @@ -0,0 +1,660 @@ +"""Entity Resolution - 3-stage pipeline for finding and merging duplicates. + +Uses string similarity blocking + LLM verification to resolve entity duplicates +in the Knowledge Graph. Implements Jaro-Winkler similarity and token overlap +for candidate generation, then validates with Qwen3.6-35B before merging. + +Architecture: + Stage 1: find_candidates() - String similarity blocking + Stage 2: verify_candidates() - LLM verification (YES/NO judgment) + Stage 3: merge_duplicates() - Merge confirmed pairs, keep higher-degree entity + +KuzuDB-compatible Cypher ONLY - no CASE WHEN, no Neo4j-specific functions. +""" +import logging +import time +import json +from typing import Dict, List, Optional, Any, Tuple +from itertools import combinations + +import requests + +logger = logging.getLogger(__name__) + + +class EntityResolver: + """Find and resolve duplicate entities using string similarity + LLM verification. + + Attributes: + kg: KGClient instance for KG operations + config: Configuration dict with thresholds and settings + llm_url: URL for LLM verification endpoint + llm_model: Model name for LLM verification + audit_log: List of merge operations for auditing + """ + + def __init__(self, kg_client: Any, config: Optional[Dict] = None): + """Initialize the entity resolver. + + Args: + kg_client: KGClient instance for KG operations + config: Optional configuration dict with keys: + - string_threshold: Jaro-Winkler threshold (default 0.80) + - token_threshold: Token overlap threshold (default 0.60) + - llm_verify: Enable LLM verification (default True) + - batch_size: LLM batch size for rate limiting (default 10) + - llm_sleep: Seconds between LLM calls (default 0.5) + """ + self.kg = kg_client + self.config = config or {} + self.llm_url = self.config.get( + "llm_url", "http://192.168.1.250:11435/v1/chat/completions" + ) + self.llm_model = self.config.get( + "llm_model", "Qwen3.6-35B-A3B-MTP-UD-Q5_K_XL.gguf" + ) + self.audit_log: List[Dict] = [] + self._llm_session = requests.Session() + + def find_candidates( + self, + entity_type: Optional[str] = None, + similarity_threshold: float = 0.80, + ) -> List[Dict]: + """Stage 1: Find candidate duplicate pairs using string similarity. + + Uses Jaro-Winkler similarity and token overlap to find potential duplicates. Groups by entity type first (blocking strategy) to reduce + computational complexity. + + Args: + entity_type: Filter by type (None = all types) + similarity_threshold: Minimum Jaro-Winkler similarity (default 0.80) + + Returns: + List of candidate dicts with keys: + - name_a: First entity name + - name_b: Second entity name + - type: Entity type + - similarity: Jaro-Winkler similarity score + - token_overlap: Token overlap ratio + - method: Detection method used + """ + token_threshold = self.config.get("token_threshold", 0.60) + + logger.info(f"Finding candidates (type={entity_type}, threshold={similarity_threshold})") + + # Fetch entities from KG + entities = self._fetch_entities(entity_type) + + if not entities: + logger.warning("No entities found for resolution") + return [] + + logger.info(f"Loaded {len(entities)} entities for comparison") + + candidates = [] + + # Group by type for blocking + type_groups: Dict[str, List[Dict]] = {} + for ent in entities: + etype = ent.get("type", "unknown") + type_groups.setdefault(etype, []).append(ent) + + # Compare within each type group + for etype, group in type_groups.items(): + logger.debug(f"Processing type '{etype}' with {len(group)} entities") + + # Compare all pairs in this type + for ent_a, ent_b in combinations(group, 2): + name_a = ent_a.get("name", "") + name_b = ent_b.get("name", "") + + if not name_a or not name_b or name_a == name_b: + continue + + # Calculate similarities + jw_sim = self._jaro_winkler(name_a.lower(), name_b.lower()) + tok_overlap = self._token_overlap(name_a, name_b) + + # Check thresholds + if jw_sim >= similarity_threshold or tok_overlap >= token_threshold: + method = "jaro_winkler" if jw_sim >= similarity_threshold else "token_overlap" + candidates.append({ + "name_a": name_a, + "name_b": name_b, + "type": etype, + "similarity": round(jw_sim, 4), + "token_overlap": round(tok_overlap, 4), + "method": method, + }) + + # Sort by combined score (weighted toward Jaro-Winkler) + candidates.sort( + key=lambda c: (c["similarity"] * 0.6 + c["token_overlap"] * 0.4), + reverse=True, + ) + + logger.info(f"Found {len(candidates)} candidate pairs for verification") + return candidates + + def verify_candidates( + self, + candidates: List[Dict], + batch_size: int = 10, + ) -> List[Dict]: + """Stage 2: Verify candidates using LLM judgment. + + Sends each pair to Qwen3.6-35B for YES/NO verification with confidence + scoring. Rate-limited with sleep between calls. + + Args: + candidates: List from find_candidates() + batch_size: Process N at a time (for progress reporting) + + Returns: + List of verified duplicates with keys: + - All keys from input candidate + - llm_verdict: "YES" or "NO" + - llm_confidence: 0.0-1.0 confidence score + - llm_reasoning: Optional reasoning text + """ + if not self.config.get("llm_verify", True): + logger.info("LLM verification disabled, skipping") + return [c for c in candidates if c.get("similarity", 0) > 0.95] + + verified = [] + llm_sleep = self.config.get("llm_sleep", 0.5) + + logger.info(f"Verifying {len(candidates)} candidates with LLM") + + for i, candidate in enumerate(candidates): + if i > 0 and i % batch_size == 0: + logger.debug(f"Processed {i}/{len(candidates)} candidates") + + result = self._llm_verify_pair( + candidate["name_a"], + candidate["name_b"], + candidate["type"], + ) + + candidate_copy = candidate.copy() + candidate_copy["llm_verdict"] = result["verdict"] + candidate_copy["llm_confidence"] = result["confidence"] + if result.get("reasoning"): + candidate_copy["llm_reasoning"] = result["reasoning"] + + if result["verdict"] == "YES": + verified.append(candidate_copy) + + # Rate limiting + time.sleep(llm_sleep) + + logger.info(f"LLM verified {len(verified)} duplicates from {len(candidates)} candidates") + return verified + + def merge_duplicates( + self, + duplicates: List[Dict], + dry_run: bool = True, + ) -> Dict[str, Any]: + """Stage 3: Merge confirmed duplicates in the KG. + + Keeps the entity with more connections as canonical. Absorbs + relationships from the duplicate. Logs every merge to audit chain. + + Args: + duplicates: Verified duplicate pairs from verify_candidates() + dry_run: If True, report what would happen without executing + + Returns: + Dict with keys: + - merged: Count of successful merges + - skipped: Count of skipped/invalid pairs + - dry_run: Whether this was a dry run + - details: List of per-merge detail dicts + - audit_log: Full audit trail + """ + if dry_run: + logger.info("DRY RUN MODE - No changes will be made to KG") + + merged_count = 0 + skipped_count = 0 + details = [] + + logger.info(f"Processing {len(duplicates)} duplicate pairs for merge") + + for dup in duplicates: + name_a = dup["name_a"] + name_b = dup["name_b"] + entity_type = dup.get("type", "unknown") + confidence = dup.get("llm_confidence", 0.8) + + # Get entity degrees to decide canonical + degree_a = self._get_entity_degree(name_a) + degree_b = self._get_entity_degree(name_b) + + if degree_a is None or degree_b is None: + logger.warning(f"Could not get degrees for {name_a} or {name_b}, skipping") + skipped_count += 1 + continue + + # Higher degree = canonical + if degree_a >= degree_b: + canonical_name, duplicate_name = name_a, name_b + canonical_degree, duplicate_degree = degree_a, degree_b + else: + canonical_name, duplicate_name = name_b, name_a + canonical_degree, duplicate_degree = degree_b, degree_a + + # Perform merge + merge_result = self._merge_pair( + canonical_name, + duplicate_name, + entity_type, + dry_run=dry_run, + ) + + detail = { + "canonical": canonical_name, + "duplicate": duplicate_name, + "type": entity_type, + "canonical_degree": canonical_degree, + "duplicate_degree": duplicate_degree, + "confidence": confidence, + "dry_run": dry_run, + "result": merge_result, + } + details.append(detail) + + if merge_result.get("success"): + merged_count += 1 + # Add to audit log + self.audit_log.append({ + "action": "merge", + "timestamp": time.isoformat(time.utcnow()) if hasattr(time, "isoformat") else str(time.time()), + "target_id": duplicate_name, + "metadata": { + "canonical": canonical_name, + "type": entity_type, + "confidence": confidence, + }, + }) + else: + skipped_count += 1 + + result = { + "merged": merged_count, + "skipped": skipped_count, + "dry_run": dry_run, + "details": details, + "audit_log": self.audit_log, + } + + logger.info(f"Completed: {merged_count} merged, {skipped_count} skipped (dry_run={dry_run})") + return result + + def _fetch_entities(self, entity_type: Optional[str] = None) -> List[Dict]: + """Fetch entities from KG using Cypher query. + + Args: + entity_type: Optional filter by entity type + + Returns: + List of entity dicts with name, type, mention_count + """ + if entity_type: + query = ( + f"MATCH (e:Entity) WHERE e.type = '{entity_type}' " + "RETURN e.name AS name, e.type AS type, e.mention_count AS mention_count" + ) + else: + query = ( + "MATCH (e:Entity) " + "RETURN e.name AS name, e.type AS type, e.mention_count AS mention_count" + ) + + try: + rows = self.kg.query_cypher(query) + return [ + {"name": r.get("name"), "type": r.get("type", "unknown"), "mention_count": r.get("mention_count", 0)} + for r in rows + if r.get("name") + ] + except Exception as e: + logger.error(f"Failed to fetch entities: {e}") + return [] + + def _jaro_winkler(self, s1: str, s2: str) -> float: + """Compute Jaro-Winkler similarity between two strings. + + Jaro similarity measures character matches and transpositions. + Winkler adds prefix bonus for matching start characters. + + Args: + s1: First string + s2: Second string + + Returns: + Similarity score 0.0-1.0 + """ + if s1 == s2: + return 1.0 + if not s1 or not s2: + return 0.0 + + # Jaro similarity + len1, len2 = len(s1), len(s2) + match_distance = (max(len1, len2) // 2) - 1 + + s1_matches = [False] * len1 + s2_matches = [False] * len2 + + matches = 0 + transpositions = 0 + + # Find matches + for i in range(len1): + start = max(0, i - match_distance) + end = min(len2, i + match_distance + 1) + + for j in range(start, end): + if s2_matches[j] or s1[i] != s2[j]: + continue + s1_matches[i] = s2_matches[j] = True + matches += 1 + break + + if matches == 0: + return 0.0 + + # Count transpositions + k = 0 + for i in range(len1): + if not s1_matches[i]: + continue + while not s2_matches[k]: + k += 1 + if s1[i] != s2[k]: + transpositions += 1 + k += 1 + + jaro = ((matches / len1) + (matches / len2) + ((matches - transpositions / 2) / matches)) / 3.0 + + # Winkler prefix bonus + prefix_len = 0 + max_prefix = min(4, min(len1, len2)) + for i in range(max_prefix): + if s1[i] == s2[i]: + prefix_len += 1 + else: + break + + p = 0.1 # Winkler scaling factor + return jaro + (prefix_len * p * (1 - jaro)) + + def _token_overlap(self, s1: str, s2: str) -> float: + """Compute token overlap ratio between two strings. + + Tokenizes strings and calculates intersection/min(tokens_a, tokens_b). + Catches cases like "Elastic Stack" vs "ELK Stack" (2/2 overlap). + + Args: + s1: First string + s2: Second string + + Returns: + Overlap ratio 0.0-1.0 + """ + # Tokenize to lowercase words + def tokenize(s: str) -> set: + return set(w.lower() for w in s.split() if w.isalnum() or len(w) > 2) + + tokens_a = tokenize(s1) + tokens_b = tokenize(s2) + + if not tokens_a or not tokens_b: + return 0.0 + + intersection = tokens_a & tokens_b + return len(intersection) / min(len(tokens_a), len(tokens_b)) + + def _llm_verify_pair( + self, + name_a: str, + name_b: str, + entity_type: str, + ) -> Dict: + """Ask LLM if two entities are the same real-world entity. + + Uses Qwen3.6-35B on Mediaserver for YES/NO verification. + Checks reasoning_content first, falls back to content field. + + Args: + name_a: First entity name + name_b: Second entity name + entity_type: Type of entities + + Returns: + Dict with keys: + - verdict: "YES" or "NO" + - confidence: 0.0-1.0 score + - reasoning: Optional explanation + """ + prompt = f"""Are these the same real-world entity? Answer ONLY 'YES' or 'NO' followed by a confidence score 0-100. + +Entity A: {name_a} (type: {entity_type}) +Entity B: {name_b} (type: {entity_type}) + +Consider: Are these different names for the same thing, or genuinely different entities? +Examples: +- 'Elastic Stack' vs 'ELK Stack' → YES (same product suite) +- 'Elastic Security' vs 'Elastic SLED' → NO (different products) +- 'SLED' vs 'State, Local, and Education' → YES (same acronym) +- 'AI/ML' vs 'GenAI' → NO (related but different concepts) + +Your response:""" + + try: + response = self._llm_session.post( + self.llm_url, + json={ + "model": self.llm_model, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 100, + "temperature": 0.1, + }, + timeout=30, + ) + response.raise_for_status() + data = response.json() + + # Extract content - check reasoning_content first for Qwen models + message = data.get("choices", [{}])[0].get("message", {}) + content = message.get("content", "") or "" + if not content: + content = message.get("reasoning_content", "") or "" + + content = content.strip().lower() + + # Parse verdict + verdict = "NO" + if "yes" in content[:10]: # Check first 10 chars + verdict = "YES" + + # Parse confidence + confidence = 0.8 # default + import re + # Look for number 0-100 + match = re.search(r'(\d{1,3})', content) + if match: + conf_val = int(match.group(1)) + if 0 <= conf_val <= 100: + confidence = conf_val / 100.0 + + return { + "verdict": verdict, + "confidence": confidence, + "reasoning": content[:200], # truncate for brevity + } + + except Exception as e: + logger.warning(f"LLM verification failed for {name_a}/{name_b}: {e}") + return {"verdict": "NO", "confidence": 0.0, "reasoning": f"error: {e}"} + + def _get_entity_degree(self, entity_name: str) -> Optional[int]: + """Get relationship count for an entity. + + Uses Cypher count of edges connected to entity. + + Args: + entity_name: Name of entity to check + + Returns: + Degree (number of relationships) or None if not found + """ + query = ( + f"MATCH (e:Entity)-[r]-() WHERE e.name = '{entity_name}' " + "RETURN COUNT(r) AS degree" + ) + + try: + rows = self.kg.query_cypher(query) + if rows and len(rows) > 0: + return rows[0].get("degree", 0) + return 0 + except Exception as e: + logger.error(f"Failed to get degree for {entity_name}: {e}") + return None + + def _merge_pair( + self, + canonical_name: str, + duplicate_name: str, + entity_type: str, + dry_run: bool = True, + ) -> Dict: + """Merge one pair of duplicates. + + Transfers all relationships from duplicate to canonical, + then deletes the duplicate entity. + + Args: + canonical_name: Name of entity to keep (higher degree) + duplicate_name: Name of entity to merge/absorb + entity_type: Type of both entities + dry_run: If True, only report what would be done + + Returns: + Dict with success status and details + """ + if dry_run: + return { + "success": True, + "action": "would_merge", + "canonical": canonical_name, + "duplicate": duplicate_name, + } + + try: + # Get relationships of duplicate + rel_query = ( + f"MATCH (e:Entity)-[r]-(other) WHERE e.name = '{duplicate_name}' " + "RETURN type(r) AS rel_type, other.name AS other_name" + ) + rels = self.kg.query_cypher(rel_query) + + rels_transferred = 0 + + # Create equivalent relationships from canonical + for rel in rels: + other_name = rel.get("other_name") + if other_name and other_name != canonical_name: + create_query = ( + f"MATCH (c:Entity) WHERE c.name = '{canonical_name}' " + f"MATCH (o:Entity) WHERE o.name = '{other_name}' " + "CREATE (c)-[:RELATED_TO]->(o)" + ) + try: + self.kg.query_cypher(create_query) + rels_transferred += 1 + except Exception: + pass # Relationship may already exist + + # Delete duplicate entity + # DETACH DELETE required by KuzuDB to remove nodes with relationships + delete_query = f"MATCH (e:Entity) WHERE e.name = '{duplicate_name}' DETACH DELETE e" + try: + self.kg.query_cypher(delete_query) + except Exception as e: + logger.warning(f"Failed to delete duplicate {duplicate_name}: {e}") + return { + "success": False, + "error": f"delete_failed: {e}", + "relationships_transferred": rels_transferred, + } + + return { + "success": True, + "relationships_transferred": rels_transferred, + "canonical": canonical_name, + "duplicate_deleted": duplicate_name, + } + + except Exception as e: + logger.error(f"Merge failed for {canonical_name}/{duplicate_name}: {e}") + return { + "success": False, + "error": str(e), + } + + def run( + self, + entity_type: Optional[str] = None, + stage: str = "candidates", + dry_run: bool = True, + ) -> Dict[str, Any]: + """Run the full resolution pipeline or a single stage. + + Args: + entity_type: Optional filter by entity type + stage: Pipeline stage to run (candidates, verify, merge, or full) + dry_run: Whether to simulate changes + + Returns: + Pipeline results dict + """ + result: Dict[str, Any] = {"stage": stage, "dry_run": dry_run} + + if stage == "candidates": + candidates = self.find_candidates(entity_type) + result["candidates"] = candidates + result["count"] = len(candidates) + + elif stage == "verify": + candidates = self.find_candidates(entity_type) + verified = self.verify_candidates(candidates) + result["verified"] = verified + result["verified_count"] = len(verified) + result["candidate_count"] = len(candidates) + + elif stage == "merge": + candidates = self.find_candidates(entity_type) + verified = self.verify_candidates(candidates) + merge_result = self.merge_duplicates(verified, dry_run=dry_run) + result.update(merge_result) + + elif stage == "full": + # Run all 3 stages + candidates = self.find_candidates(entity_type) + verified = self.verify_candidates(candidates) + merge_result = self.merge_duplicates(verified, dry_run=dry_run) + result["candidates"] = candidates + result["candidates_count"] = len(candidates) + result["verified"] = verified + result["verified_count"] = len(verified) + result.update(merge_result) + + else: + result = {"status": "error", "message": f"Unknown stage: {stage}"} + + return result diff --git a/usr/plugins/_kg_pipeline/pipeline/extractor.py b/usr/plugins/_kg_pipeline/pipeline/extractor.py new file mode 100644 index 0000000000..f10c687dc1 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/extractor.py @@ -0,0 +1,172 @@ +"""Entity extraction helper.""" +import os +import re +import json +import logging +from typing import Dict, List, Optional, Any +from datetime import datetime +import requests + +from .kg_client import KGClient + +logger = logging.getLogger(__name__) + + +class KGExtractor: + """Extracts entities and relationships using LLM.""" + + LLM_TIMEOUT = 120 + MAX_CONTENT_LENGTH = 4000 + + DOMAIN_PATHS = { + "work": ["work/", "sales/", "sled/", "deals/", "territory/"], + "personal": ["personal/", "life/", "home/", "bookmark/"], + "technology": ["system/", "infra/", "docker/", "framework/", "model/"], + "context": ["context/", "ideas/", "testing/", "archive/"] + } + + # Common entity aliases for normalization + ENTITY_ALIASES = { + "MSU": "Michigan State University", + "U of M": "University of Michigan", + "OSU": "Ohio State University", + "MWO": "MechWarrior Online", + "A0": "Agent Zero", + "Elastic": "Elastic", + } + + def __init__(self, kg_client: KGClient, config: Dict[str, Any]): + self.kg = kg_client + self.config = config + self.session = requests.Session() + self.llm_url = config.get("llm_api_url", "http://192.168.1.245:8000/v1") + self.llm_model = config.get("llm_model", "default") + + def detect_domain(self, filepath: str) -> str: + """Detect domain from file path.""" + rel_path = filepath.lower() + for domain, paths in self.DOMAIN_PATHS.items(): + for prefix in paths: + if prefix in rel_path: + return domain + return "context" + + def canonicalize(self, name: str) -> str: + """Normalize entity name.""" + return self.ENTITY_ALIASES.get(name.strip(), name.strip()) + + def _build_extraction_prompt(self, content: str, filename: str, + domain: str) -> str: + """Build extraction prompt for LLM.""" + domain_desc = { + "work": "SLED sales - accounts, contacts, competitors, deals", + "personal": "Personal - books, recipes, games, interests", + "technology": "Agent Zero system, AI models, infrastructure", + "context": "Learning notes, ideas, testing" + }.get(domain, "General knowledge") + + return f"""Extract entities and relationships from this {domain} domain file. + +DOMAIN: {domain} - {domain_desc} +SOURCE: {filename} + +TEXT CONTENT: +{content[:self.MAX_CONTENT_LENGTH]} + +Output ONLY valid JSON: +{{ + "entities": [ + {{"name": "exact name", "type": "EntityType", "confidence": 0.8}} + ], + "relationships": [ + {{"subject": "name", "type": "REL_TYPE", "object": "name", "confidence": 0.8}} + ] +}} + +RULES: +1. Extract ONLY entities explicitly mentioned +2. Use EXACT names from text (no paraphrasing) +3. Every relationship must connect two extracted entities +4. Confidence: 1.0=explicit, 0.8=strongly implied, 0.7=likely""" + + def extract_with_llm(self, content: str, filename: str, + domain: str) -> Dict[str, Any]: + """Extract entities using local LLM.""" + prompt = self._build_extraction_prompt(content, filename, domain) + + try: + response = self.session.post( + f"{self.llm_url}/chat/completions", + json={ + "model": self.llm_model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.1, + "max_tokens": 8000 + }, + timeout=self.LLM_TIMEOUT + ) + response.raise_for_status() + result = response.json() + raw_content = result['choices'][0]['message']['content'] + + parsed = self._extract_json_from_text(raw_content) + if parsed: + return parsed + + return {"entities": [], "relationships": []} + + except Exception as e: + logger.error(f"LLM extraction failed: {e}") + return {"entities": [], "relationships": []} + + def _extract_json_from_text(self, text: str) -> Optional[Dict]: + """Extract JSON from LLM response.""" + try: + return json.loads(text.strip()) + except: + pass + + for marker in ['```json', '```']: + if marker in text: + try: + block = text.split(marker)[1].split('```')[0] + return json.loads(block.strip()) + except: + continue + + import re + + brace_pattern = re.compile(r'\{[^{}]*"entities"[^{}]*\}', re.DOTALL) + match = brace_pattern.search(text) + if match: + try: + return json.loads(match.group()) + except: + pass + + return None + + def extract_from_file(self, filepath: str) -> Dict[str, Any]: + """Extract entities from a single file.""" + try: + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + if len(content) < 50: + return {"status": "skipped", "reason": "too_short"} + + domain = self.detect_domain(filepath) + filename = os.path.basename(filepath) + + extraction = self.extract_with_llm(content, filename, domain) + + return { + "status": "done", + "filename": filename, + "domain": domain, + "entities": extraction.get("entities", []), + "relationships": extraction.get("relationships", []) + } + + except Exception as e: + return {"status": "error", "error": str(e)} diff --git a/usr/plugins/_kg_pipeline/pipeline/gdrive.py b/usr/plugins/_kg_pipeline/pipeline/gdrive.py new file mode 100644 index 0000000000..510c6a128e --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/gdrive.py @@ -0,0 +1,51 @@ +"""Google Drive upload helper for KG exports.""" +import os +import sys +import json +import logging +from typing import Dict, Any + +from .kg_client import KGClient + +logger = logging.getLogger(__name__) + + +class KGDriveUploader: + """Uploads KG exports to Google Drive.""" + + def __init__(self, kg_client: KGClient, config: Dict[str, Any]): + self.kg = kg_client + self.config = config + self.gw_path = "/a0/usr/google-workspace" + + def upload_export(self, filepath: str = "") -> Dict[str, Any]: + """Upload a KG export file to Google Drive.""" + if not filepath: + filepath = self._export_kg() + if not filepath or not os.path.exists(filepath): + return {"status": "error", "message": f"File not found: {filepath}"} + + sys.path.insert(0, self.gw_path) + try: + from google_workspace_tools import drive_upload_file + result = drive_upload_file( + file_path=filepath, + file_name=os.path.basename(filepath), + folder_id=None, + ) + return {"status": "ok", "result": str(result), "file": filepath} + except Exception as e: + logger.error(f"Drive upload failed: {e}") + return {"status": "error", "message": str(e)} + + def _export_kg(self) -> str: + """Export KG data and return file path.""" + try: + data = self.kg.export_data() + out = "/a0/usr/workdir/logs/kg_export_latest.json" + with open(out, "w") as f: + json.dump(data, f, indent=2) + return out + except Exception as e: + logger.error(f"KG export failed: {e}") + return "" diff --git a/usr/plugins/_kg_pipeline/pipeline/health_scorer.py b/usr/plugins/_kg_pipeline/pipeline/health_scorer.py new file mode 100644 index 0000000000..6d374e9a63 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/health_scorer.py @@ -0,0 +1,417 @@ +"""Entity health scoring and tiered memory assignment for KG. + +Computes multi-dimensional health scores (0.0-1.0) for entities and +assigns memory tiers: hot, warm, cool, cold. + +Scoring dimensions: + - Connectivity (35%): relationship count normalized via log + - Recency (20%): days since last_seen, exponential decay + - Source Quality (20%): mention count + category richness + - Freshness (15%): update frequency (first_seen vs last_seen) + - Confidence (10%): entity confidence value +""" +import math +import logging +from datetime import datetime, timezone, timedelta +from typing import Dict, List, Optional, Any, Tuple + +from .kg_client import KGClient + +logger = logging.getLogger(__name__) + + +class HealthScorer: + """Compute health scores and memory tiers for KG entities.""" + + TIER_THRESHOLDS: Dict[str, float] = { + "hot": 0.7, + "warm": 0.5, + "cool": 0.3, + "cold": 0.0, + } + + WEIGHTS: Dict[str, float] = { + "connectivity": 0.35, + "recency": 0.20, + "source_quality": 0.20, + "freshness": 0.15, + "confidence": 0.10, + } + + def __init__( + self, + kg_client: KGClient, + config: Optional[Dict[str, Any]] = None, + ) -> None: + """Initialize scorer with KG client and optional config. + + Args: + kg_client: HTTP client for KG service. + config: Optional overrides for thresholds and TTL. + """ + self.kg = kg_client + cfg = config or {} + thresholds = cfg.get("health_scoring", {}).get("tier_thresholds", {}) + if thresholds: + self.TIER_THRESHOLDS = { + "hot": thresholds.get("hot", 0.7), + "warm": thresholds.get("warm", 0.5), + "cool": thresholds.get("cool", 0.3), + "cold": 0.0, + } + ttl_hours = cfg.get("health_scoring", {}).get("cache_ttl_hours", 24) + self._cache: Optional[List[Dict[str, Any]]] = None + self._cache_time: Optional[datetime] = None + self._cache_ttl = timedelta(hours=ttl_hours) + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def score_entities( + self, + entity_type: Optional[str] = None, + limit: int = 1000, + offset: int = 0, + ) -> Dict[str, Any]: + """Score a batch of entities. + + Args: + entity_type: Filter by entity type (None = all). + limit: Max entities to score. + offset: Pagination offset. + + Returns: + Dict with scored entities, distribution, and metadata. + """ + now = datetime.now(timezone.utc) + + # Fetch entities + entities = self._fetch_entities(entity_type, limit, offset) + if not entities: + return {"status": "ok", "scored": 0, "entities": [], + "distribution": {}} + + # Fetch degrees and max_degree + degree_map, max_degree = self._fetch_degrees() + + # Score each entity + scored: List[Dict[str, Any]] = [] + for ent in entities: + name = ent.get("name", "") + degree = degree_map.get(name, 0) + dims = self._compute_score( + name=name, + entity_type=ent.get("type", "unknown"), + domain=ent.get("domain", ""), + categories=ent.get("categories", ""), + confidence=float(ent.get("confidence", 0.5)), + mention_count=int(ent.get("mention_count", 0)), + first_seen=ent.get("first_seen"), + last_seen=ent.get("last_seen"), + degree=degree, + max_degree=max_degree, + ) + dims["name"] = name + dims["entity_type"] = ent.get("type", "unknown") + dims["tier"] = self._assign_tier(dims["total"]) + scored.append(dims) + + # Update cache + self._cache = scored + self._cache_time = now + + distribution = self._calc_distribution(scored) + return { + "status": "ok", + "scored": len(scored), + "entities": scored, + "distribution": distribution, + "max_degree": max_degree, + } + + def get_tier_distribution(self) -> Dict[str, int]: + """Get count of entities in each tier (uses cache if fresh). + + Returns: + Dict mapping tier name to entity count. + """ + scored = self._get_cached() + if scored is None: + result = self.score_entities() + scored = result.get("entities", []) + return self._calc_distribution(scored) + + def get_critical_entities( + self, limit: int = 50 + ) -> List[Dict[str, Any]]: + """Get entities with lowest health scores for cleanup review. + + Args: + limit: Max entities to return. + + Returns: + List of entity dicts sorted by score ascending. + """ + scored = self._get_cached() + if scored is None: + result = self.score_entities() + scored = result.get("entities", []) + scored_sorted = sorted(scored, key=lambda e: e.get("total", 1.0)) + return scored_sorted[:limit] + + def clear_cache(self) -> None: + """Force cache invalidation.""" + self._cache = None + self._cache_time = None + + # ------------------------------------------------------------------ + # Scoring dimensions + # ------------------------------------------------------------------ + + def _compute_score( + self, + name: str, + entity_type: str, + domain: str, + categories: str, + confidence: float, + mention_count: int, + first_seen: Optional[str], + last_seen: Optional[str], + degree: int, + max_degree: int, + ) -> Dict[str, Any]: + """Compute health score for a single entity. + + Returns: + Dict with total score and per-dimension breakdown. + """ + connectivity = self._score_connectivity(degree, max_degree) + recency = self._score_recency(last_seen) + source_quality = self._score_source_quality(mention_count, categories) + freshness = self._score_freshness(first_seen, last_seen) + confidence_score = self._score_confidence(confidence) + + total = ( + self.WEIGHTS["connectivity"] * connectivity + + self.WEIGHTS["recency"] * recency + + self.WEIGHTS["source_quality"] * source_quality + + self.WEIGHTS["freshness"] * freshness + + self.WEIGHTS["confidence"] * confidence_score + ) + total = max(0.0, min(1.0, total)) + + return { + "total": round(total, 4), + "connectivity": round(connectivity, 4), + "recency": round(recency, 4), + "source_quality": round(source_quality, 4), + "freshness": round(freshness, 4), + "confidence": round(confidence_score, 4), + } + + def _score_connectivity(self, degree: int, max_degree: int) -> float: + """Score based on relationship count. Weight: 35%. + + Uses log normalization: log(degree+1) / log(max_degree+1). + Returns 0.0 when max_degree is 0. + """ + if max_degree <= 0: + return 0.0 + val = math.log(degree + 1) / math.log(max_degree + 1) + return max(0.0, min(1.0, val)) + + def _score_recency(self, last_seen: Optional[str]) -> float: + """Score based on when entity was last seen. Weight: 20%. + + 1.0 if less than 7 days, linear decay to 0.1 at 365 days, 0.0 if no last_seen. + """ + if not last_seen: + return 0.0 + try: + dt = self._parse_datetime(last_seen) + if dt is None: + return 0.0 + now = datetime.now(timezone.utc) + days = (now - dt).days + if days < 0: + return 1.0 + if days <= 7: + return 1.0 + if days >= 365: + return 0.1 + return 1.0 - (0.9 * (days - 7) / 358.0) + except (ValueError, TypeError): + logger.warning("Invalid last_seen value: %s", last_seen) + return 0.0 + + def _score_source_quality( + self, mention_count: int, categories: str + ) -> float: + """Score based on mention count and category richness. Weight: 20%. + + mention_score = min(mention_count / 5, 1.0) * 0.5 + category_score = min(category_count / 5, 1.0) * 0.5 + """ + mention_score = min(mention_count / 5.0, 1.0) * 0.5 + cat_count = len(categories.split(",")) if categories else 0 + category_score = min(cat_count / 5.0, 1.0) * 0.5 + return max(0.0, min(1.0, mention_score + category_score)) + + def _score_confidence(self, confidence: float) -> float: + """Score based on entity confidence. Weight: 10%. + + Direct confidence value clamped to [0.0, 1.0]. + """ + return max(0.0, min(1.0, float(confidence))) + + def _score_freshness( + self, + first_seen: Optional[str], + last_seen: Optional[str], + ) -> float: + """Score based on how often entity is updated. Weight: 15%. + + 1.0 if updated within 7 days of first_seen (actively maintained), + decays based on the update span relative to total lifetime. + """ + if not first_seen or not last_seen: + return 0.0 + try: + dt_first = self._parse_datetime(first_seen) + dt_last = self._parse_datetime(last_seen) + if dt_first is None or dt_last is None: + return 0.0 + now = datetime.now(timezone.utc) + lifespan = (now - dt_first).days + update_span = (dt_last - dt_first).days + if lifespan <= 0: + return 1.0 + ratio = update_span / lifespan + return max(0.0, min(1.0, ratio)) + except (ValueError, TypeError): + return 0.0 + + def _assign_tier(self, score: float) -> str: + """Assign memory tier based on total score. + + Args: + score: Total health score in [0.0, 1.0]. + + Returns: + Tier string: hot, warm, cool, or cold. + """ + if score >= self.TIER_THRESHOLDS["hot"]: + return "hot" + if score >= self.TIER_THRESHOLDS["warm"]: + return "warm" + if score >= self.TIER_THRESHOLDS["cool"]: + return "cool" + return "cold" + + # ------------------------------------------------------------------ + # Data fetching (KuzuDB-compatible Cypher) + # ------------------------------------------------------------------ + + def _fetch_entities( + self, + entity_type: Optional[str], + limit: int, + offset: int, + ) -> List[Dict[str, Any]]: + """Fetch entity properties from KG via Cypher. + + Uses simple MATCH/WHERE/RETURN - no CASE WHEN or rand(). + """ + if entity_type: + query = ( + "MATCH (e:Entity) WHERE e.type = $type " + "RETURN e.name AS name, e.type AS type, " + "e.domain AS domain, e.categories AS categories, " + "e.confidence AS confidence, e.mention_count AS mention_count, " + "e.first_seen AS first_seen, e.last_seen AS last_seen " + f"SKIP {offset} LIMIT {limit}" + ) + rows = self.kg.query_cypher(query, {"type": entity_type}) + else: + query = ( + "MATCH (e:Entity) " + "RETURN e.name AS name, e.type AS type, " + "e.domain AS domain, e.categories AS categories, " + "e.confidence AS confidence, e.mention_count AS mention_count, " + "e.first_seen AS first_seen, e.last_seen AS last_seen " + f"SKIP {offset} LIMIT {limit}" + ) + rows = self.kg.query_cypher(query) + return rows + + def _fetch_degrees(self) -> Tuple[Dict[str, int], int]: + """Fetch relationship counts per entity via Cypher. + + Returns: + Tuple of (name->degree map, max_degree). + """ + query = ( + "MATCH (e:Entity)-[r]-() " + "WITH e.name AS name, count(r) AS degree " + "RETURN name, degree " + "ORDER BY degree DESC LIMIT 5000" + ) + rows = self.kg.query_cypher(query) + degree_map: Dict[str, int] = {} + max_degree = 1 # floor at 1 to avoid division by zero + for row in rows: + name = row.get("name", "") + deg = int(row.get("degree", 0)) + degree_map[name] = deg + if deg > max_degree: + max_degree = deg + return degree_map, max_degree + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + @staticmethod + def _parse_datetime(ts: str) -> Optional[datetime]: + """Parse ISO datetime string to timezone-aware datetime.""" + if not ts: + return None + try: + dt = datetime.fromisoformat(ts) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except (ValueError, TypeError): + return None + + @staticmethod + def _calc_distribution( + scored: List[Dict[str, Any]], + ) -> Dict[str, int]: + """Count entities in each tier. + + Args: + scored: List of scored entity dicts with 'tier' key. + + Returns: + Dict mapping tier name to count. + """ + dist: Dict[str, int] = {"hot": 0, "warm": 0, "cool": 0, "cold": 0} + for ent in scored: + tier = ent.get("tier", "cold") + dist[tier] = dist.get(tier, 0) + 1 + return dist + + def _get_cached(self) -> Optional[List[Dict[str, Any]]]: + """Return cached scores if still within TTL.""" + if self._cache is None or self._cache_time is None: + return None + now = datetime.now(timezone.utc) + if now - self._cache_time > self._cache_ttl: + self._cache = None + self._cache_time = None + return None + return self._cache + diff --git a/usr/plugins/_kg_pipeline/pipeline/ingester.py b/usr/plugins/_kg_pipeline/pipeline/ingester.py new file mode 100644 index 0000000000..3a1e11961b --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/ingester.py @@ -0,0 +1,276 @@ +"""File ingestion helper combining single and bulk ingest logic.""" +import os +import json +import time +import glob +import logging +from pathlib import Path +from datetime import datetime +from typing import Dict, List, Tuple, Optional, Any + +from .kg_client import KGClient +from .audit_chain import AuditChain + +logger = logging.getLogger(__name__) + + +class Ingester: + """Handles single file and bulk ingestion to KG.""" + + def __init__(self, kg_client: KGClient, config: Dict[str, Any]): + self.kg = kg_client + self.config = config + self.state_file = config.get("ingest_state_file") + self.log_dir = config.get("log_dir", "/a0/usr/workdir/logs") + self.max_file_size_kb = config.get("max_file_size_kb", 50) + self.min_file_size_kb = config.get("min_file_size_kb", 2) + audit_cfg = config.get("audit", {}) + audit_dir = os.path.join(self.log_dir, "kg_audit") + self.audit = AuditChain( + audit_dir=audit_cfg.get("audit_dir", audit_dir), + enabled=audit_cfg.get("enabled", True), + ) + + def _log(self, msg: str) -> None: + """Log message with timestamp.""" + ts = datetime.utcnow().isoformat() + line = f"[{ts}] {msg}" + logger.info(msg) + log_file = os.path.join(self.log_dir, "kg_ingest.log") + os.makedirs(os.path.dirname(log_file), exist_ok=True) + with open(log_file, "a") as f: + f.write(line + "\n") + + def _load_state(self) -> Dict: + """Load ingestion state from file.""" + if self.state_file and os.path.exists(self.state_file): + try: + with open(self.state_file) as f: + return json.load(f) + except Exception as e: + logger.warning(f"Failed to load state: {e}") + return {} + + def _save_state(self, state: Dict) -> None: + """Save ingestion state to file.""" + if self.state_file: + os.makedirs(os.path.dirname(self.state_file), exist_ok=True) + with open(self.state_file, "w") as f: + json.dump(state, f, indent=2) + + @staticmethod + def detect_domain(filepath: str) -> str: + """Detect domain from file path.""" + p = filepath.lower() + if any(x in p for x in [ + "work", "sales", "territory", "deal", "pipeline", "sled" + ]): + return "work" + elif any(x in p for x in [ + "personal", "life", "home", "bookmark" + ]): + return "personal" + elif any(x in p for x in [ + "infra", "model", "docker", "server", "system", "framework" + ]): + return "technology" + return "context" + + def _should_process_file(self, filepath: str, state: Dict, + resume: bool = False) -> bool: + """Check if file should be processed.""" + if not filepath.endswith(".md"): + return False + + size_kb = os.path.getsize(filepath) / 1024 + if size_kb > self.max_file_size_kb or size_kb < self.min_file_size_kb: + return False + + # Skip archived + if "_archived" in filepath: + return False + + if resume and filepath in state: + stored = state[filepath].get("mtime", 0) + if stored == os.path.getmtime(filepath): + return False + + return True + + def ingest_file(self, filepath: str, domain: Optional[str] = None + ) -> Dict[str, Any]: + """Ingest a single file into KG.""" + try: + with open(filepath, "r", encoding="utf-8", errors="replace") as f: + content = f.read() + + if len(content.strip()) < 200: + return {"status": "skipped", "reason": "too_short"} + + # Apply token compression before truncation + from .token_compressor import TokenCompressor + if not hasattr(self, 'compressor'): + self.compressor = TokenCompressor(self.config) + content = self.compressor.compress(content) + + full_content = f"Source: {filepath}\n\n{content[:8000]}" + domain = domain or self.detect_domain(filepath) + + start = time.time() + result = self.kg.add_content(full_content, filepath, domain) + elapsed = time.time() - start + self.audit.append( + action="add", + target_type="document", + target_id=filepath, + source="ingester.ingest_file", + metadata={ + "entities": result.get("entities", 0), + "domain": domain, + "elapsed": round(elapsed, 1), + }, + ) + return { + "status": "done", + "entities": result.get("entities", 0), + "relationships": result.get("relationships", 0), + "domain": result.get("domain", domain), + "elapsed": round(elapsed, 1) + } + except Exception as e: + return {"status": "failed", "error": str(e)} + + def ingest_directory(self, knowledge_dir: str, limit: Optional[int] = None, + resume: bool = False, force_reingest: bool = False + ) -> Dict[str, Any]: + """Ingest all files from a directory.""" + state = {} if force_reingest else self._load_state() + + files = [] + for root, dirs, filenames in os.walk(knowledge_dir): + dirs[:] = [d for d in dirs if d != "_archived"] + for fn in filenames: + fp = os.path.join(root, fn) + if self._should_process_file(fp, state, resume): + files.append((fp, os.path.getsize(fp) / 1024)) + + files.sort(key=lambda x: x[1]) + if limit: + files = files[:limit] + + self._log(f"Found {len(files)} files to process") + + total_ents = 0 + total_rels = 0 + done_count = 0 + fail_count = 0 + + for i, (fp, size_kb) in enumerate(files): + self._log(f"[{i+1}/{len(files)}] Processing: {fp}") + result = self.ingest_file(fp) + state[fp] = { + "status": result["status"], + **result, + "mtime": os.path.getmtime(fp), + "timestamp": datetime.utcnow().isoformat() + } + self._save_state(state) + + if result["status"] == "done": + total_ents += result.get("entities", 0) + total_rels += result.get("relationships", 0) + done_count += 1 + self.audit.append( + action="add", + target_type="document", + target_id=fp, + source="ingester.ingest_directory", + metadata={ + "entities": result.get("entities", 0), + "domain": result.get("domain", ""), + }, + ) + elif result["status"] == "failed": + fail_count += 1 + + # Run janitor + if done_count > 0: + try: + self.kg.janitor(passes=["normalize", "orphans"]) + self.audit.append( + action="janitor", + target_type="entity", + target_id="all", + source="ingester.ingest_directory", + metadata={"passes": ["normalize", "orphans"]}, + ) + except Exception as e: + logger.warning(f"Janitor failed: {e}") + + return { + "files_processed": done_count, + "files_failed": fail_count, + "entities_added": total_ents, + "relationships_added": total_rels + } + + def bulk_ingest(self, file_list: List[str], kg_paths: set = None, + dry_run: bool = False) -> Tuple[int, int, int]: + """Bulk ingest files with deduplication.""" + kg_paths = kg_paths or set() + pushed, failed, skipped = 0, 0, 0 + + kg_basenames = {p.split("/")[-1].replace(".md", "") for p in kg_paths} + + for i, filepath in enumerate(file_list, 1): + rel_path = filepath.lstrip("/") + basename = os.path.basename(filepath).replace(".md", "") + + if rel_path in kg_paths or basename in kg_basenames: + skipped += 1 + continue + + try: + with open(filepath) as f: + content = f.read() + + if len(content.strip()) < 100: + skipped += 1 + continue + + # Apply token compression before truncation + from .token_compressor import TokenCompressor + if not hasattr(self, 'compressor'): + self.compressor = TokenCompressor(self.config) + content = self.compressor.compress(content) + + # Truncate large files + if len(content) > 30000: + content = content[:30000] + "\n\n[...truncated...]" + + domain = self.detect_domain(filepath) + + if dry_run: + pushed += 1 + continue + + result = self.kg.add_content(content, rel_path, domain) + + if "error" in result: + failed += 1 + else: + pushed += 1 + self.audit.append( + action="add", + target_type="document", + target_id=filepath, + source="ingester.bulk_ingest", + metadata={"domain": domain}, + ) + except Exception as e: + failed += 1 + logger.error(f"Failed to process {filepath}: {e}") + + time.sleep(0.3) + + return pushed, failed, skipped diff --git a/usr/plugins/_kg_pipeline/pipeline/kg_client.py b/usr/plugins/_kg_pipeline/pipeline/kg_client.py new file mode 100644 index 0000000000..f6b0595e5c --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/kg_client.py @@ -0,0 +1,119 @@ +"""Shared KG service HTTP client.""" +import json +import time +import logging +from typing import Dict, List, Optional, Any +import requests + +logger = logging.getLogger(__name__) + + +class KGClient: + """HTTP client for KG service communication.""" + + def __init__(self, base_url: str, timeout: int = 300, + max_retries: int = 3, retry_delay: float = 1.0): + self.base_url = base_url.rstrip("/") + self.timeout = timeout + self.max_retries = max_retries + self.retry_delay = retry_delay + self.session = requests.Session() + + def _request(self, method: str, endpoint: str, + **kwargs) -> Dict[str, Any]: + """Make HTTP request with retry logic.""" + url = f"{self.base_url}/{endpoint.lstrip('/')}" + + for attempt in range(self.max_retries): + try: + if method.upper() == "GET": + r = self.session.get(url, timeout=self.timeout, **kwargs) + else: + r = self.session.request(method, url, timeout=self.timeout, **kwargs) + r.raise_for_status() + return r.json() if r.text else {} + except requests.exceptions.RequestException as e: + logger.warning(f"Request failed (attempt {attempt + 1}): {e}") + if attempt < self.max_retries - 1: + time.sleep(self.retry_delay * (2 ** attempt)) + else: + raise + return {} + + def health_check(self) -> Dict[str, Any]: + """Check KG service health.""" + return self._request("GET", "/health") + + def add_content(self, content: str, source_path: str, + domain: str = "context") -> Dict[str, Any]: + """Add content with entity extraction.""" + return self._request("POST", "/api/v1/add", + json={"content": content, "source_path": source_path, "domain": domain}) + + def search(self, query: str, limit: int = 10) -> Dict[str, Any]: + """Search entities/relationships.""" + return self._request("POST", "/api/v1/search", + json={"query": query, "limit": limit}) + + def get_entities(self, offset: int = 0, limit: int = 50) -> Dict[str, Any]: + """List entities.""" + return self._request("GET", "/api/v1/entities", + params={"offset": offset, "limit": limit}) + + def export_data(self) -> Dict[str, Any]: + """Export all data.""" + return self._request("GET", "/api/v1/export") + + def get_hubs(self, top_n: int = 50, min_degree: int = 5) -> List[Dict]: + """Get hub entities.""" + r = self._request("GET", "/api/v1/graph/hubs", + params={"top_n": top_n, "min_degree": min_degree}) + return r.get("hubs", []) + + def get_orphans(self, limit: int = 1000) -> List[Dict]: + """Get orphan entities.""" + r = self._request("GET", "/api/v1/analysis/orphans", + params={"limit": limit}) + return r.get("orphans", []) + + def get_communities(self) -> Dict[str, Any]: + """Detect communities.""" + return self._request("GET", "/api/v1/analysis/communities") + + def get_bridges(self) -> Dict[str, Any]: + """Find bridge nodes.""" + return self._request("GET", "/api/v1/analysis/bridges") + + def janitor(self, passes: List[str] = None, + dry_run: bool = False) -> Dict[str, Any]: + """Run cleanup/maintenance.""" + passes = passes or ["normalize", "orphans"] + return self._request("POST", "/api/v1/janitor", + json={"passes": passes, "dry_run": dry_run}) + + def query_cypher(self, query: str, params: Dict = None) -> List[Dict]: + """Execute Cypher query.""" + r = self._request("POST", "/api/v1/query", + json={"query": query, "params": params or {}}) + return r.get("rows", []) + + def update_entity(self, entity_id: str, data: Dict) -> Dict[str, Any]: + """Update entity.""" + return self._request("PUT", f"/api/v1/entities/{entity_id}", + json=data) + + def create_relationship(self, source_name: str, target_name: str, + rel_type: str) -> bool: + """Create relationship between entities.""" + try: + self._request("POST", "/api/v1/relationships", + json={"source_name": source_name, "target_name": target_name, + "rel_type": rel_type}) + return True + except Exception as e: + logger.error(f"Failed to create relationship: {e}") + return False + + def get_status(self) -> Dict[str, Any]: + """Get service status.""" + return self._request("GET", "/api/v1/status") diff --git a/usr/plugins/_kg_pipeline/pipeline/knowledge_archiver.py b/usr/plugins/_kg_pipeline/pipeline/knowledge_archiver.py new file mode 100755 index 0000000000..81f0489ac6 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/knowledge_archiver.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +"""Knowledge Archiver - Scans knowledge directory for KG-included files + +Scans /a0/usr/knowledge/ for files already in KG (checking via KG API), +moves confirmed KG files to /a0/usr/knowledge/_archived/, +maintains a manifest of archived files, runs as scheduled task. + +Usage: + python3 /a0/usr/plugins/_kg_pipeline/pipeline/knowledge_archiver.py [--dry-run] [--scan-dir PATH] +""" + +import json +import hashlib +import shutil +from datetime import datetime +from pathlib import Path +from typing import Dict, Any, List, Optional +import argparse +import requests + +# Configuration +KNOWLEDGE_DIR = Path("/a0/usr/knowledge") +ARCHIVED_DIR = Path("/a0/usr/knowledge/_archived") +ARCHIVE_MANIFEST = Path("/a0/usr/knowledge/_archived/_manifest.json") +ARCHIVE_LOG = Path("/a0/usr/workdir/logs/knowledge_archiver.log") +KG_SERVICE_URL = "http://100.78.79.41:8010" + + +def log_operation(message: str, dry_run: bool = False) -> None: + """Log operation with timestamp.""" + prefix = "[DRY RUN] " if dry_run else "" + timestamp = datetime.now().isoformat() + log_line = f"{timestamp} {prefix}{message}\n" + + ARCHIVE_LOG.parent.mkdir(parents=True, exist_ok=True) + with open(ARCHIVE_LOG, 'a') as f: + f.write(log_line) + print(log_line.strip()) + + +def load_manifest() -> Dict[str, Any]: + """Load archive manifest.""" + if ARCHIVE_MANIFEST.exists(): + with open(ARCHIVE_MANIFEST, 'r') as f: + return json.load(f) + return { + 'created': datetime.now().isoformat(), + 'archived_files': [] + } + + +def save_manifest(manifest: Dict[str, Any]) -> None: + """Save archive manifest.""" + ARCHIVED_DIR.mkdir(parents=True, exist_ok=True) + with open(ARCHIVE_MANIFEST, 'w') as f: + json.dump(manifest, f, indent=2) + + +def compute_file_hash(file_path: Path) -> str: + """Compute SHA256 hash of file content.""" + sha256 = hashlib.sha256() + with open(file_path, 'rb') as f: + sha256.update(f.read()) + return sha256.hexdigest()[:16] + + +def check_file_in_kg(file_path: Path) -> Optional[Dict[str, Any]]: + """Check if file content exists in KG via source_path check.""" + try: + # Query KG for documents matching this source path pattern + source_pattern = str(file_path.relative_to(KNOWLEDGE_DIR.parent)) + + # Use temporal search as a proxy for file existence + cypher_query = """ + MATCH (d:Document) + WHERE d.source_path CONTAINS $filename + RETURN d.doc_id AS doc_id, d.source_path AS source_path, d.created_at AS created + LIMIT 1 + """ + + resp = requests.post( + f"{KG_SERVICE_URL}/api/v1/query", + json={"query": cypher_query, "params": {"filename": file_path.name}}, + timeout=10 + ) + + if resp.status_code == 200: + data = resp.json() + if data.get('rows', []): + return { + 'in_kg': True, + 'doc_id': data['rows'][0].get('doc_id'), + 'source_path': data['rows'][0].get('source_path') + } + + return {'in_kg': False} + except Exception as e: + log_operation(f"Error checking KG for {file_path}: {e}", dry_run=True) + return {'in_kg': False, 'error': str(e)} + + +def get_knowledge_files(base_dir: Path) -> List[Path]: + """Get all knowledge files recursively.""" + files = [] + if not base_dir.exists(): + return files + + for ext in ['*.md', '*.json', '*.txt', '*.yaml', '*.yml']: + files.extend(base_dir.rglob(ext)) + + return [f for f in files if not f.name.startswith('_') and '_archived' not in str(f)] + + +def archive_file(file_path: Path, dry_run: bool = False) -> bool: + """Move file to archived directory.""" + try: + # Compute relative path to preserve structure + rel_path = file_path.relative_to(KNOWLEDGE_DIR) + dest_path = ARCHIVED_DIR / rel_path + + if dry_run: + log_operation(f"Would archive: {file_path} → {dest_path}", dry_run=True) + return True + + # Create destination directory + dest_path.parent.mkdir(parents=True, exist_ok=True) + + # Move file + shutil.move(str(file_path), str(dest_path)) + log_operation(f"Archived: {file_path} → {dest_path}") + return True + except Exception as e: + log_operation(f"Failed to archive {file_path}: {e}", dry_run=dry_run) + return False + + +def run_archiver(dry_run: bool = False, scan_dir: Optional[Path] = None) -> Dict[str, Any]: + """Run the full archival pipeline.""" + log_operation("=== Starting Knowledge Archiver ===", dry_run) + + manifest = load_manifest() + base_dir = scan_dir or KNOWLEDGE_DIR + + # Get all knowledge files + files = get_knowledge_files(base_dir) + log_operation(f"Found {len(files)} knowledge files in {base_dir}", dry_run) + + archived_count = 0 + failed_count = 0 + skipped_count = 0 + + for file_path in files: + # Skip already archived files + if '_archived' in str(file_path): + skipped_count += 1 + continue + + # Check if already in manifest + file_hash = compute_file_hash(file_path) + already_archived = any( + entry.get('hash') == file_hash for entry in manifest['archived_files'] + ) + + if already_archived: + skipped_count += 1 + continue + + # Check KG for this file + kg_check = check_file_in_kg(file_path) + + if kg_check.get('in_kg'): + # File exists in KG, can be archived + if archive_file(file_path, dry_run): + # Add to manifest + manifest['archived_files'].append({ + 'original_path': str(file_path), + 'archived_path': str(ARCHIVED_DIR / file_path.relative_to(KNOWLEDGE_DIR)), + 'hash': file_hash, + 'archived_at': datetime.now().isoformat(), + 'kg_doc_id': kg_check.get('doc_id'), + 'file_size': file_path.stat().st_size + }) + archived_count += 1 + else: + failed_count += 1 + else: + # Not in KG, leave in place + log_operation(f"Not in KG, keeping: {file_path}", dry_run=dry_run) + + # Save manifest + if not dry_run: + save_manifest(manifest) + + summary = { + 'total_files': len(files), + 'archived': archived_count, + 'failed': failed_count, + 'skipped': skipped_count, + 'total_archived_in_manifest': len(manifest['archived_files']) + } + + log_operation(f"=== Archival Complete === {summary}", dry_run) + return summary + + +def main(): + parser = argparse.ArgumentParser(description='Knowledge Archiver') + parser.add_argument('--dry-run', action='store_true', help='Simulate without archiving') + parser.add_argument('--scan-dir', type=Path, help='Directory to scan (default: /a0/usr/knowledge)') + parser.add_argument('--check-khealth', action='store_true', help='Check KG health and exit') + args = parser.parse_args() + + if args.check_khealth: + try: + resp = requests.get(f"{KG_SERVICE_URL}/api/v1/status", timeout=10) + if resp.status_code == 200: + print(f"KG is healthy: {resp.json()}") + else: + print(f"KG returned status {resp.status_code}") + return 1 + except Exception as e: + print(f"KG connection failed: {e}") + return 1 + return 0 + + print(f"=== Knowledge Archiver ===") + print(f"Knowledge dir: {args.scan_dir or KNOWLEDGE_DIR}") + print(f"Archive dir: {ARCHIVED_DIR}") + print(f"Manifest: {ARCHIVE_MANIFEST}") + print(f"KG URL: {KG_SERVICE_URL}") + + result = run_archiver(dry_run=args.dry_run, scan_dir=args.scan_dir) + + print(f"\n=== Summary ===") + print(json.dumps(result, indent=2)) + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/usr/plugins/_kg_pipeline/pipeline/knowledge_ingester.py b/usr/plugins/_kg_pipeline/pipeline/knowledge_ingester.py new file mode 100755 index 0000000000..dd641a33ab --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/knowledge_ingester.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +"""Knowledge Graph ingestion script - loads files into standalone KG service.""" +import os +import sys +import json +import time +import requests +import argparse +from pathlib import Path +from datetime import datetime + +KG_SERVICE = os.getenv("KG_SERVICE", "http://100.78.79.41:8010/api/v1") +KNOWLEDGE_DIR = os.getenv("KNOWLEDGE_DIR", "/a0/usr/knowledge") +STATE_FILE = "/a0/usr/workdir/logs/kg_ingest_state.json" +LOG_FILE = "/a0/usr/workdir/logs/kg_ingest.log" +MAX_FILE_SIZE_KB = 50 +ARCHIVE_DIR = os.path.join(KNOWLEDGE_DIR, "_archived") + +def archive_file(filepath, knowledge_dir): + """Move successfully ingested file to _archived/ directory.""" + rel_path = os.path.relpath(filepath, knowledge_dir) + archive_path = os.path.join(ARCHIVE_DIR, rel_path) + os.makedirs(os.path.dirname(archive_path), exist_ok=True) + os.rename(filepath, archive_path) + return archive_path # Skip files larger than this + +def log(msg): + ts = datetime.utcnow().isoformat() + line = f"[{ts}] {msg}" + print(line, flush=True) + with open(LOG_FILE, "a") as f: + f.write(line + "\n") + +def load_state(): + if os.path.exists(STATE_FILE): + with open(STATE_FILE) as f: + return json.load(f) + return {} + +def save_state(state): + os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True) + with open(STATE_FILE, "w") as f: + json.dump(state, f, indent=2) + +def detect_domain(filepath): + p = filepath.lower() + if any(x in p for x in ["work", "sales", "territory", "deal", "pipeline", "sled"]): + return "work" + elif any(x in p for x in ["personal", "life", "home", "bookmark"]): + return "personal" + elif any(x in p for x in ["infra", "model", "docker", "server", "system", "framework"]): + return "technology" + else: + return "context" + +def find_files(knowledge_dir, limit=None, resume=False, state=None): + files = [] + for root, dirs, filenames in os.walk(knowledge_dir): + dirs[:] = [d for d in dirs if d != "_archived"] # Skip archived files + for fn in filenames: + if not fn.endswith(".md"): + continue + fp = os.path.join(root, fn) + size_kb = os.path.getsize(fp) / 1024 + if size_kb > MAX_FILE_SIZE_KB or size_kb < 2.0: + continue + # Skip YouTube bookmarks without enriched content + if "/bookmarks/" in fp: + try: + with open(fp, "r", encoding="utf-8", errors="replace") as f: + content = f.read() + if ("youtube.com" in content or "youtu.be" in content): + if "## Scraped Content" not in content and "## Gemini Summary" not in content: + continue # Skip unenriched YouTube bookmarks + except: + pass + if resume and state and fp in state and state[fp].get("status") == "done": + stored_mtime = state[fp].get("mtime", 0) + current_mtime = os.path.getmtime(fp) + if stored_mtime == current_mtime: + continue # File unchanged, skip + # else: file was updated, re-ingest + files.append((fp, size_kb)) + files.sort(key=lambda x: x[1]) # Sort by size (smallest first) + if limit: + files = files[:limit] + return files + +def ingest_file(filepath, size_kb): + """Ingest a single file into the KG service.""" + try: + with open(filepath, "r", encoding="utf-8", errors="replace") as f: + content = f.read() + + if len(content.strip()) < 200: + return {"status": "skipped", "reason": "too short", "entities": 0, "relationships": 0} + + # Add source prefix + full_content = f"Source: {filepath}\n\n{content[:8000]}" + domain = detect_domain(filepath) + + start = time.time() + r = requests.post( + f"{KG_SERVICE}/add", + json={"content": full_content, "source_path": filepath, "domain": domain}, + timeout=120 + ) + elapsed = time.time() - start + + if r.status_code == 200: + result = r.json() + return { + "status": "done", + "entities": result.get("entities", 0), + "relationships": result.get("relationships", 0), + "domain": result.get("domain", domain), + "elapsed": round(elapsed, 1) + } + else: + return {"status": "failed", "error": f"HTTP {r.status_code}: {r.text[:200]}", "elapsed": round(elapsed, 1)} + except Exception as e: + return {"status": "failed", "error": str(e)} + +def main(): + parser = argparse.ArgumentParser(description="Ingest knowledge files into KG service") + parser.add_argument("--limit", type=int, default=None, help="Max files to process") + parser.add_argument("--resume", action="store_true", help="Skip already-processed files") + parser.add_argument("--force-reingest", action="store_true", help="Clear state and reprocess all files") + parser.add_argument("--status", action="store_true", help="Show current status") + args = parser.parse_args() + + # Status check + if args.status: + try: + r = requests.get(f"{KG_SERVICE}/status", timeout=5) + print(json.dumps(r.json(), indent=2)) + except Exception as e: + print(f"Service error: {e}") + state = load_state() + done = sum(1 for v in state.values() if v.get("status") == "done") + failed = sum(1 for v in state.values() if v.get("status") == "failed") + print(f"\nState file: {done} done, {failed} failed, {len(state)} total") + return + + # Health check + try: + r = requests.get(f"{KG_SERVICE.replace('/api/v1', '')}/health", timeout=5) + if r.status_code != 200: + log("ERROR: KG service not healthy") + return + log("KG service healthy") + except Exception as e: + log(f"ERROR: Cannot reach KG service: {e}") + return + + state = {} if args.force_reingest else load_state() + if args.force_reingest: + log("Force reingest: clearing state, reprocessing all files") + files = find_files(KNOWLEDGE_DIR, limit=args.limit, resume=(args.resume and not args.force_reingest), state=state) + log(f"Found {len(files)} files to process") + + total_ents = 0 + total_rels = 0 + done_count = 0 + fail_count = 0 + + for i, (fp, size_kb) in enumerate(files): + log(f"[{i+1}/{len(files)}] Processing: {fp} ({size_kb:.1f}KB)") + result = ingest_file(fp, size_kb) + state[fp] = {"status": result["status"], **result, "mtime": os.path.getmtime(fp), "timestamp": datetime.utcnow().isoformat()} + # Archive successfully ingested file (skip bookmarks for karakeep sync) + if result.get("status") == "done" and "/bookmarks/" not in fp: + try: + archive_path = archive_file(fp, KNOWLEDGE_DIR) + log(f" -> Archived to {archive_path}") + except Exception as e: + log(f" -> Archive failed: {e}") + + save_state(state) + + if result["status"] == "done": + total_ents += result.get("entities", 0) + total_rels += result.get("relationships", 0) + done_count += 1 + log(f" -> {result['entities']} entities, {result['relationships']} rels in {result.get('elapsed', 0)}s") + elif result["status"] == "skipped": + log(f" -> Skipped: {result.get('reason')}") + else: + fail_count += 1 + log(f" -> FAILED: {result.get('error', 'unknown')}") + + # Final summary + log(f"\n=== INGESTION COMPLETE ===") + log(f"Files processed: {done_count} done, {fail_count} failed") + log(f"Entities added: {total_ents}") + log(f"Relationships added: {total_rels}") + + # Run janitor after ingest (normalize + orphans, skip fuzzy for speed) + if done_count > 0: + log("\nRunning janitor (normalize + orphans)...") + try: + r = requests.post(f"{KG_SERVICE.replace('/api/v1', '')}/api/v1/janitor", + json={"passes": ["normalize", "orphans"], "dry_run": False}, timeout=60) + if r.status_code == 200: + result = r.json() + total = result.get("total_actions", 0) + log(f"Janitor: {total} cleanup actions performed") + else: + log(f"Janitor: returned status {r.status_code}") + except Exception as e: + log(f"Janitor: error - {e}") + + # Get final status from service + try: + r = requests.get(f"{KG_SERVICE}/status", timeout=5) + svc_status = r.json() + log(f"Service status: {svc_status['entities']} entities, {svc_status['relationships']} relationships, {svc_status['documents']} documents") + except: + pass + +if __name__ == "__main__": + main() diff --git a/usr/plugins/_kg_pipeline/pipeline/orphan_connector.py b/usr/plugins/_kg_pipeline/pipeline/orphan_connector.py new file mode 100644 index 0000000000..3ce8ee1438 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/orphan_connector.py @@ -0,0 +1,307 @@ +"""Orphan entity connector using LLM to suggest connections.""" +import os +import json +import time +import logging +from dataclasses import dataclass, field, asdict +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Any +import requests + +from .kg_client import KGClient + +logger = logging.getLogger(__name__) + + +@dataclass +class OrphanEntity: + """Represents an orphan entity from KG.""" + id: str + name: str + type: str + domain: str + confidence: float + degree: int + + +@dataclass +class InferredRelationship: + """Represents LLM-inferred relationship.""" + source_name: str + target_name: str + relation: str + confidence: float + reasoning: str + + +@dataclass +class ConnectorState: + """Tracks processing state for resume capability.""" + total_orphans: int = 0 + processed_count: int = 0 + connected_count: int = 0 + failed_count: int = 0 + skipped_count: int = 0 + last_batch_time: Optional[str] = None + processed_ids: set = field(default_factory=set) + failed_ids: set = field(default_factory=set) + + def to_dict(self) -> Dict: + return { + "total_orphans": self.total_orphans, + "processed_count": self.processed_count, + "connected_count": self.connected_count, + "failed_count": self.failed_count, + "skipped_count": self.skipped_count, + "last_batch_time": self.last_batch_time, + "processed_ids": list(self.processed_ids), + "failed_ids": list(self.failed_ids), + } + + @classmethod + def from_dict(cls, data: Dict) -> "ConnectorState": + state = cls( + total_orphans=data.get("total_orphans", 0), + processed_count=data.get("processed_count", 0), + connected_count=data.get("connected_count", 0), + failed_count=data.get("failed_count", 0), + skipped_count=data.get("skipped_count", 0), + last_batch_time=data.get("last_batch_time"), + ) + state.processed_ids = set(data.get("processed_ids", [])) + state.failed_ids = set(data.get("failed_ids", [])) + return state + + +class OrphanConnector: + """Connects orphan entities to hub entities via LLM.""" + + LLM_TIMEOUT = 60 + CONFIDENCE_THRESHOLD = 0.7 + MIN_HUB_DEGREE = 5 + BATCH_SIZE = 5 + RATE_LIMIT_DELAY = 2.0 + + def __init__(self, kg_client: KGClient, config: Dict[str, Any]): + self.kg = kg_client + self.config = config + self.state_file = config.get("orphan_state_file", + "/a0/usr/workdir/state/kg_orphan_state.json") + self.session = requests.Session() + self.llm_url = config.get("llm_api_url", "http://192.168.1.245:8000/v1") + self.llm_model = config.get("llm_model", "default") + + def _load_state(self) -> ConnectorState: + """Load state from file.""" + if self.state_file and os.path.exists(self.state_file): + try: + with open(self.state_file) as f: + return ConnectorState.from_dict(json.load(f)) + except Exception as e: + logger.warning(f"Failed to load state: {e}") + return ConnectorState() + + def _save_state(self, state: ConnectorState) -> None: + """Save state to file.""" + if self.state_file: + os.makedirs(os.path.dirname(self.state_file), exist_ok=True) + with open(self.state_file, "w") as f: + json.dump(state.to_dict(), f, indent=2) + + def infer_relationships(self, orphans: List[OrphanEntity], + hubs: List[Dict]) -> List[InferredRelationship]: + """Use LLM to infer relationships between orphans and hubs.""" + orphan_list = "\n".join( + f"- Name: '{o.name}', Type: {o.type}" + for o in orphans + ) + hub_list = "\n".join( + f"- Name: '{h['name']}', Type: {h['type']}" + for h in hubs[:50] + ) + + prompt = f"""You are a knowledge graph relationship inference engine. +Suggest logical relationships between ORPHAN entities and HUB entities. + +ORPHANS (low connectivity): +{orphan_list} + +HUBS (well-connected): +{hub_list} + +For each orphaned entity, propose 1-3 logical connections. + +INSTRUCTIONS: +1. Only suggest semantically meaningful relationships +2. Use specific terms (e.g., "mentioned_in", "part_of", "related_to") +3. Assign confidence 0.0-1.0 based on semantic strength +4. Provide brief reasoning + +Return ONLY JSON: +{{"relationships": [ +{{"source_name": "...", "target_name": "...", "relation": "...", + "confidence": 0.85, "reasoning": "..."}} +]}}""" + + try: + response = self.session.post( + f"{self.llm_url}/chat/completions", + json={ + "model": self.llm_model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.3, + "max_tokens": 2000, + }, + timeout=self.LLM_TIMEOUT + ) + response.raise_for_status() + result = response.json() + content = result["choices"][0]["message"].get("content", "") + + parsed = self._extract_json(content) + if not parsed: + return [] + + relationships = [] + for rel_data in parsed.get("relationships", []): + if rel_data.get("confidence", 0) >= self.CONFIDENCE_THRESHOLD: + relationships.append(InferredRelationship( + source_name=rel_data["source_name"], + target_name=rel_data["target_name"], + relation=rel_data["relation"], + confidence=rel_data["confidence"], + reasoning=rel_data.get("reasoning", ""), + )) + return relationships + + except Exception as e: + logger.error(f"LLM inference failed: {e}") + return [] + + def _extract_json(self, content: str) -> Optional[Dict]: + """Extract JSON from LLM response.""" + try: + return json.loads(content.strip()) + except json.JSONDecodeError: + pass + + import re + + patterns = [ + r'```json\s*(.*?)\s*```', + r'```\s*(.*?)\s*```', + r'\{.*"relationships".*\}', + ] + for pattern in patterns: + matches = re.findall(pattern, content, re.DOTALL) + for match in matches: + try: + return json.loads(match.strip()) + except: + continue + return None + + def process_batch(self, orphans: List[OrphanEntity], + hubs: List[Dict], state: ConnectorState, + dry_run: bool = False) -> tuple: + """Process a batch of orphans.""" + if not orphans or not hubs: + return 0, 0, 0 + + inferred = self.infer_relationships(orphans, hubs) + time.sleep(self.RATE_LIMIT_DELAY) + + connected = 0 + failed = 0 + + for rel in inferred: + if rel.source_name in state.processed_ids: + continue + + if dry_run: + logger.info(f"[DRY RUN] Would connect: {rel.source_name} -> {rel.target_name}") + connected += 1 + else: + success = self.kg.create_relationship( + source_name=rel.source_name, + target_name=rel.target_name, + rel_type=rel.relation + ) + if success: + connected += 1 + else: + failed += 1 + state.failed_ids.add(rel.source_name) + + for orphan in orphans: + state.processed_ids.add(orphan.id) + + return len(orphans), connected, failed + + def run(self, batch_size: int = 5, max_batches: Optional[int] = None, + dry_run: bool = False) -> Dict[str, Any]: + """Main processing loop.""" + state = self._load_state() + + # Fetch orphans and hubs + orphan_data = self.kg.get_orphans(limit=5000) + hubs = self.kg.get_hubs(top_n=50, min_degree=self.MIN_HUB_DEGREE) + + if not orphan_data: + return {"status": "no_orphans"} + + # Convert to OrphanEntity + orphans = [ + OrphanEntity( + id=item["id"], + name=item["name"], + type=item.get("type", ""), + domain=item.get("domain", ""), + confidence=item.get("confidence", 0), + degree=item.get("degree", 0), + ) + for item in orphan_data + ] + + state.total_orphans = len(orphans) + + # Filter already processed + to_process = [ + o for o in orphans + if o.id not in state.processed_ids and o.id not in state.failed_ids + ] + + logger.info(f"Processing {len(to_process)} orphans") + + batches_processed = 0 + for i in range(0, len(to_process), batch_size): + if max_batches and batches_processed >= max_batches: + break + + batch = to_process[i:i + batch_size] + processed, connected, failed = self.process_batch(batch, hubs, state, dry_run) + + state.processed_count += processed + state.connected_count += connected + state.failed_count += failed + state.last_batch_time = datetime.utcnow().isoformat() + batches_processed += 1 + + self._save_state(state) + time.sleep(self.RATE_LIMIT_DELAY) + + return { + "status": "done", + "total_orphans": state.total_orphans, + "processed": state.processed_count, + "connected": state.connected_count, + "failed": state.failed_count, + } + + def reset(self) -> bool: + """Reset state file.""" + if self.state_file and os.path.exists(self.state_file): + os.remove(self.state_file) + logger.info(f"State file reset: {self.state_file}") + return True diff --git a/usr/plugins/_kg_pipeline/pipeline/parallel_worker.py b/usr/plugins/_kg_pipeline/pipeline/parallel_worker.py new file mode 100644 index 0000000000..e5dfa14c00 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/parallel_worker.py @@ -0,0 +1,256 @@ +"""Parallel chunk worker for distributed ingestion.""" +import os +import time +import logging +from pathlib import Path +from typing import Dict, List, Optional, Any +from datetime import datetime + +from .kg_client import KGClient +from .ingester import Ingester +from .audit_chain import AuditChain +from . import checkpoint + +logger = logging.getLogger(__name__) + + +class ParallelWorker: + """Processes a chunk of files through parallel workers.""" + + DOMAIN_MAP = { + "blog": "context", + "security-labs": "technology", + "observability-labs": "technology", + "customers": "work", + "products": "technology", + "industries": "work", + "partners": "work", + "competitive": "work", + "what-is-glossary": "context", + "ai-emerging": "technology", + "training": "context", + "pricing-licensing": "context", + } + + def __init__(self, kg_client: KGClient, config: Dict[str, Any]): + self.kg = kg_client + self.config = config + self.chunk_dir = config.get("chunk_dir", "/a0/usr/workdir/config") + self.log_dir = config.get("log_dir", "/a0/usr/workdir/logs") + self.timeout = config.get("timeout", 300) + self.max_chars = config.get("max_chars", 30000) + audit_cfg = config.get("audit", {}) + audit_dir = os.path.join(self.log_dir, "kg_audit") + self.audit = AuditChain( + audit_dir=audit_cfg.get("audit_dir", audit_dir), + enabled=audit_cfg.get("enabled", True), + ) + + def _log(self, worker_id: int, msg: str) -> None: + """Log message with worker ID.""" + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + line = f"[{ts}] [W{worker_id}] {msg}" + logger.info(line) + log_file = os.path.join(self.log_dir, f"kg_worker_{worker_id}.log") + os.makedirs(os.path.dirname(log_file), exist_ok=True) + with open(log_file, "a") as f: + f.write(line + "\n") + + def get_processed_files(self) -> set: + """Get already processed files from KG.""" + processed = set() + try: + data = self.kg.export_data() + for doc in data.get("documents", []): + sp = doc.get("source_path", "") + if sp: + processed.add(os.path.basename(sp)) + except Exception as e: + logger.warning(f"Could not fetch processed files: {e}") + return processed + + def get_domain(self, filepath: str) -> str: + """Map file directory to KG domain.""" + parts = filepath.split("/") + for part in parts: + if part in self.DOMAIN_MAP: + return self.DOMAIN_MAP[part] + return "context" + + def load_chunk(self, chunk_index: int) -> List[str]: + """Load file list from chunk file.""" + chunk_file = os.path.join( + self.chunk_dir, f"kg_chunk_{chunk_index}.txt" + ) + if not os.path.exists(chunk_file): + raise FileNotFoundError( + f"Chunk file not found: {chunk_file}" + ) + with open(chunk_file, "r") as f: + return [line.strip() for line in f if line.strip()] + + def process_chunk( + self, chunk_index: int, worker_id: int + ) -> Dict[str, Any]: + """Process a single chunk with crash recovery.""" + self._log(worker_id, f"Starting chunk {chunk_index}") + resumed = False + + # Load existing checkpoint for resume + cp = checkpoint.load_checkpoint(worker_id, chunk_index) + checkpoint_processed: set = set() + cp_failed: List[dict] = [] + cp_stats: Dict = {"pushed": 0, "failed": 0, "skipped": 0} + + if cp is not None: + resumed = True + checkpoint_processed = set(cp.get("processed_files", [])) + cp_failed = list(cp.get("failed_files", [])) + cp_stats = dict(cp.get("stats", cp_stats)) + self._log( + worker_id, + f"RESUMING from checkpoint: " + f"{len(checkpoint_processed)} already done", + ) + + files = self.load_chunk(chunk_index) + self._log( + worker_id, + f"Loaded {len(files)} files from chunk {chunk_index}", + ) + + processed = self.get_processed_files() + self._log( + worker_id, + f"Found {len(processed)} already-processed in KG", + ) + + to_process = [ + f for f in files + if os.path.basename(f) not in processed + and os.path.basename(f) not in checkpoint_processed + and os.path.exists(f) and os.path.getsize(f) > 100 + ] + self._log( + worker_id, + f"After dedup: {len(to_process)} files to process", + ) + + if not to_process and not cp_failed: + checkpoint.clear_checkpoint(worker_id, chunk_index) + return { + "status": "no_files", "processed": 0, "resumed": resumed, + } + + pushed = cp_stats.get("pushed", 0) + failed = cp_stats.get("failed", 0) + skipped = cp_stats.get("skipped", 0) + local_processed: List[str] = list(checkpoint_processed) + failed_list: List[dict] = list(cp_failed) + start_time = time.time() + + for i, fpath in enumerate(to_process, 1): + try: + with open(fpath, "r", encoding="utf-8", + errors="ignore") as f: + content = f.read() + + if len(content.strip()) < 100: + skipped += 1 + continue + + # Apply token compression before truncation + from .token_compressor import TokenCompressor + if not hasattr(self, 'compressor'): + self.compressor = TokenCompressor(self.config) + content = self.compressor.compress(content) + + if len(content) > self.max_chars: + content = content[:self.max_chars] + content += "\n\n[...truncated...]" + + rel_path = os.path.relpath( + fpath, "/a0/usr/workdir/" + ) + domain = self.get_domain(rel_path) + + result = self.kg.add_content( + content, f"workdir/{rel_path}", domain + ) + + if "error" in result: + failed += 1 + failed_list.append({ + "file": os.path.basename(fpath), + "error": str(result["error"])[:100], + }) + else: + pushed += 1 + local_processed.append(os.path.basename(fpath)) + self.audit.append( + action="add", + target_type="document", + target_id=fpath, + source=f"parallel_worker:{worker_id}", + metadata={ + "domain": domain, + "chunk": chunk_index, + }, + ) + + if i % 10 == 0: + stats = { + "pushed": pushed, "failed": failed, + "skipped": skipped, + } + checkpoint.save_checkpoint( + worker_id, chunk_index, + local_processed, len(files), + failed_list, stats, + ) + + if i % 50 == 0: + elapsed = time.time() - start_time + rate = pushed / (elapsed / 3600) if elapsed > 0 else 0 + self._log( + worker_id, + f"[{i}/{len(to_process)}] " + f"pushed={pushed} failed={failed} " + f"skipped={skipped} | {rate:.1f}/h", + ) + + except Exception as e: + failed += 1 + failed_list.append({ + "file": os.path.basename(fpath), + "error": str(e)[:100], + }) + if failed <= 10: + self._log( + worker_id, + f"ERROR: {os.path.basename(fpath)} " + f"- {str(e)[:60]}", + ) + + time.sleep(0.1) + + elapsed = time.time() - start_time + self._log( + worker_id, + f"COMPLETE: {pushed} pushed, {failed} failed, " + f"{skipped} skipped in {elapsed:.0f}s", + ) + + # Clear checkpoint only on clean completion + if failed == 0: + checkpoint.clear_checkpoint(worker_id, chunk_index) + + return { + "status": "done", + "processed": len(to_process), + "pushed": pushed, + "failed": failed, + "skipped": skipped, + "elapsed_seconds": round(elapsed, 1), + "resumed": resumed, + } diff --git a/usr/plugins/_kg_pipeline/pipeline/phase2_ingest.py b/usr/plugins/_kg_pipeline/pipeline/phase2_ingest.py new file mode 100644 index 0000000000..d5abb7fcca --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/phase2_ingest.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +"""Phase 2 Ingestion Script: FAISS summary output + KG HTTP ingestion.""" + +import json +import os +import time +import requests +import sys + +WORKDIR = '/a0/usr/workdir' +DELTA_FILE = f'{WORKDIR}/logs/delta_sorted_phase2.json' +RESULTS_FILE = f'{WORKDIR}/logs/phase2_ingest_results.json' +FAISS_MANIFEST = f'{WORKDIR}/logs/phase2_faiss_manifest.json' +KG_API = 'http://100.78.79.41:8010/api/v1/add' +KG_TIMEOUT = 60 +KG_DELAY = 0.5 +FAISS_AREA = 'strategic_workdir' +MAX_FAISS_CHARS = 2000 +MAX_KG_CHARS = 5000 + +results = { + 'scanned': 0, + 'faiss_queued': 0, + 'kg_success': 0, + 'kg_fail': 0, + 'kg_failures': [], + 'started_at': time.strftime('%Y-%m-%dT%H:%M:%S') +} + +def main(): + print("=" * 60) + print("Phase 2 Ingestion: KG HTTP + FAISS Manifest") + print("=" * 60) + + with open(DELTA_FILE, 'r') as f: + delta = json.load(f) + + results['scanned'] = len(delta) + print(f"Files to process: {len(delta)}") + + # Phase 1: Build FAISS manifest (for memory_save calls) + faiss_queue = [] + for file_info in delta: + path = file_info['path'] + score = file_info.get('score', 0) + reason = file_info.get('reason', 'unknown') + full_path = os.path.join(WORKDIR, path) + + if not os.path.exists(full_path): + print(f" SKIP (not found): {path}") + continue + + try: + with open(full_path, 'r', errors='replace') as f: + content = f.read() + except Exception as e: + print(f" SKIP (read error): {path}: {e}") + continue + + summary = content[:MAX_FAISS_CHARS] + faiss_queue.append({ + 'path': path, + 'score': score, + 'reason': reason, + 'size': file_info.get('size', 0), + 'summary': summary, + 'full_content': content[:MAX_KG_CHARS] + }) + + results['faiss_queued'] = len(faiss_queue) + + # Save FAISS manifest for memory_save tool calls + with open(FAISS_MANIFEST, 'w') as f: + json.dump(faiss_queue, f, indent=2) + print(f"\nFAISS manifest saved: {len(faiss_queue)} files -> {FAISS_MANIFEST}") + + # Phase 2: KG ingestion + print(f"\nStarting KG ingestion...") + kg_success = 0 + kg_fail = 0 + kg_failures = [] + + for i, item in enumerate(faiss_queue): + path = item['path'] + score = item['score'] + content = item['full_content'] + + print(f" [{i+1}/{len(faiss_queue)}] [{score}] {path}...", end=' ', flush=True) + + try: + payload = { + 'content': content, + 'source_path': f'workdir/strategic/{os.path.basename(path)}', + 'domain': 'context' + } + resp = requests.post(KG_API, json=payload, timeout=KG_TIMEOUT) + if resp.status_code == 200: + kg_success += 1 + print(f"OK") + else: + kg_fail += 1 + err_msg = f"HTTP {resp.status_code}: {resp.text[:200]}" + print(f"FAIL ({err_msg})") + kg_failures.append({'path': path, 'score': score, 'error': err_msg}) + except requests.exceptions.Timeout: + kg_fail += 1 + print(f"TIMEOUT") + kg_failures.append({'path': path, 'score': score, 'error': 'timeout'}) + except Exception as e: + kg_fail += 1 + print(f"ERROR: {e}") + kg_failures.append({'path': path, 'score': score, 'error': str(e)}) + + time.sleep(KG_DELAY) + + # Progress every 20 files + if (i + 1) % 20 == 0: + elapsed = time.time() - start_time + print(f"\n--- Progress: {i+1}/{len(faiss_queue)} ({elapsed:.0f}s elapsed) ---") + print(f" KG: {kg_success} ok / {kg_fail} fail") + + # Save results + results['kg_success'] = kg_success + results['kg_fail'] = kg_fail + results['kg_failures'] = kg_failures + results['completed_at'] = time.strftime('%Y-%m-%dT%H:%M:%S') + + with open(RESULTS_FILE, 'w') as f: + json.dump(results, f, indent=2) + + print(f"\n{'=' * 60}") + print(f"KG INGESTION COMPLETE") + print(f"{'=' * 60}") + print(f"Files scanned: {results['scanned']}") + print(f"FAISS queued: {results['faiss_queued']}") + print(f"KG success: {kg_success}") + print(f"KG failed: {kg_fail}") + print(f"Results saved: {RESULTS_FILE}") + print(f"FAISS manifest: {FAISS_MANIFEST}") + +start_time = time.time() + +if __name__ == '__main__': + main() diff --git a/usr/plugins/_kg_pipeline/pipeline/token_compressor.py b/usr/plugins/_kg_pipeline/pipeline/token_compressor.py new file mode 100644 index 0000000000..38c4f16417 --- /dev/null +++ b/usr/plugins/_kg_pipeline/pipeline/token_compressor.py @@ -0,0 +1,552 @@ +"""Token compression module for KG ingestion pipeline. + +Compresses content before LLM extraction to reduce token usage by stripping +boilerplate, social sharing noise, and other non-content elements. +""" +import hashlib +import json +import os +import re +import time +from typing import Dict, List, Optional, Any, Pattern +from urllib.parse import urlparse, urlunparse +import logging +import urllib.request + +logger = logging.getLogger(__name__) + + +class TokenCompressor: + """Compress content before LLM extraction to reduce token usage.""" + + # Pre-compiled regex patterns - handle leading whitespace (indented strings) + _SHARE_BUTTONS: Pattern = re.compile( + r'^\s*Share on (Twitter|X|Facebook|LinkedIn|Email)\s*$', + re.IGNORECASE | re.MULTILINE, + ) + _SHARE_STANDALONE: Pattern = re.compile( + r'^\s*Share\s*$', + re.IGNORECASE | re.MULTILINE, + ) + # Handle "By" followed by author name on same line OR next line + _AUTHOR_BYLINE: Pattern = re.compile( + r'^\s*By\s+\w[\w\s-]+\w\s*$', + re.MULTILINE | re.IGNORECASE, + ) + # Handle multi-line "By\nName" pattern + _AUTHOR_BYLINE_MULTILINE: Pattern = re.compile( + r'^\s*By\s*\n\s*\w[\w\s-]+\w\s*$', + re.MULTILINE | re.IGNORECASE, + ) + _READING_TIME: Pattern = re.compile( + r'^\s*\d+\s*min(?:ute)?(?:s)?\s*read\s*$', + re.IGNORECASE | re.MULTILINE, + ) + _CATEGORY_DATE_HEADERS: Pattern = re.compile( + r'^\s*\d{1,2}\s+(January|February|March|April|May|June|' + r'July|August|September|October|November|December)\s+\d{4}\s*' + r'(?:\n+\s*[A-Z][a-z]+)*', + re.MULTILINE | re.IGNORECASE, + ) + _SOCIAL_URLS: Pattern = re.compile( + r'^\s*https?://(?:twitter\.com|x\.com|facebook\.com|linkedin\.com)/\S+$', + re.IGNORECASE | re.MULTILINE, + ) + _REPEATED_TITLES: Pattern = re.compile( + r'^(\s*#+)\s+(.+?)\s*$\n+\s*\1\s+\2\s*$', + re.MULTILINE, + ) + + # URL tracking parameters to strip + _UTM_PARAMS: List[str] = [ + 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', + 'utm_content', 'fbclid', 'gclid', 'twitterclid', 'li_fat_id', + 'mc_cid', 'mc_eid', 'ref', 'referral', 'referrer', 'source', + 'track', 'clickid', 'affiliate', 'aff', 'partner', 'cid', + ] + + def __init__(self, config: Optional[Dict[str, Any]] = None): + """Initialize with optional config overrides. + + Args: + config: Configuration dictionary with compression settings. + """ + self.config = config or {} + comp_config = self.config.get('compression', {}) + self.enabled = comp_config.get('enabled', True) + self.min_reduction_pct = comp_config.get('min_reduction_pct', 10) + self.llm_enabled = comp_config.get('llm_enabled', True) + self.llm_threshold_chars = comp_config.get('llm_threshold_chars', 30000) + self.llm_max_output_tokens = comp_config.get('llm_max_output_tokens', 4096) + self.cache_enabled = comp_config.get('cache_enabled', True) + self.cache_ttl_days = comp_config.get('cache_ttl_days', 7) + + # LLM endpoint configuration + self.llm_api_url = self.config.get( + 'llm_api_url', 'http://192.168.1.250:11435/v1/chat/completions' + ) + self.llm_model = self.config.get( + 'llm_model', 'Qwen3.6-35B-A3B-MTP-UD-Q5_K_XL.gguf' + ) + + # Initialize cache directory + self._cache_dir = os.path.join( + os.path.dirname(comp_config.get('log_dir', '/a0/usr/workdir/logs')), + 'cache', 'kg_compression' + ) + if self.cache_enabled: + os.makedirs(self._cache_dir, exist_ok=True) + + self.stats: Dict[str, Any] = { + 'total_calls': 0, + 'total_original_chars': 0, + 'total_compressed_chars': 0, + 'total_reduction_pct': 0.0, + 'llm_calls': 0, + 'llm_errors': 0, + 'cache_hits': 0, + 'cache_misses': 0, + 'warnings': [], + } + logger.info( + f"TokenCompressor initialized (enabled={self.enabled}, " + f"llm_enabled={self.llm_enabled}, cache_enabled={self.cache_enabled})" + ) + + def compress(self, content: str, source_path: str = "") -> str: + """Apply all compression steps to content. + + Args: + content: Raw content string to compress. + source_path: Source file path for context. + + Returns: + Compressed content string. + """ + if not self.enabled: + return content + + if not content or len(content) < 200: + return content + + original_size = len(content) + + # Check cache first + if self.cache_enabled: + cached = self._check_cache(content) + if cached is not None: + self.stats['cache_hits'] += 1 + logger.debug(f"Cache hit for content (original {original_size:,} chars)") + return cached + self.stats['cache_misses'] += 1 + + self.stats['total_calls'] += 1 + self.stats['total_original_chars'] += original_size + + # Step 1: Regex compression (fast, always applied) + result = self._strip_boilerplate(content) + result = self._collapse_whitespace(result) + result = self._remove_duplicate_lines(result) + result = self._clean_urls(result) + result = self._strip_non_ascii(result) + + # Step 2: LLM summarization (only for large files) + if self.llm_enabled and len(result) > self.llm_threshold_chars: + result = self._llm_summarize(result, source_path) + + # Step 3: Smart truncation (preserve entity-rich sections) + if len(result) > self.llm_threshold_chars: + result = self._smart_truncate(result) + + compressed_size = len(result) + self.stats['total_compressed_chars'] += compressed_size + + reduction = original_size - compressed_size + pct = (reduction / original_size) * 100 if original_size > 0 else 0 + + logger.info( + f"Compressed {original_size:,} → {compressed_size:,} chars " + f"({pct:.1f}% reduction)" + ) + + if pct < self.min_reduction_pct: + self.stats['warnings'].append( + f"Low compression: {pct:.1f}% < {self.min_reduction_pct}% target" + ) + logger.warning( + f"Compression below threshold: {pct:.1f}% < " + f"{self.min_reduction_pct}%" + ) + + self.stats['total_reduction_pct'] = ( + (self.stats['total_original_chars'] + - self.stats['total_compressed_chars']) + / self.stats['total_original_chars'] * 100 + if self.stats['total_original_chars'] > 0 else 0 + ) + + # Save to cache + if self.cache_enabled: + self._save_to_cache(content, result) + + return result + + def _llm_summarize(self, content: str, source_path: str = "") -> str: + """Use LLM to summarize large content while preserving entity information. + + Only called for content >30K chars after regex compression. + Sends content to Mediaserver Qwen3.6-35B for intelligent summarization. + + Args: + content: Pre-compressed (regex already applied) content + source_path: Source file path for context + + Returns: + Summarized content preserving all entity information + """ + if not self.llm_enabled: + return content + + # Send first 20K chars (enough context for summarization) + content_to_summarize = content[:20000] + + prompt = ( + "Summarize the following content for knowledge extraction. " + "Preserve ALL named entities (people, organizations, products, " + "technologies, concepts, locations). Remove repetitive descriptions, " + "examples, and filler. Keep the essential information and relationships. " + "Output a concise summary:\n\n" + f"{content_to_summarize}" + ) + + payload = { + "model": self.llm_model, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": self.llm_max_output_tokens, + "temperature": 0.1, + } + + try: + req = urllib.request.Request( + self.llm_api_url, + data=json.dumps(payload).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }, + method='POST', + ) + + with urllib.request.urlopen(req, timeout=60) as response: + response_data = json.loads(response.read().decode('utf-8')) + message = response_data.get('choices', [{}])[0].get('message', {}) + summary = message.get('content', '') + + # Fallback: Qwen models may put reasoning in reasoning_content field + if not summary or len(summary) < 50: + summary = message.get('reasoning_content', '') + + if summary and len(summary) > 100: + self.stats['llm_calls'] += 1 + logger.info( + f"LLM summarization: {len(content):,} → {len(summary):,} chars" + ) + return summary + else: + logger.warning("LLM returned empty or too short summary") + return content + + except urllib.error.HTTPError as e: + self.stats['llm_errors'] += 1 + logger.error(f"LLM HTTP error: {e.code} - {e.read().decode('utf-8', errors='ignore')}") + return content + except urllib.error.URLError as e: + self.stats['llm_errors'] += 1 + logger.error(f"LLM URL error: {e.reason}") + return content + except Exception as e: + self.stats['llm_errors'] += 1 + logger.error(f"LLM summarization failed: {e}") + return content + + def _get_cache_path(self, content_hash: str) -> str: + """Get cache file path for a content hash.""" + return os.path.join(self._cache_dir, f"{content_hash}.compressed") + + def _check_cache(self, content: str) -> Optional[str]: + """Check if compressed version exists in cache. + + Args: + content: Original content to check cache for. + + Returns: + Cached compressed content if found and not expired, else None. + """ + if not self.cache_enabled: + return None + + content_hash = hashlib.md5(content.encode()).hexdigest() + cache_path = self._get_cache_path(content_hash) + + if os.path.exists(cache_path): + # Check if cache is less than configured TTL + ttl_seconds = self.cache_ttl_days * 86400 + if time.time() - os.path.getmtime(cache_path) < ttl_seconds: + try: + with open(cache_path, 'r', encoding='utf-8') as f: + return f.read() + except Exception as e: + logger.warning(f"Failed to read cache: {e}") + return None + + def _save_to_cache(self, content: str, compressed: str) -> None: + """Save compressed content to cache. + + Args: + content: Original content (for hash generation) + compressed: Compressed content to cache + """ + if not self.cache_enabled: + return + + content_hash = hashlib.md5(content.encode()).hexdigest() + cache_path = self._get_cache_path(content_hash) + + try: + with open(cache_path, 'w', encoding='utf-8') as f: + f.write(compressed) + except Exception as e: + logger.warning(f"Failed to write cache: {e}") + + def _smart_truncate(self, content: str, max_chars: int = 30000) -> str: + """Truncate content preferring sections with more entity signals. + + Instead of cutting at max_chars, prefer sections with more entity signals + (capitalized words, technical terms, product names). + + Args: + content: Content to truncate + max_chars: Maximum characters to keep + + Returns: + Truncated content with entity-rich sections prioritized + """ + if len(content) <= max_chars: + return content + + # Split into paragraphs + paragraphs = content.split('\n\n') + + # Score each paragraph by entity signals + def entity_score(p: str) -> float: + score = 0.0 + # Capitalized words (likely entities) + caps = len(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', p)) + score += caps * 0.5 + # Technical terms + tech = len(re.findall( + r'\b(?:API|SDK|CLI|HTTP|REST|SQL|NoSQL|ML|AI|LLM|SaaS|' + r'IaaS|Docker|Kubernetes)\b', p, re.IGNORECASE + )) + score += tech * 2.0 + # Length (prefer substance) + score += min(len(p) / 100, 5.0) + return score + + # Sort by score descending, take top paragraphs that fit + scored = [(p, entity_score(p)) for p in paragraphs] + scored.sort(key=lambda x: x[1], reverse=True) + + result: List[str] = [] + total = 0 + for p, score in scored: + if total + len(p) + 2 <= max_chars: + result.append(p) + total += len(p) + 2 + + return '\n\n'.join(result) + + def _strip_boilerplate(self, content: str) -> str: + """Remove known boilerplate patterns. + + Args: + content: Content to process. + + Returns: + Content with boilerplate removed. + """ + # Remove share buttons (line by line) + content = self._SHARE_BUTTONS.sub('', content) + content = self._SHARE_STANDALONE.sub('', content) + + # Remove author bylines (handle single line and multi-line) + content = self._AUTHOR_BYLINE_MULTILINE.sub('', content) + content = self._AUTHOR_BYLINE.sub('', content) + + # Remove reading time indicators + content = self._READING_TIME.sub('', content) + + # Remove date+category headers + content = self._CATEGORY_DATE_HEADERS.sub('', content) + + # Remove standalone social URLs + content = self._SOCIAL_URLS.sub('', content) + + # Remove repeated titles (title appears in both heading and subheading) + content = self._REPEATED_TITLES.sub(r'\1 \2', content) + + return content + + def _collapse_whitespace(self, content: str) -> str: + """Collapse multiple blank lines to max 2 (paragraph break). + + Args: + content: Content to process. + + Returns: + Content with collapsed whitespace. + """ + # Replace 3+ newlines with 2 (paragraph break preserves structure) + content = re.sub(r'\n{3,}', '\n\n', content) + # Replace tabs with spaces + content = re.sub(r'\t+', ' ', content) + # Collapse multiple spaces (but keep newlines) + content = re.sub(r' +', ' ', content) + # Remove trailing whitespace per line + content = re.sub(r' +$', '', content, flags=re.MULTILINE) + # Collapse any remaining multiple blank lines + content = re.sub(r'\n\n+', '\n\n', content) + return content + + def _remove_duplicate_lines(self, content: str) -> str: + """Remove consecutive duplicate lines. + + Args: + content: Content to process. + + Returns: + Content with duplicate lines removed. + """ + lines = content.split('\n') + result: List[str] = [] + prev_line = None + + for line in lines: + stripped = line.strip() + if stripped == prev_line and stripped: + continue + result.append(line) + prev_line = stripped + + return '\n'.join(result) + + def _clean_urls(self, content: str) -> str: + """Remove tracking query parameters from URLs. + + Args: + content: Content to process. + + Returns: + Content with cleaned URLs. + """ + def clean_url_match(match: re.Match) -> str: + url = match.group(0) + try: + parsed = urlparse(url) + if parsed.query: + params: List[str] = [] + for param in parsed.query.split('&'): + if '=' in param: + key = param.split('=')[0] + if key.lower() not in self._UTM_PARAMS: + params.append(param) + else: + params.append(param) + new_query = '&'.join(params) if params else '' + parsed = parsed._replace(query=new_query) + return urlunparse(parsed) if new_query else parsed.path + return url + except Exception as e: + logger.debug(f"URL cleaning failed for {url}: {e}") + return url + + # Find URLs and clean them + url_pattern = r'https?://[^\s<>"\)`\]\n]+' + return re.sub(url_pattern, clean_url_match, content) + + def _strip_non_ascii(self, content: str) -> str: + """Remove non-ASCII characters except CJK, emojis, and common symbols. + + Args: + content: Content to process. + + Returns: + Content with non-ASCII cleaned. + """ + result: List[str] = [] + for char in content: + code = ord(char) + # ASCII + if code < 128: + result.append(char) + # CJK Unified Ideographs, Extensions + elif 0x4E00 <= code <= 0x9FFF: + result.append(char) + # CJK Extension A + elif 0x3400 <= code <= 0x4DBF: + result.append(char) + # CJK Extension B-F + elif 0x20000 <= code <= 0x2EBFF: + result.append(char) + # Hiragana, Katakana + elif 0x3040 <= code <= 0x309F or 0x30A0 <= code <= 0x30FF: + result.append(char) + # Hangul + elif 0xAC00 <= code <= 0xD7AF: + result.append(char) + # Emojis (common range) + elif 0x1F300 <= code <= 0x1F9FF: + result.append(char) + # Replace control characters with space + elif code < 0x20: + result.append(' ') + # Skip other non-ASCII (likely noise) + else: + continue + + return ''.join(result) + + def get_stats(self) -> Dict[str, Any]: + """Get compression statistics. + + Returns: + Dictionary with compression statistics. + """ + return { + 'total_calls': self.stats['total_calls'], + 'total_original_chars': self.stats['total_original_chars'], + 'total_compressed_chars': self.stats['total_compressed_chars'], + 'reduction_percentage': round(self.stats['total_reduction_pct'], 2), + 'average_reduction_per_call': round( + self.stats['total_reduction_pct'] / self.stats['total_calls'], + 2 + ) if self.stats['total_calls'] > 0 else 0, + 'llm_calls': self.stats['llm_calls'], + 'llm_errors': self.stats['llm_errors'], + 'cache_hits': self.stats['cache_hits'], + 'cache_misses': self.stats['cache_misses'], + 'warnings': self.stats['warnings'][:10], # Limit to 10 + } + + def reset_stats(self) -> None: + """Reset compression statistics.""" + self.stats = { + 'total_calls': 0, + 'total_original_chars': 0, + 'total_compressed_chars': 0, + 'total_reduction_pct': 0.0, + 'llm_calls': 0, + 'llm_errors': 0, + 'cache_hits': 0, + 'cache_misses': 0, + 'warnings': [], + } diff --git a/usr/plugins/_kg_pipeline/plugin.yaml b/usr/plugins/_kg_pipeline/plugin.yaml new file mode 100644 index 0000000000..6c3e417658 --- /dev/null +++ b/usr/plugins/_kg_pipeline/plugin.yaml @@ -0,0 +1,12 @@ +name: _kg_pipeline +title: KG Pipeline +description: > + Knowledge Graph batch pipeline — ingestion, enrichment, auditing, + and maintenance. Companion to kg_tools (real-time operations). + Provides tool methods for single/bulk ingestion, Elastic KB ingestion, + orphan connection, enrichment, and retrieval auditing. +version: 1.0.0 +settings_sections: + - agent +per_project_config: false +per_agent_config: false diff --git a/usr/plugins/_kg_pipeline/prompts/agent.system.tool.kg_pipeline.md b/usr/plugins/_kg_pipeline/prompts/agent.system.tool.kg_pipeline.md new file mode 100644 index 0000000000..095fc90d71 --- /dev/null +++ b/usr/plugins/_kg_pipeline/prompts/agent.system.tool.kg_pipeline.md @@ -0,0 +1,58 @@ +# KG Pipeline Tool + +You are the KG Pipeline agent. You handle batch operations for the Knowledge Graph system - ingestion, enrichment, auditing, and maintenance. + +## Available Methods + +| Method | Purpose | Key Args | +|--------|---------|----------| +| `status` | Check KG service health and doc counts | — | +| `ingest` | Ingest single file or directory | `filepath`, `directory`, `limit`, `resume`, `force_reingest` | +| `bulk_ingest` | Bulk ingest with deduplication | `directory`, `pattern`, `dry_run` | +| `elastic_ingest` | Elastic KB ingestion with domain mapping | `category`, `dry_run`, `skip_export_check` | +| `parallel_ingest` | Parallel chunk processing | `chunk`, `worker_id` | +| `connect_orphans` | Connect orphan entities via LLM | `batch_size`, `max_batches`, `dry_run`, `reset` | +| `enrich` | Enrich entities with domain/categories | `limit`, `offset`, `dry_run` | +| `audit` | Run retrieval quality audit | `sample`, `save_report` | +| `extract` | Extract entities from a file | `filepath` | +| `retry_failed` | Retry docs that failed during worker runs | (auto-detects from worker logs) | +| `knowledge_ingest` | Knowledge file ingestion with state tracking, archiving, janitor | `limit`, `resume`, `force_reingest`, `status_only` | +| `gdrive_upload` | Export KG and upload to Google Drive | `filepath` (optional, auto-exports if omitted) | + +## Configuration + +All settings come from `default_config.yaml`: +- `kg_service_url`: KG service endpoint (AICube:8010) +- `llm_api_url`: LLM endpoint for enrichment/extraction (Spark:8000) +- `llm_model`: LLM model name +- `batch_size`: Processing batch size +- `timeout`: HTTP timeout (seconds) +- `max_retries`: Retry count for failed requests +- `elastic_kb_dir`: Elastic KB file directory +- `knowledge_dir`: Knowledge files directory +- `log_dir`: Log output directory +- `ingest_state_file`: Progress tracking JSON + +## Architecture + +This plugin handles **batch operations**. For real-time queries, use `kg_tools` (kg_search, kg_add, kg_query, kg_insights, kg_hubs, kg_communities, kg_surprises, kg_bridges, kg_suggest_questions). + +## Response Format + +Always returns structured JSON: +```json +{"status": "ok", "pushed": 100, "failed": 0, "skipped": 0} +``` + +On error: +```json +{"status": "error", "message": "error description"} +``` + +## Safety Rules + +1. Always check service health before long operations (`status` method) +2. Respect rate limits — add delays between API calls +3. Save state periodically for resume capability +4. Log all operations to configured log directory +5. Handle errors gracefully — never crash diff --git a/usr/plugins/_kg_pipeline/tests/__init__.py b/usr/plugins/_kg_pipeline/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usr/plugins/_kg_pipeline/tests/test_audit_chain.py b/usr/plugins/_kg_pipeline/tests/test_audit_chain.py new file mode 100644 index 0000000000..12c40ec49b --- /dev/null +++ b/usr/plugins/_kg_pipeline/tests/test_audit_chain.py @@ -0,0 +1,190 @@ +"""Tests for AuditChain append-only audit trail.""" +import json +import os +import shutil +import time +import unittest +from datetime import date, datetime, timedelta, timezone + +import sys +sys.path.insert(0, "/a0/usr/plugins/_kg_pipeline") +from pipeline.audit_chain import AuditChain + + +class TestAuditChain(unittest.TestCase): + """Unit tests for AuditChain.""" + + def setUp(self) -> None: + """Create temp audit directory for each test.""" + self.audit_dir = f"/tmp/test_audit_{os.getpid()}" + os.makedirs(self.audit_dir, exist_ok=True) + + def tearDown(self) -> None: + """Remove temp audit directory after each test.""" + shutil.rmtree(self.audit_dir, ignore_errors=True) + + def _make_chain(self, enabled: bool = True) -> AuditChain: + """Create an AuditChain pointing at the test directory.""" + return AuditChain(self.audit_dir, enabled=enabled) + + def _read_events(self) -> list: + """Read all events from today's audit file.""" + today = date.today().isoformat() + filepath = os.path.join(self.audit_dir, f"kg_audit_{today}.jsonl") + events = [] + if not os.path.exists(filepath): + return events + with open(filepath, "r") as f: + for line in f: + line = line.strip() + if line: + events.append(json.loads(line)) + return events + + def test_append_creates_file(self) -> None: + """Append an event and verify the file exists with valid JSON.""" + chain = self._make_chain() + chain.append( + action="add", + target_type="document", + target_id="/test/file.md", + source="test:test", + ) + events = self._read_events() + self.assertEqual(len(events), 1) + self.assertEqual(events[0]["action"], "add") + self.assertEqual(events[0]["target_type"], "document") + self.assertEqual(events[0]["target_id"], "/test/file.md") + self.assertEqual(events[0]["source"], "test:test") + self.assertIn("timestamp", events[0]) + + def test_append_multiple_events(self) -> None: + """Append 5 events and verify all present.""" + chain = self._make_chain() + for i in range(5): + chain.append( + action="add", + target_type="document", + target_id=f"/test/file{i}.md", + source="test:test", + metadata={"index": i}, + ) + events = self._read_events() + self.assertEqual(len(events), 5) + for i, ev in enumerate(events): + self.assertEqual(ev["target_id"], f"/test/file{i}.md") + + def test_query_by_action(self) -> None: + """Append mixed events, query by action='add'.""" + chain = self._make_chain() + chain.append(action="add", target_type="entity", + target_id="e1", source="test") + chain.append(action="update", target_type="entity", + target_id="e2", source="test") + chain.append(action="add", target_type="document", + target_id="d1", source="test") + chain.append(action="delete", target_type="entity", + target_id="e3", source="test") + + adds = chain.query(action="add") + self.assertEqual(len(adds), 2) + self.assertTrue(all(e["action"] == "add" for e in adds)) + + def test_query_by_source(self) -> None: + """Append events from different sources, filter by source.""" + chain = self._make_chain() + chain.append(action="add", target_type="document", + target_id="d1", source="ingester") + chain.append(action="add", target_type="document", + target_id="d2", source="elastic") + chain.append(action="add", target_type="document", + target_id="d3", source="ingester") + + results = chain.query(source="ingester") + self.assertEqual(len(results), 2) + self.assertTrue(all(e["source"] == "ingester" for e in results)) + + def test_query_by_since(self) -> None: + """Append events, query only those after a specific time.""" + chain = self._make_chain() + chain.append(action="add", target_type="document", + target_id="d1", source="test") + time.sleep(0.05) + cutoff = datetime.now(timezone.utc).isoformat() + time.sleep(0.05) + chain.append(action="add", target_type="document", + target_id="d2", source="test") + chain.append(action="add", target_type="document", + target_id="d3", source="test") + + results = chain.query(since=cutoff) + self.assertGreaterEqual(len(results), 2) + for ev in results: + self.assertGreaterEqual(ev["timestamp"], cutoff) + + def test_disabled_is_noop(self) -> None: + """enabled=False: no files should be created.""" + noop_dir = f"/tmp/test_audit_noop_{os.getpid()}" + chain = AuditChain(noop_dir, enabled=False) + chain.append( + action="add", target_type="document", + target_id="x", source="test", + ) + self.assertFalse(os.path.exists(noop_dir)) + results = chain.query() + self.assertEqual(results, []) + stats = chain.get_stats() + self.assertEqual(stats["total_events"], 0) + shutil.rmtree(noop_dir, ignore_errors=True) + + def test_stats(self) -> None: + """Append events and verify stats counts.""" + chain = self._make_chain() + chain.append(action="add", target_type="document", + target_id="d1", source="test") + chain.append(action="add", target_type="document", + target_id="d2", source="test") + chain.append(action="update", target_type="entity", + target_id="e1", source="test") + + stats = chain.get_stats() + self.assertEqual(stats["total_events"], 3) + self.assertEqual(stats["events_by_action"]["add"], 2) + self.assertEqual(stats["events_by_action"]["update"], 1) + self.assertEqual(stats["file_count"], 1) + self.assertGreater(stats["total_size_bytes"], 0) + + def test_daily_rotation(self) -> None: + """Append events to different dates, verify separate files.""" + chain = self._make_chain() + + # Write to today + chain.append(action="add", target_type="document", + target_id="today.md", source="test") + + # Manually write to a different date file + yesterday = (date.today() - timedelta(days=1)).isoformat() + yesterday_file = os.path.join( + self.audit_dir, f"kg_audit_{yesterday}.jsonl" + ) + with open(yesterday_file, "a") as f: + f.write(json.dumps({ + "timestamp": yesterday + "T00:00:00Z", + "action": "add", + "target_type": "document", + "target_id": "yesterday.md", + "source": "test", + "metadata": {}, + }) + "\n") + + stats = chain.get_stats() + self.assertEqual(stats["file_count"], 2) + self.assertEqual(stats["total_events"], 2) + + # Query should find both + all_events = chain.query(limit=10) + self.assertEqual(len(all_events), 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/usr/plugins/_kg_pipeline/tests/test_checkpoint.py b/usr/plugins/_kg_pipeline/tests/test_checkpoint.py new file mode 100644 index 0000000000..dbe89f39d7 --- /dev/null +++ b/usr/plugins/_kg_pipeline/tests/test_checkpoint.py @@ -0,0 +1,150 @@ +"""Unit tests for checkpoint crash recovery module.""" +import json +import os +import sys +import tempfile +import unittest +from datetime import datetime, timezone, timedelta +from unittest.mock import patch + +# Add pipeline dir to path for direct test execution +_pipeline_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "pipeline", +) +if _pipeline_dir not in sys.path: + sys.path.insert(0, _pipeline_dir) + +import checkpoint + + +class TestCheckpoint(unittest.TestCase): + """Test suite for checkpoint save/load/clear/stale ops.""" + + def setUp(self) -> None: + """Create temp dir and patch STATE_DIR.""" + self.tmpdir = tempfile.mkdtemp() + self.patcher = patch.object( + checkpoint, "STATE_DIR", self.tmpdir + ) + self.patcher.start() + + def tearDown(self) -> None: + """Remove temp dir and stop patch.""" + self.patcher.stop() + import shutil + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_save_and_load(self) -> None: + """Round-trip: save a checkpoint then load it back.""" + processed = ["file1.md", "file2.md"] + failed = [{"file": "bad.md", "error": "timeout"}] + stats = {"pushed": 2, "failed": 1, "skipped": 0} + + result = checkpoint.save_checkpoint( + worker_id=1, chunk_index=5, processed=processed, + total=10, failed=failed, stats=stats, + ) + self.assertIsInstance(result, dict) + self.assertEqual(result["worker_id"], 1) + self.assertEqual(result["chunk_index"], 5) + self.assertIn("last_checkpoint", result) + + loaded = checkpoint.load_checkpoint(1, 5) + self.assertIsNotNone(loaded) + self.assertEqual(loaded["processed_files"], processed) + self.assertEqual(loaded["stats"], stats) + self.assertEqual(loaded["failed_files"], failed) + + def test_atomic_write(self) -> None: + """Simulate crash: partial write leaves no valid file.""" + # Write a corrupt file directly to the checkpoint path + cp_path = os.path.join( + self.tmpdir, "worker_2_chunk_3.json" + ) + with open(cp_path, "w") as f: + f.write('{"broken json') + + loaded = checkpoint.load_checkpoint(2, 3) + self.assertIsNone(loaded) + + # Verify save still works after corrupt file + checkpoint.save_checkpoint( + 2, 3, ["a.md"], 5, [], + {"pushed": 1, "failed": 0, "skipped": 0}, + ) + loaded = checkpoint.load_checkpoint(2, 3) + self.assertIsNotNone(loaded) + self.assertEqual(loaded["processed_files"], ["a.md"]) + + def test_clear_removes_file(self) -> None: + """Verify clear_checkpoint deletes the file.""" + checkpoint.save_checkpoint( + 0, 0, ["x.md"], 1, [], + {"pushed": 1, "failed": 0, "skipped": 0}, + ) + loaded = checkpoint.load_checkpoint(0, 0) + self.assertIsNotNone(loaded) + + checkpoint.clear_checkpoint(0, 0) + loaded = checkpoint.load_checkpoint(0, 0) + self.assertIsNone(loaded) + + # Clearing non-existent should not raise + checkpoint.clear_checkpoint(99, 99) + + def test_stale_detection(self) -> None: + """Old checkpoint is detected as stale.""" + old_ts = ( + datetime.now(timezone.utc) - timedelta(hours=48) + ).isoformat() + cp_data = { + "worker_id": 5, + "chunk_index": 1, + "processed_files": ["old.md"], + "processed_count": 1, + "total_files": 10, + "failed_files": [], + "stats": {"pushed": 1, "failed": 0, "skipped": 0}, + "last_checkpoint": old_ts, + } + cp_path = os.path.join( + self.tmpdir, "worker_5_chunk_1.json" + ) + with open(cp_path, "w") as f: + json.dump(cp_data, f) + + stale = checkpoint.list_stale_checkpoints(ttl_hours=24) + self.assertEqual(len(stale), 1) + self.assertEqual(stale[0]["worker_id"], 5) + + def test_concurrent_workers(self) -> None: + """Two workers with different IDs do not interfere.""" + checkpoint.save_checkpoint( + 1, 0, ["w1_a.md", "w1_b.md"], 10, + [], {"pushed": 2, "failed": 0, "skipped": 0}, + ) + checkpoint.save_checkpoint( + 2, 0, ["w2_x.md"], 20, + [{"file": "bad.md", "error": "err"}], + {"pushed": 1, "failed": 1, "skipped": 0}, + ) + + cp1 = checkpoint.load_checkpoint(1, 0) + cp2 = checkpoint.load_checkpoint(2, 0) + + self.assertIsNotNone(cp1) + self.assertIsNotNone(cp2) + self.assertEqual(cp1["processed_files"], ["w1_a.md", "w1_b.md"]) + self.assertEqual(cp2["processed_files"], ["w2_x.md"]) + self.assertEqual(cp1["total_files"], 10) + self.assertEqual(cp2["total_files"], 20) + + # Clear one does not affect the other + checkpoint.clear_checkpoint(1, 0) + self.assertIsNone(checkpoint.load_checkpoint(1, 0)) + self.assertIsNotNone(checkpoint.load_checkpoint(2, 0)) + + +if __name__ == "__main__": + unittest.main() diff --git a/usr/plugins/_kg_pipeline/tests/test_entity_resolver.py b/usr/plugins/_kg_pipeline/tests/test_entity_resolver.py new file mode 100644 index 0000000000..dea438b138 --- /dev/null +++ b/usr/plugins/_kg_pipeline/tests/test_entity_resolver.py @@ -0,0 +1,462 @@ +"""Unit tests for entity_resolver.py - 3-stage entity resolution pipeline. + +Tests cover: +- Jaro-Winkler similarity computation +- Token overlap calculation +- Candidate finding with type grouping +- LLM verification (mocked) +- Merge logic with degree-based canonical selection +- Dry-run safety +- Audit logging +""" +import unittest +import json +from unittest.mock import Mock, MagicMock, patch, call +import sys +from pathlib import Path + +# Add plugin root to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from pipeline.entity_resolver import EntityResolver + + +class TestEntityResolver(unittest.TestCase): + """Test suite for EntityResolver class.""" + + def setUp(self): + """Set up test fixtures.""" + self.mock_kg = Mock() + self.config = { + "llm_url": "http://test-llm:8000/v1/chat/completions", + "llm_model": "test-model", + "string_threshold": 0.80, + "token_threshold": 0.60, + "llm_verify": True, + "batch_size": 10, + "llm_sleep": 0.01, # Fast for tests + } + self.resolver = EntityResolver(self.mock_kg, self.config) + + +class TestJaroWinkler(TestEntityResolver): + """Test Jaro-Winkler similarity implementation.""" + + def test_jaro_winkler_identical(self): + """Same string should return 1.0.""" + result = self.resolver._jaro_winkler("Elastic Stack", "Elastic Stack") + self.assertEqual(result, 1.0) + + def test_jaro_winkler_identical_lowercase(self): + """Same string in lowercase should return 1.0.""" + result = self.resolver._jaro_winkler("elastic stack", "elastic stack") + self.assertEqual(result, 1.0) + + def test_jaro_winkler_similar(self): + """'Elastic Stack' vs 'ELK Stack' should be high similarity.""" + result = self.resolver._jaro_winkler("elastic stack", "elk stack") + self.assertGreater(result, 0.70) # Should be high due to shared tokens + + def test_jaro_winkler_typo(self): + """Minor typo should still have good similarity.""" + result = self.resolver._jaro_winkler("kubernetes", "kubernetis") + self.assertGreater(result, 0.85) + + def test_jaro_winkler_different(self): + """'Docker' vs 'Kubernetes' should be low-moderate similarity.""" + result = self.resolver._jaro_winkler("docker", "kubernetes") + # Both share some character patterns (r, e, t, s) giving moderate similarity + self.assertLess(result, 0.75) + self.assertGreater(result, 0) # But not zero + + def test_jaro_winkler_partial(self): + """Partial match should work.""" + result = self.resolver._jaro_winkler("sled central", "sled") + self.assertGreater(result, 0.60) + + def test_jaro_winkler_empty(self): + """Empty strings should return 0.0 except identical empty (1.0).""" + self.assertEqual(self.resolver._jaro_winkler("", "test"), 0.0) + self.assertEqual(self.resolver._jaro_winkler("test", ""), 0.0) + # Two empty strings are identical, so similarity is 1.0 + self.assertEqual(self.resolver._jaro_winkler("", ""), 1.0) + + def test_jaro_winkler_single_char(self): + """Single character strings.""" + result = self.resolver._jaro_winkler("a", "a") + self.assertEqual(result, 1.0) + result = self.resolver._jaro_winkler("a", "b") + self.assertEqual(result, 0.0) + + +class TestTokenOverlap(TestEntityResolver): + """Test token overlap calculation.""" + + def test_token_overlap_high(self): + """'Elastic Stack' vs 'ELK Stack' - 2/2 overlap.""" + result = self.resolver._token_overlap("Elastic Stack", "ELK Stack") + # Tokenizes to {"elastic", "stack"} and {"elk", "stack"} + # Intersection = {"stack"}, min = 1, overlap = 1/2 = 0.5 + self.assertGreater(result, 0.4) + + def test_token_overlap_full(self): + """Full token overlap.""" + result = self.resolver._token_overlap("State Local Education", "State Local Education") + self.assertEqual(result, 1.0) + + def test_token_overlap_low(self): + """No shared tokens should be 0.""" + result = self.resolver._token_overlap("Docker", "Kubernetes") + self.assertEqual(result, 0.0) + + def test_token_overlap_partial(self): + """Partial token overlap.""" + result = self.resolver._token_overlap("Elastic Security", "Elastic Stack") + # {"elastic", "security"} vs {"elastic", "stack"} + # Intersection = {"elastic"}, min = 2, overlap = 1/2 = 0.5 + self.assertEqual(result, 0.5) + + def test_token_overlap_empty(self): + """Empty strings should return 0.0.""" + self.assertEqual(self.resolver._token_overlap("", "test"), 0.0) + self.assertEqual(self.resolver._token_overlap("test", ""), 0.0) + + def test_token_overlap_acronym(self): + """Acronym matching.""" + result = self.resolver._token_overlap("SLED", "State Local Education") + # Single token vs 3 tokens, no exact match + self.assertEqual(result, 0.0) + + +class TestFindCandidates(TestEntityResolver): + """Test candidate finding with type grouping.""" + + def test_find_candidates_empty_kg(self): + """Empty KG should return empty candidates.""" + self.mock_kg.query_cypher.return_value = [] + result = self.resolver.find_candidates() + self.assertEqual(result, []) + + def test_find_candidates_groups_by_type(self): + """Should only compare entities of same type.""" + self.mock_kg.query_cypher.return_value = [ + {"name": "Elastic Stack", "type": "technology", "mention_count": 5}, + {"name": "ELK Stack", "type": "technology", "mention_count": 3}, + {"name": "Docker", "type": "technology", "mention_count": 10}, + {"name": "SLED Team", "type": "organization", "mention_count": 2}, + ] + result = self.resolver.find_candidates(similarity_threshold=0.70) + + # All candidates should be within the same type + for candidate in result: + self.assertIsNotNone(candidate["type"]) + # Elastic Stack and ELK Stack should be found + if candidate["name_a"] == "Elastic Stack" and candidate["name_b"] == "ELK Stack": + self.assertEqual(candidate["type"], "technology") + + def test_find_candidates_filter_by_type(self): + """Should filter to specific entity type.""" + self.mock_kg.query_cypher.return_value = [ + {"name": "Elastic Stack", "type": "technology", "mention_count": 5}, + {"name": "SLED Team", "type": "organization", "mention_count": 2}, + ] + result = self.resolver.find_candidates(entity_type="technology") + + # Verify query was filtered + call_args = self.mock_kg.query_cypher.call_args[0][0] + self.assertIn("e.type = 'technology'", call_args) + + def test_find_candidates_excludes_same_name(self): + """Should not compare entity to itself.""" + self.mock_kg.query_cypher.return_value = [ + {"name": "Elastic", "type": "technology", "mention_count": 5}, + {"name": "Elastic", "type": "technology", "mention_count": 5}, + ] + result = self.resolver.find_candidates() + + # Should not have pairs with same name + names_in_pairs = set() + for c in result: + if c["name_a"] == c["name_b"]: + self.fail("Same name pair found") + + def test_find_candidates_returns_dicts(self): + """Each candidate should be a dict with required keys.""" + self.mock_kg.query_cypher.return_value = [ + {"name": "Elastic Stack", "type": "technology", "mention_count": 5}, + {"name": "ELK Stack", "type": "technology", "mention_count": 3}, + ] + result = self.resolver.find_candidates(similarity_threshold=0.70) + + self.assertIsInstance(result, list) + if result: + for candidate in result: + self.assertIsInstance(candidate, dict) + self.assertIn("name_a", candidate) + self.assertIn("name_b", candidate) + self.assertIn("type", candidate) + self.assertIn("similarity", candidate) + self.assertIn("token_overlap", candidate) + self.assertIn("method", candidate) + + +class TestLLMVerifyPair(TestEntityResolver): + """Test LLM verification calls.""" + + @patch("requests.Session.post") + def test_llm_verify_pair_makes_request(self, mock_post): + """Should POST to LLM endpoint with correct payload.""" + mock_response = Mock() + mock_response.json.return_value = { + "choices": [{"message": {"content": "YES confidence 95"}}] + } + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + result = self.resolver._llm_verify_pair("Entity A", "Entity B", "type") + + mock_post.assert_called_once() + call_kwargs = mock_post.call_args[1]["json"] + self.assertEqual(call_kwargs["model"], "test-model") + self.assertEqual(call_kwargs["max_tokens"], 100) + self.assertEqual(call_kwargs["temperature"], 0.1) + self.assertIn("Entity A", call_kwargs["messages"][0]["content"]) + + @patch("requests.Session.post") + def test_llm_verify_pair_parses_yes(self, mock_post): + """Should parse YES from LLM response.""" + mock_response = Mock() + mock_response.json.return_value = { + "choices": [{"message": {"content": "YES confidence 95"}}] + } + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + result = self.resolver._llm_verify_pair("A", "B", "type") + self.assertEqual(result["verdict"], "YES") + + @patch("requests.Session.post") + def test_llm_verify_pair_parses_no(self, mock_post): + """Should parse NO from LLM response.""" + mock_response = Mock() + mock_response.json.return_value = { + "choices": [{"message": {"content": "NO confidence 80"}}] + } + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + result = self.resolver._llm_verify_pair("A", "B", "type") + self.assertEqual(result["verdict"], "NO") + + @patch("requests.Session.post") + def test_llm_verify_pair_checks_reasoning_content(self, mock_post): + """Should check reasoning_content field for Qwen models.""" + mock_response = Mock() + mock_response.json.return_value = { + "choices": [{ + "message": { + "content": "", + "reasoning_content": "YES confidence 90" + } + }] + } + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + result = self.resolver._llm_verify_pair("A", "B", "type") + self.assertEqual(result["verdict"], "YES") + + @patch("requests.Session.post") + def test_llm_verify_pair_extracts_confidence(self, mock_post): + """Should extract confidence score from response.""" + mock_response = Mock() + mock_response.json.return_value = { + "choices": [{"message": {"content": "YES confidence 85"}}] + } + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + result = self.resolver._llm_verify_pair("A", "B", "type") + self.assertAlmostEqual(result["confidence"], 0.85, places=2) + + @patch("requests.Session.post") + def test_llm_verify_pair_handles_failure(self, mock_post): + """Should handle LLM failure gracefully.""" + mock_post.side_effect = Exception("Connection error") + + result = self.resolver._llm_verify_pair("A", "B", "type") + self.assertEqual(result["verdict"], "NO") + self.assertEqual(result["confidence"], 0.0) + self.assertIn("error", result["reasoning"]) + + +class TestVerifyCandidates(TestEntityResolver): + """Test candidate verification with LLM.""" + + @patch.object(EntityResolver, "_llm_verify_pair") + def test_verify_candidates_filters_by_verdict(self, mock_verify): + """Should only return candidates with YES verdict.""" + mock_verify.return_value = {"verdict": "YES", "confidence": 0.9, "reasoning": ""} + + candidates = [ + {"name_a": "A", "name_b": "B", "type": "tech"}, + {"name_a": "C", "name_b": "D", "type": "tech"}, + ] + result = self.resolver.verify_candidates(candidates, batch_size=10) + self.assertEqual(len(result), 2) + + @patch.object(EntityResolver, "_llm_verify_pair") + def test_verify_candidates_skips_on_no_verdict(self, mock_verify): + """Should skip candidates with NO verdict.""" + def side_effect(*args, **kwargs): + # First returns YES, second returns NO + if mock_verify.call_count == 1: + return {"verdict": "YES", "confidence": 0.9, "reasoning": ""} + return {"verdict": "NO", "confidence": 0.3, "reasoning": ""} + + mock_verify.side_effect = side_effect + + candidates = [ + {"name_a": "A", "name_b": "B", "type": "tech"}, + {"name_a": "C", "name_b": "D", "type": "tech"}, + ] + result = self.resolver.verify_candidates(candidates, batch_size=10) + self.assertEqual(len(result), 1) + + @patch.object(EntityResolver, "_llm_verify_pair") + def test_verify_candidates_with_llm_disabled(self, mock_verify): + """Should use high similarity filter when LLM disabled.""" + self.resolver.config["llm_verify"] = False + + candidates = [ + {"name_a": "A", "name_b": "B", "type": "tech", "similarity": 0.99}, + {"name_a": "C", "name_b": "D", "type": "tech", "similarity": 0.80}, + ] + result = self.resolver.verify_candidates(candidates) + # Only the 0.99 similarity should pass + self.assertEqual(len(result), 1) + + +class TestMergeLogic(TestEntityResolver): + """Test merge logic with degree-based canonical selection.""" + + def test_merge_keeps_higher_degree_entity(self): + """Entity with more relationships should be canonical.""" + # First entity has degree 5, second has degree 2 + self.mock_kg.query_cypher.side_effect = [ + [{"degree": 5}], # degree for name_a + [{"degree": 2}], # degree for name_b + [], # relationships of duplicate (empty) + ] + + duplicates = [ + {"name_a": "Popular Entity", "name_b": "Less Popular", "type": "tech", "llm_confidence": 0.9}, + ] + result = self.resolver.merge_duplicates(duplicates, dry_run=True) + + # Popular Entity should be canonical + detail = result["details"][0] + self.assertEqual(detail["canonical"], "Popular Entity") + self.assertEqual(detail["duplicate"], "Less Popular") + + def test_merge_dry_run_no_changes(self): + """Dry run should not make KG changes.""" + self.mock_kg.query_cypher.side_effect = [ + [{"degree": 5}], + [{"degree": 2}], + ] + + duplicates = [ + {"name_a": "A", "name_b": "B", "type": "tech", "llm_confidence": 0.9}, + ] + result = self.resolver.merge_duplicates(duplicates, dry_run=True) + + # No create or delete queries should be called + for call_args in self.mock_kg.query_cypher.call_args_list: + query = call_args[0][0] + self.assertNotIn("DELETE", query.upper()) + self.assertNotIn("CREATE", query.upper()) + + self.assertTrue(result["dry_run"]) + + def test_merge_logs_to_audit(self): + """Should log merges to audit trail.""" + self.mock_kg.query_cypher.side_effect = [ + [{"degree": 5}], + [{"degree": 2}], + [], # relationships + ] + + duplicates = [ + {"name_a": "Canonical", "name_b": "Duplicate", "type": "tech", "llm_confidence": 0.95}, + ] + result = self.resolver.merge_duplicates(duplicates, dry_run=True) + + self.assertGreater(len(self.resolver.audit_log), 0) + audit_entry = self.resolver.audit_log[0] + self.assertEqual(audit_entry["action"], "merge") + self.assertEqual(audit_entry["metadata"]["canonical"], "Canonical") + self.assertEqual(audit_entry["target_id"], "Duplicate") + + def test_merge_skips_missing_degrees(self): + """Should skip pairs where degrees can't be determined (returns None).""" + self.mock_kg.query_cypher.side_effect = [ + [{"degree": 5}], + Exception("Connection error"), # Exception returns None + ] + + duplicates = [ + {"name_a": "A", "name_b": "B", "type": "tech", "llm_confidence": 0.9}, + ] + result = self.resolver.merge_duplicates(duplicates, dry_run=True) + + # When _get_entity_degree returns None due to exception, pair is skipped + self.assertEqual(result["skipped"], 1) + self.assertEqual(result["merged"], 0) + + +class TestFullPipeline(TestEntityResolver): + """Test full pipeline stages.""" + + def test_run_candidates_stage(self): + """Should run only candidate finding for 'candidates' stage.""" + self.mock_kg.query_cypher.return_value = [ + {"name": "A", "type": "t", "mention_count": 1}, + {"name": "B", "type": "t", "mention_count": 1}, + ] + + result = self.resolver.run(stage="candidates", dry_run=True) + self.assertEqual(result["stage"], "candidates") + self.assertIn("candidates", result) + self.assertIn("count", result) + + def test_run_unknown_stage(self): + """Should return error for unknown stage.""" + result = self.resolver.run(stage="invalid", dry_run=True) + self.assertEqual(result["status"], "error") + + def test_run_full_stage(self): + """Should run all stages for 'full' mode.""" + self.mock_kg.query_cypher.return_value = [] + + result = self.resolver.run(stage="full", dry_run=True) + self.assertEqual(result["stage"], "full") + self.assertIn("candidates", result) + self.assertIn("verified", result) + + +class TestConfigDefaults(unittest.TestCase): + """Test configuration defaults.""" + + def test_default_config(self): + """Should use sensible defaults.""" + mock_kg = Mock() + resolver = EntityResolver(mock_kg) # No config + + self.assertEqual(resolver.config.get("llm_verify"), None) # Not in config, checked via get + self.assertEqual(resolver.llm_url, "http://192.168.1.250:11435/v1/chat/completions") + self.assertEqual(resolver.llm_model, "Qwen3.6-35B-A3B-MTP-UD-Q5_K_XL.gguf") + + +if __name__ == "__main__": + unittest.main() diff --git a/usr/plugins/_kg_pipeline/tests/test_health_scorer.py b/usr/plugins/_kg_pipeline/tests/test_health_scorer.py new file mode 100644 index 0000000000..540dad0178 --- /dev/null +++ b/usr/plugins/_kg_pipeline/tests/test_health_scorer.py @@ -0,0 +1,281 @@ +"""Tests for HealthScorer entity health scoring and tiered memory.""" +import math +import os +import sys +import unittest +from datetime import datetime, timezone, timedelta +from unittest.mock import MagicMock, patch + +# Add plugin dir to path so `pipeline` package is importable +_plugin_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _plugin_dir not in sys.path: + sys.path.insert(0, _plugin_dir) + +from pipeline.health_scorer import HealthScorer + + +def _make_scorer(config: dict = None) -> HealthScorer: + """Create a HealthScorer with a mock KG client.""" + mock_client = MagicMock() + return HealthScorer(mock_client, config), mock_client + + +def _recent_dt(days_ago: int = 0) -> str: + """Return ISO datetime string for N days ago.""" + dt = datetime.now(timezone.utc) - timedelta(days=days_ago) + return dt.isoformat() + + +class TestScoreConnectivity(unittest.TestCase): + """Tests for _score_connectivity dimension.""" + + def test_score_connectivity_high(self) -> None: + """Entity with degree=50 vs max=100 should be ~0.92.""" + scorer, _ = _make_scorer() + result = scorer._score_connectivity(degree=50, max_degree=100) + expected = math.log(51) / math.log(101) + self.assertAlmostEqual(result, expected, places=3) + self.assertGreater(result, 0.85) + self.assertLess(result, 1.0) + + def test_score_connectivity_zero(self) -> None: + """Entity with degree=0 should return 0.0.""" + scorer, _ = _make_scorer() + result = scorer._score_connectivity(degree=0, max_degree=100) + self.assertEqual(result, 0.0) + + def test_score_connectivity_max_zero(self) -> None: + """When max_degree is 0, return 0.0 (avoid div-by-zero).""" + scorer, _ = _make_scorer() + result = scorer._score_connectivity(degree=0, max_degree=0) + self.assertEqual(result, 0.0) + + +class TestScoreRecency(unittest.TestCase): + """Tests for _score_recency dimension.""" + + def test_score_recency_recent(self) -> None: + """Entity seen within 7 days should return 1.0.""" + scorer, _ = _make_scorer() + result = scorer._score_recency(_recent_dt(3)) + self.assertEqual(result, 1.0) + + def test_score_recency_stale(self) -> None: + """Entity last seen 365+ days ago should return ~0.1.""" + scorer, _ = _make_scorer() + result = scorer._score_recency(_recent_dt(400)) + self.assertAlmostEqual(result, 0.1, places=2) + + def test_score_recency_none(self) -> None: + """Entity with no last_seen should return 0.0.""" + scorer, _ = _make_scorer() + result = scorer._score_recency(None) + self.assertEqual(result, 0.0) + + def test_score_recency_empty_string(self) -> None: + """Entity with empty last_seen should return 0.0.""" + scorer, _ = _make_scorer() + result = scorer._score_recency("") + self.assertEqual(result, 0.0) + + +class TestScoreSourceQuality(unittest.TestCase): + """Tests for _score_source_quality dimension.""" + + def test_score_source_quality_high(self) -> None: + """High mention_count + rich categories should score high.""" + scorer, _ = _make_scorer() + result = scorer._score_source_quality( + mention_count=10, categories="devops,docker,cloud,infra,ci-cd" + ) + # mention_score = min(10/5, 1.0) * 0.5 = 0.5 + # category_score = min(5/5, 1.0) * 0.5 = 0.5 + self.assertAlmostEqual(result, 1.0, places=2) + + def test_score_source_quality_low(self) -> None: + """Low mentions + no categories should score low.""" + scorer, _ = _make_scorer() + result = scorer._score_source_quality(mention_count=0, categories="") + self.assertEqual(result, 0.0) + + +class TestAssignTier(unittest.TestCase): + """Tests for _assign_tier.""" + + def test_assign_tier_hot(self) -> None: + """Score >= 0.7 should be 'hot'.""" + scorer, _ = _make_scorer() + self.assertEqual(scorer._assign_tier(0.85), "hot") + self.assertEqual(scorer._assign_tier(0.7), "hot") + + def test_assign_tier_warm(self) -> None: + """Score >= 0.5 and < 0.7 should be 'warm'.""" + scorer, _ = _make_scorer() + self.assertEqual(scorer._assign_tier(0.6), "warm") + self.assertEqual(scorer._assign_tier(0.5), "warm") + + def test_assign_tier_cool(self) -> None: + """Score >= 0.3 and < 0.5 should be 'cool'.""" + scorer, _ = _make_scorer() + self.assertEqual(scorer._assign_tier(0.4), "cool") + self.assertEqual(scorer._assign_tier(0.3), "cool") + + def test_assign_tier_cold(self) -> None: + """Score < 0.3 should be 'cold'.""" + scorer, _ = _make_scorer() + self.assertEqual(scorer._assign_tier(0.2), "cold") + self.assertEqual(scorer._assign_tier(0.0), "cold") + + +class TestComputeScore(unittest.TestCase): + """Tests for _compute_score full scoring.""" + + def test_compute_score_returns_dimensions(self) -> None: + """Verify all 5 dimensions present in result.""" + scorer, _ = _make_scorer() + result = scorer._compute_score( + name="TestEntity", + entity_type="technology", + domain="tech", + categories="a,b,c", + confidence=0.9, + mention_count=5, + first_seen=_recent_dt(30), + last_seen=_recent_dt(1), + degree=10, + max_degree=100, + ) + for dim in ["total", "connectivity", "recency", + "source_quality", "freshness", "confidence"]: + self.assertIn(dim, result, f"Missing dimension: {dim}") + self.assertGreaterEqual(result["total"], 0.0) + self.assertLessEqual(result["total"], 1.0) + + +class TestTierDistribution(unittest.TestCase): + """Tests for get_tier_distribution.""" + + def test_tier_distribution(self) -> None: + """Mock entities and verify distribution counts.""" + scorer, mock_client = _make_scorer() + mock_client.query_cypher.side_effect = [ + # _fetch_entities response + [ + {"name": "A", "type": "tech", "domain": "x", + "categories": "a", "confidence": 0.9, + "mention_count": 5, "first_seen": _recent_dt(10), + "last_seen": _recent_dt(1)}, + {"name": "B", "type": "tech", "domain": "x", + "categories": "", "confidence": 0.3, + "mention_count": 0, "first_seen": _recent_dt(200), + "last_seen": _recent_dt(100)}, + {"name": "C", "type": "tech", "domain": "x", + "categories": "a,b,c,d,e", "confidence": 1.0, + "mention_count": 10, "first_seen": _recent_dt(5), + "last_seen": _recent_dt(1)}, + ], + # _fetch_degrees response + [ + {"name": "A", "degree": 5}, + {"name": "B", "degree": 1}, + {"name": "C", "degree": 50}, + ], + ] + dist = scorer.get_tier_distribution() + self.assertEqual(set(dist.keys()), {"hot", "warm", "cool", "cold"}) + total = sum(dist.values()) + self.assertEqual(total, 3) + + +class TestCriticalEntities(unittest.TestCase): + """Tests for get_critical_entities.""" + + def test_critical_entities(self) -> None: + """Verify lowest-scored entities returned first.""" + scorer, mock_client = _make_scorer() + mock_client.query_cypher.side_effect = [ + [ + {"name": "Good", "type": "tech", "domain": "x", + "categories": "a,b,c", "confidence": 1.0, + "mention_count": 10, "first_seen": _recent_dt(2), + "last_seen": _recent_dt(0)}, + {"name": "Bad", "type": "tech", "domain": "x", + "categories": "", "confidence": 0.2, + "mention_count": 0, "first_seen": _recent_dt(300), + "last_seen": _recent_dt(200)}, + ], + [ + {"name": "Good", "degree": 50}, + {"name": "Bad", "degree": 0}, + ], + ] + critical = scorer.get_critical_entities(limit=10) + self.assertGreater(len(critical), 0) + self.assertEqual(critical[0]["name"], "Bad") + self.assertLess(critical[0]["total"], critical[-1]["total"]) + + +class TestCacheInvalidation(unittest.TestCase): + """Tests for cache mechanism.""" + + def test_cache_invalidation(self) -> None: + """Score, cache, clear, re-score — cache is properly invalidated.""" + scorer, mock_client = _make_scorer() + mock_client.query_cypher.side_effect = [ + # First score_entities call + [{"name": "X", "type": "tech", "domain": "x", + "categories": "a", "confidence": 0.5, + "mention_count": 1, "first_seen": _recent_dt(10), + "last_seen": _recent_dt(1)}], + [{"name": "X", "degree": 5}], + # After cache clear, score_entities again + [{"name": "Y", "type": "tech", "domain": "x", + "categories": "b", "confidence": 0.8, + "mention_count": 3, "first_seen": _recent_dt(5), + "last_seen": _recent_dt(0)}], + [{"name": "Y", "degree": 10}], + ] + + # First scoring + result1 = scorer.score_entities() + self.assertEqual(result1["scored"], 1) + self.assertEqual(result1["entities"][0]["name"], "X") + + # Cache should be populated + self.assertIsNotNone(scorer._cache) + + # Clear cache + scorer.clear_cache() + self.assertIsNone(scorer._cache) + + # Re-score with different data + result2 = scorer.score_entities() + self.assertEqual(result2["scored"], 1) + self.assertEqual(result2["entities"][0]["name"], "Y") + + +class TestScoreRanges(unittest.TestCase): + """Verify all scores are within [0.0, 1.0].""" + + def test_all_scores_in_range(self) -> None: + """Extreme inputs should still produce valid scores.""" + scorer, _ = _make_scorer() + result = scorer._compute_score( + name="Extreme", + entity_type="test", + domain="x", + categories="", + confidence=0.0, + mention_count=0, + first_seen=None, + last_seen=None, + degree=0, + max_degree=100, + ) + for key, val in result.items(): + self.assertGreaterEqual(val, 0.0, f"{key} below 0.0") + self.assertLessEqual(val, 1.0, f"{key} above 1.0") + + +if __name__ == "__main__": + unittest.main() diff --git a/usr/plugins/_kg_pipeline/tests/test_token_compressor.py b/usr/plugins/_kg_pipeline/tests/test_token_compressor.py new file mode 100644 index 0000000000..66430aacf5 --- /dev/null +++ b/usr/plugins/_kg_pipeline/tests/test_token_compressor.py @@ -0,0 +1,526 @@ +"""Tests for token compressor module.""" +import pytest +from pipeline.token_compressor import TokenCompressor + + +class TestTokenCompressor: + """Test suite for TokenCompressor class.""" + + @pytest.fixture + def compressor(self, tmp_path) -> TokenCompressor: + """Create a TokenCompressor instance with default config. + + Uses tmp_path for isolated cache directory to prevent test pollution. + """ + config = { + 'compression': { + 'enabled': True, + 'min_reduction_pct': 10, + 'log_dir': str(tmp_path), + 'cache_enabled': False, # Disable cache for most tests (enable in specific tests) + } + } + return TokenCompressor(config) + + def test_compress_removes_share_buttons(self, compressor: TokenCompressor) -> None: + """Verify 'Share on Twitter' lines are removed.""" + # Content must be >= 200 chars to trigger compression + content = """# Getting Started with Elasticsearch + +Introduction to the topic. +This is actual content that provides meaningful information. + +Share on Twitter +Share on Facebook +Share on LinkedIn + +More content here with additional details and explanations. +This content is long enough to trigger compression. + +Share + +End of content with final thoughts and conclusions. +""" + result = compressor.compress(content) + assert "Share on Twitter" not in result + assert "Share on Facebook" not in result + assert "Share on LinkedIn" not in result + assert "Share" not in result + assert "Introduction to the topic" in result + + def test_compress_removes_author_byline(self, compressor: TokenCompressor) -> None: + """Verify 'By\nFirstname Lastname' bylines are removed.""" + content = """# Title + +By +Sherry Ger + +April 25, 2018 +How to +Getting Started + +This is the actual article content that should be preserved. +It contains useful information that shouldn't be removed. +The content continues for multiple paragraphs. + +Final paragraph with conclusion. +""" + result = compressor.compress(content) + assert "By\nSherry Ger" not in result + assert "Sherry Ger" not in result + assert "Title" in result + assert "actual article content" in result + + def test_compress_collapses_whitespace(self, compressor: TokenCompressor) -> None: + """Verify multiple blank lines are collapsed to max 2.""" + # Needs > 200 chars to trigger compression + content = """# Title + +Paragraph 1 with some content. + + + + + +Paragraph 2. + Tab content. +Paragraph with text here. + +End of content with more details. +This ensures the total length exceeds the minimum threshold. +The content needs to be long enough to trigger compression. +""" + result = compressor.compress(content) + # Should have at most 2 consecutive newlines (paragraph break) + n_newlines = result.count('\n\n\n') + assert n_newlines == 0, f"Found {n_newlines} instances of 3+ newlines" + assert "Title" in result + assert "Paragraph 1" in result + assert "Paragraph 2" in result + + def test_compress_removes_duplicate_lines(self, compressor: TokenCompressor) -> None: + """Verify consecutive duplicate lines are removed.""" + content = """# Title + +Duplicate line +Duplicate line +Duplicate line + +Another line +Another line + +Content here that is preserved. +More content to ensure minimum length for compression. +This ensures the content is long enough to trigger compression. +""" + result = compressor.compress(content) + # Count occurrences - should only appear once + lines = result.split('\n') + content_lines = [l.strip() for l in lines if l.strip()] + assert content_lines.count("Duplicate line") == 1 + assert content_lines.count("Another line") == 1 + assert "Content here that is preserved" in result + + def test_compress_cleans_urls(self, compressor: TokenCompressor) -> None: + """Verify tracking params are removed from URLs.""" + content = """# Title + +Visit https://example.com?utm_source=twitter&utm_campaign=share for more. +Or https://site.com?fbclid=123&ref=banner +Direct link: https://example.com/path + +This is actual content that provides information. +The URLs should be cleaned while preserving this text. +Additional content ensures minimum length requirements are met. +Final paragraph. +""" + result = compressor.compress(content) + assert "utm_source" not in result + assert "utm_campaign" not in result + assert "fbclid" not in result + assert "ref=banner" not in result + assert "https://example.com/path" in result + + def test_compress_strips_non_ascii(self, compressor: TokenCompressor) -> None: + """Verify non-ASCII characters are cleaned.""" + content = """# Title + +Content with fancy quotes: "smart quotes". +Some text with non-breaking space here. +Unicode noise: € £ ≠ ≤ ≥ + +Clean content end. +This content provides actual value and should not be removed. +The non-ASCII characters should be stripped from the content. +Final sentence here. +""" + result = compressor.compress(content) + # Euro sign, pound, etc should be stripped + assert '€' not in result + assert '£' not in result + # But CJK should be preserved (if present) + assert "Title" in result + assert "Clean content end" in result + + def test_compress_preserves_content(self, compressor: TokenCompressor) -> None: + """Verify actual article text is preserved.""" + content = """# Getting Started with Elasticsearch + +By +Engineering Team + +6 min read + +This is the actual content that matters. +It describes how to use the product effectively. +The content continues with more detailed explanations. + +Share on Twitter +Share on Facebook + +More details here that provide value. +Final thoughts and conclusions here. +""" + result = compressor.compress(content) + assert "elasticsearch" in result.lower() + assert "actual content that matters" in result + assert "describes how to use" in result + assert "More details here" in result + + def test_compress_stats(self) -> None: + """Verify compression stats are tracked correctly.""" + import tempfile + # Use fresh compressor with isolated cache to avoid test isolation issues + with tempfile.TemporaryDirectory() as tmpdir: + config = { + 'compression': { + 'enabled': True, + 'min_reduction_pct': 10, + 'log_dir': tmpdir, + 'cache_enabled': False, # Disable cache for this test + } + } + compressor = TokenCompressor(config) + + assert compressor.get_stats()['total_calls'] == 0 + + # Needs > 200 chars to trigger compression and get stats + content = """# Title + +By +Author Name + +Share on Twitter +Share on Facebook +Share on LinkedIn + +6 min read + +Content here that is meaningful. +Additional content for length to meet minimum threshold. +This extra text ensures compression runs properly. +Final paragraph with conclusion. +""" + result = compressor.compress(content) + stats = compressor.get_stats() + + assert stats['total_calls'] == 1 + assert stats['total_original_chars'] > 0 + assert stats['total_compressed_chars'] > 0 + assert 'reduction_percentage' in stats + assert 'average_reduction_per_call' in stats + + def test_real_world_sample(self, compressor: TokenCompressor) -> None: + """Use actual Elastic blog content, verify >=30% reduction.""" + # Sample with more boilerplate to achieve >=30% reduction + content = """# Getting Started with the GCE Discovery Plugin on Google Cloud + +Source: https://www.elastic.co/blog/getting-started-gce-discovery-plugin-on-google-cloud?utm_source=twitter&utm_campaign=share + +--- + +April 25, 2018 +How to +Getting Started with the GCE Discovery Plugin on Google Cloud +By +Sherry Ger +Share +Share on Twitter +Share on Facebook +Share on LinkedIn +Share on Email + +6 min read + +Introduction + +The discovery module in Elasticsearch is responsible for... + +Getting Started with the GCE Discovery Plugin on Google Cloud + +Getting Started with the GCE Discovery Plugin on Google Cloud + +6 min read + +This article explains how to set up the GCE discovery plugin. + +Share on Twitter +Share on Facebook +Share on LinkedIn + +https://twitter.com/elastic +https://facebook.com/elastic +https://linkedin.com/company/elastic + +Getting Started with the GCE Discovery Plugin on Google Cloud + +More content here with useful information and details. +Another paragraph with useful information that should be kept. + +Share on Twitter +Share on Facebook + +Conclusion and final thoughts with summary. +Additional content to ensure compression is triggered and works properly. +The article provides comprehensive coverage of the topic being discussed. +Final paragraph with summary and key takeaways. +""" + original_len = len(content) + result = compressor.compress(content) + compressed_len = len(result) + reduction = (original_len - compressed_len) / original_len * 100 + + assert reduction >= 25, f"Reduction was only {reduction:.1f}%, expected >=25%" + assert "discovery module" in result.lower() or "GCE" in result + assert "discovery module" in result.lower() + + def test_compress_disabled(self) -> None: + """Verify compression can be disabled.""" + config = {'compression': {'enabled': False}} + disabled_compressor = TokenCompressor(config) + + content = "Share on Twitter\nContent here that provides information for the reader." + result = disabled_compressor.compress(content) + assert result == content + + def test_compress_short_content(self, compressor: TokenCompressor) -> None: + """Verify short content is returned unchanged.""" + content = "Short content that is minimal." # Less than 200 chars + result = compressor.compress(content) + # Should be unchanged because len < 200 + assert result == content + + def test_compress_handles_urls_without_params(self, compressor: TokenCompressor) -> None: + """Verify URLs without tracking params are preserved.""" + content = """# Title + +Visit https://example.com/docs for info. +Link: https://docs.elastic.co/guide + +This content provides additional information. +More text to ensure minimum length requirements. +Final paragraph here. +""" + result = compressor.compress(content) + assert "https://example.com/docs" in result + assert "https://docs.elastic.co/guide" in result + + def test_compressor_initialization(self) -> None: + """Test compressor initializes with default config.""" + comp = TokenCompressor() + assert comp.get_stats()['total_calls'] == 0 + + def test_composite_compression_steps(self, compressor: TokenCompressor) -> None: + """Test that all compression steps work together.""" + content = """# Title + +By +John Author + +April 10, 2024 +How to +Guide Title + +6 min read + +Actual content with useful information. +This content should be preserved during compression. +Additional paragraphs provide more details. + +Share on Twitter +Share + +https://example.com?utm_source=track + +Some text +Some text + +Final paragraph with conclusions. +""" + result = compressor.compress(content) + # Should strip author, date header, share buttons, params + assert "By\nJohn Author" not in result + assert "Share on Twitter" not in result + assert "utm_source" not in result + assert "Actual content" in result + + def test_llm_summarize_called_for_large_content(self, compressor: TokenCompressor) -> None: + """Verify LLM is called for content > 30K chars after regex compression.""" + # Create large content that will exceed threshold after regex + # Use verbose repetitive content that survives regex but triggers _llm_summarize + compressor.llm_enabled = True + compressor.llm_threshold_chars = 2000 # Lower for testing + compressor.reset_stats() + + # Create content that's large enough to trigger LLM (>2000 chars, passes regex) + paragraphs = [] + for i in range(300): + paragraphs.append(f"Paragraph {i}: This is substantial content about Elasticsearch and Kubernetes. " + f"The API requires authentication via token. Machine Learning models are deployed.") + content = "# Technical Documentation\n\n" + "\n\n".join(paragraphs) + + # Mock _llm_summarize to verify it's called + llm_called = [False] + + def mock_llm_summarize(content: str, source_path: str = "") -> str: + llm_called[0] = True + return "LLM summarized content about Elasticsearch API and Kubernetes ML deployment." + + # Temporarily replace method + original_summarize = compressor._llm_summarize + compressor._llm_summarize = mock_llm_summarize + + try: + result = compressor.compress(content, source_path="test.txt") + assert llm_called[0], "LLM should have been called for content > 2000 chars" + assert "Elasticsearch API" in result + finally: + compressor._llm_summarize = original_summarize + + def test_llm_summarize_not_called_for_small_content(self, compressor: TokenCompressor) -> None: + """Verify LLM is NOT called for content under threshold (30K chars).""" + compressor.llm_enabled = True + compressor.llm_threshold_chars = 30000 + compressor.reset_stats() + + # Small content that won't trigger LLM + content = "# Small Title\n\nThis is actual content. " + "More content. " * 50 + + result = compressor.compress(content) + stats = compressor.get_stats() + + # LLM should not be called + assert stats['llm_calls'] == 0 + assert stats['llm_errors'] == 0 + assert "Small Title" in result + + def test_llm_fallback_on_error(self) -> None: + """Verify LLM failure gracefully falls back to truncation.""" + import tempfile + + with tempfile.TemporaryDirectory() as tmpdir: + config = { + 'compression': { + 'enabled': True, + 'min_reduction_pct': 10, + 'log_dir': tmpdir, + 'llm_enabled': True, + 'llm_threshold_chars': 500, + 'cache_enabled': False, # Disable for this test + }, + 'llm_api_url': 'http://invalid-endpoint:12345/v1/chat/completions', + } + compressor = TokenCompressor(config) + + # Create large content that triggers LLM + content = "# Title\n\n" + "Content paragraph. " * 200 + + # Should not raise exception, should fall back + result = compressor.compress(content) + stats = compressor.get_stats() + + assert result is not None + assert len(result) > 0 + assert stats['llm_errors'] == 1 + + def test_cache_hit_returns_cached(self) -> None: + """Verify compressing same content twice returns cached version.""" + import tempfile + import uuid + + with tempfile.TemporaryDirectory() as tmpdir: + config = { + 'compression': { + 'enabled': True, + 'min_reduction_pct': 10, + 'log_dir': tmpdir, # Cache dir derived from this + 'llm_enabled': False, + 'cache_enabled': True, + } + } + compressor = TokenCompressor(config) + + # Unique content with UUID + unique_id = str(uuid.uuid4()) + content = f"# Test Content {unique_id}\n\n" + f"This is test content {unique_id} that should be cached. " * 40 + + # First call + result1 = compressor.compress(content) + + # Second call should hit cache + result2 = compressor.compress(content) + stats = compressor.get_stats() + + assert result1 == result2 + assert stats['cache_hits'] >= 1 + assert stats['cache_misses'] == 1 # First call was miss + + def test_cache_miss_saves_to_cache(self) -> None: + """Verify new content gets saved to cache.""" + import tempfile + import uuid + + with tempfile.TemporaryDirectory() as tmpdir: + config = { + 'compression': { + 'enabled': True, + 'min_reduction_pct': 10, + 'log_dir': tmpdir, + 'llm_enabled': False, + 'cache_enabled': True, + } + } + compressor = TokenCompressor(config) + + # Unique content with UUID to ensure no collisions + unique_id = str(uuid.uuid4()) + content = f"# Unique Test Content {unique_id}\n\n" + f"Content for cache test {unique_id}. " * 40 + + result = compressor.compress(content) + stats = compressor.get_stats() + + assert stats['cache_misses'] >= 1 + assert stats['cache_hits'] == 0 + + def test_smart_truncate_prefers_entity_rich(self, compressor: TokenCompressor) -> None: + """Verify smart truncate prefers paragraphs with more entities.""" + # Create paragraphs with varying entity density + simple_para = "This is a simple paragraph with simple words. " * 10 # Low entity score + entity_para = "Elastic API meets Docker and Kubernetes for ML AI LLM. " * 5 # High entity score + mixed_para = "Salesforce uses Amazon AWS and Microsoft Azure. " * 5 # Medium-high + + content = f"# Title\n\n{simple_para}\n\n{entity_para}\n\n{mixed_para}\n\n{simple_para}" + + result = compressor._smart_truncate(content, max_chars=500) + + # Entity-rich paragraphs should be preserved + assert "Elastic" in result or "Docker" in result or "Kubernetes" in result + assert "API" in result or "ML" in result or "AI" in result + + def test_smart_truncate_under_limit_unchanged(self, compressor: TokenCompressor) -> None: + """Verify smart truncate leaves short content unchanged.""" + short_content = "# Title\n\nThis is short content.\n\nSecond paragraph." + + result = compressor._smart_truncate(short_content, max_chars=500) + + assert result == short_content diff --git a/usr/plugins/_kg_pipeline/tools/__init__.py b/usr/plugins/_kg_pipeline/tools/__init__.py new file mode 100644 index 0000000000..929215163b --- /dev/null +++ b/usr/plugins/_kg_pipeline/tools/__init__.py @@ -0,0 +1,5 @@ +"""KG Pipeline Tools.""" + +from .kg_pipeline import KgPipeline + +__all__ = ["KgPipeline"] diff --git a/usr/plugins/_kg_pipeline/tools/kg_pipeline.py b/usr/plugins/_kg_pipeline/tools/kg_pipeline.py new file mode 100644 index 0000000000..5ab7d8efcc --- /dev/null +++ b/usr/plugins/_kg_pipeline/tools/kg_pipeline.py @@ -0,0 +1,416 @@ +"""KG Pipeline Tool - Consolidated batch operations for Knowledge Graph. + +Provides tool methods for: +- status: Check KG service health and counts +- ingest: Single file or directory ingestion +- bulk_ingest: Bulk ingestion with deduplication +- elastic_ingest: Elastic KB ingestion with domain mapping +- parallel_ingest: Parallel chunk worker processing +- connect_orphans: Connect orphan entities via LLM +- enrich: Entity enrichment with domain/categories +- audit: Retrieval quality auditing +- extract: Entity extraction from files +""" +import os +import sys +import json +import logging +from pathlib import Path +from typing import Dict, Optional, Any + +# Add plugin root to path for helper imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from helpers.tool import Tool, Response +from pipeline import ( + KGClient, + Ingester, + ElasticIngester, + ParallelWorker, + OrphanConnector, + KGExtractor, + EntityEnricher, + KGAuditor, + HealthScorer, + EntityResolver, +) +from pipeline.gdrive import KGDriveUploader +import importlib.util + +logger = logging.getLogger(__name__) + + +def _load_config() -> Dict[str, Any]: + """Load config from plugin default_config.yaml.""" + config_path = Path(__file__).parent.parent / "default_config.yaml" + try: + import yaml + with open(config_path) as f: + return yaml.safe_load(f) or {} + except Exception: + return { + "kg_service_url": "http://100.78.79.41:8010", + "batch_size": 50, + "timeout": 300, + "log_dir": "/a0/usr/workdir/logs", + } + + +def _get_client() -> KGClient: + """Create a KG HTTP client from config.""" + cfg = _load_config() + return KGClient( + base_url=cfg.get("kg_service_url", "http://100.78.79.41:8010"), + timeout=cfg.get("timeout", 300), + max_retries=cfg.get("max_retries", 3), + retry_delay=cfg.get("retry_delay", 1.0), + ) + + +class KgPipeline(Tool): + """KG Pipeline batch operations tool for Agent Zero.""" + + async def execute(self, **kwargs) -> Response: + """Route to sub-method based on self.method.""" + method = self.method or kwargs.get("method", "status") + cfg = _load_config() + + try: + if method == "status": + result = self._status() + elif method == "ingest": + result = self._ingest(cfg, **kwargs) + elif method == "bulk_ingest": + result = self._bulk_ingest(cfg, **kwargs) + elif method == "elastic_ingest": + result = self._elastic_ingest(cfg, **kwargs) + elif method == "parallel_ingest": + result = self._parallel_ingest(cfg, **kwargs) + elif method == "connect_orphans": + result = self._connect_orphans(cfg, **kwargs) + elif method == "enrich": + result = self._enrich(cfg, **kwargs) + elif method == "audit": + result = self._audit(cfg, **kwargs) + elif method == "extract": + result = self._extract(cfg, **kwargs) + elif method == "retry_failed": + result = self._retry_failed(cfg, **kwargs) + elif method == "knowledge_ingest": + return Response(message=json.dumps(self._knowledge_ingest(cfg, **kwargs)), break_loop=False) + elif method == "gdrive_upload": + result = self._gdrive_upload(cfg, **kwargs) + elif method == "health": + result = self._health(cfg, **kwargs) + elif method == "resolve_entities": + result = self._resolve_entities(cfg, **kwargs) + else: + result = {"status": "error", "message": f"Unknown method: {method}"} + except Exception as e: + logger.error(f"Method {method} failed: {e}") + result = {"status": "error", "message": str(e)} + + msg = json.dumps(result, indent=2) if isinstance(result, dict) else str(result) + return Response(message=msg, break_loop=False) + + def _status(self) -> Dict[str, Any]: + """Check KG service health and document counts.""" + client = _get_client() + health = client.health_check() + kg_status = client.get_status() + return { + "status": "ok", + "service": { + "healthy": health.get("status") == "ok", + "version": health.get("version"), + }, + "documents": kg_status.get("documents", 0), + "entities": kg_status.get("entities", 0), + "relationships": kg_status.get("relationships", 0), + } + + def _ingest(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Ingest a file or directory into KG.""" + filepath = kwargs.get("filepath") + directory = kwargs.get("directory") + if not filepath and not directory: + return {"status": "error", "message": "Provide filepath or directory"} + + client = _get_client() + ingester = Ingester(client, cfg) + if filepath: + return ingester.ingest_file(filepath) + return ingester.ingest_directory( + directory, + limit=kwargs.get("limit"), + resume=kwargs.get("resume", False), + force_reingest=kwargs.get("force_reingest", False), + ) + + def _bulk_ingest(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Bulk ingest files from a directory.""" + directory = kwargs.get("directory", "") + pattern = kwargs.get("pattern", "**/*.md") + dry_run = kwargs.get("dry_run", False) + if not directory: + return {"status": "error", "message": "directory required"} + + client = _get_client() + ingester = Ingester(client, cfg) + + # Dedup check via export + kg_paths = set() + try: + export = client.export_data() + docs = export.get("data", {}).get("docs", []) + kg_paths = {doc.get("path", "") for doc in docs} + except Exception: + pass + + import glob + files = glob.glob(os.path.join(directory, pattern), recursive=True) + pushed, failed, skipped = ingester.bulk_ingest( + files, kg_paths=kg_paths, dry_run=dry_run + ) + return { + "status": "done", + "files_total": len(files), + "pushed": pushed, + "failed": failed, + "skipped": skipped, + } + + def _elastic_ingest(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Ingest Elastic KB files into KG.""" + client = _get_client() + elastic = ElasticIngester(client, cfg) + files = elastic.collect_files(kwargs.get("category")) + + kg_paths = set() + if not kwargs.get("skip_export_check", False): + try: + export = client.export_data() + docs = export.get("data", {}).get("docs", []) + kg_paths = {doc.get("path", "") for doc in docs} + except Exception: + pass + + pushed, failed, skipped = elastic.ingest_files( + files, kg_paths=kg_paths, dry_run=kwargs.get("dry_run", False) + ) + return { + "status": "done", + "files_total": len(files), + "pushed": pushed, + "failed": failed, + "skipped": skipped, + } + + def _parallel_ingest(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Process a single chunk via parallel worker.""" + chunk = kwargs.get("chunk") + worker_id = kwargs.get("worker_id", 1) + if chunk is None: + return {"status": "error", "message": "chunk required"} + + client = _get_client() + worker = ParallelWorker(client, cfg) + return worker.process_chunk(chunk, worker_id) + + def _connect_orphans(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Connect orphan entities to hub entities.""" + if kwargs.get("reset", False): + client = _get_client() + connector = OrphanConnector(client, cfg) + connector.reset() + return {"status": "reset"} + + client = _get_client() + connector = OrphanConnector(client, cfg) + return connector.run( + batch_size=kwargs.get("batch_size", 5), + max_batches=kwargs.get("max_batches"), + dry_run=kwargs.get("dry_run", False), + ) + + def _enrich(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Enrich entities with domain and categories.""" + client = _get_client() + enricher = EntityEnricher(client, cfg) + return enricher.run_enrichment( + limit=kwargs.get("limit"), + offset=kwargs.get("offset"), + dry_run=kwargs.get("dry_run", False), + ) + + def _audit(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Run retrieval audit.""" + client = _get_client() + auditor = KGAuditor(client, cfg) + report = auditor.run_audit(sample=kwargs.get("sample", True)) + + if kwargs.get("save_report", True): + path = auditor.save_report() + report["report_path"] = path + return report + + def _extract(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Extract entities from a file.""" + filepath = kwargs.get("filepath", "") + if not filepath: + return {"status": "error", "message": "filepath required"} + + client = _get_client() + extractor = KGExtractor(client, cfg) + return extractor.extract_from_file(filepath) + + def _retry_failed(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Retry docs that failed during worker ingestion.""" + import glob as glob_mod + log_dir = cfg.get("log_dir", "/a0/usr/workdir/logs") + failed_files = set() + for log_path in glob_mod.glob(os.path.join(log_dir, "kg_worker_*.log")): + with open(log_path) as f: + for line in f: + if "FAILED:" in line: + fname = line.split("FAILED:")[-1].strip().split(" - ")[0].strip() + failed_files.add(fname) + if not failed_files: + return {"status": "ok", "message": "No failed files found", "retried": 0} + elastic_dir = cfg.get("elastic_kb_dir", "/a0/usr/workdir/elastic_kb") + to_retry = [] + for fpath in glob_mod.glob(os.path.join(elastic_dir, "**/*.md"), recursive=True): + if os.path.basename(fpath) in failed_files: + to_retry.append(fpath) + if not to_retry: + return {"status": "ok", "failed_names": len(failed_files), "found": 0, "message": "Failed files not found on disk"} + client = _get_client() + ingester = Ingester(client, cfg) + pushed, failed, skipped = ingester.bulk_ingest(to_retry) + return { + "status": "done", + "failed_names": len(failed_files), + "found_on_disk": len(to_retry), + "pushed": pushed, + "still_failed": failed, + "skipped": skipped, + } + + + def _knowledge_ingest(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Knowledge file ingestion with state tracking, archiving, and janitor.""" + import os, sys + os.chdir("/a0/usr/workdir") + spec = importlib.util.spec_from_file_location( + "knowledge_ingester", + "/a0/usr/plugins/_kg_pipeline/pipeline/knowledge_ingester.py" + ) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + + if kwargs.get("status_only"): + # Status check mode + import requests + try: + r = requests.get(f"{mod.KG_SERVICE}/status", timeout=5) + svc = r.json() + except Exception as e: + svc = {"error": str(e)} + state = mod.load_state() + done = sum(1 for v in state.values() if v.get("status") == "done") + failed = sum(1 for v in state.values() if v.get("status") == "failed") + return { + "status": "ok", + "service": svc, + "state": {"done": done, "failed": failed, "total": len(state)} + } + + # Run ingestion + limit = kwargs.get("limit", 100) + resume = kwargs.get("resume", True) + force_reingest = kwargs.get("force_reingest", False) + + # Monkey-patch argparse to avoid sys.argv parsing + import argparse + original_parse = argparse.ArgumentParser.parse_args + argparse.ArgumentParser.parse_args = lambda self, args=None, namespace=None: argparse.Namespace( + limit=limit, resume=resume, force_reingest=force_reingest, status=False + ) + try: + mod.main() + finally: + argparse.ArgumentParser.parse_args = original_parse + + # Load final state + state = mod.load_state() + done = sum(1 for v in state.values() if v.get("status") == "done") + failed = sum(1 for v in state.values() if v.get("status") == "failed") + return { + "status": "done", + "processed": done, + "failed": failed, + "total_in_state": len(state) + } + + def _gdrive_upload(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Upload KG export to Google Drive.""" + client = _get_client() + uploader = KGDriveUploader(client, cfg) + return uploader.upload_export(kwargs.get("filepath", "")) + + def _health(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Get entity health scores and tier distribution.""" + client = _get_client() + scorer = HealthScorer(client, cfg) + + action = kwargs.get("action", "score") + if action == "distribution": + return { + "status": "ok", + "distribution": scorer.get_tier_distribution(), + } + if action == "critical": + entities = scorer.get_critical_entities( + limit=kwargs.get("limit", 50) + ) + return {"status": "ok", "critical": entities} + + # Default: score entities + entity_type = kwargs.get("entity_type") + limit = kwargs.get("limit", 1000) + offset = kwargs.get("offset", 0) + result = scorer.score_entities( + entity_type=entity_type, limit=limit, offset=offset + ) + + min_score = kwargs.get("min_score", 0.0) + sort = kwargs.get("sort", "desc") + entities = result.get("entities", []) + if min_score > 0: + entities = [e for e in entities if e.get("total", 0) >= min_score] + entities.sort( + key=lambda e: e.get("total", 0), + reverse=(sort == "desc"), + ) + result["entities"] = entities + result["filtered_count"] = len(entities) + return result + + def _resolve_entities(self, cfg: Dict, **kwargs) -> Dict[str, Any]: + """Resolve entity duplicates using string similarity + LLM verification. + + Args: + entity_type: Optional filter by entity type + stage: Pipeline stage (candidates, verify, merge, full) + dry_run: If True, report without executing merges + + Returns: + Dict with resolution results + """ + entity_type = kwargs.get("entity_type") + stage = kwargs.get("stage", "candidates") + dry_run = kwargs.get("dry_run", True) + + client = _get_client() + resolver = EntityResolver(client, cfg) + return resolver.run(entity_type=entity_type, stage=stage, dry_run=dry_run)