diff --git a/lib/ai_bot_classifier.d.ts b/lib/ai_bot_classifier.d.ts new file mode 100644 index 0000000..f2f1868 --- /dev/null +++ b/lib/ai_bot_classifier.d.ts @@ -0,0 +1,23 @@ +export interface AiBotEntry { + pattern: RegExp; + name: string; + provider: string; + category: "indexing" | "retrieval" | "agent"; + description?: string; + ip_ranges_url?: string; +} + +export interface AiBotClassification { + $is_ai_bot: boolean; + $ai_bot_name?: string; + $ai_bot_provider?: string; + $ai_bot_category?: "indexing" | "retrieval" | "agent"; +} + +export function classify_user_agent( + userAgent: string | null | undefined, +): AiBotClassification; +export function create_classifier(options: { + additional_bots?: AiBotEntry[]; +}): (userAgent: string) => AiBotClassification; +export function get_bot_database(): AiBotEntry[]; diff --git a/lib/ai_bot_classifier.js b/lib/ai_bot_classifier.js new file mode 100644 index 0000000..b94d259 --- /dev/null +++ b/lib/ai_bot_classifier.js @@ -0,0 +1,175 @@ +// lib/ai_bot_classifier.js + +const AI_BOT_DATABASE = [ + // === OpenAI === + { + pattern: /GPTBot\//i, + name: "GPTBot", + provider: "OpenAI", + category: "indexing", + description: "OpenAI web crawler for model training data", + ip_ranges_url: "https://openai.com/gptbot.json", + }, + { + pattern: /ChatGPT-User\//i, + name: "ChatGPT-User", + provider: "OpenAI", + category: "retrieval", + description: "ChatGPT real-time retrieval for user queries (RAG)", + ip_ranges_url: "https://openai.com/chatgpt-user.json", + }, + { + pattern: /OAI-SearchBot\//i, + name: "OAI-SearchBot", + provider: "OpenAI", + category: "indexing", + description: "OpenAI search indexing crawler", + ip_ranges_url: "https://openai.com/searchbot.json", + }, + + // === Anthropic === + { + pattern: /ClaudeBot\//i, + name: "ClaudeBot", + provider: "Anthropic", + category: "indexing", + description: "Anthropic web crawler for model training", + ip_ranges_url: null, // Anthropic publishes ranges but URL may vary + }, + { + pattern: /Claude-User\//i, + name: "Claude-User", + provider: "Anthropic", + category: "retrieval", + description: "Claude real-time retrieval for user queries", + }, + + // === Google === + { + pattern: /Google-Extended\//i, + name: "Google-Extended", + provider: "Google", + category: "indexing", + description: "Google AI training data crawler (separate from Googlebot)", + }, + + // === Perplexity === + { + pattern: /PerplexityBot\//i, + name: "PerplexityBot", + provider: "Perplexity", + category: "retrieval", + description: "Perplexity AI search crawler", + }, + + // === ByteDance === + { + pattern: /Bytespider\//i, + name: "Bytespider", + provider: "ByteDance", + category: "indexing", + description: "ByteDance/TikTok AI crawler", + }, + + // === Common Crawl === + { + pattern: /CCBot\//i, + name: "CCBot", + provider: "Common Crawl", + category: "indexing", + description: "Common Crawl bot (data used by many AI models)", + }, + + // === Apple === + { + pattern: /Applebot-Extended\//i, + name: "Applebot-Extended", + provider: "Apple", + category: "indexing", + description: "Apple AI/Siri training data crawler", + }, + + // === Meta === + { + pattern: /Meta-ExternalAgent\//i, + name: "Meta-ExternalAgent", + provider: "Meta", + category: "indexing", + description: "Meta/Facebook AI training data crawler", + }, + + // === Cohere === + { + pattern: /cohere-ai\//i, + name: "cohere-ai", + provider: "Cohere", + category: "indexing", + description: "Cohere AI training data crawler", + }, +]; + +/** + * Classify a user-agent string against the AI bot database. + * @param {string} userAgent - The user-agent string to classify + * @returns {Object} Classification result with $is_ai_bot and optional bot details + */ +function classify_user_agent(userAgent) { + if (!userAgent || typeof userAgent !== "string") { + return { $is_ai_bot: false }; + } + + for (const bot of AI_BOT_DATABASE) { + if (bot.pattern.test(userAgent)) { + return { + $is_ai_bot: true, + $ai_bot_name: bot.name, + $ai_bot_provider: bot.provider, + $ai_bot_category: bot.category, + }; + } + } + + return { $is_ai_bot: false }; +} + +/** + * Create a classifier with optional additional bot patterns. + * @param {Object} options + * @param {Array} options.additional_bots - Additional bot patterns to check (checked first) + * @returns {Function} A classify_user_agent function + */ +function create_classifier(options) { + const additional = (options && options.additional_bots) || []; + const combined = [...additional, ...AI_BOT_DATABASE]; + + return function (userAgent) { + if (!userAgent || typeof userAgent !== "string") { + return { $is_ai_bot: false }; + } + + for (const bot of combined) { + if (bot.pattern.test(userAgent)) { + return { + $is_ai_bot: true, + $ai_bot_name: bot.name, + $ai_bot_provider: bot.provider, + $ai_bot_category: bot.category, + }; + } + } + + return { $is_ai_bot: false }; + }; +} + +function get_bot_database() { + return AI_BOT_DATABASE.map((bot) => ({ + pattern: bot.pattern, + name: bot.name, + provider: bot.provider, + category: bot.category, + description: bot.description || "", + })); +} + +module.exports = { classify_user_agent, create_classifier, get_bot_database }; diff --git a/lib/ai_bot_middleware.d.ts b/lib/ai_bot_middleware.d.ts new file mode 100644 index 0000000..8e65921 --- /dev/null +++ b/lib/ai_bot_middleware.d.ts @@ -0,0 +1,30 @@ +import { IncomingMessage } from "http"; + +export interface BotClassificationOptions { + user_agent_property?: string; + property_prefix?: string; + additional_bots?: Array<{ + pattern: RegExp; + name: string; + provider: string; + category: "indexing" | "retrieval" | "agent"; + }>; +} + +export interface BotClassificationController { + enable(): void; + disable(): void; +} + +export function enable_bot_classification( + mixpanel: any, + options?: BotClassificationOptions, +): BotClassificationController; + +export function track_request( + mixpanel: any, + req: IncomingMessage, + eventName: string, + properties?: Record, + callback?: (err?: Error) => void, +): void; diff --git a/lib/ai_bot_middleware.js b/lib/ai_bot_middleware.js new file mode 100644 index 0000000..4efb16a --- /dev/null +++ b/lib/ai_bot_middleware.js @@ -0,0 +1,103 @@ +// lib/ai_bot_middleware.js + +const { + classify_user_agent, + create_classifier, +} = require("./ai_bot_classifier"); + +/** + * Enable AI bot classification on a Mixpanel client instance. + * Wraps send_event_request to auto-classify track() and import() calls when $user_agent property is present. + * + * @param {Object} mixpanel - Mixpanel client from Mixpanel.init() + * @param {Object} [options] + * @param {string} [options.user_agent_property='$user_agent'] - Property name containing the UA string + * @param {string} [options.property_prefix='$'] - Prefix for classification properties + * @param {Array} [options.additional_bots] - Additional bot patterns + * @returns {Object} Controller with enable()/disable() methods + */ +function enable_bot_classification(mixpanel, options) { + if (mixpanel._ai_bot_classification_enabled) { + return; + } + mixpanel._ai_bot_classification_enabled = true; + + const opts = options || {}; + const uaProp = opts.user_agent_property || "$user_agent"; + const prefix = opts.property_prefix || "$"; + const classify = opts.additional_bots + ? create_classifier({ additional_bots: opts.additional_bots }) + : classify_user_agent; + + let enabled = true; + + // Wrap send_event_request — the single chokepoint for all event data + const originalSendEvent = mixpanel.send_event_request; + mixpanel.send_event_request = function ( + endpoint, + event, + properties, + callback, + ) { + let enrichedProperties = properties; + if (enabled && properties && properties[uaProp]) { + const classification = classify(properties[uaProp]); + // Map classification properties with the configured prefix + if (prefix === "$") { + enrichedProperties = Object.assign({}, properties, classification); + } else { + enrichedProperties = Object.assign({}, properties); + for (const [key, value] of Object.entries(classification)) { + // $is_ai_bot -> {prefix}is_ai_bot; $ai_bot_name -> {prefix}name + const newKey = key.startsWith("$ai_bot_") + ? prefix + key.substring("$ai_bot_".length) + : prefix + key.substring(1); + enrichedProperties[newKey] = value; + } + } + } + originalSendEvent.call( + mixpanel, + endpoint, + event, + enrichedProperties, + callback, + ); + }; + + return { + enable: function () { + enabled = true; + }, + disable: function () { + enabled = false; + }, + }; +} + +/** + * Helper: Track an event with automatic user-agent and IP extraction from an HTTP request. + * + * @param {Object} mixpanel - Mixpanel client + * @param {Object} req - Node.js HTTP IncomingMessage (or Express Request) + * @param {string} eventName - Event name + * @param {Object} [properties] - Additional properties + * @param {Function} [callback] - Callback + */ +function track_request(mixpanel, req, eventName, properties, callback) { + const enrichedProperties = Object.assign({}, properties || {}); + const ua = req.headers && req.headers["user-agent"]; + if (ua) { + enrichedProperties.$user_agent = ua; + } + const ip = + req.ip || + (req.headers && req.headers["x-forwarded-for"]) || + (req.connection && req.connection.remoteAddress); + if (ip) { + enrichedProperties.ip = ip; + } + mixpanel.track(eventName, enrichedProperties, callback); +} + +module.exports = { enable_bot_classification, track_request }; diff --git a/lib/mixpanel-node.d.ts b/lib/mixpanel-node.d.ts index 2982f73..8919e66 100644 --- a/lib/mixpanel-node.d.ts +++ b/lib/mixpanel-node.d.ts @@ -406,3 +406,11 @@ declare namespace mixpanel { } export = mixpanel; + +import * as AiBotMiddleware from "./ai_bot_middleware"; +import * as AiBotClassifier from "./ai_bot_classifier"; + +declare module "mixpanel" { + export const ai: typeof AiBotMiddleware; + export const AiBotClassifier: typeof AiBotClassifier; +} diff --git a/lib/mixpanel-node.js b/lib/mixpanel-node.js index d7cdb85..c60f2ef 100644 --- a/lib/mixpanel-node.js +++ b/lib/mixpanel-node.js @@ -535,4 +535,6 @@ const create_client = function (token, config) { // module exporting module.exports = { init: create_client, + ai: require("./ai_bot_middleware"), + AiBotClassifier: require("./ai_bot_classifier"), }; diff --git a/test/ai_bot_classifier.js b/test/ai_bot_classifier.js new file mode 100644 index 0000000..b2e9d7f --- /dev/null +++ b/test/ai_bot_classifier.js @@ -0,0 +1,268 @@ +// test/ai_bot_classifier.js + +// These tests define the expected behavior of the classifier +// Write these BEFORE implementing lib/ai_bot_classifier.js + +describe("AiBotClassifier", () => { + let classify; + + beforeEach(() => { + const { classify_user_agent } = require("../lib/ai_bot_classifier"); + classify = classify_user_agent; + }); + + // === CORE CLASSIFICATION === + + describe("classify_user_agent", () => { + // --- OpenAI Bots --- + + it("should classify GPTBot user agent", () => { + const result = classify( + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)", + ); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("GPTBot"); + expect(result.$ai_bot_provider).toBe("OpenAI"); + expect(result.$ai_bot_category).toBe("indexing"); + }); + + it("should classify ChatGPT-User agent", () => { + const result = classify( + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ChatGPT-User/1.0; +https://openai.com/bot)", + ); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("ChatGPT-User"); + expect(result.$ai_bot_provider).toBe("OpenAI"); + expect(result.$ai_bot_category).toBe("retrieval"); + }); + + it("should classify OAI-SearchBot agent", () => { + const result = classify( + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot)", + ); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("OAI-SearchBot"); + expect(result.$ai_bot_provider).toBe("OpenAI"); + expect(result.$ai_bot_category).toBe("indexing"); + }); + + // --- Anthropic Bots --- + + it("should classify ClaudeBot agent", () => { + const result = classify( + "Mozilla/5.0 (compatible; ClaudeBot/1.0; +claudebot@anthropic.com)", + ); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("ClaudeBot"); + expect(result.$ai_bot_provider).toBe("Anthropic"); + expect(result.$ai_bot_category).toBe("indexing"); + }); + + it("should classify Claude-User agent", () => { + const result = classify("Mozilla/5.0 (compatible; Claude-User/1.0)"); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("Claude-User"); + expect(result.$ai_bot_provider).toBe("Anthropic"); + expect(result.$ai_bot_category).toBe("retrieval"); + }); + + // --- Google Bots --- + + it("should classify Google-Extended agent", () => { + const result = classify("Mozilla/5.0 (compatible; Google-Extended/1.0)"); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("Google-Extended"); + expect(result.$ai_bot_provider).toBe("Google"); + expect(result.$ai_bot_category).toBe("indexing"); + }); + + // --- Perplexity --- + + it("should classify PerplexityBot agent", () => { + const result = classify("Mozilla/5.0 (compatible; PerplexityBot/1.0)"); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("PerplexityBot"); + expect(result.$ai_bot_provider).toBe("Perplexity"); + expect(result.$ai_bot_category).toBe("retrieval"); + }); + + // --- ByteDance --- + + it("should classify Bytespider agent", () => { + const result = classify("Mozilla/5.0 (compatible; Bytespider/1.0)"); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("Bytespider"); + expect(result.$ai_bot_provider).toBe("ByteDance"); + expect(result.$ai_bot_category).toBe("indexing"); + }); + + // --- Common Crawl --- + + it("should classify CCBot agent", () => { + const result = classify("CCBot/2.0 (https://commoncrawl.org/faq/)"); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("CCBot"); + expect(result.$ai_bot_provider).toBe("Common Crawl"); + expect(result.$ai_bot_category).toBe("indexing"); + }); + + // --- Apple --- + + it("should classify Applebot-Extended agent", () => { + const result = classify( + "Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Applebot-Extended/0.1", + ); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("Applebot-Extended"); + expect(result.$ai_bot_provider).toBe("Apple"); + expect(result.$ai_bot_category).toBe("indexing"); + }); + + // --- Meta --- + + it("should classify Meta-ExternalAgent agent", () => { + const result = classify( + "Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)", + ); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("Meta-ExternalAgent"); + expect(result.$ai_bot_provider).toBe("Meta"); + expect(result.$ai_bot_category).toBe("indexing"); + }); + + // --- Cohere --- + + it("should classify cohere-ai agent", () => { + const result = classify("cohere-ai/1.0 (https://cohere.com)"); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("cohere-ai"); + expect(result.$ai_bot_provider).toBe("Cohere"); + expect(result.$ai_bot_category).toBe("indexing"); + }); + + // === NEGATIVE CASES === + + it("should NOT classify regular Chrome browser as AI bot", () => { + const result = classify( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + ); + expect(result.$is_ai_bot).toBe(false); + expect(result.$ai_bot_name).toBeUndefined(); + }); + + it("should NOT classify Googlebot (regular) as AI bot", () => { + const result = classify( + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + ); + expect(result.$is_ai_bot).toBe(false); + }); + + it("should NOT classify Bingbot (regular) as AI bot", () => { + const result = classify( + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + ); + expect(result.$is_ai_bot).toBe(false); + }); + + it("should NOT classify curl as AI bot", () => { + const result = classify("curl/7.64.1"); + expect(result.$is_ai_bot).toBe(false); + }); + + it("should handle empty user agent", () => { + const result = classify(""); + expect(result.$is_ai_bot).toBe(false); + }); + + it("should handle undefined user agent", () => { + const result = classify(undefined); + expect(result.$is_ai_bot).toBe(false); + }); + + it("should handle null user agent", () => { + const result = classify(null); + expect(result.$is_ai_bot).toBe(false); + }); + + // === CASE SENSITIVITY === + + it("should match case-insensitively", () => { + const result = classify("Mozilla/5.0 (compatible; gptbot/1.2)"); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("GPTBot"); + }); + + // === RETURN SHAPE === + + it("should return all expected fields for a match", () => { + const result = classify("GPTBot/1.2"); + expect(result).toHaveProperty("$is_ai_bot", true); + expect(result).toHaveProperty("$ai_bot_name"); + expect(result).toHaveProperty("$ai_bot_provider"); + expect(result).toHaveProperty("$ai_bot_category"); + expect(typeof result.$ai_bot_name).toBe("string"); + expect(typeof result.$ai_bot_provider).toBe("string"); + expect(["indexing", "retrieval", "agent"]).toContain( + result.$ai_bot_category, + ); + }); + + it("should return only $is_ai_bot:false for non-matches", () => { + const result = classify("Mozilla/5.0 Chrome/120"); + expect(Object.keys(result)).toEqual(["$is_ai_bot"]); + expect(result.$is_ai_bot).toBe(false); + }); + }); + + // === BOT DATABASE === + + describe("get_bot_database", () => { + it("should expose the bot database for inspection", () => { + const { get_bot_database } = require("../lib/ai_bot_classifier"); + const db = get_bot_database(); + expect(Array.isArray(db)).toBe(true); + expect(db.length).toBeGreaterThan(0); + expect(db[0]).toHaveProperty("pattern"); + expect(db[0]).toHaveProperty("name"); + expect(db[0]).toHaveProperty("provider"); + expect(db[0]).toHaveProperty("category"); + }); + }); + + // === CUSTOM BOTS === + + describe("custom bot registration", () => { + it("should allow adding custom bot patterns", () => { + const { create_classifier } = require("../lib/ai_bot_classifier"); + const classifier = create_classifier({ + additional_bots: [ + { + pattern: /MyCustomBot\//i, + name: "MyCustomBot", + provider: "CustomCorp", + category: "indexing", + }, + ], + }); + const result = classifier("Mozilla/5.0 (compatible; MyCustomBot/1.0)"); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe("MyCustomBot"); + }); + + it("should check custom bots before built-in bots", () => { + const { create_classifier } = require("../lib/ai_bot_classifier"); + const classifier = create_classifier({ + additional_bots: [ + { + pattern: /GPTBot\//i, + name: "GPTBot-Custom", + provider: "CustomProvider", + category: "retrieval", + }, + ], + }); + const result = classifier("GPTBot/1.2"); + expect(result.$ai_bot_name).toBe("GPTBot-Custom"); + }); + }); +}); diff --git a/test/ai_bot_middleware.js b/test/ai_bot_middleware.js new file mode 100644 index 0000000..a3a4173 --- /dev/null +++ b/test/ai_bot_middleware.js @@ -0,0 +1,269 @@ +// test/ai_bot_middleware.js + +describe("AI Bot Middleware Integration", () => { + let Mixpanel, mixpanel; + + beforeEach(() => { + Mixpanel = require("../lib/mixpanel-node"); + }); + + describe("enable_bot_classification", () => { + it("should enrich track() calls with bot classification when $user_agent is present", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + mixpanel.track("page_view", { + distinct_id: "user123", + $user_agent: + "Mozilla/5.0 (compatible; GPTBot/1.2; +https://openai.com/gptbot)", + }); + + expect(mixpanel.send_request).toHaveBeenCalledWith( + expect.objectContaining({ + data: expect.objectContaining({ + properties: expect.objectContaining({ + $is_ai_bot: true, + $ai_bot_name: "GPTBot", + $ai_bot_provider: "OpenAI", + $ai_bot_category: "indexing", + }), + }), + }), + undefined, + ); + }); + + it("should NOT add bot properties when $user_agent is not present", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + mixpanel.track("page_view", { distinct_id: "user123" }); + + const callData = mixpanel.send_request.mock.calls[0][0].data; + expect(callData.properties.$is_ai_bot).toBeUndefined(); + }); + + it("should set $is_ai_bot:false when $user_agent is present but not an AI bot", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + mixpanel.track("page_view", { + distinct_id: "user123", + $user_agent: "Mozilla/5.0 Chrome/120.0.0.0", + }); + + const callData = mixpanel.send_request.mock.calls[0][0].data; + expect(callData.properties.$is_ai_bot).toBe(false); + expect(callData.properties.$ai_bot_name).toBeUndefined(); + }); + + it("should preserve existing properties alongside bot classification", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + mixpanel.track("page_view", { + distinct_id: "user123", + $user_agent: "GPTBot/1.2", + page_url: "/products", + custom_prop: "value", + }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.page_url).toBe("/products"); + expect(props.custom_prop).toBe("value"); + expect(props.$is_ai_bot).toBe(true); + }); + + it("should preserve callback functionality", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + const callback = vi.fn(); + mixpanel.track("page_view", { $user_agent: "GPTBot/1.2" }, callback); + + expect(mixpanel.send_request).toHaveBeenCalledWith( + expect.anything(), + callback, + ); + }); + + it("should support callback as second argument (no properties)", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + const callback = vi.fn(); + mixpanel.track("page_view", callback); + + // When callback is passed as 2nd arg, properties should be empty + // and no bot classification should be added + expect(mixpanel.send_request).toHaveBeenCalled(); + }); + + it("should NOT enrich track_batch events (known limitation — track_batch bypasses send_event_request)", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + mixpanel.track_batch([ + { + event: "page_view", + properties: { $user_agent: "GPTBot/1.2", distinct_id: "bot1" }, + }, + { + event: "page_view", + properties: { $user_agent: "Chrome/120", distinct_id: "user1" }, + }, + ]); + + // track_batch goes through send_batch_requests -> send_request, NOT send_event_request + // so bot classification is not applied + const call = mixpanel.send_request.mock.calls[0][0]; + expect(call.data[0].properties.$is_ai_bot).toBeUndefined(); + expect(call.data[1].properties.$is_ai_bot).toBeUndefined(); + }); + + it("should not modify the original properties object", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + const props = { distinct_id: "user123", $user_agent: "GPTBot/1.2" }; + const originalKeys = Object.keys(props); + mixpanel.track("page_view", props); + + // Original object should not have been mutated + expect(Object.keys(props).sort()).toEqual(originalKeys.sort()); + }); + }); + + describe("configuration options", () => { + it("should accept custom user_agent_property name", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel, { + user_agent_property: "ua_string", + }); + + vi.spyOn(mixpanel, "send_request"); + + mixpanel.track("page_view", { + distinct_id: "user123", + ua_string: "GPTBot/1.2", + }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.$is_ai_bot).toBe(true); + }); + + it("should accept custom property prefix", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel, { + property_prefix: "bot_", + }); + + vi.spyOn(mixpanel, "send_request"); + + mixpanel.track("page_view", { + $user_agent: "GPTBot/1.2", + }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.bot_is_ai_bot).toBe(true); + expect(props.bot_name).toBe("GPTBot"); + }); + + it("should allow disabling classification without removing middleware", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + const controller = enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + controller.disable(); + mixpanel.track("page_view", { $user_agent: "GPTBot/1.2" }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.$is_ai_bot).toBeUndefined(); + + controller.enable(); + mixpanel.track("page_view", { $user_agent: "GPTBot/1.2" }); + + const props2 = mixpanel.send_request.mock.calls[1][0].data.properties; + expect(props2.$is_ai_bot).toBe(true); + }); + }); + + describe("helper: track_request", () => { + it("should provide a helper that extracts user-agent from HTTP request", () => { + const { + enable_bot_classification, + track_request, + } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + // Simulate an Express/Node.js request object + const mockReq = { + headers: { + "user-agent": "GPTBot/1.2", + "x-forwarded-for": "1.2.3.4", + }, + ip: "1.2.3.4", + url: "/api/products", + }; + + track_request(mixpanel, mockReq, "page_view", { + distinct_id: "user123", + page_url: "/api/products", + }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.$user_agent).toBe("GPTBot/1.2"); + expect(props.$is_ai_bot).toBe(true); + expect(props.ip).toBe("1.2.3.4"); + expect(props.page_url).toBe("/api/products"); + }); + + it("should handle request with no user-agent header", () => { + const { + enable_bot_classification, + track_request, + } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, "send_request"); + + const mockReq = { headers: {}, ip: "1.2.3.4" }; + track_request(mixpanel, mockReq, "page_view", { distinct_id: "user123" }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.$is_ai_bot).toBeUndefined(); + }); + }); +});