Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions lib/ai_bot_classifier.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
export interface AiBotEntry {
pattern: RegExp;
name: string;
provider: string;
category: "indexing" | "retrieval" | "agent";
description?: string;
ip_ranges_url?: string;
}

export interface AiBotClassification {
$is_ai_bot: boolean;
$ai_bot_name?: string;
$ai_bot_provider?: string;
$ai_bot_category?: "indexing" | "retrieval" | "agent";
}

export function classify_user_agent(
userAgent: string | null | undefined,
): AiBotClassification;
export function create_classifier(options: {
additional_bots?: AiBotEntry[];
}): (userAgent: string) => AiBotClassification;
export function get_bot_database(): AiBotEntry[];
175 changes: 175 additions & 0 deletions lib/ai_bot_classifier.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
// lib/ai_bot_classifier.js

const AI_BOT_DATABASE = [
// === OpenAI ===
{
pattern: /GPTBot\//i,
name: "GPTBot",
provider: "OpenAI",
category: "indexing",
description: "OpenAI web crawler for model training data",
ip_ranges_url: "https://openai.com/gptbot.json",
},
{
pattern: /ChatGPT-User\//i,
name: "ChatGPT-User",
provider: "OpenAI",
category: "retrieval",
description: "ChatGPT real-time retrieval for user queries (RAG)",
ip_ranges_url: "https://openai.com/chatgpt-user.json",
},
{
pattern: /OAI-SearchBot\//i,
name: "OAI-SearchBot",
provider: "OpenAI",
category: "indexing",
description: "OpenAI search indexing crawler",
ip_ranges_url: "https://openai.com/searchbot.json",
},

// === Anthropic ===
{
pattern: /ClaudeBot\//i,
name: "ClaudeBot",
provider: "Anthropic",
category: "indexing",
description: "Anthropic web crawler for model training",
ip_ranges_url: null, // Anthropic publishes ranges but URL may vary
},
{
pattern: /Claude-User\//i,
name: "Claude-User",
provider: "Anthropic",
category: "retrieval",
description: "Claude real-time retrieval for user queries",
},

// === Google ===
{
pattern: /Google-Extended\//i,
name: "Google-Extended",
provider: "Google",
category: "indexing",
description: "Google AI training data crawler (separate from Googlebot)",
},

// === Perplexity ===
{
pattern: /PerplexityBot\//i,
name: "PerplexityBot",
provider: "Perplexity",
category: "retrieval",
description: "Perplexity AI search crawler",
},

// === ByteDance ===
{
pattern: /Bytespider\//i,
name: "Bytespider",
provider: "ByteDance",
category: "indexing",
description: "ByteDance/TikTok AI crawler",
},

// === Common Crawl ===
{
pattern: /CCBot\//i,
name: "CCBot",
provider: "Common Crawl",
category: "indexing",
description: "Common Crawl bot (data used by many AI models)",
},

// === Apple ===
{
pattern: /Applebot-Extended\//i,
name: "Applebot-Extended",
provider: "Apple",
category: "indexing",
description: "Apple AI/Siri training data crawler",
},

// === Meta ===
{
pattern: /Meta-ExternalAgent\//i,
name: "Meta-ExternalAgent",
provider: "Meta",
category: "indexing",
description: "Meta/Facebook AI training data crawler",
},

// === Cohere ===
{
pattern: /cohere-ai\//i,
name: "cohere-ai",
provider: "Cohere",
category: "indexing",
description: "Cohere AI training data crawler",
},
];

/**
* Classify a user-agent string against the AI bot database.
* @param {string} userAgent - The user-agent string to classify
* @returns {Object} Classification result with $is_ai_bot and optional bot details
*/
function classify_user_agent(userAgent) {
if (!userAgent || typeof userAgent !== "string") {
return { $is_ai_bot: false };
}

for (const bot of AI_BOT_DATABASE) {
if (bot.pattern.test(userAgent)) {
return {
$is_ai_bot: true,
$ai_bot_name: bot.name,
$ai_bot_provider: bot.provider,
$ai_bot_category: bot.category,
};
}
}

return { $is_ai_bot: false };
}

/**
* Create a classifier with optional additional bot patterns.
* @param {Object} options
* @param {Array} options.additional_bots - Additional bot patterns to check (checked first)
* @returns {Function} A classify_user_agent function
*/
function create_classifier(options) {
const additional = (options && options.additional_bots) || [];
const combined = [...additional, ...AI_BOT_DATABASE];

return function (userAgent) {
if (!userAgent || typeof userAgent !== "string") {
return { $is_ai_bot: false };
}

for (const bot of combined) {
if (bot.pattern.test(userAgent)) {
return {
$is_ai_bot: true,
$ai_bot_name: bot.name,
$ai_bot_provider: bot.provider,
$ai_bot_category: bot.category,
};
}
}

return { $is_ai_bot: false };
};
}

function get_bot_database() {
return AI_BOT_DATABASE.map((bot) => ({
pattern: bot.pattern,
name: bot.name,
provider: bot.provider,
category: bot.category,
description: bot.description || "",
}));
}

module.exports = { classify_user_agent, create_classifier, get_bot_database };
30 changes: 30 additions & 0 deletions lib/ai_bot_middleware.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { IncomingMessage } from "http";

export interface BotClassificationOptions {
user_agent_property?: string;
property_prefix?: string;
additional_bots?: Array<{
pattern: RegExp;
name: string;
provider: string;
category: "indexing" | "retrieval" | "agent";
}>;
}

export interface BotClassificationController {
enable(): void;
disable(): void;
}

export function enable_bot_classification(
mixpanel: any,
options?: BotClassificationOptions,
): BotClassificationController;

export function track_request(
mixpanel: any,
req: IncomingMessage,
eventName: string,
properties?: Record<string, any>,
callback?: (err?: Error) => void,
): void;
103 changes: 103 additions & 0 deletions lib/ai_bot_middleware.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// lib/ai_bot_middleware.js

const {
classify_user_agent,
create_classifier,
} = require("./ai_bot_classifier");

/**
* Enable AI bot classification on a Mixpanel client instance.
* Wraps send_event_request to auto-classify track() and import() calls when $user_agent property is present.
*
* @param {Object} mixpanel - Mixpanel client from Mixpanel.init()
* @param {Object} [options]
* @param {string} [options.user_agent_property='$user_agent'] - Property name containing the UA string
* @param {string} [options.property_prefix='$'] - Prefix for classification properties
* @param {Array} [options.additional_bots] - Additional bot patterns
* @returns {Object} Controller with enable()/disable() methods
*/
function enable_bot_classification(mixpanel, options) {
if (mixpanel._ai_bot_classification_enabled) {
return;
}
mixpanel._ai_bot_classification_enabled = true;

const opts = options || {};
const uaProp = opts.user_agent_property || "$user_agent";
const prefix = opts.property_prefix || "$";
const classify = opts.additional_bots
? create_classifier({ additional_bots: opts.additional_bots })
: classify_user_agent;

let enabled = true;

// Wrap send_event_request — the single chokepoint for all event data
const originalSendEvent = mixpanel.send_event_request;
mixpanel.send_event_request = function (
endpoint,
event,
properties,
callback,
) {
let enrichedProperties = properties;
if (enabled && properties && properties[uaProp]) {
const classification = classify(properties[uaProp]);
// Map classification properties with the configured prefix
if (prefix === "$") {
enrichedProperties = Object.assign({}, properties, classification);
} else {
enrichedProperties = Object.assign({}, properties);
for (const [key, value] of Object.entries(classification)) {
// $is_ai_bot -> {prefix}is_ai_bot; $ai_bot_name -> {prefix}name
const newKey = key.startsWith("$ai_bot_")
? prefix + key.substring("$ai_bot_".length)
: prefix + key.substring(1);
enrichedProperties[newKey] = value;
}
}
}
originalSendEvent.call(
mixpanel,
endpoint,
event,
enrichedProperties,
callback,
);
};

return {
enable: function () {
enabled = true;
},
disable: function () {
enabled = false;
},
};
}

/**
* Helper: Track an event with automatic user-agent and IP extraction from an HTTP request.
*
* @param {Object} mixpanel - Mixpanel client
* @param {Object} req - Node.js HTTP IncomingMessage (or Express Request)
* @param {string} eventName - Event name
* @param {Object} [properties] - Additional properties
* @param {Function} [callback] - Callback
*/
function track_request(mixpanel, req, eventName, properties, callback) {
const enrichedProperties = Object.assign({}, properties || {});
const ua = req.headers && req.headers["user-agent"];
if (ua) {
enrichedProperties.$user_agent = ua;
}
const ip =
req.ip ||
(req.headers && req.headers["x-forwarded-for"]) ||
(req.connection && req.connection.remoteAddress);
if (ip) {
enrichedProperties.ip = ip;
}
mixpanel.track(eventName, enrichedProperties, callback);
}

module.exports = { enable_bot_classification, track_request };
8 changes: 8 additions & 0 deletions lib/mixpanel-node.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -406,3 +406,11 @@ declare namespace mixpanel {
}

export = mixpanel;

import * as AiBotMiddleware from "./ai_bot_middleware";
import * as AiBotClassifier from "./ai_bot_classifier";

declare module "mixpanel" {
export const ai: typeof AiBotMiddleware;
export const AiBotClassifier: typeof AiBotClassifier;
}
2 changes: 2 additions & 0 deletions lib/mixpanel-node.js
Original file line number Diff line number Diff line change
Expand Up @@ -535,4 +535,6 @@ const create_client = function (token, config) {
// module exporting
module.exports = {
init: create_client,
ai: require("./ai_bot_middleware"),
AiBotClassifier: require("./ai_bot_classifier"),
};
Loading