From f8ba915b26568ef3d96caba3142174a4e667afb6 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 29 Jan 2026 17:25:45 +0000 Subject: [PATCH 1/7] Filter bot traffic from Sentry spans using tracesSampler Add a tracesSampler function to drop spans from HeadlessChrome, bots, crawlers, and other automated traffic. This prevents bot-induced span throughput anomalies while maintaining 100% sampling for real users. Fixes DOCS-A4C Co-Authored-By: Claude --- src/instrumentation-client.ts | 40 +++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/src/instrumentation-client.ts b/src/instrumentation-client.ts index f9336e7a14d76..5e6cea1733818 100644 --- a/src/instrumentation-client.ts +++ b/src/instrumentation-client.ts @@ -4,8 +4,44 @@ import * as Spotlight from '@spotlightjs/spotlight'; Sentry.init({ dsn: process.env.NEXT_PUBLIC_SENTRY_DSN, - // Adjust this value in production, or use tracesSampler for greater control - tracesSampleRate: 1, + // Use tracesSampler to filter out bot/crawler traffic + tracesSampler: _samplingContext => { + // Check if running in browser environment + if (typeof navigator === 'undefined' || !navigator.userAgent) { + return 1; // Default to sampling if userAgent not available + } + + const userAgent = navigator.userAgent.toLowerCase(); + + // Patterns to identify bots, crawlers, and headless browsers + const botPatterns = [ + 'headlesschrome', + 'headless', + 'bot', + 'crawler', + 'spider', + 'scraper', + 'googlebot', + 'bingbot', + 'yandexbot', + 'slackbot', + 'facebookexternalhit', + 'twitterbot', + 'linkedinbot', + 'whatsapp', + 'telegrambot', + 'phantomjs', + 'selenium', + 'puppeteer', + 'playwright', + ]; + + // Check if userAgent matches any bot pattern + const isBot = botPatterns.some(pattern => userAgent.includes(pattern)); + + // Drop spans for bots (return 0), keep for real users (return 1) + return isBot ? 0 : 1; + }, // Enable logs to be sent to Sentry enableLogs: true, From a5d0f0dd463e410bb85647bf22dc446081af44c7 Mon Sep 17 00:00:00 2001 From: paulj Date: Thu, 29 Jan 2026 14:30:53 -0500 Subject: [PATCH 2/7] Refactor bot detection to use regex for better performance - Hoist bot patterns to module scope to avoid recreation on each trace - Use single regex test instead of array iteration with includes() - Add monitoring tool patterns: lighthouse, pagespeed, gtmetrix, pingdom, uptimerobot Co-Authored-By: Claude Co-authored-by: Cursor --- src/instrumentation-client.ts | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/src/instrumentation-client.ts b/src/instrumentation-client.ts index 5e6cea1733818..6f97ff769e80b 100644 --- a/src/instrumentation-client.ts +++ b/src/instrumentation-client.ts @@ -1,6 +1,11 @@ import * as Sentry from '@sentry/nextjs'; import * as Spotlight from '@spotlightjs/spotlight'; +// Regex to identify bots, crawlers, and headless browsers +// Note: 'bot' catches googlebot, slackbot, twitterbot, etc +const BOT_PATTERN = + /bot|crawler|spider|scraper|headless|facebookexternalhit|whatsapp|phantomjs|selenium|puppeteer|playwright|lighthouse|pagespeed|gtmetrix|pingdom|uptimerobot/i; + Sentry.init({ dsn: process.env.NEXT_PUBLIC_SENTRY_DSN, @@ -11,35 +16,9 @@ Sentry.init({ return 1; // Default to sampling if userAgent not available } - const userAgent = navigator.userAgent.toLowerCase(); - - // Patterns to identify bots, crawlers, and headless browsers - const botPatterns = [ - 'headlesschrome', - 'headless', - 'bot', - 'crawler', - 'spider', - 'scraper', - 'googlebot', - 'bingbot', - 'yandexbot', - 'slackbot', - 'facebookexternalhit', - 'twitterbot', - 'linkedinbot', - 'whatsapp', - 'telegrambot', - 'phantomjs', - 'selenium', - 'puppeteer', - 'playwright', - ]; - - // Check if userAgent matches any bot pattern - const isBot = botPatterns.some(pattern => userAgent.includes(pattern)); + const isBot = BOT_PATTERN.test(navigator.userAgent); - // Drop spans for bots (return 0), keep for real users (return 1) + // Drop traces for bots (return 0), keep for real users (return 1) return isBot ? 0 : 1; }, From 4d71b469e2263cd77af277cc7e9861cb23f22a36 Mon Sep 17 00:00:00 2001 From: paulj Date: Thu, 29 Jan 2026 14:56:54 -0500 Subject: [PATCH 3/7] Allow AI agents in tracesSampler for docs consumption visibility Add allowlist for AI agents (ClaudeBot, GPTBot, Cursor, Codex, Copilot, etc.) to ensure we have tracing data for agentic tools consuming our markdown docs. These are checked before the bot filter so they won't be dropped by the generic 'bot' pattern. Co-Authored-By: Claude Co-authored-by: Cursor --- src/instrumentation-client.ts | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/instrumentation-client.ts b/src/instrumentation-client.ts index 6f97ff769e80b..61af02925a287 100644 --- a/src/instrumentation-client.ts +++ b/src/instrumentation-client.ts @@ -1,22 +1,36 @@ import * as Sentry from '@sentry/nextjs'; import * as Spotlight from '@spotlightjs/spotlight'; -// Regex to identify bots, crawlers, and headless browsers -// Note: 'bot' catches googlebot, slackbot, twitterbot, etc +// AI agents we want to track for docs/markdown consumption visibility +// These fetch markdown content and we need performance data on serving to agentic tools +const AI_AGENT_PATTERN = + /claudebot|claude-web|anthropic|gptbot|chatgpt|openai|cursor|codex|copilot|perplexity|cohere|gemini/i; + +// Bots/crawlers to filter out (SEO crawlers, social media, testing tools, monitors) +// Note: 'bot' is broad but AI agents are allowlisted above const BOT_PATTERN = /bot|crawler|spider|scraper|headless|facebookexternalhit|whatsapp|phantomjs|selenium|puppeteer|playwright|lighthouse|pagespeed|gtmetrix|pingdom|uptimerobot/i; Sentry.init({ dsn: process.env.NEXT_PUBLIC_SENTRY_DSN, - // Use tracesSampler to filter out bot/crawler traffic + // Use tracesSampler to filter out bot/crawler traffic while keeping AI agents tracesSampler: _samplingContext => { // Check if running in browser environment if (typeof navigator === 'undefined' || !navigator.userAgent) { return 1; // Default to sampling if userAgent not available } - const isBot = BOT_PATTERN.test(navigator.userAgent); + const userAgent = navigator.userAgent; + + // Always sample AI agents - we want visibility into how agentic tools consume our docs + const isAIAgent = AI_AGENT_PATTERN.test(userAgent); + if (isAIAgent) { + return 1; + } + + // Filter out traditional bots/crawlers + const isBot = BOT_PATTERN.test(userAgent); // Drop traces for bots (return 0), keep for real users (return 1) return isBot ? 0 : 1; From 683eac86faa6cfe8e585e716ea34beabc80784cc Mon Sep 17 00:00:00 2001 From: paulj Date: Thu, 29 Jan 2026 15:04:42 -0500 Subject: [PATCH 4/7] Fix false positive: use explicit bot names instead of generic 'bot' The generic 'bot' pattern incorrectly matched Cubot phone user agents (e.g., "CUBOT GT99"), dropping traces for legitimate mobile users. Replace with explicit bot names: googlebot, bingbot, slackbot, etc. Co-Authored-By: Claude Co-authored-by: Cursor --- src/instrumentation-client.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/instrumentation-client.ts b/src/instrumentation-client.ts index 61af02925a287..5d15a7c84772e 100644 --- a/src/instrumentation-client.ts +++ b/src/instrumentation-client.ts @@ -7,9 +7,9 @@ const AI_AGENT_PATTERN = /claudebot|claude-web|anthropic|gptbot|chatgpt|openai|cursor|codex|copilot|perplexity|cohere|gemini/i; // Bots/crawlers to filter out (SEO crawlers, social media, testing tools, monitors) -// Note: 'bot' is broad but AI agents are allowlisted above +// Using explicit bot names to avoid false positives (e.g., "bot" would match Cubot phones) const BOT_PATTERN = - /bot|crawler|spider|scraper|headless|facebookexternalhit|whatsapp|phantomjs|selenium|puppeteer|playwright|lighthouse|pagespeed|gtmetrix|pingdom|uptimerobot/i; + /googlebot|bingbot|yandexbot|baiduspider|duckduckbot|slackbot|twitterbot|linkedinbot|telegrambot|discordbot|applebot|ahrefsbot|semrushbot|dotbot|mj12bot|crawler|spider|scraper|headless|facebookexternalhit|whatsapp|phantomjs|selenium|puppeteer|playwright|lighthouse|pagespeed|gtmetrix|pingdom|uptimerobot/i; Sentry.init({ dsn: process.env.NEXT_PUBLIC_SENTRY_DSN, From b6c90b3e49869beeb81581c2c919c9cdfb308e48 Mon Sep 17 00:00:00 2001 From: paulj Date: Thu, 29 Jan 2026 15:09:06 -0500 Subject: [PATCH 5/7] dial back sampling --- src/instrumentation-client.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/instrumentation-client.ts b/src/instrumentation-client.ts index 5d15a7c84772e..64aef6f2e8f12 100644 --- a/src/instrumentation-client.ts +++ b/src/instrumentation-client.ts @@ -18,12 +18,12 @@ Sentry.init({ tracesSampler: _samplingContext => { // Check if running in browser environment if (typeof navigator === 'undefined' || !navigator.userAgent) { - return 1; // Default to sampling if userAgent not available + return 0.3; // Default to sampling if userAgent not available } const userAgent = navigator.userAgent; - // Always sample AI agents - we want visibility into how agentic tools consume our docs + // Always sample AI agents - we want full visibility into agentic docs consumption const isAIAgent = AI_AGENT_PATTERN.test(userAgent); if (isAIAgent) { return 1; @@ -32,8 +32,8 @@ Sentry.init({ // Filter out traditional bots/crawlers const isBot = BOT_PATTERN.test(userAgent); - // Drop traces for bots (return 0), keep for real users (return 1) - return isBot ? 0 : 1; + // Drop traces for bots, sample 30% of real users + return isBot ? 0 : 0.3; }, // Enable logs to be sent to Sentry From dbdc4a42f21adb2dff631b96b70e3c1d7866e278 Mon Sep 17 00:00:00 2001 From: paulj Date: Thu, 29 Jan 2026 15:19:25 -0500 Subject: [PATCH 6/7] update regex pattern for readability --- src/instrumentation-client.ts | 62 ++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/src/instrumentation-client.ts b/src/instrumentation-client.ts index 64aef6f2e8f12..0086e263d7838 100644 --- a/src/instrumentation-client.ts +++ b/src/instrumentation-client.ts @@ -3,13 +3,67 @@ import * as Spotlight from '@spotlightjs/spotlight'; // AI agents we want to track for docs/markdown consumption visibility // These fetch markdown content and we need performance data on serving to agentic tools -const AI_AGENT_PATTERN = - /claudebot|claude-web|anthropic|gptbot|chatgpt|openai|cursor|codex|copilot|perplexity|cohere|gemini/i; +const AI_AGENT_PATTERN = new RegExp( + [ + 'claudebot', + 'claude-web', + 'anthropic', + 'gptbot', + 'chatgpt', + 'openai', + 'cursor', + 'codex', + 'copilot', + 'perplexity', + 'cohere', + 'gemini', + ].join('|'), + 'i' +); // Bots/crawlers to filter out (SEO crawlers, social media, testing tools, monitors) // Using explicit bot names to avoid false positives (e.g., "bot" would match Cubot phones) -const BOT_PATTERN = - /googlebot|bingbot|yandexbot|baiduspider|duckduckbot|slackbot|twitterbot|linkedinbot|telegrambot|discordbot|applebot|ahrefsbot|semrushbot|dotbot|mj12bot|crawler|spider|scraper|headless|facebookexternalhit|whatsapp|phantomjs|selenium|puppeteer|playwright|lighthouse|pagespeed|gtmetrix|pingdom|uptimerobot/i; +const BOT_PATTERN = new RegExp( + [ + // Search engine crawlers + 'googlebot', + 'bingbot', + 'yandexbot', + 'baiduspider', + 'duckduckbot', + 'applebot', + // SEO tools + 'ahrefsbot', + 'semrushbot', + 'dotbot', + 'mj12bot', + // Social media + 'slackbot', + 'twitterbot', + 'linkedinbot', + 'telegrambot', + 'discordbot', + 'facebookexternalhit', + 'whatsapp', + // Generic patterns + 'crawler', + 'spider', + 'scraper', + 'headless', + // Testing/automation tools + 'phantomjs', + 'selenium', + 'puppeteer', + 'playwright', + // Performance/monitoring tools + 'lighthouse', + 'pagespeed', + 'gtmetrix', + 'pingdom', + 'uptimerobot', + ].join('|'), + 'i' +); Sentry.init({ dsn: process.env.NEXT_PUBLIC_SENTRY_DSN, From 9a141fa47ea5723899e83a70ec5db84d0fc5eabc Mon Sep 17 00:00:00 2001 From: paulj Date: Thu, 29 Jan 2026 19:11:33 -0500 Subject: [PATCH 7/7] move filtering to edge & server --- src/instrumentation-client.ts | 87 +-------------------------- src/instrumentation.ts | 6 +- src/tracesSampler.ts | 109 ++++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 87 deletions(-) create mode 100644 src/tracesSampler.ts diff --git a/src/instrumentation-client.ts b/src/instrumentation-client.ts index 0086e263d7838..5154224013030 100644 --- a/src/instrumentation-client.ts +++ b/src/instrumentation-client.ts @@ -1,94 +1,11 @@ import * as Sentry from '@sentry/nextjs'; import * as Spotlight from '@spotlightjs/spotlight'; -// AI agents we want to track for docs/markdown consumption visibility -// These fetch markdown content and we need performance data on serving to agentic tools -const AI_AGENT_PATTERN = new RegExp( - [ - 'claudebot', - 'claude-web', - 'anthropic', - 'gptbot', - 'chatgpt', - 'openai', - 'cursor', - 'codex', - 'copilot', - 'perplexity', - 'cohere', - 'gemini', - ].join('|'), - 'i' -); - -// Bots/crawlers to filter out (SEO crawlers, social media, testing tools, monitors) -// Using explicit bot names to avoid false positives (e.g., "bot" would match Cubot phones) -const BOT_PATTERN = new RegExp( - [ - // Search engine crawlers - 'googlebot', - 'bingbot', - 'yandexbot', - 'baiduspider', - 'duckduckbot', - 'applebot', - // SEO tools - 'ahrefsbot', - 'semrushbot', - 'dotbot', - 'mj12bot', - // Social media - 'slackbot', - 'twitterbot', - 'linkedinbot', - 'telegrambot', - 'discordbot', - 'facebookexternalhit', - 'whatsapp', - // Generic patterns - 'crawler', - 'spider', - 'scraper', - 'headless', - // Testing/automation tools - 'phantomjs', - 'selenium', - 'puppeteer', - 'playwright', - // Performance/monitoring tools - 'lighthouse', - 'pagespeed', - 'gtmetrix', - 'pingdom', - 'uptimerobot', - ].join('|'), - 'i' -); - Sentry.init({ dsn: process.env.NEXT_PUBLIC_SENTRY_DSN, - // Use tracesSampler to filter out bot/crawler traffic while keeping AI agents - tracesSampler: _samplingContext => { - // Check if running in browser environment - if (typeof navigator === 'undefined' || !navigator.userAgent) { - return 0.3; // Default to sampling if userAgent not available - } - - const userAgent = navigator.userAgent; - - // Always sample AI agents - we want full visibility into agentic docs consumption - const isAIAgent = AI_AGENT_PATTERN.test(userAgent); - if (isAIAgent) { - return 1; - } - - // Filter out traditional bots/crawlers - const isBot = BOT_PATTERN.test(userAgent); - - // Drop traces for bots, sample 30% of real users - return isBot ? 0 : 0.3; - }, + // Adjust this value in production, or use tracesSampler for greater control + tracesSampleRate: 0.3, // Enable logs to be sent to Sentry enableLogs: true, diff --git a/src/instrumentation.ts b/src/instrumentation.ts index 074fa7bbd0a8b..dcd08ba8f6cc0 100644 --- a/src/instrumentation.ts +++ b/src/instrumentation.ts @@ -1,10 +1,12 @@ import * as Sentry from '@sentry/nextjs'; +import {tracesSampler} from './tracesSampler'; + export function register() { if (process.env.NEXT_RUNTIME === 'nodejs') { Sentry.init({ dsn: process.env.NEXT_PUBLIC_SENTRY_DSN, - tracesSampleRate: 1, + tracesSampler, enableLogs: true, debug: false, environment: process.env.NODE_ENV === 'development' ? 'development' : undefined, @@ -29,7 +31,7 @@ export function register() { if (process.env.NEXT_RUNTIME === 'edge') { Sentry.init({ dsn: process.env.NEXT_PUBLIC_SENTRY_DSN, - tracesSampleRate: 1, + tracesSampler, enableLogs: true, debug: false, environment: process.env.NODE_ENV === 'development' ? 'development' : undefined, diff --git a/src/tracesSampler.ts b/src/tracesSampler.ts new file mode 100644 index 0000000000000..4305e85073286 --- /dev/null +++ b/src/tracesSampler.ts @@ -0,0 +1,109 @@ +// Sampling context passed to tracesSampler +// Using inline type to avoid dependency on internal Sentry types +interface SamplingContext { + attributes?: Record; + name?: string; + normalizedRequest?: { + headers?: Record; + }; + parentSampled?: boolean; +} + +// AI agents we want to track for docs/markdown consumption visibility +// These fetch markdown content and we need performance data on serving to agentic tools +const AI_AGENT_PATTERN = new RegExp( + [ + 'claudebot', + 'claude-web', + 'anthropic', + 'gptbot', + 'chatgpt', + 'openai', + 'cursor', + 'codex', + 'copilot', + 'perplexity', + 'cohere', + 'gemini', + ].join('|'), + 'i' +); + +// Bots/crawlers to filter out (SEO crawlers, social media, testing tools, monitors) +// Uses specific bot names where possible, plus generic patterns for common crawler terms +const BOT_PATTERN = new RegExp( + [ + // Search engine crawlers + 'googlebot', + 'bingbot', + 'yandexbot', + 'baiduspider', + 'duckduckbot', + 'applebot', + // SEO tools + 'ahrefsbot', + 'semrushbot', + 'dotbot', + 'mj12bot', + // Social media + 'slackbot', + 'twitterbot', + 'linkedinbot', + 'telegrambot', + 'discordbot', + 'facebookexternalhit', + 'whatsapp', + // Generic patterns + 'crawler', + 'spider', + 'scraper', + 'headless', + // Testing/automation tools + 'phantomjs', + 'selenium', + 'puppeteer', + 'playwright', + // Performance/monitoring tools + 'lighthouse', + 'pagespeed', + 'gtmetrix', + 'pingdom', + 'uptimerobot', + ].join('|'), + 'i' +); + +// Default sample rate for real users +const DEFAULT_SAMPLE_RATE = 0.3; + +/** + * Determines trace sample rate based on user agent. + * - AI agents: 100% (we want full visibility into agentic docs consumption) + * - Bots/crawlers: 0% (filter out noise) + * - Real users: 30% + * + * AI agents are checked first, so if something matches both AI and bot patterns, we sample it. + */ +export function tracesSampler(samplingContext: SamplingContext): number { + // Try to get user agent from normalizedRequest headers (Sentry SDK provides this) + // Falls back to OTel semantic convention attributes if normalizedRequest not available + const userAgent = + samplingContext.normalizedRequest?.headers?.['user-agent'] ?? + (samplingContext.attributes?.['http.user_agent'] as string | undefined) ?? + (samplingContext.attributes?.['user_agent.original'] as string | undefined); + + if (!userAgent) { + return DEFAULT_SAMPLE_RATE; + } + + if (AI_AGENT_PATTERN.test(userAgent)) { + return 1; + } + + if (BOT_PATTERN.test(userAgent)) { + return 0; + } + + // Sample real users at default rate + return DEFAULT_SAMPLE_RATE; +}