diff --git a/src/a365/exporter/Agent365Exporter.ts b/src/a365/exporter/Agent365Exporter.ts index 2fbf2fe..46fb56b 100644 --- a/src/a365/exporter/Agent365Exporter.ts +++ b/src/a365/exporter/Agent365Exporter.ts @@ -25,7 +25,17 @@ import { } from "./utils.js"; import { getA365Logger } from "../logging.js"; import { OpenTelemetryConstants } from "../constants.js"; -import { isSdkStatsEnabled, recordSuccess, shortHost } from "../../sdkstats/index.js"; +import { + isSdkStatsEnabled, + recordSuccess, + recordFailure, + recordRetry, + recordThrottle, + recordException, + recordDuration, + classifyStatusCode, + shortHost, +} from "../../sdkstats/index.js"; const DEFAULT_MAX_RETRIES = 3; @@ -285,6 +295,7 @@ export class Agent365Exporter implements SpanExporter { } for (let attempt = 0; attempt <= DEFAULT_MAX_RETRIES; attempt++) { + const requestStart = Date.now(); try { const response = await fetch(url, { method: "POST", @@ -299,10 +310,28 @@ export class Agent365Exporter implements SpanExporter { "unknown"; lastCorrelationId = correlationId; - if (response.status >= 200 && response.status < 300) { - if (recordA365Stats) { - recordSuccess(endpointCategory, host); + if (recordA365Stats) { + recordDuration(endpointCategory, host, Date.now() - requestStart); + const kind = classifyStatusCode(response.status); + switch (kind) { + case "success": + recordSuccess(endpointCategory, host); + break; + case "retry": + recordRetry(endpointCategory, host, response.status); + break; + case "throttle": + recordThrottle(endpointCategory, host, response.status); + break; + case "failure": + recordFailure(endpointCategory, host, response.status); + break; + case "ignored": + break; } + } + + if (response.status >= 200 && response.status < 300) { return { ok: true, correlationId }; } @@ -329,6 +358,10 @@ export class Agent365Exporter implements SpanExporter { ); return { ok: false, correlationId }; } catch (error) { + if (recordA365Stats) { + recordDuration(endpointCategory, host, Date.now() - requestStart); + recordException(endpointCategory, host, classifyExceptionType(error)); + } this.logger.error("[Agent365Exporter] Request error:", error); if (attempt < DEFAULT_MAX_RETRIES) { await sleep(200 * (attempt + 1)); @@ -488,6 +521,22 @@ function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } +/** + * Classify a thrown fetch error into a stable SDKStats `exceptionType` + * label so the dimension cardinality stays bounded. Mirrors the buckets + * the AzMon exporter's statsbeat uses (`ExceptionType` enum in + * `@azure/monitor-opentelemetry-exporter`). + */ +function classifyExceptionType(error: unknown): string { + if (error instanceof Error) { + const name = error.name; + if (name === "AbortError" || name === "TimeoutError") return "Timeout exception"; + if (name === "TypeError") return "Network exception"; + return name || "Client exception"; + } + return "Client exception"; +} + /** * Parse the Retry-After header value into milliseconds. * Supports both delay-seconds (e.g. "120") and HTTP-date formats (RFC 7231 §7.1.3). diff --git a/src/sdkstats/index.ts b/src/sdkstats/index.ts index f123d66..af039af 100644 --- a/src/sdkstats/index.ts +++ b/src/sdkstats/index.ts @@ -36,13 +36,24 @@ export { SdkStatsManager } from "./manager.js"; export { REQUEST_SUCCESS_NAME, + REQUEST_FAILURE_NAME, + REQUEST_DURATION_NAME, + RETRY_COUNT_NAME, + THROTTLE_COUNT_NAME, + EXCEPTION_COUNT_NAME, NETWORK_METRIC_NAMES, recordSuccess, + recordFailure, + recordRetry, + recordThrottle, + recordException, + recordDuration, + classifyStatusCode, drain, shortHost, _resetAllForTest as _resetNetworkStatsForTest, } from "./networkStats.js"; -export type { NetworkMetricName, NetworkKey } from "./networkStats.js"; +export type { NetworkMetricName, NetworkKey, StatusCodeKind } from "./networkStats.js"; export { NetworkStatsSpanExporter, diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 55a8622..1f428ff 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -18,7 +18,16 @@ import type { ObservableResult } from "@opentelemetry/api"; import { MICROSOFT_OPENTELEMETRY_VERSION } from "../types.js"; import { getSdkStatsFeatureFlags, getSdkStatsInstrumentationFlags } from "./state.js"; -import { REQUEST_SUCCESS_NAME, drain, type NetworkMetricName } from "./networkStats.js"; +import { + REQUEST_SUCCESS_NAME, + REQUEST_FAILURE_NAME, + REQUEST_DURATION_NAME, + RETRY_COUNT_NAME, + THROTTLE_COUNT_NAME, + EXCEPTION_COUNT_NAME, + drain, + type NetworkMetricName, +} from "./networkStats.js"; /** * Feature SDKStats `type` dimension values, per the Application Insights @@ -40,6 +49,15 @@ interface NetworkGaugeSpec { metric: NetworkMetricName; unit: string; description: string; + /** + * Names of the per-key dimensions for this metric, in the order they + * are encoded in the {@link NetworkKey} returned by `drain()`. The + * first two entries are always `endpoint` and `host`; the third (if + * present) is `statusCode` (failure/retry/throttle) or + * `exceptionType` (exception). `Request_Success_Count` and + * `Request_Duration` have no third dimension. + */ + keyAttributes: readonly string[]; } const NETWORK_GAUGE_SPECS: readonly NetworkGaugeSpec[] = [ @@ -47,6 +65,37 @@ const NETWORK_GAUGE_SPECS: readonly NetworkGaugeSpec[] = [ metric: REQUEST_SUCCESS_NAME, unit: "count", description: "Number of successful HTTP exports per endpoint", + keyAttributes: ["endpoint", "host"], + }, + { + metric: REQUEST_FAILURE_NAME, + unit: "count", + description: "Number of failed HTTP exports per endpoint, broken down by status code", + keyAttributes: ["endpoint", "host", "statusCode"], + }, + { + metric: REQUEST_DURATION_NAME, + unit: "ms", + description: "Average HTTP export request duration per endpoint", + keyAttributes: ["endpoint", "host"], + }, + { + metric: RETRY_COUNT_NAME, + unit: "count", + description: "Number of HTTP exports that returned a retryable status code", + keyAttributes: ["endpoint", "host", "statusCode"], + }, + { + metric: THROTTLE_COUNT_NAME, + unit: "count", + description: "Number of HTTP exports that returned a throttling status code (402/439)", + keyAttributes: ["endpoint", "host", "statusCode"], + }, + { + metric: EXCEPTION_COUNT_NAME, + unit: "count", + description: "Number of HTTP exports that raised an exception (no HTTP response)", + keyAttributes: ["endpoint", "host", "exceptionType"], }, ]; @@ -210,11 +259,14 @@ export class SdkStatsMetrics { private makeNetworkCallback(spec: NetworkGaugeSpec): (result: ObservableResult) => void { return (result: ObservableResult): void => { for (const [key, value] of drain(spec.metric)) { - const attrs: Record = { - ...this.commonAttributes, - endpoint: key[0], - host: key[1], - }; + const attrs: Record = { ...this.commonAttributes }; + for (let i = 0; i < spec.keyAttributes.length; i++) { + const name = spec.keyAttributes[i]; + const part = key[i]; + if (part !== undefined) { + attrs[name] = part; + } + } result.observe(value, attrs); } }; diff --git a/src/sdkstats/networkStats.ts b/src/sdkstats/networkStats.ts index 6a524bf..a373c6b 100644 --- a/src/sdkstats/networkStats.ts +++ b/src/sdkstats/networkStats.ts @@ -4,13 +4,15 @@ /** * Network SDKStats accumulator for SDK self-telemetry. * - * Per-export success counts for telemetry exporters. Exporters call - * {@link recordSuccess} after each successful transmit; the - * {@link SdkStatsMetrics} observable-gauge callbacks drain the - * accumulated counts on each export interval. + * Per-export counters and timings for telemetry exporters. Exporters call + * the {@link recordSuccess} / {@link recordFailure} / {@link recordRetry} / + * {@link recordThrottle} / {@link recordException} / {@link recordDuration} + * functions after each transmit; the {@link SdkStatsMetrics} + * observable-gauge callbacks drain the accumulated counts on each export + * interval. * * Mirrors `src/microsoft/opentelemetry/_sdkstats/_utils.py` from the Python - * distro (microsoft/opentelemetry-distro-python#144). + * distro. */ // Metric names must match the AzMon SDKStats backend's recognized @@ -19,15 +21,27 @@ // Sending envelopes under any other name returns HTTP 200 but the // backend doesn't index them, so they're invisible in the SDKStats // dashboards. The constants below intentionally match the wire-format -// names — do NOT rename them to lowercase. +// names — do NOT rename them. export const REQUEST_SUCCESS_NAME = "Request_Success_Count"; +export const REQUEST_FAILURE_NAME = "Request_Failure_Count"; +export const REQUEST_DURATION_NAME = "Request_Duration"; +export const RETRY_COUNT_NAME = "Retry_Count"; +export const THROTTLE_COUNT_NAME = "Throttle_Count"; +export const EXCEPTION_COUNT_NAME = "Exception_Count"; /** * Names of registered network SDKStats metrics, in registration order. * * @internal */ -export const NETWORK_METRIC_NAMES = [REQUEST_SUCCESS_NAME] as const; +export const NETWORK_METRIC_NAMES = [ + REQUEST_SUCCESS_NAME, + REQUEST_FAILURE_NAME, + REQUEST_DURATION_NAME, + RETRY_COUNT_NAME, + THROTTLE_COUNT_NAME, + EXCEPTION_COUNT_NAME, +] as const; export type NetworkMetricName = (typeof NETWORK_METRIC_NAMES)[number]; @@ -36,21 +50,37 @@ export type NetworkMetricName = (typeof NETWORK_METRIC_NAMES)[number]; * * Per the Application Insights SDKStats spec the per-key dimensions are * `endpoint` (category, e.g. "otlp", "a365") and `host` (stamp-specific - * region or hostname). + * region or hostname), optionally followed by a third dimension — + * `statusCode` (failure/retry/throttle) or `exceptionType` (exception). * * @internal */ -export type NetworkKey = readonly [string, string]; +export type NetworkKey = readonly string[]; // Single-threaded JS execution → no lock needed (Python uses one because of // the GIL + threads; Node.js doesn't share JS objects across worker threads). -const REQUESTS_MAP: Record> = { +type CounterMetricName = Exclude; + +const REQUESTS_MAP: Record> = { [REQUEST_SUCCESS_NAME]: new Map(), + [REQUEST_FAILURE_NAME]: new Map(), + [RETRY_COUNT_NAME]: new Map(), + [THROTTLE_COUNT_NAME]: new Map(), + [EXCEPTION_COUNT_NAME]: new Map(), }; +// Duration is tracked as running sum + count so the observable-gauge +// callback can report the per-interval average (per spec). +interface DurationAccumulator { + sum: number; + count: number; +} +const DURATION_MAP: Map = new Map(); + // `Map` keys are compared by identity for arrays/objects, so we serialize // the key tuple to a string. The `\u0000` separator can't appear in a URL -// hostname or HTTP status string, so this is unambiguous. +// hostname, HTTP status code, or exception-type string, so this is +// unambiguous. const KEY_SEPARATOR = "\u0000"; function encodeKey(key: NetworkKey): string { @@ -58,11 +88,10 @@ function encodeKey(key: NetworkKey): string { } function decodeKey(encoded: string): NetworkKey { - const parts = encoded.split(KEY_SEPARATOR); - return [parts[0], parts[1]] as const; + return encoded.split(KEY_SEPARATOR); } -function bump(metric: NetworkMetricName, key: NetworkKey, value = 1): void { +function bump(metric: CounterMetricName, key: NetworkKey, value = 1): void { const bucket = REQUESTS_MAP[metric]; const encoded = encodeKey(key); bucket.set(encoded, (bucket.get(encoded) ?? 0) + value); @@ -72,6 +101,62 @@ export function recordSuccess(endpoint: string, host: string): void { bump(REQUEST_SUCCESS_NAME, [endpoint, host]); } +export function recordFailure(endpoint: string, host: string, statusCode: number | string): void { + bump(REQUEST_FAILURE_NAME, [endpoint, host, String(statusCode)]); +} + +export function recordRetry(endpoint: string, host: string, statusCode: number | string): void { + bump(RETRY_COUNT_NAME, [endpoint, host, String(statusCode)]); +} + +export function recordThrottle(endpoint: string, host: string, statusCode: number | string): void { + bump(THROTTLE_COUNT_NAME, [endpoint, host, String(statusCode)]); +} + +export function recordException(endpoint: string, host: string, exceptionType: string): void { + bump(EXCEPTION_COUNT_NAME, [endpoint, host, exceptionType]); +} + +export function recordDuration(endpoint: string, host: string, durationMs: number): void { + if (!Number.isFinite(durationMs) || durationMs < 0) return; + const encoded = encodeKey([endpoint, host]); + const existing = DURATION_MAP.get(encoded); + if (existing) { + existing.sum += durationMs; + existing.count += 1; + } else { + DURATION_MAP.set(encoded, { sum: durationMs, count: 1 }); + } +} + +/** + * Classification of an HTTP status code per the Application Insights + * SDKStats Network specification. + * + * - `success`: 200 (and 206 if all envelopes were accepted — handled by + * the caller, this helper only returns the bucket for a single response + * code). + * - `retry`: 401, 403, 408, 429, 500, 502, 503, 504. + * - `throttle`: 402, 439. + * - `failure`: everything else (excluding redirects 307/308 which are + * followed transparently and never reported). + */ +export type StatusCodeKind = "success" | "retry" | "throttle" | "failure" | "ignored"; + +const RETRY_STATUSES = new Set([401, 403, 408, 429, 500, 502, 503, 504]); +const THROTTLE_STATUSES = new Set([402, 439]); +// 206 is handled by the caller (per-envelope breakdown). 307/308 are +// followed by the HTTP client transparently and are not reported. +const IGNORED_STATUSES = new Set([206, 307, 308]); + +export function classifyStatusCode(status: number): StatusCodeKind { + if (status >= 200 && status < 300 && status !== 206) return "success"; + if (IGNORED_STATUSES.has(status)) return "ignored"; + if (THROTTLE_STATUSES.has(status)) return "throttle"; + if (RETRY_STATUSES.has(status)) return "retry"; + return "failure"; +} + /** * Compute the stamp-specific short host for the SDKStats `host` dimension. * @@ -117,10 +202,23 @@ export function shortHost(input: string): string { * * Used by the observable-gauge callbacks so each observation reports only * the delta accumulated during the export interval. + * + * For {@link REQUEST_DURATION_NAME} the reported value is the average of + * all durations recorded since the previous drain, per the spec + * ("Request_Duration ... avg request duration during the scheduled + * interval"). */ export function drain(metric: NetworkMetricName): Map { - const bucket = REQUESTS_MAP[metric]; const snapshot = new Map(); + if (metric === REQUEST_DURATION_NAME) { + for (const [encoded, acc] of DURATION_MAP) { + if (acc.count === 0) continue; + snapshot.set(decodeKey(encoded), acc.sum / acc.count); + } + DURATION_MAP.clear(); + return snapshot; + } + const bucket = REQUESTS_MAP[metric]; for (const [encoded, value] of bucket) { snapshot.set(decodeKey(encoded), value); } @@ -133,6 +231,10 @@ export function drain(metric: NetworkMetricName): Map { */ export function _resetAllForTest(): void { for (const name of NETWORK_METRIC_NAMES) { - REQUESTS_MAP[name].clear(); + if (name === REQUEST_DURATION_NAME) { + DURATION_MAP.clear(); + } else { + REQUESTS_MAP[name].clear(); + } } } diff --git a/src/sdkstats/otlpWrapper.ts b/src/sdkstats/otlpWrapper.ts index 15fe960..8387909 100644 --- a/src/sdkstats/otlpWrapper.ts +++ b/src/sdkstats/otlpWrapper.ts @@ -4,13 +4,26 @@ /** * Network SDKStats wrappers for OTLP exporters. * - * The upstream OTLP HTTP exporters do not surface HTTP status codes — only - * the {@link ExportResult} enum and any raised exception. The decorators - * here capture that signal so the network SDKStats pipeline can record - * success counts per endpoint. + * Decorates upstream OTLP HTTP exporters so the network SDKStats pipeline + * can record per-export success/failure/retry/throttle/exception counts + * and request duration per the Application Insights SDKStats Network + * specification (`endpoint="otlp"`, per-host). * - * Mirrors `src/microsoft/opentelemetry/_sdkstats/_otlp_wrapper.py` from the - * Python distro (microsoft/opentelemetry-distro-python#144). + * ## Upstream signal availability + * + * The upstream `@opentelemetry/otlp-exporter-base` delegate only exposes + * `ExportResult` (SUCCESS/FAILED) plus an optional `error`. For + * non-retryable HTTP responses the error is an `OTLPExporterError` + * carrying the HTTP `code`, so we can record `Request_Failure_Count` + * with the actual status. For HTTP responses the OTLP/HTTP spec + * classifies as retryable (429, 502, 503, 504) the upstream constructs + * a synthetic error with no `code`, so the original status is lost; we + * record `Retry_Count` with `statusCode="unknown"` in that case. + * Network errors, timeouts, and other thrown exceptions are recorded as + * `Exception_Count` with a bounded set of `exceptionType` labels. + * + * Mirrors `src/microsoft/opentelemetry/_sdkstats/_otlp_wrapper.py` from + * the Python distro (microsoft/opentelemetry-distro-python#144). */ import type { ExportResult } from "@opentelemetry/core"; @@ -25,11 +38,124 @@ import type { import type { ReadableSpan, SpanExporter } from "@opentelemetry/sdk-trace-base"; import type { LogRecordExporter, ReadableLogRecord } from "@opentelemetry/sdk-logs"; -import { recordSuccess, shortHost } from "./networkStats.js"; +import { + recordSuccess, + recordFailure, + recordRetry, + recordException, + recordDuration, + shortHost, +} from "./networkStats.js"; /** Per spec, `endpoint` is a category label, not the destination URL. */ const OTLP_ENDPOINT_CATEGORY = "otlp"; +/** + * Sentinel `statusCode` dimension used when the upstream OTLP delegate + * has discarded the original HTTP status code (currently the retryable + * 429/502/503/504 path). Keeps the dimension present per spec. + */ +const OTLP_UNKNOWN_STATUS = "unknown"; + +/** + * Bounded set of `exceptionType` labels for OTLP `Exception_Count`. + * Cardinality must stay bounded so the SDKStats backend can index it. + */ +const EXC_TIMEOUT = "Timeout exception"; +const EXC_NETWORK = "Network exception"; +const EXC_CLIENT = "Client exception"; + +const RETRYABLE_NETWORK_ERROR_CODES = new Set([ + "ECONNRESET", + "ECONNREFUSED", + "EPIPE", + "ETIMEDOUT", + "EAI_AGAIN", + "ENOTFOUND", + "ENETUNREACH", + "EHOSTUNREACH", +]); + +/** + * Per the OTLP/HTTP response specification, retryable HTTP status codes + * are 429, 502, 503, and 504. The upstream delegate normally routes + * these through its `retryable` branch (no status code surfaced), but + * we classify defensively here for the rare case the failure branch + * still carries a retryable code (e.g. retries exhausted). + */ +const OTLP_HTTP_RETRYABLE_STATUSES = new Set([429, 502, 503, 504]); + +interface ErrorWithCode { + code?: unknown; + name?: unknown; + message?: unknown; +} + +function asErrorWithCode(err: unknown): ErrorWithCode | undefined { + return typeof err === "object" && err !== null ? (err as ErrorWithCode) : undefined; +} + +/** + * Treat `error.code` as an HTTP status code only when it is an integer + * in a plausible HTTP response range. Guards against arbitrary numeric + * `code` fields on non-HTTP errors. + */ +function asHttpStatus(code: unknown): number | undefined { + if (typeof code !== "number" || !Number.isInteger(code)) return undefined; + if (code < 100 || code > 599) return undefined; + return code; +} + +function classifyExceptionType(error: unknown): string { + const err = asErrorWithCode(error); + if (!err) return EXC_CLIENT; + const name = typeof err.name === "string" ? err.name : ""; + if (name === "AbortError" || name === "TimeoutError") return EXC_TIMEOUT; + if (typeof err.message === "string" && err.message === "Request timed out") return EXC_TIMEOUT; + if (name === "TypeError") return EXC_NETWORK; + if (typeof err.code === "string" && RETRYABLE_NETWORK_ERROR_CODES.has(err.code)) { + return EXC_NETWORK; + } + return EXC_CLIENT; +} + +/** + * Record the appropriate SDKStats counter for an OTLP export failure, + * given the host and the error surfaced by the upstream delegate. + * + * See file-level "Upstream signal availability" comment for rationale. + */ +function recordOtlpFailure(host: string, error: unknown): void { + const err = asErrorWithCode(error); + const httpStatus = err ? asHttpStatus(err.code) : undefined; + + if (httpStatus !== undefined) { + // The OTLP/HTTP "throttle" classification additionally requires a + // Retry-After header that the upstream delegate does not expose to + // us, so we conservatively bucket 429 as retry rather than throttle. + if (OTLP_HTTP_RETRYABLE_STATUSES.has(httpStatus)) { + recordRetry(OTLP_ENDPOINT_CATEGORY, host, httpStatus); + } else { + recordFailure(OTLP_ENDPOINT_CATEGORY, host, httpStatus); + } + return; + } + + // Upstream delegate's synthetic message for HTTP retryable responses + // (429/502/503/504) discards the status code. Record as retry with an + // "unknown" status code so the dimension stays present per spec. + if ( + err && + typeof err.message === "string" && + err.message === "Export failed with retryable status" + ) { + recordRetry(OTLP_ENDPOINT_CATEGORY, host, OTLP_UNKNOWN_STATUS); + return; + } + + recordException(OTLP_ENDPOINT_CATEGORY, host, classifyExceptionType(error)); +} + /** * Resolve the short-host string for a given OTLP signal. * @@ -55,8 +181,9 @@ function resolveShortHost(signal: "traces" | "metrics" | "logs"): string { /** * Common bookkeeping for an export attempt. * - * On `ExportResultCode.SUCCESS` we record a success count. Other outcomes - * (failure, exception, duration) will be added in a future PR. + * Records success/exception counters and request duration regardless of + * outcome, per the SDKStats spec ("Request_Duration ... avg request + * duration for all requests during the scheduled interval"). */ function wrapExport( host: string, @@ -64,14 +191,28 @@ function wrapExport( resultCallback: (result: ExportResult) => void, _items: T, ): void { + const start = Date.now(); + let settled = false; const settle = (result: ExportResult): void => { + if (settled) return; + settled = true; + recordDuration(OTLP_ENDPOINT_CATEGORY, host, Date.now() - start); if (result.code === ExportResultCode.SUCCESS) { recordSuccess(OTLP_ENDPOINT_CATEGORY, host); + } else { + recordOtlpFailure(host, result.error); } resultCallback(result); }; - inner(settle); + try { + inner(settle); + } catch (err) { + settle({ + code: ExportResultCode.FAILED, + error: err instanceof Error ? err : new Error(String(err)), + }); + } } /** @@ -148,7 +289,7 @@ export class NetworkStatsLogExporter implements LogRecordExporter { } forceFlush(): Promise { - return this.inner.forceFlush ? this.inner.forceFlush() : Promise.resolve(); + return this.inner.forceFlush?.() ?? Promise.resolve(); } shutdown(): Promise { diff --git a/test/internal/unit/a365/agent365NetworkStats.test.ts b/test/internal/unit/a365/agent365NetworkStats.test.ts index 3ed6e75..9b91f67 100644 --- a/test/internal/unit/a365/agent365NetworkStats.test.ts +++ b/test/internal/unit/a365/agent365NetworkStats.test.ts @@ -7,7 +7,12 @@ import type { ReadableSpan } from "@opentelemetry/sdk-trace-base"; import { Agent365Exporter } from "../../../../src/a365/exporter/Agent365Exporter.js"; import { + EXCEPTION_COUNT_NAME, + REQUEST_DURATION_NAME, + REQUEST_FAILURE_NAME, REQUEST_SUCCESS_NAME, + RETRY_COUNT_NAME, + THROTTLE_COUNT_NAME, _resetAllForTest, drain, } from "../../../../src/sdkstats/networkStats.js"; @@ -119,4 +124,86 @@ describe("Agent365Exporter network SDKStats", () => { expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); }); + + it("records request_failure_count with statusCode on a non-retryable, non-throttle 4xx", async () => { + fetchSpy.mockResolvedValue({ status: 404, headers: new Map() }); + + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + const failures = drain(REQUEST_FAILURE_NAME); + expect(failures.size).toBe(1); + const [key, count] = [...failures.entries()][0]; + expect(key[0]).toBe("a365"); + expect(key[2]).toBe("404"); + expect(count).toBe(1); + }); + + it("records retry_count once per retryable response (with statusCode)", async () => { + fetchSpy.mockResolvedValue({ status: 503, headers: new Map() }); + vi.spyOn(globalThis, "setTimeout").mockImplementation(((cb: () => void) => { + cb(); + return 0 as unknown as NodeJS.Timeout; + }) as typeof setTimeout); + + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + const retries = drain(RETRY_COUNT_NAME); + expect(retries.size).toBe(1); + const [key, count] = [...retries.entries()][0]; + expect(key[2]).toBe("503"); + // DEFAULT_MAX_RETRIES = 3 → 4 total attempts, all 503. + expect(count).toBe(4); + }); + + it("records throttle_count with statusCode on 439 (pure throttle status)", async () => { + // Per the SDKStats spec, THROTTLE_STATUSES = {402, 439}. 429 is classified + // as retry (not throttle) — classifyStatusCode checks THROTTLE_STATUSES + // first, then RETRY_STATUSES, and 429 only appears in the retry set. + fetchSpy.mockResolvedValue({ status: 439, headers: new Map() }); + vi.spyOn(globalThis, "setTimeout").mockImplementation(((cb: () => void) => { + cb(); + return 0 as unknown as NodeJS.Timeout; + }) as typeof setTimeout); + + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + const throttles = drain(THROTTLE_COUNT_NAME); + expect(throttles.size).toBe(1); + const [key] = [...throttles.entries()][0]; + expect(key[2]).toBe("439"); + }); + + it("records exception_count when fetch rejects", async () => { + fetchSpy.mockRejectedValue(new Error("boom")); + vi.spyOn(globalThis, "setTimeout").mockImplementation(((cb: () => void) => { + cb(); + return 0 as unknown as NodeJS.Timeout; + }) as typeof setTimeout); + + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + const exceptions = drain(EXCEPTION_COUNT_NAME); + expect(exceptions.size).toBe(1); + const [key, count] = [...exceptions.entries()][0]; + expect(key[0]).toBe("a365"); + // 4 attempts (initial + 3 retries) each throw. + expect(count).toBe(4); + }); + + it("records request_duration on each attempt regardless of outcome", async () => { + fetchSpy.mockResolvedValue({ status: 200, headers: new Map() }); + + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + const durations = drain(REQUEST_DURATION_NAME); + expect(durations.size).toBe(1); + const [key, avg] = [...durations.entries()][0]; + expect(key[0]).toBe("a365"); + expect(avg).toBeGreaterThanOrEqual(0); + }); }); diff --git a/test/internal/unit/sdkstats/metrics.test.ts b/test/internal/unit/sdkstats/metrics.test.ts index fd6c55c..030dbe2 100644 --- a/test/internal/unit/sdkstats/metrics.test.ts +++ b/test/internal/unit/sdkstats/metrics.test.ts @@ -5,9 +5,19 @@ import { describe, it, beforeEach, expect } from "vitest"; import { MeterProvider } from "@opentelemetry/sdk-metrics"; import { + EXCEPTION_COUNT_NAME, + REQUEST_DURATION_NAME, + REQUEST_FAILURE_NAME, REQUEST_SUCCESS_NAME, + RETRY_COUNT_NAME, + THROTTLE_COUNT_NAME, _resetAllForTest as _resetNetworkStatsForTest, + recordDuration, + recordException, + recordFailure, + recordRetry, recordSuccess, + recordThrottle, } from "../../../../src/sdkstats/networkStats.js"; import { FEATURE_TYPE_FEATURE, @@ -234,5 +244,58 @@ describe("sdkstats/metrics", () => { await meterProvider.shutdown(); _resetNetworkStatsForTest(); }); + + it("emits failure/retry/throttle/exception observations with the appropriate dimension and an avg duration", async () => { + _resetNetworkStatsForTest(); + recordFailure("a365", "westus", 404); + recordRetry("a365", "westus", 503); + recordThrottle("a365", "westus", 439); + recordException("a365", "westus", "Timeout exception"); + recordDuration("a365", "westus", 100); + recordDuration("a365", "westus", 200); + + const { PeriodicExportingMetricReader } = await import("@opentelemetry/sdk-metrics"); + const exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE); + const reader = new PeriodicExportingMetricReader({ + exporter, + exportIntervalMillis: 60_000, + }); + const meterProvider = new MeterProvider({ readers: [reader] }); + new SdkStatsMetrics(meterProvider); + + await meterProvider.forceFlush(); + + const byName = (name: string) => + exporter + .getMetrics() + .flatMap((rm) => rm.scopeMetrics.flatMap((sm) => sm.metrics)) + .filter((m) => m.descriptor.name === name) + .flatMap((m) => m.dataPoints); + + const failures = byName(REQUEST_FAILURE_NAME); + expect(failures).toHaveLength(1); + expect(failures[0].attributes.statusCode).toBe("404"); + + const retries = byName(RETRY_COUNT_NAME); + expect(retries[0].attributes.statusCode).toBe("503"); + + const throttles = byName(THROTTLE_COUNT_NAME); + expect(throttles[0].attributes.statusCode).toBe("439"); + + const exceptions = byName(EXCEPTION_COUNT_NAME); + expect(exceptions[0].attributes.exceptionType).toBe("Timeout exception"); + + const durations = byName(REQUEST_DURATION_NAME); + expect(durations).toHaveLength(1); + expect(durations[0].value).toBe(150); + expect(durations[0].attributes.endpoint).toBe("a365"); + expect(durations[0].attributes.host).toBe("westus"); + // Duration has no statusCode / exceptionType dimension. + expect(durations[0].attributes.statusCode).toBeUndefined(); + expect(durations[0].attributes.exceptionType).toBeUndefined(); + + await meterProvider.shutdown(); + _resetNetworkStatsForTest(); + }); }); }); diff --git a/test/internal/unit/sdkstats/networkStats.test.ts b/test/internal/unit/sdkstats/networkStats.test.ts index 7a74a53..703c1fc 100644 --- a/test/internal/unit/sdkstats/networkStats.test.ts +++ b/test/internal/unit/sdkstats/networkStats.test.ts @@ -4,11 +4,22 @@ import { beforeEach, describe, expect, it } from "vitest"; import { + EXCEPTION_COUNT_NAME, NETWORK_METRIC_NAMES, + REQUEST_DURATION_NAME, + REQUEST_FAILURE_NAME, REQUEST_SUCCESS_NAME, + RETRY_COUNT_NAME, + THROTTLE_COUNT_NAME, _resetAllForTest, + classifyStatusCode, drain, + recordDuration, + recordException, + recordFailure, + recordRetry, recordSuccess, + recordThrottle, shortHost, } from "../../../../src/sdkstats/networkStats.js"; @@ -17,9 +28,106 @@ describe("sdkstats/networkStats", () => { _resetAllForTest(); }); - it("exposes the Request_Success_Count metric name", () => { - expect(NETWORK_METRIC_NAMES).toEqual([REQUEST_SUCCESS_NAME]); + it("exposes all six SDKStats network metric names", () => { + expect(NETWORK_METRIC_NAMES).toEqual([ + REQUEST_SUCCESS_NAME, + REQUEST_FAILURE_NAME, + REQUEST_DURATION_NAME, + RETRY_COUNT_NAME, + THROTTLE_COUNT_NAME, + EXCEPTION_COUNT_NAME, + ]); expect(REQUEST_SUCCESS_NAME).toBe("Request_Success_Count"); + expect(REQUEST_FAILURE_NAME).toBe("Request_Failure_Count"); + expect(REQUEST_DURATION_NAME).toBe("Request_Duration"); + expect(RETRY_COUNT_NAME).toBe("Retry_Count"); + expect(THROTTLE_COUNT_NAME).toBe("Throttle_Count"); + expect(EXCEPTION_COUNT_NAME).toBe("Exception_Count"); + }); + + it("records failure/retry/throttle counts keyed by (endpoint, host, statusCode)", () => { + recordFailure("a365", "westus", 400); + recordFailure("a365", "westus", 400); + recordFailure("a365", "westus", 404); + recordRetry("a365", "westus", 503); + recordThrottle("a365", "westus", 429); + + const failures = drain(REQUEST_FAILURE_NAME); + expect(failures.size).toBe(2); + expect(failures.get([...failures.keys()].find((k) => k[2] === "400")!)).toBe(2); + expect(failures.get([...failures.keys()].find((k) => k[2] === "404")!)).toBe(1); + + const retries = drain(RETRY_COUNT_NAME); + expect([...retries.entries()]).toEqual([[["a365", "westus", "503"], 1]]); + + const throttles = drain(THROTTLE_COUNT_NAME); + expect([...throttles.entries()]).toEqual([[["a365", "westus", "429"], 1]]); + }); + + it("records exception counts keyed by (endpoint, host, exceptionType)", () => { + recordException("otlp", "collector", "Timeout exception"); + recordException("otlp", "collector", "Timeout exception"); + recordException("otlp", "collector", "Network exception"); + + const exceptions = drain(EXCEPTION_COUNT_NAME); + expect(exceptions.size).toBe(2); + const entries = [...exceptions.entries()].sort(([a], [b]) => a[2].localeCompare(b[2])); + expect(entries).toEqual([ + [["otlp", "collector", "Network exception"], 1], + [["otlp", "collector", "Timeout exception"], 2], + ]); + }); + + it("recordDuration averages recorded durations per (endpoint, host) on drain", () => { + recordDuration("a365", "westus", 100); + recordDuration("a365", "westus", 300); + recordDuration("a365", "eastus", 50); + + const durations = drain(REQUEST_DURATION_NAME); + expect(durations.size).toBe(2); + const map = new Map([...durations.entries()].map(([k, v]) => [k[1], v])); + expect(map.get("westus")).toBe(200); + expect(map.get("eastus")).toBe(50); + + // Second drain is empty (atomic reset). + expect(drain(REQUEST_DURATION_NAME).size).toBe(0); + }); + + it("recordDuration ignores negative or non-finite values", () => { + recordDuration("a365", "westus", -1); + recordDuration("a365", "westus", NaN); + recordDuration("a365", "westus", Infinity); + expect(drain(REQUEST_DURATION_NAME).size).toBe(0); + }); + + describe("classifyStatusCode", () => { + it("buckets 2xx (except 206) as success", () => { + expect(classifyStatusCode(200)).toBe("success"); + expect(classifyStatusCode(204)).toBe("success"); + expect(classifyStatusCode(206)).toBe("ignored"); + }); + + it("buckets retryable statuses correctly", () => { + for (const s of [401, 403, 408, 429, 500, 502, 503, 504]) { + expect(classifyStatusCode(s)).toBe("retry"); + } + }); + + it("buckets throttle statuses correctly", () => { + expect(classifyStatusCode(402)).toBe("throttle"); + expect(classifyStatusCode(439)).toBe("throttle"); + }); + + it("treats 307/308 redirects as ignored", () => { + expect(classifyStatusCode(307)).toBe("ignored"); + expect(classifyStatusCode(308)).toBe("ignored"); + }); + + it("treats other 4xx/5xx as failure", () => { + expect(classifyStatusCode(400)).toBe("failure"); + expect(classifyStatusCode(404)).toBe("failure"); + expect(classifyStatusCode(501)).toBe("failure"); + }); }); it("accumulates success counts per (endpoint, host) and reports keys as two-element tuples", () => { diff --git a/test/internal/unit/sdkstats/otlpWrapper.test.ts b/test/internal/unit/sdkstats/otlpWrapper.test.ts index 3d5dfa1..68a0536 100644 --- a/test/internal/unit/sdkstats/otlpWrapper.test.ts +++ b/test/internal/unit/sdkstats/otlpWrapper.test.ts @@ -14,7 +14,11 @@ import { NetworkStatsSpanExporter, } from "../../../../src/sdkstats/otlpWrapper.js"; import { + EXCEPTION_COUNT_NAME, + REQUEST_DURATION_NAME, + REQUEST_FAILURE_NAME, REQUEST_SUCCESS_NAME, + RETRY_COUNT_NAME, _resetAllForTest, drain, } from "../../../../src/sdkstats/networkStats.js"; @@ -80,12 +84,119 @@ describe("sdkstats/otlpWrapper", () => { expect([...success.entries()]).toEqual([[[ENDPOINT, HOST], 1]]); }); - it("does not record success on FAILED result", async () => { + it("records an exception on FAILED result with no error, and records duration", async () => { const inner = makeFakeSpanExporter({ code: ExportResultCode.FAILED }); const wrapper = new NetworkStatsSpanExporter(inner); await new Promise((resolve) => wrapper.export([], () => resolve())); expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); + + const exceptions = drain(EXCEPTION_COUNT_NAME); + expect(exceptions.size).toBe(1); + const [key, count] = [...exceptions.entries()][0]; + expect(key).toEqual([ENDPOINT, HOST, "Client exception"]); + expect(count).toBe(1); + + // Duration is recorded regardless of outcome. + expect(drain(REQUEST_DURATION_NAME).size).toBe(1); + }); + + it("records Request_Failure_Count with the HTTP status code when the error carries one", async () => { + const httpError = Object.assign(new Error("Bad Request"), { + name: "OTLPExporterError", + code: 400, + }); + const inner = makeFakeSpanExporter({ code: ExportResultCode.FAILED, error: httpError }); + const wrapper = new NetworkStatsSpanExporter(inner); + await new Promise((resolve) => wrapper.export([], () => resolve())); + + expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); + expect(drain(EXCEPTION_COUNT_NAME).size).toBe(0); + const failures = drain(REQUEST_FAILURE_NAME); + expect([...failures.entries()]).toEqual([[[ENDPOINT, HOST, "400"], 1]]); + }); + + it("records Retry_Count when the HTTP status code is a retryable OTLP code (429/502/503/504)", async () => { + for (const status of [429, 502, 503, 504]) { + _resetAllForTest(); + const httpError = Object.assign(new Error(""), { + name: "OTLPExporterError", + code: status, + }); + const inner = makeFakeSpanExporter({ + code: ExportResultCode.FAILED, + error: httpError, + }); + const wrapper = new NetworkStatsSpanExporter(inner); + await new Promise((resolve) => wrapper.export([], () => resolve())); + const retries = drain(RETRY_COUNT_NAME); + expect([...retries.entries()]).toEqual([[[ENDPOINT, HOST, String(status)], 1]]); + expect(drain(REQUEST_FAILURE_NAME).size).toBe(0); + } + }); + + it("records Retry_Count with statusCode='unknown' when upstream surfaces a synthetic retryable error", async () => { + const retryableError = Object.assign(new Error("Export failed with retryable status"), { + name: "OTLPExporterError", + }); + const inner = makeFakeSpanExporter({ + code: ExportResultCode.FAILED, + error: retryableError, + }); + const wrapper = new NetworkStatsSpanExporter(inner); + await new Promise((resolve) => wrapper.export([], () => resolve())); + + const retries = drain(RETRY_COUNT_NAME); + expect([...retries.entries()]).toEqual([[[ENDPOINT, HOST, "unknown"], 1]]); + expect(drain(EXCEPTION_COUNT_NAME).size).toBe(0); + }); + + it("records Exception_Count with a bounded type for timeouts and network errors", async () => { + const cases: Array<[Error, string]> = [ + [Object.assign(new Error("aborted"), { name: "AbortError" }), "Timeout exception"], + [Object.assign(new Error("timed out"), { name: "TimeoutError" }), "Timeout exception"], + [new Error("Request timed out"), "Timeout exception"], + [new TypeError("fetch failed"), "Network exception"], + [Object.assign(new Error("conn refused"), { code: "ECONNREFUSED" }), "Network exception"], + [Object.assign(new Error("dns"), { code: "ENOTFOUND" }), "Network exception"], + ]; + + for (const [err, expected] of cases) { + _resetAllForTest(); + const inner = makeFakeSpanExporter({ code: ExportResultCode.FAILED, error: err }); + const wrapper = new NetworkStatsSpanExporter(inner); + await new Promise((resolve) => wrapper.export([], () => resolve())); + const exc = drain(EXCEPTION_COUNT_NAME); + expect([...exc.keys()][0]).toEqual([ENDPOINT, HOST, expected]); + } + }); + + it("ignores non-HTTP numeric codes (e.g. string-coded Node errors)", async () => { + // Some Node errors expose a string `code` (e.g. 'ECONNRESET'); other + // errors may expose a numeric code that is not an HTTP status. Both + // should fall through to Exception_Count rather than be misread as + // an HTTP failure/retry. + const weirdError = Object.assign(new Error("not http"), { code: 12345 }); + const inner = makeFakeSpanExporter({ code: ExportResultCode.FAILED, error: weirdError }); + const wrapper = new NetworkStatsSpanExporter(inner); + await new Promise((resolve) => wrapper.export([], () => resolve())); + + expect(drain(REQUEST_FAILURE_NAME).size).toBe(0); + expect(drain(RETRY_COUNT_NAME).size).toBe(0); + const exc = drain(EXCEPTION_COUNT_NAME); + expect([...exc.keys()][0]).toEqual([ENDPOINT, HOST, "Client exception"]); + }); + + it("records a request duration on SUCCESS", async () => { + const inner = makeFakeSpanExporter({ code: ExportResultCode.SUCCESS }); + const wrapper = new NetworkStatsSpanExporter(inner); + await new Promise((resolve) => wrapper.export([], () => resolve())); + + const durations = drain(REQUEST_DURATION_NAME); + expect(durations.size).toBe(1); + const [key, avg] = [...durations.entries()][0]; + expect(key).toEqual([ENDPOINT, HOST]); + expect(avg).toBeGreaterThanOrEqual(0); }); it("forwards forceFlush and shutdown", async () => { @@ -98,6 +209,29 @@ describe("sdkstats/otlpWrapper", () => { expect(flushSpy).toHaveBeenCalledOnce(); expect(shutdownSpy).toHaveBeenCalledOnce(); }); + + it("records exception + duration and surfaces FAILED when inner throws synchronously", async () => { + const boom = new Error("inner blew up"); + const throwingInner: SpanExporter = { + export(): void { + throw boom; + }, + shutdown: () => Promise.resolve(), + forceFlush: () => Promise.resolve(), + }; + const wrapper = new NetworkStatsSpanExporter(throwingInner); + + const result = await new Promise((resolve) => + wrapper.export([], (r) => resolve(r)), + ); + expect(result.code).toBe(ExportResultCode.FAILED); + expect(result.error).toBe(boom); + + const exceptions = drain(EXCEPTION_COUNT_NAME); + expect(exceptions.size).toBe(1); + expect([...exceptions.keys()][0]).toEqual([ENDPOINT, HOST, "Client exception"]); + expect(drain(REQUEST_DURATION_NAME).size).toBe(1); + }); }); describe("NetworkStatsMetricExporter", () => {