Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions src/a365/exporter/Agent365Exporter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ import {
classifyStatusCode,
shortHost,
} from "../../sdkstats/index.js";
import {
A365_ENDPOINT_CATEGORY,
EXC_TIMEOUT,
EXC_NETWORK,
EXC_CLIENT,
} from "../../sdkstats/constants.js";

const DEFAULT_MAX_RETRIES = 3;

Expand Down Expand Up @@ -288,7 +294,7 @@ export class Agent365Exporter implements SpanExporter {
// the URL or re-checking env on every iteration. `endpoint` is the
// category label per spec — A365 transmits report endpoint="a365".
const recordA365Stats = isSdkStatsEnabled();
const endpointCategory = "a365";
const endpointCategory = A365_ENDPOINT_CATEGORY;
let host = url;
if (recordA365Stats) {
host = shortHost(url);
Expand Down Expand Up @@ -530,11 +536,11 @@ function sleep(ms: number): Promise<void> {
function classifyExceptionType(error: unknown): string {
if (error instanceof Error) {
const name = error.name;
if (name === "AbortError" || name === "TimeoutError") return "Timeout exception";
if (name === "TypeError") return "Network exception";
return name || "Client exception";
if (name === "AbortError" || name === "TimeoutError") return EXC_TIMEOUT;
if (name === "TypeError") return EXC_NETWORK;
return name || EXC_CLIENT;
}
return "Client exception";
return EXC_CLIENT;
}

/**
Expand Down
118 changes: 118 additions & 0 deletions src/sdkstats/constants.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

/**
* Shared constants for the SDKStats Network pipeline.
*
* Centralizes the wire-format metric names, HTTP status-code buckets,
* endpoint category labels, and bounded `exceptionType` strings used by
* the network statsbeat accumulator ({@link ./networkStats}), the OTLP
* exporter wrapper ({@link ./otlpWrapper}), and the A365 exporter
* ({@link ../a365/exporter/Agent365Exporter}).
*
* Ideally the wire-format metric names would be imported directly from
* the `StatsbeatCounter` enum in `@azure/monitor-opentelemetry-exporter`
* so we have a single source of truth. That enum is currently shipped at
* `dist/{esm,commonjs}/export/statsbeat/types.{js,d.ts}`, but the
* package's `package.json#exports` field only publishes `.` and
* `./package.json`, so under our `moduleResolution: NodeNext` config a
* direct `import { StatsbeatCounter } from
* "@azure/monitor-opentelemetry-exporter/dist/esm/export/statsbeat/types.js"`
* fails with `TS2307: Cannot find module … or its corresponding type
* declarations`. Until the exporter exposes the enum from its public
* entry point (tracked upstream in
* https://github.com/Azure/azure-sdk-for-js, sdk/monitor/monitor-opentelemetry-exporter)
* we mirror the values here and keep them in lockstep — sending envelopes
* under any other name returns HTTP 200 but the AzMon SDKStats backend
* doesn't index them.
*/

// ---------------------------------------------------------------------------
// Wire-format metric names. Must match the `StatsbeatCounter` enum in
// `@azure/monitor-opentelemetry-exporter/dist/{esm,commonjs}/export/statsbeat/types.js`.
// ---------------------------------------------------------------------------

export const REQUEST_SUCCESS_NAME = "Request_Success_Count";
Comment thread
JacksonWeber marked this conversation as resolved.
export const REQUEST_FAILURE_NAME = "Request_Failure_Count";
export const REQUEST_DURATION_NAME = "Request_Duration";
export const RETRY_COUNT_NAME = "Retry_Count";
export const THROTTLE_COUNT_NAME = "Throttle_Count";
export const EXCEPTION_COUNT_NAME = "Exception_Count";

/**
* Names of registered network SDKStats metrics, in registration order.
*
* @internal
*/
export const NETWORK_METRIC_NAMES = [
REQUEST_SUCCESS_NAME,
REQUEST_FAILURE_NAME,
REQUEST_DURATION_NAME,
RETRY_COUNT_NAME,
THROTTLE_COUNT_NAME,
EXCEPTION_COUNT_NAME,
] as const;

export type NetworkMetricName = (typeof NETWORK_METRIC_NAMES)[number];

// ---------------------------------------------------------------------------
// HTTP status-code buckets per the Application Insights SDKStats Network
// specification. Used by `classifyStatusCode` and by exporter wrappers that
// need a defensive secondary classification.
// ---------------------------------------------------------------------------

export const RETRY_STATUSES: ReadonlySet<number> = new Set([
401, 403, 408, 429, 500, 502, 503, 504,
]);
export const THROTTLE_STATUSES: ReadonlySet<number> = new Set([402, 439]);
// 206 is handled by the caller (per-envelope breakdown). 307/308 are
// followed by the HTTP client transparently and are not reported.
export const IGNORED_STATUSES: ReadonlySet<number> = new Set([206, 307, 308]);

/**
* Per the OTLP/HTTP response specification, retryable HTTP status codes
* are 429, 502, 503, and 504. The upstream OTLP delegate normally routes
* these through its `retryable` branch (no status code surfaced), but
* wrappers classify defensively for the rare case the failure branch
* still carries a retryable code (e.g. retries exhausted).
*/
export const OTLP_HTTP_RETRYABLE_STATUSES: ReadonlySet<number> = new Set([429, 502, 503, 504]);

// ---------------------------------------------------------------------------
// Endpoint category labels. Per spec, `endpoint` is a category label, not
// the destination URL.
// ---------------------------------------------------------------------------

export const OTLP_ENDPOINT_CATEGORY = "otlp";
export const A365_ENDPOINT_CATEGORY = "a365";

/**
* Sentinel `statusCode` dimension used when the upstream OTLP delegate
* has discarded the original HTTP status code (currently the retryable
* 429/502/503/504 path). Keeps the dimension present per spec.
*/
export const OTLP_UNKNOWN_STATUS = "unknown";

// ---------------------------------------------------------------------------
// Bounded set of `exceptionType` labels for `Exception_Count`.
// Cardinality must stay bounded so the SDKStats backend can index it.
// ---------------------------------------------------------------------------

export const EXC_TIMEOUT = "Timeout exception";
export const EXC_NETWORK = "Network exception";
export const EXC_CLIENT = "Client exception";

/**
* Node socket error codes that we treat as transient network failures
* when classifying an exception into the `Network exception` bucket.
*/
export const RETRYABLE_NETWORK_ERROR_CODES: ReadonlySet<string> = new Set([
"ECONNRESET",
"ECONNREFUSED",
"EPIPE",
"ETIMEDOUT",
"EAI_AGAIN",
"ENOTFOUND",
"ENETUNREACH",
"EHOSTUNREACH",
]);
52 changes: 23 additions & 29 deletions src/sdkstats/networkStats.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,35 @@
* distro.
*/

// Metric names must match the AzMon SDKStats backend's recognized
// schema (see `StatsbeatCounter` enum in
// `@azure/monitor-opentelemetry-exporter/dist/esm/export/statsbeat/types.js`).
// Sending envelopes under any other name returns HTTP 200 but the
// backend doesn't index them, so they're invisible in the SDKStats
// dashboards. The constants below intentionally match the wire-format
// names — do NOT rename them.
export const REQUEST_SUCCESS_NAME = "Request_Success_Count";
export const REQUEST_FAILURE_NAME = "Request_Failure_Count";
export const REQUEST_DURATION_NAME = "Request_Duration";
export const RETRY_COUNT_NAME = "Retry_Count";
export const THROTTLE_COUNT_NAME = "Throttle_Count";
export const EXCEPTION_COUNT_NAME = "Exception_Count";

/**
* Names of registered network SDKStats metrics, in registration order.
*
* @internal
*/
export const NETWORK_METRIC_NAMES = [
// Wire-format metric names and the set of HTTP status-code buckets used
// below live in `./constants.js` so the OTLP/A365 exporter wrappers can
// share a single source of truth. Re-exported here for backwards
// compatibility with existing imports of these symbols from
// `./networkStats.js`.
import {
REQUEST_SUCCESS_NAME,
REQUEST_FAILURE_NAME,
REQUEST_DURATION_NAME,
RETRY_COUNT_NAME,
THROTTLE_COUNT_NAME,
EXCEPTION_COUNT_NAME,
] as const;

export type NetworkMetricName = (typeof NETWORK_METRIC_NAMES)[number];
NETWORK_METRIC_NAMES,
RETRY_STATUSES,
THROTTLE_STATUSES,
IGNORED_STATUSES,
type NetworkMetricName,
} from "./constants.js";

export {
REQUEST_SUCCESS_NAME,
REQUEST_FAILURE_NAME,
REQUEST_DURATION_NAME,
RETRY_COUNT_NAME,
THROTTLE_COUNT_NAME,
EXCEPTION_COUNT_NAME,
NETWORK_METRIC_NAMES,
};
export type { NetworkMetricName };

/**
* Composite key for an aggregated network SDKStats counter.
Expand Down Expand Up @@ -143,12 +143,6 @@ export function recordDuration(endpoint: string, host: string, durationMs: numbe
*/
export type StatusCodeKind = "success" | "retry" | "throttle" | "failure" | "ignored";

const RETRY_STATUSES = new Set([401, 403, 408, 429, 500, 502, 503, 504]);
const THROTTLE_STATUSES = new Set([402, 439]);
// 206 is handled by the caller (per-envelope breakdown). 307/308 are
// followed by the HTTP client transparently and are not reported.
const IGNORED_STATUSES = new Set([206, 307, 308]);

export function classifyStatusCode(status: number): StatusCodeKind {
if (status >= 200 && status < 300 && status !== 206) return "success";
if (IGNORED_STATUSES.has(status)) return "ignored";
Expand Down
47 changes: 9 additions & 38 deletions src/sdkstats/otlpWrapper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,44 +46,15 @@ import {
recordDuration,
shortHost,
} from "./networkStats.js";

/** Per spec, `endpoint` is a category label, not the destination URL. */
const OTLP_ENDPOINT_CATEGORY = "otlp";

/**
* Sentinel `statusCode` dimension used when the upstream OTLP delegate
* has discarded the original HTTP status code (currently the retryable
* 429/502/503/504 path). Keeps the dimension present per spec.
*/
const OTLP_UNKNOWN_STATUS = "unknown";

/**
* Bounded set of `exceptionType` labels for OTLP `Exception_Count`.
* Cardinality must stay bounded so the SDKStats backend can index it.
*/
const EXC_TIMEOUT = "Timeout exception";
const EXC_NETWORK = "Network exception";
const EXC_CLIENT = "Client exception";

const RETRYABLE_NETWORK_ERROR_CODES = new Set([
"ECONNRESET",
"ECONNREFUSED",
"EPIPE",
"ETIMEDOUT",
"EAI_AGAIN",
"ENOTFOUND",
"ENETUNREACH",
"EHOSTUNREACH",
]);

/**
* Per the OTLP/HTTP response specification, retryable HTTP status codes
* are 429, 502, 503, and 504. The upstream delegate normally routes
* these through its `retryable` branch (no status code surfaced), but
* we classify defensively here for the rare case the failure branch
* still carries a retryable code (e.g. retries exhausted).
*/
const OTLP_HTTP_RETRYABLE_STATUSES = new Set([429, 502, 503, 504]);
import {
OTLP_ENDPOINT_CATEGORY,
OTLP_UNKNOWN_STATUS,
OTLP_HTTP_RETRYABLE_STATUSES,
RETRYABLE_NETWORK_ERROR_CODES,
EXC_TIMEOUT,
EXC_NETWORK,
EXC_CLIENT,
} from "./constants.js";

interface ErrorWithCode {
code?: unknown;
Expand Down
7 changes: 4 additions & 3 deletions test/internal/unit/a365/agent365NetworkStats.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import {
_resetAllForTest,
drain,
} from "../../../../src/sdkstats/networkStats.js";
import { A365_ENDPOINT_CATEGORY } from "../../../../src/sdkstats/constants.js";
import { _resetA365LoggerForTest } from "../../../../src/a365/logging.js";

const TENANT_ID = "tenant-11111111-1111-1111-1111-111111111111";
Expand Down Expand Up @@ -134,7 +135,7 @@ describe("Agent365Exporter network SDKStats", () => {
const failures = drain(REQUEST_FAILURE_NAME);
expect(failures.size).toBe(1);
const [key, count] = [...failures.entries()][0];
expect(key[0]).toBe("a365");
expect(key[0]).toBe(A365_ENDPOINT_CATEGORY);
expect(key[2]).toBe("404");
expect(count).toBe(1);
});
Expand Down Expand Up @@ -189,7 +190,7 @@ describe("Agent365Exporter network SDKStats", () => {
const exceptions = drain(EXCEPTION_COUNT_NAME);
expect(exceptions.size).toBe(1);
const [key, count] = [...exceptions.entries()][0];
expect(key[0]).toBe("a365");
expect(key[0]).toBe(A365_ENDPOINT_CATEGORY);
// 4 attempts (initial + 3 retries) each throw.
expect(count).toBe(4);
});
Expand All @@ -203,7 +204,7 @@ describe("Agent365Exporter network SDKStats", () => {
const durations = drain(REQUEST_DURATION_NAME);
expect(durations.size).toBe(1);
const [key, avg] = [...durations.entries()][0];
expect(key[0]).toBe("a365");
expect(key[0]).toBe(A365_ENDPOINT_CATEGORY);
expect(avg).toBeGreaterThanOrEqual(0);
});
});
25 changes: 13 additions & 12 deletions test/internal/unit/sdkstats/metrics.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
recordSuccess,
recordThrottle,
} from "../../../../src/sdkstats/networkStats.js";
import { A365_ENDPOINT_CATEGORY, EXC_TIMEOUT } from "../../../../src/sdkstats/constants.js";
import {
FEATURE_TYPE_FEATURE,
FEATURE_TYPE_INSTRUMENTATION,
Expand Down Expand Up @@ -175,7 +176,7 @@ describe("sdkstats/metrics", () => {
setSdkStatsInstrumentation(SdkStatsInstrumentation.MONGODB);
// Drop a network counter so a request_success_count observation will fire.
_resetNetworkStatsForTest();
recordSuccess("a365", "contoso.example.com");
recordSuccess(A365_ENDPOINT_CATEGORY, "contoso.example.com");

const { PeriodicExportingMetricReader } = await import("@opentelemetry/sdk-metrics");
const exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE);
Expand Down Expand Up @@ -205,8 +206,8 @@ describe("sdkstats/metrics", () => {
describe("network gauges (default mode)", () => {
it("emits one observation per drained key, attaches endpoint + host, and clears after collection", async () => {
_resetNetworkStatsForTest();
recordSuccess("a365", "a365.example.com");
recordSuccess("a365", "a365.example.com");
recordSuccess(A365_ENDPOINT_CATEGORY, "a365.example.com");
recordSuccess(A365_ENDPOINT_CATEGORY, "a365.example.com");

const { PeriodicExportingMetricReader } = await import("@opentelemetry/sdk-metrics");
const exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE);
Expand All @@ -229,7 +230,7 @@ describe("sdkstats/metrics", () => {
const success = byName(REQUEST_SUCCESS_NAME);
expect(success).toHaveLength(1);
expect(success[0].value).toBe(2);
expect(success[0].attributes.endpoint).toBe("a365");
expect(success[0].attributes.endpoint).toBe(A365_ENDPOINT_CATEGORY);
expect(success[0].attributes.host).toBe("a365.example.com");
expect(success[0].attributes.statusCode).toBeUndefined();

Expand All @@ -247,12 +248,12 @@ describe("sdkstats/metrics", () => {

it("emits failure/retry/throttle/exception observations with the appropriate dimension and an avg duration", async () => {
_resetNetworkStatsForTest();
recordFailure("a365", "westus", 404);
recordRetry("a365", "westus", 503);
recordThrottle("a365", "westus", 439);
recordException("a365", "westus", "Timeout exception");
recordDuration("a365", "westus", 100);
recordDuration("a365", "westus", 200);
recordFailure(A365_ENDPOINT_CATEGORY, "westus", 404);
recordRetry(A365_ENDPOINT_CATEGORY, "westus", 503);
recordThrottle(A365_ENDPOINT_CATEGORY, "westus", 439);
recordException(A365_ENDPOINT_CATEGORY, "westus", EXC_TIMEOUT);
recordDuration(A365_ENDPOINT_CATEGORY, "westus", 100);
recordDuration(A365_ENDPOINT_CATEGORY, "westus", 200);

const { PeriodicExportingMetricReader } = await import("@opentelemetry/sdk-metrics");
const exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE);
Expand Down Expand Up @@ -283,12 +284,12 @@ describe("sdkstats/metrics", () => {
expect(throttles[0].attributes.statusCode).toBe("439");

const exceptions = byName(EXCEPTION_COUNT_NAME);
expect(exceptions[0].attributes.exceptionType).toBe("Timeout exception");
expect(exceptions[0].attributes.exceptionType).toBe(EXC_TIMEOUT);

const durations = byName(REQUEST_DURATION_NAME);
expect(durations).toHaveLength(1);
expect(durations[0].value).toBe(150);
expect(durations[0].attributes.endpoint).toBe("a365");
expect(durations[0].attributes.endpoint).toBe(A365_ENDPOINT_CATEGORY);
expect(durations[0].attributes.host).toBe("westus");
// Duration has no statusCode / exceptionType dimension.
expect(durations[0].attributes.statusCode).toBeUndefined();
Expand Down
Loading
Loading