diff --git a/docs/pipelines.md b/docs/pipelines.md index b20159eb..d066731e 100644 --- a/docs/pipelines.md +++ b/docs/pipelines.md @@ -389,13 +389,21 @@ and carries: | `maxDelay` | `8s` | Cap on the scaled delay | | `maxAttempts` | `3` | Total attempts including the first send; `1` disables retries | | `jitter` | `0.2` | Symmetric jitter fraction in `[0.0, 1.0]` | -| `retryableStatuses` | `{429, 500, 502, 503, 504}` | Status codes that trigger a retry on an `HttpException` | +| `retryableStatuses` | `{408, 429, 500, 502, 503, 504}` | Status codes that trigger a retry on an `HttpException` | | `retryableMethods` | `{GET, HEAD, OPTIONS, PUT, DELETE}` | Methods retryable by RFC 9110; others need a replayable body | | `scheduler` | `null` | Optional caller scheduler; `null` uses a daemon scheduler | -`408` (Request Timeout) is intentionally excluded from the default `retryableStatuses` — a -server-side 408 usually means the client was slow to send and is unlikely to improve on retry. -Callers that disagree can opt in via the builder. +These are the SDK's canonical retry defaults: the stage-based `DefaultRetryStep` (and its +`HttpRetryOptions`) share the same base delay (`200ms`), max delay (`8s`), multiplier (`2.0`), +jitter (`0.2`), retryable-status policy, and total send budget (3 attempts). Both stacks compute +their exponential schedule through the one `BackoffCalculator`, so the two cannot drift apart; +the only intentional difference is that the stage-based step has no `totalTimeout` deadline. +`HttpRetryOptions` counts *retries* (`maxRetries`, default `2`) while `RetrySettings` counts +*total attempts* (`maxAttempts`, default `3`) — both default to the same 3 sends. + +`408` (Request Timeout) **is** retryable by default, matching +`RetryUtils.isRetryable`/`HttpException.retryable` and the stage-based step. Callers wanting a +stricter posture can pass a tighter `retryableStatuses` set to the builder. --- diff --git a/sdk-core/api/sdk-core.api b/sdk-core/api/sdk-core.api index 93270501..9c5cb560 100644 --- a/sdk-core/api/sdk-core.api +++ b/sdk-core/api/sdk-core.api @@ -793,7 +793,6 @@ public class org/dexpace/sdk/core/http/pipeline/steps/DefaultRedirectStep : org/ public class org/dexpace/sdk/core/http/pipeline/steps/DefaultRetryStep : org/dexpace/sdk/core/http/pipeline/steps/RetryStep { public static final field Companion Lorg/dexpace/sdk/core/http/pipeline/steps/DefaultRetryStep$Companion; public static final field DEFAULT_MAX_RETRIES I - public static final field MAX_SHIFT_TRY_COUNT I public fun ()V public fun (Lorg/dexpace/sdk/core/http/pipeline/steps/HttpRetryOptions;)V public fun (Lorg/dexpace/sdk/core/http/pipeline/steps/HttpRetryOptions;Lorg/dexpace/sdk/core/util/Clock;)V @@ -2163,7 +2162,10 @@ public final class org/dexpace/sdk/core/pipeline/step/retry/RetryAfterParser { public final class org/dexpace/sdk/core/pipeline/step/retry/RetrySettings { public static final field Companion Lorg/dexpace/sdk/core/pipeline/step/retry/RetrySettings$Companion; + public static final field DEFAULT_DELAY_MULTIPLIER D public static final field DEFAULT_INITIAL_DELAY Ljava/time/Duration; + public static final field DEFAULT_JITTER D + public static final field DEFAULT_MAX_ATTEMPTS I public static final field DEFAULT_MAX_DELAY Ljava/time/Duration; public static final field DEFAULT_RETRYABLE_METHODS Ljava/util/Set; public static final field DEFAULT_RETRYABLE_STATUSES Ljava/util/Set; diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultRetryStep.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultRetryStep.kt index 380436cf..8df0f34a 100644 --- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultRetryStep.kt +++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultRetryStep.kt @@ -13,12 +13,13 @@ import org.dexpace.sdk.core.http.request.Method import org.dexpace.sdk.core.http.request.Request import org.dexpace.sdk.core.http.response.Response import org.dexpace.sdk.core.instrumentation.ClientLogger +import org.dexpace.sdk.core.pipeline.step.retry.BackoffCalculator import org.dexpace.sdk.core.pipeline.step.retry.RetryAfterParser +import org.dexpace.sdk.core.pipeline.step.retry.RetrySettings import org.dexpace.sdk.core.util.Clock import java.io.IOException import java.io.InterruptedIOException import java.time.Duration -import java.util.concurrent.ThreadLocalRandom /** * Default [RetryStep]. Drives an iterative retry loop with classified failure detection, @@ -40,9 +41,8 @@ import java.util.concurrent.ThreadLocalRandom * 4. Sleeps via [Clock.sleep]; an interrupt during sleep throws [InterruptedIOException] * with the original [InterruptedException] and any accumulated prior failures attached * as suppressed — retries are NOT resumed after an interrupt. - * 5. Caps `tryCount` at [MAX_SHIFT_TRY_COUNT] before computing `1L shl tryCount` so the - * left-shift can never overflow (the resulting delay is clamped to [HttpRetryOptions.maxDelay] - * anyway, so the cap is invisible to callers). + * 5. Computes the exponential delay through the shared [BackoffCalculator], which saturates + * rather than overflows on extreme attempt counts and clamps to [HttpRetryOptions.maxDelay]. * * ## Body replayability * @@ -69,8 +69,12 @@ import java.util.concurrent.ThreadLocalRandom * parsed from the response (response path only). A negative or unparseable value * falls through; a value of zero produces an immediate retry. * 3. [HttpRetryOptions.fixedDelay] — if set, every retry waits exactly this duration. - * 4. Exponential backoff: `baseDelay * (1L shl tryCount)` clamped to `maxDelay`, with - * ±5% random jitter via [ThreadLocalRandom]. + * 4. Exponential backoff computed by the shared [BackoffCalculator]: + * `baseDelay * 2.0^tryCount` clamped to `maxDelay`, with symmetric ±10% jitter + * ([RetrySettings.DEFAULT_JITTER]). This is the same calculator the recovery-aware + * `pipeline.step.retry.RetryStep` uses, so both stacks share one backoff formula and one + * set of defaults. The deadline-shrinking that the calculator also offers is disabled here + * (this stage-based step carries no total-timeout budget). * * ## Failure handling * @@ -146,6 +150,33 @@ public open class DefaultRetryStep */ private val options: HttpRetryOptions = clampOptions(options) + /** + * The [options]' exponential parameters expressed as a [RetrySettings] view so the shared + * [BackoffCalculator] can compute this stack's schedule. Built once per step instance: + * - `initialDelay` / `maxDelay` come from the options. + * - `delayMultiplier` (2.0) and `jitter` (0.2) are the canonical shared constants — the + * options object does not expose its own multiplier/jitter, so the SDK defaults apply. + * If [HttpRetryOptions] ever gains configurable multiplier/jitter knobs, this view must + * read them from the options instead of the constants, or the new knobs are silently + * ignored on this stack. + * - `totalTimeout = ZERO` disables the deadline cap: the stage-based step has no budget. + * The `fixedDelay` path never consults this view; it short-circuits in [backoffOrFixed]. + * + * Building this view also validates the delay magnitudes eagerly: [RetrySettings.builder] + * rejects a negative `baseDelay`/`maxDelay` and one larger than the calculator's + * ~292-year nanosecond ceiling. [HttpRetryOptions] performs no such range check, so a + * pathological delay surfaces as an [IllegalArgumentException] here, at step construction, + * rather than later at delay-computation time. + */ + private val backoffSettings: RetrySettings = + RetrySettings.builder() + .initialDelay(this.options.baseDelay) + .maxDelay(this.options.maxDelay) + .delayMultiplier(RetrySettings.DEFAULT_DELAY_MULTIPLIER) + .jitter(RetrySettings.DEFAULT_JITTER) + .totalTimeout(Duration.ZERO) + .build() + /** * Sends [request] through the downstream pipeline with automatic retry on retryable failures. * @@ -437,51 +468,16 @@ public open class DefaultRetryStep } /** - * Returns [HttpRetryOptions.fixedDelay] if set, otherwise the exponential-backoff - * delay for [tryCount]. The shift count is capped at [MAX_SHIFT_TRY_COUNT] so the - * `1L shl tryCount` term never overflows; the result is always clamped to [HttpRetryOptions.maxDelay] - * anyway, so the cap is invisible in practice. - */ - private fun backoffOrFixed(tryCount: Int): Duration = options.fixedDelay ?: exponentialBackoff(tryCount) - - /** - * `baseDelay * (1L shl tryCount)` clamped to `maxDelay`, plus a ±5% jitter sampled - * from [ThreadLocalRandom]. Pure function of [tryCount] and the configured options. - */ - private fun exponentialBackoff(tryCount: Int): Duration { - val baseNanos = options.baseDelay.toNanos() - if (baseNanos == 0L) return Duration.ZERO - val maxNanos = options.maxDelay.toNanos() - val safeShift = tryCount.coerceAtMost(MAX_SHIFT_TRY_COUNT) - // 1L shl 30 ~= 1e9 — multiplying by 800ms (8e8 ns) overflows. Cap on the long - // multiply itself: if `baseNanos * (1L shl safeShift)` would overflow, clamp. - val multiplier = 1L shl safeShift - val scaled = - if (baseNanos > 0 && multiplier > Long.MAX_VALUE / baseNanos) { - Long.MAX_VALUE - } else { - baseNanos * multiplier - } - val clamped = scaled.coerceAtMost(maxNanos) - val jittered = applyJitter(clamped) - // Guarantee a non-negative result — jitter could push us under zero if the caller - // configured pathological options (e.g. baseDelay equal to negative epsilon). - return Duration.ofNanos(jittered.coerceAtLeast(0L)) - } - - /** - * Applies a ±5% jitter to [nanos]. Sample is drawn from [ThreadLocalRandom] which is - * per-thread, so there is no cross-thread contention on the retry hot path. + * Returns [HttpRetryOptions.fixedDelay] if set, otherwise the exponential-backoff delay + * for [tryCount]. The backoff is computed by the shared [BackoffCalculator] from + * [backoffSettings] so this stack and the recovery-aware `RetryStep` share one formula. + * + * [tryCount] is 0-indexed here (`0` = the delay before the first retry), whereas + * [BackoffCalculator.computeDelay] is 1-indexed (`1` = first retry); the `+ 1` bridges + * the two so both produce `baseDelay`, `2·baseDelay`, `4·baseDelay`, … capped at `maxDelay`. */ - private fun applyJitter(nanos: Long): Long { - if (nanos == 0L) return 0L - // 5% of nanos, used as the magnitude bound on the random sample. - val jitterMagnitude = nanos / JITTER_DIVISOR - if (jitterMagnitude == 0L) return nanos - // ThreadLocalRandom.nextLong(origin, bound) is inclusive of origin, exclusive of bound. - val offset = ThreadLocalRandom.current().nextLong(-jitterMagnitude, jitterMagnitude + 1L) - return nanos + offset - } + private fun backoffOrFixed(tryCount: Int): Duration = + options.fixedDelay ?: BackoffCalculator.computeDelay(tryCount + 1, backoffSettings) // --------------- Retry-After parsing --------------- @@ -581,22 +577,14 @@ public open class DefaultRetryStep public companion object { /** - * Default [HttpRetryOptions.maxRetries] applied when the caller passes a negative - * value. Matches Azure Core's `RetryOptions` default. + * Default retry count applied when the caller passes a negative + * [HttpRetryOptions.maxRetries], and the value baked into the no-arg + * [HttpRetryOptions] default. `2` retries on top of the initial send is the SDK's + * canonical budget — `initial + DEFAULT_MAX_RETRIES == 3`, matching + * [RetrySettings.DEFAULT_MAX_ATTEMPTS] so both retry stacks default to the same + * number of total sends. */ - public const val DEFAULT_MAX_RETRIES: Int = 3 - - /** - * Upper bound on `tryCount` used for the `1L shl tryCount` term in - * [DefaultRetryStep.exponentialBackoff]. `1L shl 30` ~= 1.07e9 — the scaled delay is - * always clamped to [HttpRetryOptions.maxDelay] long before this bound is hit, so the - * cap is a paranoid guard against integer overflow rather than a behavior knob. - */ - public const val MAX_SHIFT_TRY_COUNT: Int = 30 - - // Jitter is ±5% of the computed delay; expressed as the divisor (nanos / 20) to - // avoid an extra multiplication on the hot path. See [applyJitter]. - private const val JITTER_DIVISOR = 20L + public const val DEFAULT_MAX_RETRIES: Int = 2 // Nanoseconds in one millisecond — used to convert monotonic-clock deltas to ms // for retry log events. diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/HttpRetryOptions.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/HttpRetryOptions.kt index 0936712d..8248ac25 100644 --- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/HttpRetryOptions.kt +++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/HttpRetryOptions.kt @@ -8,6 +8,7 @@ package org.dexpace.sdk.core.http.pipeline.steps import org.dexpace.sdk.core.http.common.HttpHeaderName +import org.dexpace.sdk.core.pipeline.step.retry.RetrySettings import org.dexpace.sdk.core.util.RetryUtils import java.time.Duration import java.util.Collections @@ -40,12 +41,15 @@ public fun interface HttpRetryDelayProvider { } /** - * Configuration for [DefaultRetryStep]. Defaults mirror Azure Core's retry policy: - * - [maxRetries] = 3 - * - [baseDelay] = 800ms (exponentially scaled per attempt) - * - [maxDelay] = 8s (cap on the scaled delay) + * Configuration for [DefaultRetryStep]. The numeric defaults are the SDK's canonical retry + * defaults, shared with the recovery-aware [RetrySettings] so both retry stacks compute the + * same backoff schedule (via [org.dexpace.sdk.core.pipeline.step.retry.BackoffCalculator]): + * - [maxRetries] = 2 (initial send + 2 retries = 3 total attempts, matching + * [RetrySettings.DEFAULT_MAX_ATTEMPTS]). + * - [baseDelay] = 200ms (= [RetrySettings.DEFAULT_INITIAL_DELAY]; exponentially scaled per attempt). + * - [maxDelay] = 8s (= [RetrySettings.DEFAULT_MAX_DELAY]; cap on the scaled delay). * - [fixedDelay] = null (exponential backoff is used; when non-null it overrides the - * backoff entirely and every retry waits exactly [fixedDelay]) + * backoff entirely and every retry waits exactly [fixedDelay]). * - [retryAfterHeaders] = `Retry-After`, `retry-after-ms`, `x-ms-retry-after-ms` — * parsed in declared order; the first present wins. Drop the Microsoft-specific * variants by passing a tighter list for stricter posture. @@ -54,15 +58,20 @@ public fun interface HttpRetryDelayProvider { * anywhere in the cause chain, per [RetryUtils.isRetryable]. * - [delayFromCondition] = null delay (falls through to `Retry-After` parsing, then backoff). * + * The exponential schedule, multiplier (2.0), and symmetric jitter (0.2) are sourced from the + * shared [RetrySettings] constants and applied by `BackoffCalculator`, so this stack and the + * recovery-aware stack cannot drift apart. The one intentional difference is the deadline: this + * stage-based step has no total-timeout budget, so it never shrinks a delay against one. + * * The companion [HttpRetryOptions.fixed] factory builds an options instance whose delay * never grows — useful for test injection or high-throughput retry against flaky endpoints. */ public class HttpRetryOptions @JvmOverloads constructor( - public val maxRetries: Int = 3, - public val baseDelay: Duration = Duration.ofMillis(DEFAULT_BASE_DELAY_MS), - public val maxDelay: Duration = Duration.ofSeconds(DEFAULT_MAX_DELAY_SECONDS), + public val maxRetries: Int = DEFAULT_MAX_RETRIES, + public val baseDelay: Duration = RetrySettings.DEFAULT_INITIAL_DELAY, + public val maxDelay: Duration = RetrySettings.DEFAULT_MAX_DELAY, public val fixedDelay: Duration? = null, public val retryAfterHeaders: List = DEFAULT_RETRY_AFTER_HEADERS, public val shouldRetryCondition: HttpRetryConditionPredicate = @@ -72,10 +81,9 @@ public class HttpRetryOptions public val delayFromCondition: HttpRetryDelayProvider = HttpRetryDelayProvider { null }, ) { public companion object { - // Default exponential-backoff parameters tuned to favour fast first-retry while - // bounding cumulative latency. Aligned with Azure Core's RetryOptions defaults. - private const val DEFAULT_BASE_DELAY_MS = 800L - private const val DEFAULT_MAX_DELAY_SECONDS = 8L + // The default retry count is the canonical SDK budget, kept in one place on + // DefaultRetryStep (initial send + DEFAULT_MAX_RETRIES == RetrySettings.DEFAULT_MAX_ATTEMPTS). + private const val DEFAULT_MAX_RETRIES = DefaultRetryStep.DEFAULT_MAX_RETRIES /** * The three `Retry-After` header forms parsed by [DefaultRetryStep]. Order matters — diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetrySettings.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetrySettings.kt index d54f5bb7..26c1e124 100644 --- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetrySettings.kt +++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetrySettings.kt @@ -28,13 +28,17 @@ private val MAX_NANO_REPRESENTABLE_DELAY: Duration = Duration.ofNanos(Long.MAX_V * timeout, capped `maxDelay`, `maxAttempts`. * * ## Defaults (Square + gax tuned) + * + * These are the canonical SDK retry defaults, shared with the stage-based + * [DefaultRetryStep][org.dexpace.sdk.core.http.pipeline.steps.DefaultRetryStep] so both retry + * stacks compute the same backoff schedule and honour the same retryable-status policy. * - [totalTimeout] = 30 s — hard budget for all attempts combined. * - [initialDelay] = 200 ms — first retry waits this long (subject to jitter). * - [delayMultiplier] = 2.0 — each subsequent delay is multiplied by this factor. * - [maxDelay] = 8 s — cap on the scaled delay; further attempts plateau here. * - [maxAttempts] = 3 — total attempts including the initial call (matches Square's default). * - [jitter] = 0.2 — symmetric jitter fraction. Effective delay ∈ `[delay*(1−j/2), delay*(1+j/2)]`. - * - [retryableStatuses] = `{429, 500, 502, 503, 504}` — canonical retryable codes. + * - [retryableStatuses] = `{408, 429, 500, 502, 503, 504}` — canonical retryable codes. * - [retryableMethods] = `{GET, HEAD, OPTIONS, PUT, DELETE}` — safe-by-method per RFC 9110. * `POST`/`PATCH`/etc. retry only when the request body is replayable. * - [scheduler] = `null` — fall back to the lazy daemon scheduler created by [RetryStep]. @@ -227,36 +231,61 @@ public class RetrySettings // Defaults — see class kdoc for rationale. private const val DEFAULT_TOTAL_TIMEOUT_SECONDS = 30L private const val DEFAULT_INITIAL_DELAY_MS = 200L - private const val DEFAULT_DELAY_MULTIPLIER = 2.0 private const val DEFAULT_MAX_DELAY_SECONDS = 8L - private const val DEFAULT_MAX_ATTEMPTS = 3 - private const val DEFAULT_JITTER = 0.2 + + /** + * Canonical exponential-backoff multiplier shared by both retry stacks: each delay is + * `previous * 2.0`. Exposed so the stage-based [DefaultRetryStep][org.dexpace.sdk.core.http.pipeline.steps.DefaultRetryStep] + * and the recovery-aware [RetryStep] compute the same schedule from one constant. + */ + public const val DEFAULT_DELAY_MULTIPLIER: Double = 2.0 + + /** + * Canonical symmetric jitter fraction shared by both retry stacks. The effective delay + * is drawn uniformly from `[delay * (1 - j/2), delay * (1 + j/2)]`, i.e. ±10% at the + * default `0.2`. + */ + public const val DEFAULT_JITTER: Double = 0.2 + + /** + * Canonical retry budget shared by both stacks: **3 total attempts**, i.e. the initial + * send plus 2 retries. The recovery-aware [RetryStep] counts total attempts directly + * via [maxAttempts]; the stage-based step counts retries, so its equivalent default is + * [DefaultRetryStep.DEFAULT_MAX_RETRIES][org.dexpace.sdk.core.http.pipeline.steps.DefaultRetryStep.DEFAULT_MAX_RETRIES] + * `= 2`. Both yield the same 3 sends. + */ + public const val DEFAULT_MAX_ATTEMPTS: Int = 3 /** Default total timeout: 30 s. */ @JvmField public val DEFAULT_TOTAL_TIMEOUT: Duration = Duration.ofSeconds(DEFAULT_TOTAL_TIMEOUT_SECONDS) - /** Default initial delay: 200 ms. */ + /** Default initial delay: 200 ms. Shared with the stage-based step's base delay. */ @JvmField public val DEFAULT_INITIAL_DELAY: Duration = Duration.ofMillis(DEFAULT_INITIAL_DELAY_MS) - /** Default max delay cap: 8 s. */ + /** Default max delay cap: 8 s. Shared with the stage-based step. */ @JvmField public val DEFAULT_MAX_DELAY: Duration = Duration.ofSeconds(DEFAULT_MAX_DELAY_SECONDS) /** - * Default retryable HTTP statuses: `429` (rate limit), `500` (internal), `502` - * (bad gateway), `503` (service unavailable), `504` (gateway timeout). + * Default retryable HTTP statuses: `408` (request timeout), `429` (rate limit), `500` + * (internal), `502` (bad gateway), `503` (service unavailable), `504` (gateway + * timeout). * - * Note: `408` (Request Timeout) is intentionally NOT in the default set — a 408 - * from a server usually means the client took too long to send the request, - * which is unlikely to improve on retry. Callers that disagree can opt in via - * the builder. + * This is the recovery-aware step's allow-list; [RetryStep] intersects it with + * [HttpException.retryable][org.dexpace.sdk.core.http.response.exception.HttpException.retryable] + * (which is itself derived from [RetryUtils.isRetryable][org.dexpace.sdk.core.util.RetryUtils.isRetryable]). + * With this default set the recovery-aware step retries the same common statuses the + * stage-based [DefaultRetryStep][org.dexpace.sdk.core.http.pipeline.steps.DefaultRetryStep] + * does via `RetryUtils` — `408` included, so the two stacks agree on the 408 stance. + * Callers wanting a stricter posture can pass a tighter set to the builder. */ @JvmField public val DEFAULT_RETRYABLE_STATUSES: Set = Collections.unmodifiableSet( linkedSetOf( + SC_REQUEST_TIMEOUT, SC_TOO_MANY_REQUESTS, SC_INTERNAL_ERROR, SC_BAD_GATEWAY, @@ -277,6 +306,7 @@ public class RetrySettings ) // Spelled-out status constants to satisfy detekt's MagicNumber rule. + private const val SC_REQUEST_TIMEOUT = 408 private const val SC_TOO_MANY_REQUESTS = 429 private const val SC_INTERNAL_ERROR = 500 private const val SC_BAD_GATEWAY = 502 diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/RetryStepTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/RetryStepTest.kt index 0877bf2b..41a4ab58 100644 --- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/RetryStepTest.kt +++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/RetryStepTest.kt @@ -295,7 +295,7 @@ class RetryStepTest { fun `Retry-After negative falls back to default backoff`() { val clock = FixedClock() // Configure a deterministic backoff: 100ms base with 100ms cap removes the - // exponential growth window and the jitter range is at most 5ms. + // exponential growth window and the symmetric jitter range is at most ±10ms (±10%). val opts = HttpRetryOptions( baseDelay = Duration.ofMillis(100), @@ -313,10 +313,10 @@ class RetryStepTest { val before = clock.now() pipeline.send(getRequest()) - // 100ms ± 5ms jitter — Duration#compareTo lets us range-check. + // 100ms ± 10ms symmetric jitter — Duration#compareTo lets us range-check. val elapsed = Duration.between(before, clock.now()) - assertTrue(elapsed >= Duration.ofMillis(95), "elapsed=$elapsed below 95ms") - assertTrue(elapsed <= Duration.ofMillis(105), "elapsed=$elapsed above 105ms") + assertTrue(elapsed >= Duration.ofMillis(90), "elapsed=$elapsed below 90ms") + assertTrue(elapsed <= Duration.ofMillis(110), "elapsed=$elapsed above 110ms") } @Test @@ -340,15 +340,15 @@ class RetryStepTest { val before = clock.now() pipeline.send(getRequest()) val elapsed = Duration.between(before, clock.now()) - assertTrue(elapsed >= Duration.ofMillis(95)) - assertTrue(elapsed <= Duration.ofMillis(105)) + assertTrue(elapsed >= Duration.ofMillis(90)) + assertTrue(elapsed <= Duration.ofMillis(110)) } @Test fun `custom retryAfterHeaders list ignores non-listed variants`() { val clock = FixedClock() // Only listen to the standard `Retry-After`; the `retry-after-ms` header is - // therefore invisible and we fall back to backoff (100ms ± 5ms). + // therefore invisible and we fall back to backoff (100ms ± 10ms symmetric jitter). val opts = HttpRetryOptions( baseDelay = Duration.ofMillis(100), @@ -368,8 +368,8 @@ class RetryStepTest { val before = clock.now() pipeline.send(getRequest()) val elapsed = Duration.between(before, clock.now()) - assertTrue(elapsed >= Duration.ofMillis(95), "elapsed=$elapsed below 95ms") - assertTrue(elapsed <= Duration.ofMillis(105), "elapsed=$elapsed above 105ms") + assertTrue(elapsed >= Duration.ofMillis(90), "elapsed=$elapsed below 90ms") + assertTrue(elapsed <= Duration.ofMillis(110), "elapsed=$elapsed above 110ms") } // ----------------- Exception classifier ----------------- @@ -888,7 +888,7 @@ class RetryStepTest { } @Test - fun `exponential backoff grows roughly 100 200 400 with 5pct jitter`() { + fun `exponential backoff grows roughly 100 200 400 with symmetric jitter`() { val clock = FixedClock() val opts = HttpRetryOptions( @@ -910,14 +910,14 @@ class RetryStepTest { val before = clock.now() pipeline.send(getRequest()) - // Expected: 100 + 200 + 400 = 700ms, ± 5% on each → ± 35ms total. + // Expected: 100 + 200 + 400 = 700ms, with symmetric ±10% jitter per term → ± 70ms total. val elapsed = Duration.between(before, clock.now()) - assertTrue(elapsed >= Duration.ofMillis(665), "elapsed=$elapsed below lower bound") - assertTrue(elapsed <= Duration.ofMillis(735), "elapsed=$elapsed above upper bound") + assertTrue(elapsed >= Duration.ofMillis(630), "elapsed=$elapsed below lower bound") + assertTrue(elapsed <= Duration.ofMillis(770), "elapsed=$elapsed above upper bound") } @Test - fun `jitter stays within +- 5 percent across many samples`() { + fun `jitter stays within +- 10 percent across many samples`() { // Drive the same try count repeatedly through the step and check the spread is // within the documented bound. Run via a small reflective-ish hack: invoke the // backoff path indirectly by sending many one-retry sequences and reading the @@ -944,9 +944,10 @@ class RetryStepTest { pipeline.send(getRequest()) samples.add(Duration.between(before, clock.now()).toMillis()) } - // 5% of 1000ms is 50ms. + // Symmetric jitter is ±10% of the delay; half-range of 1000ms is 100ms, so every + // sample lands in [900, 1100]. for (ms in samples) { - assertTrue(ms in 950..1050, "sample $ms outside [950,1050]") + assertTrue(ms in 900..1100, "sample $ms outside [900,1100]") } // Distribution sanity: at least 10% of samples should be on either side of the // mean to confirm the jitter is non-trivially applied. Tight assertion to catch @@ -1476,9 +1477,9 @@ class RetryStepTest { val before = clock.now() pipeline.send(getRequest()) val elapsed = Duration.between(before, clock.now()) - // Within the 5% jitter window of 100ms. - assertTrue(elapsed >= Duration.ofMillis(95), "elapsed=$elapsed below 95ms") - assertTrue(elapsed <= Duration.ofMillis(105), "elapsed=$elapsed above 105ms") + // Within the ±10% symmetric jitter window of 100ms. + assertTrue(elapsed >= Duration.ofMillis(90), "elapsed=$elapsed below 90ms") + assertTrue(elapsed <= Duration.ofMillis(110), "elapsed=$elapsed above 110ms") } @Test @@ -1548,8 +1549,9 @@ class RetryStepTest { val before = clock.now() pipeline.send(getRequest()) val elapsed = Duration.between(before, clock.now()) - assertTrue(elapsed >= Duration.ofMillis(75), "elapsed=$elapsed below 75ms") - assertTrue(elapsed <= Duration.ofMillis(85), "elapsed=$elapsed above 85ms") + // ±10% symmetric jitter on 80ms → [72, 88]. + assertTrue(elapsed >= Duration.ofMillis(72), "elapsed=$elapsed below 72ms") + assertTrue(elapsed <= Duration.ofMillis(88), "elapsed=$elapsed above 88ms") } // ----------------- Exception-side hook coverage ----------------- @@ -1749,13 +1751,13 @@ class RetryStepTest { assertEquals("attempt 2", ex.suppressed[1].message) } - // ----------------- MAX_SHIFT_TRY_COUNT cap ----------------- + // ----------------- Backoff overflow / boundary handling ----------------- @Test fun `baseDelay zero produces an immediate retry through the early-return branch`() { - // `if (baseNanos == 0L) return Duration.ZERO` in exponentialBackoff. The fixed-delay - // tests already drive baseDelay=Duration.ZERO, but this one explicitly exercises the - // exponential path with a zero base — the branch must short-circuit before any shift. + // A zero base delay must yield a zero exponential delay (immediate retry). The shared + // BackoffCalculator returns Duration.ZERO when initialDelay is zero, so the exponential + // path short-circuits to an immediate retry just as the fixed-delay path does. val clock = FixedClock() val opts = HttpRetryOptions( @@ -1780,12 +1782,13 @@ class RetryStepTest { @Test fun `exponential backoff overflow path is clamped to maxDelay rather than wrapping negative`() { - // Drive the overflow check: with baseDelay near Long.MAX_VALUE/2 and tryCount=30, - // baseNanos * (1L shl 30) overflows. The step must clamp to Long.MAX_VALUE and then - // to maxDelay rather than returning a negative duration. + // Drive the saturating-multiply path: with a base delay near the nanosecond ceiling and a + // high attempt count, the unscaled exponential value saturates instead of overflowing. + // The shared BackoffCalculator clamps the saturated value to maxDelay rather than + // returning a negative duration. val clock = FixedClock() - // 9_223_372_036 seconds is well over Long.MAX_VALUE / 2^30 in nanos, forcing the - // overflow branch. + // 9_223_372_036 seconds is just under the Long-nanosecond ceiling, so the scaled value + // saturates to maxDelay on every attempt. val opts = HttpRetryOptions( maxRetries = 35, @@ -1934,13 +1937,12 @@ class RetryStepTest { // ----------------- maxRetries clamping ----------------- @Test - fun `negative maxRetries is clamped to default 3`() { + fun `negative maxRetries is clamped to the default retry count`() { val fake = FakeHttpClient() .enqueue { status(503) } .enqueue { status(503) } .enqueue { status(503) } - .enqueue { status(503) } .enqueue { status(503) } // extra in case clamp went wrong val pipeline = @@ -1950,8 +1952,8 @@ class RetryStepTest { val response = pipeline.send(getRequest()) assertEquals(503, response.status.code) - // Default is 3 retries → 1 original + 3 retries = 4 calls. - assertEquals(4, fake.callCount) + // The clamp applies the canonical DEFAULT_MAX_RETRIES (2) → 1 original + 2 retries = 3 calls. + assertEquals(DefaultRetryStep.DEFAULT_MAX_RETRIES + 1, fake.callCount) } // ----------------- Diagnostic field coverage ----------------- diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetryDefaultsReconciliationTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetryDefaultsReconciliationTest.kt new file mode 100644 index 00000000..8cc532b1 --- /dev/null +++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetryDefaultsReconciliationTest.kt @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2026 dexpace and Omar Aljarrah + * + * Licensed under the MIT License. See LICENSE in the project root. + * SPDX-License-Identifier: MIT + */ + +package org.dexpace.sdk.core.pipeline.step.retry + +import org.dexpace.sdk.core.client.HttpClient +import org.dexpace.sdk.core.http.common.Headers +import org.dexpace.sdk.core.http.common.Protocol +import org.dexpace.sdk.core.http.pipeline.HttpPipelineBuilder +import org.dexpace.sdk.core.http.pipeline.steps.DefaultRetryStep +import org.dexpace.sdk.core.http.pipeline.steps.HttpRetryOptions +import org.dexpace.sdk.core.http.request.Method +import org.dexpace.sdk.core.http.request.Request +import org.dexpace.sdk.core.http.response.Response +import org.dexpace.sdk.core.http.response.Status +import org.dexpace.sdk.core.http.response.exception.HttpException +import org.dexpace.sdk.core.io.Io +import org.dexpace.sdk.core.testing.FakeHttpClient +import org.dexpace.sdk.core.testing.FixedClock +import org.dexpace.sdk.core.util.Clock +import org.dexpace.sdk.io.OkioIoProvider +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Assertions.assertThrows +import org.junit.jupiter.api.Assertions.assertTrue +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import java.time.Duration +import java.time.Instant + +/** + * Guards that the two retry stacks — the stage-based [DefaultRetryStep]/[HttpRetryOptions] and + * the recovery-aware [RetryStep]/[RetrySettings] — share ONE set of documented defaults and ONE + * backoff computation ([BackoffCalculator]). If a future change re-introduces divergent defaults + * (max attempts, base/initial delay, max delay, multiplier, jitter) or a second backoff formula, + * these assertions fail. + */ +class RetryDefaultsReconciliationTest { + @BeforeEach + fun setUp() { + Io.installProvider(OkioIoProvider) + } + + // region -- shared default constants -- + + @Test + fun `both stacks share the same base delay default`() { + assertEquals( + RetrySettings.DEFAULT_INITIAL_DELAY, + HttpRetryOptions().baseDelay, + "stage-based baseDelay default must equal the recovery-aware initialDelay default", + ) + assertEquals(Duration.ofMillis(200), HttpRetryOptions().baseDelay) + } + + @Test + fun `both stacks share the same max delay default`() { + assertEquals( + RetrySettings.DEFAULT_MAX_DELAY, + HttpRetryOptions().maxDelay, + "stage-based maxDelay default must equal the recovery-aware maxDelay default", + ) + assertEquals(Duration.ofSeconds(8), HttpRetryOptions().maxDelay) + } + + @Test + fun `both stacks default to the same total send budget`() { + // The two knobs count differently by design (retries vs total attempts), but the default + // budget must be the same number of sends: initial + DEFAULT_MAX_RETRIES == DEFAULT_MAX_ATTEMPTS. + assertEquals( + RetrySettings.DEFAULT_MAX_ATTEMPTS, + DefaultRetryStep.DEFAULT_MAX_RETRIES + 1, + "default total sends must agree across stacks (1 initial + retries == maxAttempts)", + ) + } + + // endregion + + // region -- shared backoff sequence -- + + /** + * The canonical no-jitter schedule for `initial=100ms, multiplier=2.0, maxDelay=1s`: + * attempt N (1-indexed) → `100ms * 2^(N-1)`, capped at 1s. Indexed by attempt-1. + */ + private val expectedSeriesMs = longArrayOf(100, 200, 400, 800, 1000, 1000) + + @Test + fun `BackoffCalculator produces the canonical exponential sequence`() { + val settings = + RetrySettings.builder() + .initialDelay(Duration.ofMillis(BASE_MS)) + .delayMultiplier(RetrySettings.DEFAULT_DELAY_MULTIPLIER) + .maxDelay(Duration.ofSeconds(1)) + .jitter(0.0) + .totalTimeout(Duration.ZERO) + .maxAttempts(10) + .build() + for (attempt in 1..expectedSeriesMs.size) { + assertEquals( + Duration.ofMillis(expectedSeriesMs[attempt - 1]), + BackoffCalculator.computeDelay(attempt, settings), + "attempt $attempt", + ) + } + } + + @Test + fun `stage-based step follows the same exponential sequence via BackoffCalculator`() { + // Drive the stage-based step and observe the cumulative FixedClock delta across a fixed + // number of retries. The stage-based step now computes its schedule through the shared + // BackoffCalculator with the canonical symmetric jitter (DEFAULT_JITTER = 0.2), so the + // cumulative elapsed must land within the ±10% jitter window of the canonical series sum. + for (retries in 1..4) { + val clock = FixedClock() + val fake = FakeHttpClient() + repeat(retries) { fake.enqueue { status(503) } } + fake.enqueue { status(200) } + + val opts = + HttpRetryOptions( + maxRetries = retries, + baseDelay = Duration.ofMillis(BASE_MS), + maxDelay = Duration.ofSeconds(1), + ) + val pipeline = + HttpPipelineBuilder(fake) + .append(DefaultRetryStep(opts, clock)) + .build() + + val before = clock.now() + pipeline.send(getRequest()) + val elapsedMs = Duration.between(before, clock.now()).toMillis() + + // Cumulative expected = sum of the first `retries` series entries. + var sum = 0L + for (i in 0 until retries) sum += expectedSeriesMs[i] + // Symmetric ±10% jitter (DEFAULT_JITTER = 0.2 → half-range 10%) per term; bound the sum. + val tolerance = (sum * (RetrySettings.DEFAULT_JITTER / 2.0)).toLong() + 1 + val low = sum - tolerance + val high = sum + tolerance + assertTrue( + elapsedMs in low..high, + "retries=$retries: elapsed=$elapsedMs not in [$low,$high] (sum=$sum)", + ) + } + } + + // endregion + + // region -- consistent 408 policy -- + + @Test + fun `stage-based step retries a 408 by default`() { + val fake = + FakeHttpClient() + .enqueue { status(408) } + .enqueue { status(200) } + val pipeline = + HttpPipelineBuilder(fake) + .append(DefaultRetryStep(HttpRetryOptions(maxRetries = 2), zeroDelayClock())) + .build() + val response = pipeline.send(getRequest()) + assertEquals(200, response.status.code) + assertEquals(2, fake.callCount, "stage-based step must retry 408") + } + + @Test + fun `recovery-aware default retryable statuses contains 408`() { + assertTrue( + RetrySettings.DEFAULT_RETRYABLE_STATUSES.contains(STATUS_REQUEST_TIMEOUT), + "recovery-aware default must agree with the stage-based 408 stance", + ) + } + + @Test + fun `recovery-aware step retries a 408 HttpException by default`() { + val client = + object : HttpClient { + private var calls = 0 + + override fun execute(request: Request): Response { + calls += 1 + if (calls == 1) { + throw object : HttpException( + status = Status.REQUEST_TIMEOUT, + headers = Headers.builder().build(), + body = null, + ) {} + } + return Response.builder() + .request(request) + .protocol(Protocol.HTTP_1_1) + .status(Status.OK) + .headers(Headers.builder().build()) + .build() + } + } + // No explicit scheduler: with zero delays the step never schedules a deferred wait, and + // leaving it null routes through RetryStep's process-wide lazy daemon scheduler rather + // than leaking a caller-owned executor (which RetryStep never shuts down) per run. + val settings = + RetrySettings.builder() + .initialDelay(Duration.ZERO) + .maxDelay(Duration.ZERO) + .delayMultiplier(1.0) + .jitter(0.0) + .totalTimeout(Duration.ZERO) + .build() + val step = RetryStep(client, settings, getRequest()) + val response = step.attempt() + assertEquals(200, response.status.code, "recovery-aware step must retry a 408") + } + + // endregion + + // region -- eager delay-magnitude validation -- + + @Test + fun `constructing the stage-based step validates delay magnitudes eagerly`() { + // HttpRetryOptions does not range-check baseDelay/maxDelay, but routing them through the + // shared RetrySettings (to drive BackoffCalculator) does. A negative delay must therefore + // fail fast at DefaultRetryStep construction, not later at delay-computation time. + assertThrows(IllegalArgumentException::class.java) { + DefaultRetryStep( + HttpRetryOptions(baseDelay = Duration.ofMillis(-1)), + zeroDelayClock(), + ) + } + assertThrows(IllegalArgumentException::class.java) { + DefaultRetryStep( + HttpRetryOptions(maxDelay = Duration.ofMillis(-1)), + zeroDelayClock(), + ) + } + } + + // endregion + + private fun getRequest(): Request = + Request.builder() + .method(Method.GET) + .url("https://api.example.com/x") + .build() + + private fun zeroDelayClock(): Clock = + object : Clock { + override fun now(): Instant = Instant.EPOCH + + override fun monotonic(): Long = 0L + + override fun sleep(duration: Duration) = Unit + } + + private companion object { + private const val BASE_MS = 100L + private const val STATUS_REQUEST_TIMEOUT = 408 + } +} diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetrySettingsTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetrySettingsTest.kt index a8ec06e8..574167b8 100644 --- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetrySettingsTest.kt +++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/pipeline/step/retry/RetrySettingsTest.kt @@ -61,4 +61,30 @@ class RetrySettingsTest { .build() assertEquals(Duration.ofSeconds(8), settings.maxDelay) } + + // --- Canonical shared defaults (one source of truth for both retry stacks) -------------- + + @Test + fun `defaults match the canonical shared constants`() { + val settings = RetrySettings.defaults() + assertEquals(Duration.ofSeconds(30), settings.totalTimeout, "totalTimeout") + assertEquals(RetrySettings.DEFAULT_INITIAL_DELAY, settings.initialDelay, "initialDelay") + assertEquals(Duration.ofMillis(200), settings.initialDelay, "initialDelay value") + assertEquals(RetrySettings.DEFAULT_MAX_DELAY, settings.maxDelay, "maxDelay") + assertEquals(Duration.ofSeconds(8), settings.maxDelay, "maxDelay value") + assertEquals(RetrySettings.DEFAULT_DELAY_MULTIPLIER, settings.delayMultiplier, "delayMultiplier") + assertEquals(2.0, settings.delayMultiplier, "delayMultiplier value") + assertEquals(RetrySettings.DEFAULT_JITTER, settings.jitter, "jitter") + assertEquals(0.2, settings.jitter, "jitter value") + assertEquals(RetrySettings.DEFAULT_MAX_ATTEMPTS, settings.maxAttempts, "maxAttempts") + assertEquals(3, settings.maxAttempts, "maxAttempts value") + } + + @Test + fun `default retryable statuses include 408 and the common retryable codes`() { + // The reconciled stance: 408 IS retryable, matching RetryUtils / HttpException.retryable + // and the stage-based DefaultRetryStep. The two stacks must agree on the 408 question. + val statuses = RetrySettings.DEFAULT_RETRYABLE_STATUSES + assertEquals(setOf(408, 429, 500, 502, 503, 504), statuses.toSet()) + } }