Skip to content

Commit a128599

Browse files
committed
[SVLS-8230] Fix SnapStart cold_start tag using restore_time
SnapStart restore invocations were misclassified as proactive_initialization because sandbox_init_time (from snapshot creation) always exceeded the 10s threshold. Fix by tracking restore_time from PlatformRestoreStart telemetry and using it for proactive init detection in SnapStart functions. When restore_time is None (telemetry not yet delivered), assume cold start since the restore and invoke happened close together. https://datadoghq.atlassian.net/browse/SVLS-8230
1 parent 672f268 commit a128599

3 files changed

Lines changed: 74 additions & 5 deletions

File tree

bottlecap/src/config/aws.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
use std::env;
22
use tokio::time::Instant;
33

4+
use crate::tags::lambda::tags::SNAP_START_VALUE;
5+
46
const AWS_DEFAULT_REGION: &str = "AWS_DEFAULT_REGION";
57
const AWS_ACCESS_KEY_ID: &str = "AWS_ACCESS_KEY_ID";
68
const AWS_SECRET_ACCESS_KEY: &str = "AWS_SECRET_ACCESS_KEY";
@@ -46,6 +48,11 @@ impl AwsConfig {
4648
self.initialization_type
4749
.eq(LAMBDA_MANAGED_INSTANCES_INIT_TYPE)
4850
}
51+
52+
#[must_use]
53+
pub fn is_snapstart(&self) -> bool {
54+
self.initialization_type.eq(SNAP_START_VALUE)
55+
}
4956
}
5057

5158
#[allow(clippy::module_name_repetitions)]

bottlecap/src/lifecycle/invocation/processor.rs

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ pub struct Processor {
8989
/// Tracks whether if first invocation after init has been received in Managed Instance mode.
9090
/// Used to determine if we should search for the empty context on an invocation.
9191
awaiting_first_invocation: bool,
92+
/// Time of the `SnapStart` restore event, set when `PlatformRestoreStart` is received.
93+
restore_time: Option<DateTime<Utc>>,
9294
}
9395

9496
impl Processor {
@@ -128,6 +130,7 @@ impl Processor {
128130
dynamic_tags: HashMap::new(),
129131
active_invocations: 0,
130132
awaiting_first_invocation: false,
133+
restore_time: None,
131134
}
132135
}
133136

@@ -243,12 +246,35 @@ impl Processor {
243246

244247
// If it's empty, then we are in a cold start
245248
if self.context_buffer.is_empty() {
246-
let now = Instant::now();
247-
let time_since_sandbox_init = now.duration_since(self.aws_config.sandbox_init_time);
248-
if time_since_sandbox_init.as_millis() > PROACTIVE_INITIALIZATION_THRESHOLD_MS.into() {
249-
proactive_initialization = true;
249+
if self.aws_config.is_snapstart() {
250+
match self.restore_time {
251+
None => {
252+
// PlatformRestoreStart hasn't arrived yet — restore and invoke
253+
// happened close together, so this is a cold start (not proactive).
254+
cold_start = true;
255+
}
256+
Some(restore_time) => {
257+
let now = Utc::now();
258+
let time_since_restore = now.signed_duration_since(restore_time);
259+
if time_since_restore.num_milliseconds()
260+
> PROACTIVE_INITIALIZATION_THRESHOLD_MS as i64
261+
{
262+
proactive_initialization = true;
263+
} else {
264+
cold_start = true;
265+
}
266+
}
267+
}
250268
} else {
251-
cold_start = true;
269+
let now = Instant::now();
270+
let time_since_sandbox_init = now.duration_since(self.aws_config.sandbox_init_time);
271+
if time_since_sandbox_init.as_millis()
272+
> PROACTIVE_INITIALIZATION_THRESHOLD_MS.into()
273+
{
274+
proactive_initialization = true;
275+
} else {
276+
cold_start = true;
277+
}
252278
}
253279

254280
// Resolve runtime only once
@@ -374,6 +400,8 @@ impl Processor {
374400
/// This is used to create a `snapstart_restore` span, since this telemetry event does not
375401
/// provide a `request_id`, we try to guess which invocation is the restore similar to init.
376402
pub fn on_platform_restore_start(&mut self, time: DateTime<Utc>) {
403+
self.restore_time = Some(time);
404+
377405
let start_time: i64 = SystemTime::from(time)
378406
.duration_since(UNIX_EPOCH)
379407
.expect("time went backwards")

integration-tests/tests/snapstart.test.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,23 @@ describe('Snapstart Integration Tests', () => {
101101
);
102102
expect(coldStartSpan).toBeUndefined();
103103
});
104+
105+
it('should have aws.lambda span with cold_start=true', () => {
106+
const result = getRestoreInvocation();
107+
expect(result).toBeDefined();
108+
const trace = result.traces![0];
109+
const awsLambdaSpan = trace.spans.find((span: any) =>
110+
span.attributes.operation_name === 'aws.lambda'
111+
);
112+
expect(awsLambdaSpan).toBeDefined();
113+
expect(awsLambdaSpan).toMatchObject({
114+
attributes: {
115+
custom: {
116+
cold_start: 'true'
117+
}
118+
}
119+
});
120+
});
104121
});
105122

106123
describe('second invocation (warm)', () => {
@@ -146,6 +163,23 @@ describe('Snapstart Integration Tests', () => {
146163
);
147164
expect(coldStartSpan).toBeUndefined();
148165
});
166+
167+
it('should have aws.lambda span with cold_start=false', () => {
168+
const result = getWarmInvocation();
169+
expect(result).toBeDefined();
170+
const trace = result.traces![0];
171+
const awsLambdaSpan = trace.spans.find((span: any) =>
172+
span.attributes.operation_name === 'aws.lambda'
173+
);
174+
expect(awsLambdaSpan).toBeDefined();
175+
expect(awsLambdaSpan).toMatchObject({
176+
attributes: {
177+
custom: {
178+
cold_start: 'false'
179+
}
180+
}
181+
});
182+
});
149183
});
150184

151185
describe('trace isolation', () => {

0 commit comments

Comments
 (0)