Skip to content

Commit e391ecc

Browse files
committed
fix: handle DynamoDB update failures after successful S3 upload
Previously, if the DynamoDB update at the end of the EC2 user-data script failed (network blip, permissions error, etc.) after S3 upload had already succeeded, the build was left stuck in PROCESSING with orphaned S3 artifacts and no error recorded. Changes: - Wrap the Step-1 update_item (ADD arch_s3_keys + decrement pending_arches) in try/except; on failure, explicitly mark the build FAILED instead of letting bash set -e silently exit. - Wrap the Step-2 update_item (SET status=COMPLETED) in a retry loop (3 attempts with exponential back-off); if all retries fail, call _mark_failed so the caller gets a definitive FAILED status rather than being stuck. - Add _mark_failed() helper with its own 3-attempt retry loop. - Exit the Python script with sys.exit(0) on handled errors so the bash trap fires only after DynamoDB has been updated.
1 parent 4a6037f commit e391ecc

1 file changed

Lines changed: 71 additions & 23 deletions

File tree

  • infrastructure/lambdas/process_build

infrastructure/lambdas/process_build/index.py

Lines changed: 71 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -364,51 +364,99 @@ def _generate_user_data(build_id, arch, python_version, requirements, single_fil
364364
# --- Atomically record completion in DynamoDB ---
365365
# ADD arch_s3_keys (StringSet) and decrement pending_arches.
366366
# The last architecture to complete (pending_arches reaches 0) sets COMPLETED.
367+
# Errors here are caught inside the Python script so DynamoDB is always updated,
368+
# even if the update itself fails (the build is then marked FAILED instead of
369+
# being left stuck in PROCESSING with orphaned S3 files).
367370
export _BUILD_S3_KEYS="$S3_KEYS"
368371
export _BUILD_COMPLETED_AT="$(date +%s)"
369372
370373
python3 << 'PYEOF'
371-
import boto3, os
374+
import boto3, os, time, sys
372375
from decimal import Decimal
373376
374377
region = os.environ['AWS_DEFAULT_REGION']
375378
table_name = "{TABLE_NAME}"
376379
build_id = "{build_id}"
380+
arch = "{arch}"
377381
378382
s3_keys_str = os.environ.get('_BUILD_S3_KEYS', '')
379383
completed_at = int(os.environ.get('_BUILD_COMPLETED_AT', '0'))
380384
key_list = [k.strip() for k in s3_keys_str.split(',') if k.strip()]
381385
382386
table = boto3.resource('dynamodb', region_name=region).Table(table_name)
383387
384-
resp = table.update_item(
385-
Key={{'buildId': build_id}},
386-
UpdateExpression='ADD arch_s3_keys :k, pending_arches :n',
387-
ExpressionAttributeValues={{
388-
':k': set(key_list),
389-
':n': Decimal('-1'),
390-
}},
391-
ReturnValues='ALL_NEW',
392-
)
388+
389+
def _mark_failed(reason):
390+
"""Best-effort: mark the build FAILED in DynamoDB."""
391+
for attempt in range(3):
392+
try:
393+
table.update_item(
394+
Key={{'buildId': build_id}},
395+
UpdateExpression='SET #s = :f, error_message = :e',
396+
ExpressionAttributeNames={{'#s': 'status'}},
397+
ExpressionAttributeValues={{':f': 'FAILED', ':e': reason}},
398+
)
399+
print('Marked build FAILED: ' + reason)
400+
return
401+
except Exception as ex:
402+
print('WARNING: _mark_failed attempt ' + str(attempt + 1) + ' failed: ' + str(ex))
403+
time.sleep(2 ** attempt)
404+
print('ERROR: could not update DynamoDB after repeated failures')
405+
406+
407+
try:
408+
# Step 1: atomically add this arch's S3 keys and decrement the pending counter.
409+
resp = table.update_item(
410+
Key={{'buildId': build_id}},
411+
UpdateExpression='ADD arch_s3_keys :k, pending_arches :n',
412+
ExpressionAttributeValues={{
413+
':k': set(key_list),
414+
':n': Decimal('-1'),
415+
}},
416+
ReturnValues='ALL_NEW',
417+
)
418+
except Exception as e:
419+
# S3 upload already succeeded; emit an error but mark the build FAILED so
420+
# it never stays stuck in PROCESSING.
421+
msg = arch + ' uploaded to S3 but DynamoDB key-registration failed: ' + str(e)
422+
print('ERROR: ' + msg)
423+
_mark_failed(msg)
424+
sys.exit(0) # Exit cleanly so bash set -e doesn't re-trigger the cleanup trap
425+
393426
pending = int(resp['Attributes'].get('pending_arches', 1))
394427
if pending <= 0:
428+
# Step 2: all architectures finished — set the final COMPLETED status.
395429
all_keys_set = resp['Attributes'].get('arch_s3_keys', set())
396430
all_keys = ','.join(sorted(all_keys_set))
397431
fc = len(all_keys_set)
398-
table.update_item(
399-
Key={{'buildId': build_id}},
400-
UpdateExpression='SET #s = :s, s3_keys = :k, completed_at = :t, file_count = :fc',
401-
ExpressionAttributeNames={{'#s': 'status'}},
402-
ExpressionAttributeValues={{
403-
':s': 'COMPLETED',
404-
':k': all_keys,
405-
':t': completed_at,
406-
':fc': fc,
407-
}},
408-
)
409-
print('Build COMPLETED: ' + str(fc) + ' file(s), keys: ' + all_keys)
432+
433+
for attempt in range(3):
434+
try:
435+
table.update_item(
436+
Key={{'buildId': build_id}},
437+
UpdateExpression='SET #s = :s, s3_keys = :k, completed_at = :t, file_count = :fc',
438+
ExpressionAttributeNames={{'#s': 'status'}},
439+
ExpressionAttributeValues={{
440+
':s': 'COMPLETED',
441+
':k': all_keys,
442+
':t': completed_at,
443+
':fc': fc,
444+
}},
445+
)
446+
print('Build COMPLETED: ' + str(fc) + ' file(s), keys: ' + all_keys)
447+
break
448+
except Exception as e:
449+
if attempt < 2:
450+
print('WARNING: COMPLETED update attempt ' + str(attempt + 1) + ' failed, retrying: ' + str(e))
451+
time.sleep(2 ** attempt)
452+
else:
453+
# pending_arches is already 0 so no other instance will retry;
454+
# mark FAILED so the caller gets a definitive answer.
455+
msg = 'S3 upload succeeded but failed to set COMPLETED status: ' + str(e)
456+
print('ERROR: ' + msg)
457+
_mark_failed(msg)
410458
else:
411-
print('{arch} done, ' + str(pending) + ' arch(es) still pending')
459+
print(arch + ' done, ' + str(pending) + ' arch(es) still pending')
412460
PYEOF
413461
414462
echo ""

0 commit comments

Comments
 (0)