From fb5bfe68852154f017ecc18ca30f616440347d7e Mon Sep 17 00:00:00 2001 From: Christopher Reid Date: Tue, 17 Mar 2026 18:11:31 -0400 Subject: [PATCH] feat: support loading custom job definitions, add sample job definitions to fix templating Signed-off-by: Christopher Reid --- .../nv.svc.farm/files/job_definitions/df.json | 19 ++++++++++ .../files/job_definitions/echo.json | 22 +++++++++++ .../job_definitions/nvidia-smi-check.json | 24 ++++++++++++ .../slack-webhook-message.json | 38 +++++++++++++++++++ .../files/job_definitions/sleep-test.json | 18 +++++++++ .../files/scripts/load_job_definitions.sh | 36 ++++++++++++++++++ .../jobs/configmap-job-definitions.yaml | 12 +++++- helm/nv.svc.farm/values.yaml | 13 +++++++ 8 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 helm/nv.svc.farm/files/job_definitions/df.json create mode 100644 helm/nv.svc.farm/files/job_definitions/echo.json create mode 100644 helm/nv.svc.farm/files/job_definitions/nvidia-smi-check.json create mode 100644 helm/nv.svc.farm/files/job_definitions/slack-webhook-message.json create mode 100644 helm/nv.svc.farm/files/job_definitions/sleep-test.json create mode 100644 helm/nv.svc.farm/files/scripts/load_job_definitions.sh diff --git a/helm/nv.svc.farm/files/job_definitions/df.json b/helm/nv.svc.farm/files/job_definitions/df.json new file mode 100644 index 0000000..0875d9f --- /dev/null +++ b/helm/nv.svc.farm/files/job_definitions/df.json @@ -0,0 +1,19 @@ + +{ + "name": "df", + "job_type": "base", + "command": "/bin/df", + "args": [], + "task_function": "", + "env": {}, + "log_to_stdout": true, + "extension_paths": [], + "allowed_args": {}, + "headless": true, + "active": true, + "unresolved_command_path": "/bin/df", + "success_return_codes": [0], + "capacity_requirements": {}, + "working_directory": "", + "container": "busybox:1.35" +} diff --git a/helm/nv.svc.farm/files/job_definitions/echo.json b/helm/nv.svc.farm/files/job_definitions/echo.json new file mode 100644 index 0000000..be28d22 --- /dev/null +++ b/helm/nv.svc.farm/files/job_definitions/echo.json @@ -0,0 +1,22 @@ +{ + "name": "echo", + "job_type": "base", + "command": "sh", + "args": [ + "-c", + "echo \"${1:-This is the default echo message. To modify it, specify in your task submission.}\"", + "sh" + ], + "allowed_args": { + "message": { + "arg": "2" + } + }, + "headless": true, + "active": true, + "unresolved_command_path": "sh", + "success_return_codes": [0], + "capacity_requirements": {}, + "working_directory": "", + "container": "ubuntu:24.04" +} diff --git a/helm/nv.svc.farm/files/job_definitions/nvidia-smi-check.json b/helm/nv.svc.farm/files/job_definitions/nvidia-smi-check.json new file mode 100644 index 0000000..52177da --- /dev/null +++ b/helm/nv.svc.farm/files/job_definitions/nvidia-smi-check.json @@ -0,0 +1,24 @@ +{ + "name": "nvidia-smi-check", + "job_type": "base", + "command": "nvidia-smi", + "args": [], + "task_function": "", + "env": {}, + "log_to_stdout": true, + "extension_paths": [], + "allowed_args": {}, + "headless": true, + "active": true, + "unresolved_command_path": "nvidia-smi", + "success_return_codes": [0], + "capacity_requirements": { + "resource_limits": { + "cpu": 1, + "memory": "4096Mi", + "nvidia.com/gpu": 1 + } + }, + "working_directory": "", + "container": "nvidia/cuda:12.2.0-base-ubuntu20.04" +} diff --git a/helm/nv.svc.farm/files/job_definitions/slack-webhook-message.json b/helm/nv.svc.farm/files/job_definitions/slack-webhook-message.json new file mode 100644 index 0000000..5d007bc --- /dev/null +++ b/helm/nv.svc.farm/files/job_definitions/slack-webhook-message.json @@ -0,0 +1,38 @@ +{ + "name": "slack-webhook-message", + "job_type": "base", + "command": "sh", + "args": [ + "-c", + "curl -X POST \"$SLACK_WEBHOOK_URL\" -H \"Content-Type: application/json\" -d {\\\"text\\\":\\\"\"${1:-This is the default Slack webhook message. To modify it, specify in your task submission.}\"\\\"}", + "sh" + ], + "allowed_args": { + "message": { + "arg": "2" + } + }, + "task_function": "", + "env": {}, + "log_to_stdout": true, + "extension_paths": [], + "headless": true, + "active": true, + "unresolved_command_path": "sh", + "success_return_codes": [0], + "capacity_requirements": { + "env": [ + { + "name": "SLACK_WEBHOOK_URL", + "valueFrom": { + "secretKeyRef": { + "key": "SLACK_WEBHOOK_URL", + "name": "slack-webhook-url" + } + } + } + ] + }, + "working_directory": "", + "container": "curlimages/curl:8.15.0" +} diff --git a/helm/nv.svc.farm/files/job_definitions/sleep-test.json b/helm/nv.svc.farm/files/job_definitions/sleep-test.json new file mode 100644 index 0000000..463ec59 --- /dev/null +++ b/helm/nv.svc.farm/files/job_definitions/sleep-test.json @@ -0,0 +1,18 @@ +{ + "name": "sleep-test", + "job_type": "base", + "command": "sleep", + "args": ["60"], + "task_function": "", + "env": {}, + "log_to_stdout": true, + "extension_paths": [], + "allowed_args": {}, + "headless": true, + "active": true, + "unresolved_command_path": "sleep", + "success_return_codes": [0], + "capacity_requirements": {}, + "working_directory": "", + "container": "busybox:1.35" +} diff --git a/helm/nv.svc.farm/files/scripts/load_job_definitions.sh b/helm/nv.svc.farm/files/scripts/load_job_definitions.sh new file mode 100644 index 0000000..85036c2 --- /dev/null +++ b/helm/nv.svc.farm/files/scripts/load_job_definitions.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env sh +set -e +set -u + +echo "Starting job definition load script." + +echo "JOBS_API_URL: ${JOBS_API_URL}" +echo "JOB_DEFINITIONS_DIR: ${JOB_DEFINITIONS_DIR}" + +echo "\nlisting all files under ${JOB_DEFINITIONS_DIR}\n" +ls -la "${JOB_DEFINITIONS_DIR}" + +# Use a while loop instead of an array for portability +find "${JOB_DEFINITIONS_DIR}" -type f -name "*.json" | while IFS= read -r file; do + echo "\n--\nuploading job definition: ${file}\n$(cat "$file")\n" + + wget --quiet --output-document=/tmp/http_response --server-response \ + --header="accept: application/json" \ + --header="Content-Type: application/json" \ + --header="X-API-KEY: ${JOBS_API_KEY}" \ + --post-file="$file" \ + "${JOBS_API_URL}/save" + + echo "response:\n$(cat /tmp/http_response)" +done + +echo "\n--\nchecking job definitions: ${JOBS_API_URL}/load" + +wget --quiet --output-document=/tmp/http_response --server-response \ + --header="accept: application/json" \ + --header="Content-Type: application/json" \ + "${JOBS_API_URL}/load" + +echo "response:\n$(cat /tmp/http_response)" + +echo "\n\nDone." diff --git a/helm/nv.svc.farm/templates/jobs/configmap-job-definitions.yaml b/helm/nv.svc.farm/templates/jobs/configmap-job-definitions.yaml index 76bd01f..73cacc4 100644 --- a/helm/nv.svc.farm/templates/jobs/configmap-job-definitions.yaml +++ b/helm/nv.svc.farm/templates/jobs/configmap-job-definitions.yaml @@ -11,8 +11,9 @@ {{- end }} {{- end }} +{{- $extraJobDefinitions := $mergedJobsSvc.serviceConfig.extraJobDefinitions | default dict -}} -{{- if $jobList -}} +{{- if or $jobList $extraJobDefinitions -}} apiVersion: v1 kind: ConfigMap metadata: @@ -22,7 +23,14 @@ metadata: data: {{ (.Files.Glob "files/scripts/load_job_definitions.sh").AsConfig | indent 2 }} {{- range $item := $jobList }} - {{ ($.Files.Glob $item ).AsConfig | nindent 2}} + {{- $fileContents := $.Files.Glob $item }} + {{- if $fileContents }} + {{ $fileContents.AsConfig | nindent 2}} + {{- end }} +{{- end }} +{{- range $name, $content := $extraJobDefinitions }} + {{ $name }}: | + {{ $content | nindent 4 }} {{- end }} {{- end }} diff --git a/helm/nv.svc.farm/values.yaml b/helm/nv.svc.farm/values.yaml index 029f8c0..f90a85b 100644 --- a/helm/nv.svc.farm/values.yaml +++ b/helm/nv.svc.farm/values.yaml @@ -437,6 +437,19 @@ apps: definitions: - df - nvidia-smi-check + # -- Map of custom job definitions to load alongside the bundled ones. + # Keys are filenames (e.g. my-job.json), values are the raw JSON content. + # @section -- Jobs + # extraJobDefinitions: + # my-job.json: | + # { + # "name": "my-job", + # "job_type": "base", + # "command": "echo", + # "args": ["hello"], + # "container": "ubuntu:24.04" + # } + extraJobDefinitions: {} # Jobs Application Specific Settings/Overrides # @section -- Jobs settings: