Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions examples/ray/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Running Ray on Azure Kubernetes Service (AKS)

This example demonstrates how to deploy and run a [Ray](https://www.ray.io/) application on AKS using the KubeRay operator.

## Prerequisites

- An AKS cluster (Kubernetes 1.26+)
- [kubectl](https://kubernetes.io/docs/tasks/tools/) configured to access your cluster
- [Helm](https://helm.sh/docs/intro/install/) 3.x installed

## Overview

Ray is an open-source framework for scaling AI and Python workloads. This example deploys:

1. The **KubeRay operator** to manage Ray clusters on Kubernetes
2. A **RayCluster** custom resource with a head node and worker nodes
3. A sample **Ray job** to verify the deployment

## Deploy the KubeRay operator

```bash
helm repo add kuberay https://ray-project.github.io/kuberay-helm/
helm repo update

helm install kuberay-operator kuberay/kuberay-operator \
--namespace kuberay-system \
--create-namespace
```

Verify the operator is running:

```bash
kubectl get pods -n kuberay-system
```

## Deploy the RayCluster

```bash
kubectl apply -f ray-cluster.yaml
```

Wait for the cluster to be ready:

```bash
kubectl get rayclusters
kubectl get pods -l ray.io/cluster=ray-cluster
```

## Submit a sample job

```bash
kubectl apply -f ray-job.yaml
```

Check job status:

```bash
kubectl get rayjobs
kubectl logs -l job-name=ray-sample-job
```

## Clean up

```bash
kubectl delete -f ray-job.yaml
kubectl delete -f ray-cluster.yaml
helm uninstall kuberay-operator -n kuberay-system
kubectl delete namespace kuberay-system
```

## Resources

- [Ray documentation](https://docs.ray.io/)
- [KubeRay documentation](https://ray-project.github.io/kuberay/)
- [AKS documentation](https://learn.microsoft.com/azure/aks)
36 changes: 36 additions & 0 deletions examples/ray/aks-classic/deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash
set -euo pipefail

# Check if the user is logged into Azure CLI
if ! az account show > /dev/null 2>&1; then
echo "Please sign in to Azure CLI using 'az login' before running this script."
exit 1
fi

# Initialize Terraform
terraform init

# Create a Terraform plan
terraform plan -out main.tfplan

# Apply the Terraform plan
terraform apply main.tfplan

# Retrieve the Terraform outputs
resource_group_name=$(terraform output -raw resource_group_name)
aks_cluster_name=$(terraform output -raw kubernetes_cluster_name)

# Get AKS credentials for the cluster
az aks get-credentials \
--resource-group "$resource_group_name" \
--name "$aks_cluster_name" \
--overwrite-existing

echo "=== Cluster nodes ==="
kubectl get nodes

echo "=== Verifying installations ==="
kubectl get pods -n kueue-system
kubectl get pods -n kuberay-system

echo "=== Setup complete ==="
122 changes: 122 additions & 0 deletions examples/ray/aks-classic/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Generate random resource group name
resource "random_pet" "rg_name" {
prefix = var.resource_group_name_prefix
}

resource "azurerm_resource_group" "rg" {
location = var.resource_group_location
name = random_pet.rg_name.id
}

resource "random_pet" "azurerm_kubernetes_cluster_name" {
prefix = "cluster"
}

resource "random_pet" "azurerm_kubernetes_cluster_dns_prefix" {
prefix = "dns"
}

resource "azurerm_kubernetes_cluster" "k8s" {
location = azurerm_resource_group.rg.location
name = random_pet.azurerm_kubernetes_cluster_name.id
resource_group_name = azurerm_resource_group.rg.name
dns_prefix = random_pet.azurerm_kubernetes_cluster_dns_prefix.id
kubernetes_version = var.kubernetes_version

identity {
type = "SystemAssigned"
}

default_node_pool {
name = "systempool"
vm_size = var.system_node_pool_vm_size
node_count = var.system_node_pool_node_count
tags = { owner = var.resource_group_owner }
}

linux_profile {
admin_username = var.username

ssh_key {
key_data = azapi_resource_action.ssh_public_key_gen.output.publicKey
}
}

network_profile {
network_plugin = "azure"
}

web_app_routing {
dns_zone_ids = []
}
}

resource "null_resource" "wait_for_aks" {
depends_on = [azurerm_kubernetes_cluster.k8s]

provisioner "local-exec" {
command = <<EOT
max_retries=10
retries=0
while [ "$(az aks show --resource-group ${azurerm_resource_group.rg.name} --name ${azurerm_kubernetes_cluster.k8s.name} --query "provisioningState" -o tsv)" != "Succeeded" ]; do
if [ $retries -ge $max_retries ]; then
echo "Max retries exceeded. Exiting..."
exit 1
fi
echo "Waiting for AKS cluster to be fully provisioned... (Attempt: $((retries+1)))"
retries=$((retries+1))
sleep 30
done
EOT
}
}

resource "azapi_update_resource" "k8s-default-node-pool-systempool-taint" {
type = "Microsoft.ContainerService/managedClusters@2024-09-02-preview"
resource_id = azurerm_kubernetes_cluster.k8s.id
body = jsonencode({
properties = {
agentPoolProfiles = [
{
name = "systempool"
nodeTaints = ["CriticalAddonsOnly=true:NoSchedule"]
}
]
}
})

depends_on = [null_resource.wait_for_aks]
}

resource "azurerm_kubernetes_cluster_node_pool" "workload" {
name = "cpupool"
kubernetes_cluster_id = azurerm_kubernetes_cluster.k8s.id
vm_size = var.ray_node_pool_vm_size
node_count = var.ray_node_pool_node_count

depends_on = [azapi_update_resource.k8s-default-node-pool-systempool-taint]
}

resource "helm_release" "kueue" {
name = "kueue"
namespace = "kueue-system"
create_namespace = true
repository = var.helm_registry
chart = "kueue"
version = var.kueue_version
wait = true

depends_on = [azurerm_kubernetes_cluster_node_pool.workload]
}

resource "helm_release" "kuberay_operator" {
name = "kuberay-operator"
namespace = "kuberay-system"
create_namespace = true
repository = var.helm_registry
chart = "kuberay-operator"
version = var.kuberay_operator_version
wait = true

depends_on = [azurerm_kubernetes_cluster_node_pool.workload]
}
46 changes: 46 additions & 0 deletions examples/ray/aks-classic/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
output "resource_group_name" {
value = azurerm_resource_group.rg.name
}

output "kubernetes_cluster_name" {
value = azurerm_kubernetes_cluster.k8s.name
}

output "client_certificate" {
value = azurerm_kubernetes_cluster.k8s.kube_config[0].client_certificate
sensitive = true
}

output "client_key" {
value = azurerm_kubernetes_cluster.k8s.kube_config[0].client_key
sensitive = true
}

output "cluster_ca_certificate" {
value = azurerm_kubernetes_cluster.k8s.kube_config[0].cluster_ca_certificate
sensitive = true
}

output "cluster_password" {
value = azurerm_kubernetes_cluster.k8s.kube_config[0].password
sensitive = true
}

output "cluster_username" {
value = azurerm_kubernetes_cluster.k8s.kube_config[0].username
sensitive = true
}

output "host" {
value = azurerm_kubernetes_cluster.k8s.kube_config[0].host
sensitive = true
}

output "kube_config" {
value = azurerm_kubernetes_cluster.k8s.kube_config_raw
sensitive = true
}

output "system_node_pool_name" {
value = azurerm_kubernetes_cluster.k8s.default_node_pool[0].name
}
41 changes: 41 additions & 0 deletions examples/ray/aks-classic/providers.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
terraform {
required_version = ">=1.0"

required_providers {
azapi = {
source = "azure/azapi"
version = "~>1.5"
}
azurerm = {
source = "hashicorp/azurerm"
version = "~>4.13"
}
helm = {
source = "hashicorp/helm"
version = "~>2.12"
}
random = {
source = "hashicorp/random"
version = "~>3.0"
}
time = {
source = "hashicorp/time"
version = "0.9.1"
}
}
}

provider "azurerm" {
features {}

subscription_id = var.subscription_id
}

provider "helm" {
kubernetes {
host = azurerm_kubernetes_cluster.k8s.kube_config[0].host
client_certificate = base64decode(azurerm_kubernetes_cluster.k8s.kube_config[0].client_certificate)
client_key = base64decode(azurerm_kubernetes_cluster.k8s.kube_config[0].client_key)
cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.k8s.kube_config[0].cluster_ca_certificate)
}
}
25 changes: 25 additions & 0 deletions examples/ray/aks-classic/ssh.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

resource "random_pet" "ssh_key_name" {
prefix = "ssh"
separator = ""
}

resource "azapi_resource_action" "ssh_public_key_gen" {
type = "Microsoft.Compute/sshPublicKeys@2022-11-01"
resource_id = azapi_resource.ssh_public_key.id
action = "generateKeyPair"
method = "POST"

response_export_values = ["publicKey", "privateKey"]
}

resource "azapi_resource" "ssh_public_key" {
type = "Microsoft.Compute/sshPublicKeys@2022-11-01"
name = random_pet.ssh_key_name.id
location = azurerm_resource_group.rg.location
parent_id = azurerm_resource_group.rg.id
}

output "key_data" {
value = azapi_resource_action.ssh_public_key_gen.output.publicKey
}
Loading
Loading