From a3114720842d5b9cedc6e3cc470a60772ef5a41e Mon Sep 17 00:00:00 2001 From: Anirudh Dagar Date: Fri, 29 May 2026 13:41:57 +0200 Subject: [PATCH 1/3] Add setup docs and ship CFN templates in-package --- .gitignore | 1 - docs/conf.py | 7 + docs/tutorials/autogluon-cloud.md | 164 +----------------- docs/tutorials/index.md | 7 + docs/tutorials/setup.md | 132 ++++++++++++++ setup.py | 21 --- src/autogluon/cloud/cli.py | 5 +- .../cloud/templates/ag_cloud_ray_aws.yaml | 128 ++++++++++++++ .../cloud/templates/ag_cloud_sagemaker.yaml | 58 +++++++ 9 files changed, 342 insertions(+), 181 deletions(-) create mode 100644 docs/tutorials/setup.md create mode 100644 src/autogluon/cloud/templates/ag_cloud_ray_aws.yaml create mode 100644 src/autogluon/cloud/templates/ag_cloud_sagemaker.yaml diff --git a/.gitignore b/.gitignore index 91805c5a..13e25623 100644 --- a/.gitignore +++ b/.gitignore @@ -130,7 +130,6 @@ dmypy.json src/autogluon/cloud/version.py VERSION.minor -src/autogluon/cloud/templates/*.yaml .idea .vscode .DS_Store diff --git a/docs/conf.py b/docs/conf.py index d678eab0..27fe1162 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,11 +16,18 @@ "sphinx_togglebutton", # sphinx-togglebutton.readthedocs.io "sphinx.ext.autodoc", # www.sphinx-doc.org/en/master/usage/extensions/autodoc.html "sphinx.ext.autosummary", # www.sphinx-doc.org/en/master/usage/extensions/autosummary.html + "sphinx.ext.extlinks", # www.sphinx-doc.org/en/master/usage/extensions/extlinks.html "sphinx.ext.napoleon", # www.sphinx-doc.org/en/master/usage/extensions/napoleon.html "sphinx.ext.viewcode", # www.sphinx-doc.org/en/master/usage/extensions/viewcode.html "sphinxcontrib.googleanalytics", # github.com/sphinx-contrib/googleanalytics ] +# Pin links to repo files at the current release tag so released docs don't dangle on master. +# Usage in markdown: {repo-file}`src/autogluon/cloud/templates/ag_cloud_sagemaker.yaml` +extlinks = { + "repo-file": (f"https://github.com/autogluon/autogluon-cloud/blob/v{release}/%s", "%s"), +} + # See https://myst-parser.readthedocs.io/en/latest/syntax/optional.html myst_enable_extensions = ["colon_fence", "deflist", "dollarmath", "html_image", "substitution"] diff --git a/docs/tutorials/autogluon-cloud.md b/docs/tutorials/autogluon-cloud.md index 057a0496..a61fcea2 100644 --- a/docs/tutorials/autogluon-cloud.md +++ b/docs/tutorials/autogluon-cloud.md @@ -5,29 +5,7 @@ The containers can be used to train models with CPU and GPU instances and deploy We offer the [autogluon.cloud](https://github.com/autogluon/autogluon-cloud) module to utilize those containers and [Amazon SageMaker](https://aws.amazon.com/sagemaker/) underneath to train/deploy AutoGluon backed models with simple APIs. -```{attention} -Costs for running cloud compute are managed by Amazon SageMaker, and storage costs are managed by AWS S3. AutoGluon-Cloud is a wrapper to these services at no additional charge. While AutoGluon-Cloud makes an effort to simplify the usage of these services, it is ultimately the user's responsibility to monitor compute usage within their account to avoid unexpected charges. -``` - - -## Installation -`autogluon.cloud` does not come with the default `autogluon` installation. You can install it via: - -```bash -pip install autogluon.cloud -``` - -Also ensure that the latest version of sagemaker python API is installed via: - -```bash -pip install -U sagemaker -``` - -This is required to ensure the information about newly released containers is available. - -## Prepare an IAM Role with Necessary Permissions -Currently, AutoGluon-Cloud can use two cloud backends: **Amazon SageMaker** and **Ray (AWS)**. -Here is an overview of the features supported by each backend. +AutoGluon-Cloud supports two backends: | Feature | SageMaker | Ray (AWS) | |--------------------------------|---------------|--------------| @@ -37,143 +15,13 @@ Here is an overview of the features supported by each backend. | **Inference endpoints** | ✅ | ❌ | | **Batch inference** | ✅ | ❌ | -AutoGluon-Cloud needs to interact with various AWS resources. For this purpose, we recommend to set up a dedicated IAM role with the necessary permissions. This can be done using one of the following options. - -::::{tab-set} -:::{tab-item} CloudFormation (AWS CLI) -:sync: setup-cli -1. Download and review the CloudFormation template from the [AutoGluon-Cloud repository](https://github.com/autogluon/autogluon-cloud/tree/master/cloudformation) - ```bash - BACKEND="sagemaker" # Supported options "sagemaker", "ray_aws" - wget https://raw.githubusercontent.com/autogluon/autogluon-cloud/refs/heads/master/cloudformation/ag_cloud_$BACKEND.yaml - ``` - ```{note} - Make sure you review the IAM policy defined in the CloudFormation template, and make necessary changes according to your use case before applying it. - ``` - -2. Deploy the CloudFormation stack - ```bash - aws cloudformation create-stack \ - --stack-name ag-cloud \ # use your preferred stack name - --template-body file://ag_cloud_$BACKEND.yaml \ - --capabilities CAPABILITY_NAMED_IAM # give permission to create IAM roles - ``` - -3. Review the outputs produced by the stack - ```bash - aws cloudformation describe-stacks --stack-name ag-cloud --query "Stacks[0].Outputs" - ``` - The output should contain the **name of the S3 bucket** and the **ARN of the IAM role** created for AutoGluon-Cloud. - ```json - [ - { - "OutputKey": "BucketName", - "OutputValue": "ag-cloud-bucket-abcd1234", - "Description": "S3 bucket where AutoGluon-Cloud will save trained predictors" - }, - { - "OutputKey": "RoleARN", - "OutputValue": "arn:aws:iam::222222222222:role/ag-cloud-execution-role", - "Description": "ARN of the created IAM role for AutoGluon-Cloud to run on SageMaker" - } - ] - ``` - -::: -:::{tab-item} CloudFormation (AWS Console) -:sync: setup-cfn -1. Download and review the CloudFormation template for the backend of your choice from the [AutoGluon-Cloud repository](https://github.com/autogluon/autogluon-cloud/tree/master/cloudformation) - - Template for [SageMaker](https://raw.githubusercontent.com/autogluon/autogluon-cloud/refs/heads/master/cloudformation/ag_cloud_sagemaker.yaml) - - Template for [Ray (AWS)](https://raw.githubusercontent.com/autogluon/autogluon-cloud/refs/heads/master/cloudformation/ag_cloud_ray_aws.yaml) - - ```{note} - Make sure you review the IAM policy defined in the CloudFormation template, and make necessary changes according to your use case before applying it. - ``` - -2. Log in to the AWS Console. Make sure to select the region where you would like to use AutoGluon-Cloud. -4. Go to CloudFormation > Stacks > Create stack and create a stack using the CloudFormation template downloaded in Step 1. - -5. After the stack is created, go to the *Outputs* tab and view the **name of the S3 bucket** and the **ARN of the IAM role** created for AutoGluon-Cloud - ![img](img/stack-outputs.png) - -::: -:::{tab-item} Manual -:sync: setup-manual -1. Create an S3 bucket for AutoGluon-Cloud to store predictors. Replace `S3_BUCKET_NAME` with your preferred name for the bucket. - ```bash - aws s3 mb s3://S3_BUCKET_NAME - ``` - -2. Generate trust relationship and IAM policy with our utils via the following command - ```python - from autogluon.cloud import TabularCloudPredictor # Can be other CloudPredictor as well - - TabularCloudPredictor.generate_default_permission( - backend="BACKEND_YOU_WANT", # We currently support "sagemaker" and "ray_aws" - account_id="YOUR_ACCOUNT_ID", # The AWS account ID you plan to use for CloudPredictor. - cloud_output_bucket="S3_BUCKET_NAME" # S3 bucket name where intermediate artifacts will be uploaded and trained models should be saved. You need to create this bucket beforehand. - ) - ``` - ```{note} - Make sure you review the trust relationship and IAM policy files, and make necessary changes according to your use case before applying them. - ``` - In the following steps, make sure to replace `AUTOGLUON-ROLE-NAME` with your desired role name, `AUTOGLUON-POLICY-NAME` with your desired policy name, and `222222222222` with your AWS account number. - -3. Create the IAM role. - ```bash - aws iam create-role --role-name AUTOGLUON-ROLE-NAME --assume-role-policy-document file://ag_cloud_sagemaker_trust_relationship.json - ``` - This method will return the **role ARN** that looks similar to `arn:aws:iam::222222222222:role/AUTOGLUON-ROLE-NAME`. Keep it for further reference. - -4. Create the IAM policy. - ```bash - aws iam create-policy --policy-name AUTOGLUON-POLICY-NAME --policy-document file://ag_cloud_sagemaker_iam_policy.json - ``` - This method will return the **policy ARN** that looks similar to `arn:aws:iam::222222222222:policy/AUTOGLUON-POLICY-NAME`. Keep it for further reference. - -5. Attach the IAM policy to the role. - ```bash - aws iam attach-role-policy --role-name AUTOGLUON-ROLE-NAME --policy-arn "arn:aws:iam::222222222222:policy/AUTOGLUON-POLICY-NAME" - ``` -::: -:::: - -Make sure to remember: -- **ARN of the IAM role** created for AutoGluon-Cloud -- **Name of the S3 bucket**, where AutoGluon-Cloud will store the training artifacts - -After completing the setup, assume the IAM role using AWS CLI or boto3. - -::::{tab-set} -:::{tab-item} Python / boto3 -:sync: assume-boto3 -```python -import boto3 - -# Replace this with the ARN of your AutoGluon-Cloud IAM role -ROLE_ARN = "arn:aws:iam::222222222222:role/AUTOGLUON-ROLE-NAME" - -session = boto3.Session() -credentials = session.client("sts").assume_role( - RoleArn=ROLE_ARN, - RoleSessionName="AutoGluonCloudSession" -)["Credentials"] +```{attention} +Costs for running cloud compute are managed by Amazon SageMaker, and storage costs are managed by AWS S3. AutoGluon-Cloud is a wrapper to these services at no additional charge. While AutoGluon-Cloud makes an effort to simplify the usage of these services, it is ultimately the user's responsibility to monitor compute usage within their account to avoid unexpected charges. +``` -boto3.setup_default_session( - aws_access_key_id=credentials["AccessKeyId"], - aws_secret_access_key=credentials["SecretAccessKey"], - aws_session_token=credentials["SessionToken"], -) +```{note} +This tutorial assumes you have already set up AutoGluon-Cloud on AWS. If you haven't, see [Setup](setup.md) first. ``` -Now when you use `autogluon.cloud` in the same Python script / Jupyter notebook, the correct IAM role will be used. -::: -:::{tab-item} AWS CLI -:sync: assume-cli -See section "Assume the IAM role" in this [tutorial](https://repost.aws/knowledge-center/iam-assume-role-cli). -::: -:::: - -For more details on setting up IAM roles and policies, refer to this [tutorial](https://aws.amazon.com/premiumsupport/knowledge-center/iam-assume-role-cli/). ## Training Using `autogluon.cloud` to train AutoGluon backed models is simple and not too much different from training an AutoGluon predictor directly. diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index eff892f6..01ed1845 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -3,6 +3,12 @@ ::::{grid} 2 :gutter: 3 +:::{grid-item-card} Setup + :link: setup.html + + Set up AutoGluon-Cloud on AWS in one command with `bootstrap`, or register existing AWS resources you already have. +::: + :::{grid-item-card} AutoGluon-Cloud :link: autogluon-cloud.html @@ -21,6 +27,7 @@ maxdepth: 2 hidden: true --- +Setup Essentials Foundation Models Image Modality diff --git a/docs/tutorials/setup.md b/docs/tutorials/setup.md new file mode 100644 index 00000000..8b060520 --- /dev/null +++ b/docs/tutorials/setup.md @@ -0,0 +1,132 @@ +# Set Up AutoGluon-Cloud on AWS + +AutoGluon-Cloud needs two AWS resources to operate: + +- An **IAM role** that SageMaker assumes to run training and inference jobs. +- An **S3 bucket** where training artifacts and trained models are stored. + +The fastest way to set both up is the `autogluon-cloud bootstrap` command shipped with the package. If you already have a role and bucket, use `register` instead. This page walks through both paths and the day-2 commands (`status`, `teardown`). + +## Install + +```bash +pip install -U autogluon.cloud +``` + +This installs the `autogluon-cloud` CLI alongside the Python API. + + +## Quickstart: `bootstrap` + +If you have AWS credentials configured (via `aws configure`, `AWS_*` env vars, SSO, or an instance profile), run: + +::::{tab-set} +:::{tab-item} CLI +:sync: setup-cli +```bash +autogluon-cloud bootstrap +``` +::: +:::{tab-item} Python +:sync: setup-py +```python +from autogluon.cloud import bootstrap + +bootstrap() +``` +::: +:::: + +This deploys a CloudFormation stack (`ag-cloud-sagemaker` by default), creates the IAM role and S3 bucket, and saves both to `~/.autogluon/cloud.yaml`. Subsequent `CloudPredictor` calls pick the saved values up automatically. + +```{note} +Review the CloudFormation template before deploying: {repo-file}`src/autogluon/cloud/templates/ag_cloud_sagemaker.yaml`. +``` + + +## Already have a role and bucket? Use `register` + +If your platform team has provisioned an IAM role and S3 bucket for you, skip CloudFormation entirely and just tell AutoGluon-Cloud about them: + +::::{tab-set} +:::{tab-item} CLI +:sync: setup-cli +```bash +autogluon-cloud register \ + --role arn:aws:iam::222222222222:role/MyAutoGluonRole \ + --bucket my-autogluon-bucket \ + --region us-east-1 +``` +::: +:::{tab-item} Python +:sync: setup-py +```python +from autogluon.cloud import register + +register( + role="arn:aws:iam::222222222222:role/MyAutoGluonRole", + bucket="my-autogluon-bucket", + region="us-east-1", +) +``` +::: +:::: + +`register` makes no AWS calls — it only persists the values to `~/.autogluon/cloud.yaml`. The IAM role must trust `sagemaker.amazonaws.com` and have permissions equivalent to AWS's `AmazonSageMakerFullAccess` managed policy plus read/write access to your bucket. + + +## Check your setup: `status` + +Verify the saved resources still exist and are accessible: + +::::{tab-set} +:::{tab-item} CLI +:sync: setup-cli +```bash +autogluon-cloud status +``` +::: +:::{tab-item} Python +:sync: setup-py +```python +from autogluon.cloud import status + +reports = status() +``` +::: +:::: + +Each backend's bucket, role, and (if applicable) CloudFormation stack are checked. `ok` means the resource exists; `ok (unverified ...)` means the caller lacks the IAM permission to verify (the resource is probably fine, but `status` couldn't confirm). + + +## Tear down: `teardown` + +When you're done with AutoGluon-Cloud and want to remove everything it created: + +::::{tab-set} +:::{tab-item} CLI +:sync: setup-cli +```bash +autogluon-cloud teardown +``` +::: +:::{tab-item} Python +:sync: setup-py +```python +from autogluon.cloud import teardown + +teardown() +``` +::: +:::: + +This deletes the CloudFormation stack(s) created by `bootstrap` and removes `~/.autogluon/cloud.yaml`. Backends added via `register` (no stack) only have their config entry removed — your existing role and bucket are left untouched. + +```{warning} +CloudFormation refuses to delete non-empty S3 buckets. If your bucket holds training artifacts you want to discard, empty it first with `aws s3 rm s3:// --recursive`. +``` + + +## Where the config lives + +`bootstrap` and `register` both write to `~/.autogluon/cloud.yaml`. The file is keyed by backend, so you can have separate entries for different backends side by side. Override the directory with the `AG_CONFIG_DIR` environment variable. diff --git a/setup.py b/setup.py index 21a696d8..30b18b19 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ import os -import shutil from setuptools import setup @@ -17,25 +16,6 @@ def create_version_file(*, version): f.write("__version__ = '{}'\n".format(version)) -def sync_templates(): - """Copy canonical CloudFormation templates into the package so they ship in the wheel. - - The templates at repo-root `/cloudformation/` are the source of truth (referenced - from docs by public URL). At build time we mirror them under the `autogluon.cloud` - package so `importlib.resources` can locate them at runtime without depending on the - repo layout. - """ - src_dir = "cloudformation" - dst_dir = os.path.join("src", AUTOGLUON, CLOUD, "templates") - if not os.path.isdir(src_dir): - # Building from an sdist that already has the copies — nothing to do. - return - os.makedirs(dst_dir, exist_ok=True) - for filename in os.listdir(src_dir): - if filename.endswith(".yaml"): - shutil.copyfile(os.path.join(src_dir, filename), os.path.join(dst_dir, filename)) - - def update_version(version, use_file_if_exists=True, create_file=False): """ To release a new stable version on PyPi, simply tag the release on github, and the Github CI will automatically publish @@ -165,7 +145,6 @@ def default_setup_args(*, version): if __name__ == "__main__": create_version_file(version=version) - sync_templates() setup_args = default_setup_args(version=version) setup( install_requires=install_requires, diff --git a/src/autogluon/cloud/cli.py b/src/autogluon/cloud/cli.py index 92d57eff..f75f664f 100644 --- a/src/autogluon/cloud/cli.py +++ b/src/autogluon/cloud/cli.py @@ -79,7 +79,10 @@ def bootstrap( default_stack = f"ag-cloud-{backend.replace('_', '-')}" stack_name = Prompt.ask("Stack name", default=default_stack) effective_stack = stack_name - template_url = f"https://github.com/autogluon/autogluon-cloud/blob/master/cloudformation/ag_cloud_{backend}.yaml" + template_url = ( + f"https://github.com/autogluon/autogluon-cloud/blob/master/" + f"src/autogluon/cloud/templates/ag_cloud_{backend}.yaml" + ) _console.print( f"This will use CloudFormation to create AWS resources (IAM roles, S3 bucket, etc.) " diff --git a/src/autogluon/cloud/templates/ag_cloud_ray_aws.yaml b/src/autogluon/cloud/templates/ag_cloud_ray_aws.yaml new file mode 100644 index 00000000..64873eb1 --- /dev/null +++ b/src/autogluon/cloud/templates/ag_cloud_ray_aws.yaml @@ -0,0 +1,128 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: 'CloudFormation template for AutoGluon-Cloud to run on an AWS Ray cluster' + +Resources: + AGCloudRayBucket: + Type: 'AWS::S3::Bucket' + Properties: + BucketName: !Sub + - '${AWS::StackName}-bucket-${suffix}' + - suffix: !Select [0, !Split ['-', !Select [2, !Split ['/', !Ref 'AWS::StackId']]]] + VersioningConfiguration: + Status: Enabled + PublicAccessBlockConfiguration: + BlockPublicAcls: true + BlockPublicPolicy: true + IgnorePublicAcls: true + RestrictPublicBuckets: true + + AGCloudRayExecutionRole: + Type: 'AWS::IAM::Role' + Properties: + RoleName: !Sub '${AWS::StackName}-execution-role' + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: ec2.amazonaws.com + AWS: !Sub 'arn:aws:iam::${AWS::AccountId}:root' + Action: 'sts:AssumeRole' + Policies: + - PolicyName: !Sub '${AWS::StackName}-custom-policy' + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: 'ec2:RunInstances' + Resource: + - 'arn:aws:ec2:*::image/ami-*' + - !Sub 'arn:aws:ec2:*:${AWS::AccountId}:instance/*' + - !Sub 'arn:aws:ec2:*:${AWS::AccountId}:network-interface/*' + - !Sub 'arn:aws:ec2:*:${AWS::AccountId}:subnet/*' + - !Sub 'arn:aws:ec2:*:${AWS::AccountId}:key-pair/*' + - !Sub 'arn:aws:ec2:*:${AWS::AccountId}:volume/*' + - !Sub 'arn:aws:ec2:*:${AWS::AccountId}:security-group/*' + + - Effect: Allow + Action: + - ec2:TerminateInstances + - ec2:DeleteTags + - ec2:StartInstances + - ec2:CreateTags + - ec2:StopInstances + Resource: + - !Sub 'arn:aws:ec2:*:${AWS::AccountId}:instance/*' + + - Effect: Allow + Action: + - ec2:Describe* + - ec2:AuthorizeSecurityGroupIngress + Resource: '*' + + - Effect: Allow + Action: + - ec2:CreateSecurityGroup + Resource: + - !Sub 'arn:aws:ec2:*:${AWS::AccountId}:security-group/*' + - !Sub 'arn:aws:ec2:*:${AWS::AccountId}:vpc/*' + + - Effect: Allow + Action: + - ec2:CreateKeyPair + - ec2:DeleteKeyPair + Resource: + - !Sub 'arn:aws:ec2:*:${AWS::AccountId}:key-pair/ag_ray_cluster*' + + - Effect: Allow + Action: + - iam:GetInstanceProfile + - iam:CreateInstanceProfile + - iam:CreateRole + - iam:GetRole + - iam:AttachRolePolicy + - iam:DetachRolePolicy + - iam:AddRoleToInstanceProfile + - iam:PassRole + Resource: '*' + + - Effect: Allow + Action: + - iam:CreatePolicy + - iam:DeletePolicy + Resource: + - !Sub 'arn:aws:iam::${AWS::AccountId}:policy/AGRayClusterPolicy*' + + - Effect: Allow + Action: + - s3:PutObject + - s3:PutObjectAcl + - s3:GetObject + - s3:GetObjectAcl + - s3:AbortMultipartUpload + Resource: + - !Sub 'arn:aws:s3:::${AGCloudRayBucket}/*' + - !Sub 'arn:aws:s3:::${AGCloudRayBucket}' + - 'arn:aws:s3:::*SageMaker*' + - 'arn:aws:s3:::*Sagemaker*' + - 'arn:aws:s3:::*sagemaker*' + + - Effect: Allow + Action: 's3:ListBucket' + Resource: '*' + + - Effect: Allow + Action: + - iam:ListPolicies + - iam:ListEntitiesForPolicy + - iam:ListPolicyVersions + Resource: '*' + +Outputs: + BucketName: + Description: S3 bucket where AutoGluon-Cloud will store data for Ray + Value: !Ref AGCloudRayBucket + + RoleARN: + Description: ARN of the created IAM role for AutoGluon-Cloud with Ray + Value: !GetAtt AGCloudRayExecutionRole.Arn diff --git a/src/autogluon/cloud/templates/ag_cloud_sagemaker.yaml b/src/autogluon/cloud/templates/ag_cloud_sagemaker.yaml new file mode 100644 index 00000000..cfbad129 --- /dev/null +++ b/src/autogluon/cloud/templates/ag_cloud_sagemaker.yaml @@ -0,0 +1,58 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: 'CloudFormation template for AutoGluon-Cloud to run on SageMaker' + +Resources: + AGCloudSageMakerBucket: + Type: 'AWS::S3::Bucket' + Properties: + BucketName: !Sub + - '${AWS::StackName}-bucket-${suffix}' + - suffix: !Select [0, !Split ['-', !Select [2, !Split ['/', !Ref 'AWS::StackId']]]] + VersioningConfiguration: + Status: Enabled + PublicAccessBlockConfiguration: + BlockPublicAcls: true + BlockPublicPolicy: true + IgnorePublicAcls: true + RestrictPublicBuckets: true + + AGCloudSageMakerExecutionRole: + Type: 'AWS::IAM::Role' + Properties: + RoleName: !Sub '${AWS::StackName}-execution-role' + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Sid: '' + Effect: Allow + Principal: + Service: sagemaker.amazonaws.com + AWS: !Sub 'arn:aws:iam::${AWS::AccountId}:root' + Action: 'sts:AssumeRole' + ManagedPolicyArns: + - 'arn:aws:iam::aws:policy/AmazonSageMakerFullAccess' + Policies: + - PolicyName: !Sub '${AWS::StackName}-custom-policy' + PolicyDocument: + Version: '2012-10-17' + Statement: + - Sid: S3Access + Effect: Allow + Action: + - s3:GetBucketLocation + - s3:ListBucket + - s3:GetObject + - s3:PutObject + - s3:AbortMultipartUpload + Resource: + - !Sub 'arn:aws:s3:::${AGCloudSageMakerBucket}' + - !Sub 'arn:aws:s3:::${AGCloudSageMakerBucket}/*' + +Outputs: + BucketName: + Description: S3 bucket where AutoGluon-Cloud will save trained predictors + Value: !Ref AGCloudSageMakerBucket + + RoleARN: + Description: ARN of the created IAM role for AutoGluon-Cloud to run on SageMaker + Value: !GetAtt AGCloudSageMakerExecutionRole.Arn From 06c49f3919fc1c8797beb840f25ecc8570a47053 Mon Sep 17 00:00:00 2001 From: Oleksandr Shchur Date: Fri, 29 May 2026 12:37:05 +0000 Subject: [PATCH 2/3] Update setup --- docs/tutorials/setup.md | 50 ++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/docs/tutorials/setup.md b/docs/tutorials/setup.md index 8b060520..cc9374e4 100644 --- a/docs/tutorials/setup.md +++ b/docs/tutorials/setup.md @@ -1,11 +1,29 @@ # Set Up AutoGluon-Cloud on AWS -AutoGluon-Cloud needs two AWS resources to operate: +AutoGluon-Cloud trains and deploys models on AWS SageMaker on your behalf. To do that, every `CloudPredictor` or `FoundationModel` you create needs two AWS resources: - An **IAM role** that SageMaker assumes to run training and inference jobs. -- An **S3 bucket** where training artifacts and trained models are stored. +- An **S3 bucket** to stage training data and store trained models. -The fastest way to set both up is the `autogluon-cloud bootstrap` command shipped with the package. If you already have a role and bucket, use `register` instead. This page walks through both paths and the day-2 commands (`status`, `teardown`). +You have two options for supplying them: + +1. **Save them once** to `~/.autogluon/cloud.yaml`, and AutoGluon-Cloud will pick them up automatically on every call. This is the recommended path — set it up with [`bootstrap`](#bootstrap) or [`register`](#register) below. +2. **Pass them explicitly** to each `CloudPredictor` / `FoundationModel`, e.g. `CloudPredictor(role="arn:aws:iam::...", cloud_output_path="s3://my-bucket/...")`. Useful if you need different roles or buckets per call, or if you don't want a config file on disk. + +The rest of this page covers option 1. + +## Commands + +AutoGluon-Cloud ships four commands for managing the saved configuration: + +| Command | What it does | When to use it | +|---|---|---| +| [`bootstrap`](#bootstrap) | Provisions a role and bucket via CloudFormation, then saves them. | First-time setup with no existing AWS resources. | +| [`register`](#register) | Saves an existing role and bucket without provisioning anything. | Your platform team already gave you a role and bucket. | +| [`status`](#status) | Verifies the saved resources still exist and are accessible. | Sanity-check before training, or after IAM/S3 changes. | +| [`teardown`](#teardown) | Deletes resources created by `bootstrap` and the saved config. | Cleanup when you're done with AutoGluon-Cloud. | + +Each command is available both as a CLI subcommand (`autogluon-cloud `) and as a Python function (`from autogluon.cloud import `). The sections below show both forms. ## Install @@ -16,9 +34,11 @@ pip install -U autogluon.cloud This installs the `autogluon-cloud` CLI alongside the Python API. -## Quickstart: `bootstrap` +## `bootstrap` -If you have AWS credentials configured (via `aws configure`, `AWS_*` env vars, SSO, or an instance profile), run: +Provisions an IAM role and S3 bucket via CloudFormation, then saves them to `~/.autogluon/cloud.yaml`. Use this if you don't already have AWS resources for AutoGluon-Cloud. + +`bootstrap` uses the [standard boto3 credential resolution order](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials) to find your AWS credentials, so anything that works for the AWS CLI or boto3 will work here (`aws configure`, `AWS_*` environment variables, an active SSO session, or an instance profile). Run: ::::{tab-set} :::{tab-item} CLI @@ -37,16 +57,16 @@ bootstrap() ::: :::: -This deploys a CloudFormation stack (`ag-cloud-sagemaker` by default), creates the IAM role and S3 bucket, and saves both to `~/.autogluon/cloud.yaml`. Subsequent `CloudPredictor` calls pick the saved values up automatically. +The CloudFormation stack is named `ag-cloud-sagemaker` by default. Subsequent `CloudPredictor` calls pick the saved values up automatically. ```{note} Review the CloudFormation template before deploying: {repo-file}`src/autogluon/cloud/templates/ag_cloud_sagemaker.yaml`. ``` -## Already have a role and bucket? Use `register` +## `register` -If your platform team has provisioned an IAM role and S3 bucket for you, skip CloudFormation entirely and just tell AutoGluon-Cloud about them: +Tells AutoGluon-Cloud to use an IAM role and S3 bucket you already have. Use this when your platform team has provisioned them for you and you want to skip CloudFormation. ::::{tab-set} :::{tab-item} CLI @@ -72,12 +92,12 @@ register( ::: :::: -`register` makes no AWS calls — it only persists the values to `~/.autogluon/cloud.yaml`. The IAM role must trust `sagemaker.amazonaws.com` and have permissions equivalent to AWS's `AmazonSageMakerFullAccess` managed policy plus read/write access to your bucket. +`register` makes no AWS calls — it only persists the values to `~/.autogluon/cloud.yaml`. The IAM role must trust `sagemaker.amazonaws.com` and have permissions equivalent to AWS's [`AmazonSageMakerFullAccess`](https://docs.aws.amazon.com/aws-managed-policy/latest/reference/AmazonSageMakerFullAccess.html) managed policy plus read/write access to your bucket. -## Check your setup: `status` +## `status` -Verify the saved resources still exist and are accessible: +Verifies that the saved IAM role, S3 bucket, and (if applicable) CloudFormation stack still exist and are accessible. ::::{tab-set} :::{tab-item} CLI @@ -96,12 +116,12 @@ reports = status() ::: :::: -Each backend's bucket, role, and (if applicable) CloudFormation stack are checked. `ok` means the resource exists; `ok (unverified ...)` means the caller lacks the IAM permission to verify (the resource is probably fine, but `status` couldn't confirm). +`ok` means the resource exists; `ok (unverified ...)` means the caller lacks the IAM permission to verify (the resource is probably fine, but `status` couldn't confirm). -## Tear down: `teardown` +## `teardown` -When you're done with AutoGluon-Cloud and want to remove everything it created: +Deletes the CloudFormation stacks created by `bootstrap` and removes `~/.autogluon/cloud.yaml`. Backends added via `register` only have their config entry removed — your existing role and bucket are left untouched. ::::{tab-set} :::{tab-item} CLI @@ -120,8 +140,6 @@ teardown() ::: :::: -This deletes the CloudFormation stack(s) created by `bootstrap` and removes `~/.autogluon/cloud.yaml`. Backends added via `register` (no stack) only have their config entry removed — your existing role and bucket are left untouched. - ```{warning} CloudFormation refuses to delete non-empty S3 buckets. If your bucket holds training artifacts you want to discard, empty it first with `aws s3 rm s3:// --recursive`. ``` From b190b7cdc4941c4b9beb71799d8b25f6af56edce Mon Sep 17 00:00:00 2001 From: Oleksandr Shchur Date: Fri, 29 May 2026 12:40:49 +0000 Subject: [PATCH 3/3] Update essentials tutorial --- docs/tutorials/autogluon-cloud.md | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/docs/tutorials/autogluon-cloud.md b/docs/tutorials/autogluon-cloud.md index a61fcea2..e965401b 100644 --- a/docs/tutorials/autogluon-cloud.md +++ b/docs/tutorials/autogluon-cloud.md @@ -1,33 +1,20 @@ -# Train and Deploy AutoGluon Models on Amazon SageMaker with AutoGluon-Cloud +# Train and Deploy AutoGluon Models with AutoGluon-Cloud -To help with AutoGluon models training, AWS developed a set of training and inference [deep learning containers](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#autogluon-training-containers). -The containers can be used to train models with CPU and GPU instances and deployed as a SageMaker endpoint or used as a batch transform job. +AutoGluon-Cloud lets you train, deploy, and run inference with AutoGluon models on AWS using the same APIs you'd use locally. Under the hood, it runs your jobs on [Amazon SageMaker](https://aws.amazon.com/sagemaker/) using AWS's official [AutoGluon deep learning containers](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#autogluon-training-containers) — so you don't manage any infrastructure yourself. -We offer the [autogluon.cloud](https://github.com/autogluon/autogluon-cloud) module to utilize those containers and [Amazon SageMaker](https://aws.amazon.com/sagemaker/) underneath to train/deploy AutoGluon backed models with simple APIs. +It supports the `tabular`, `timeseries`, and `multimodal` predictors. The examples below use `TabularCloudPredictor`; the others share the same API. -AutoGluon-Cloud supports two backends: - -| Feature | SageMaker | Ray (AWS) | -|--------------------------------|---------------|--------------| -| **Supported modalities** | `tabular`, `timeseries`, `multimodal` | `tabular` | -| **Training (single instance)** | ✅ | ✅ | -| **Training (distributed)** | ❌ | ✅ | -| **Inference endpoints** | ✅ | ❌ | -| **Batch inference** | ✅ | ❌ | - -```{attention} -Costs for running cloud compute are managed by Amazon SageMaker, and storage costs are managed by AWS S3. AutoGluon-Cloud is a wrapper to these services at no additional charge. While AutoGluon-Cloud makes an effort to simplify the usage of these services, it is ultimately the user's responsibility to monitor compute usage within their account to avoid unexpected charges. +```{note} +This tutorial assumes you've already set up AutoGluon-Cloud on AWS. If you haven't, see [Setup](setup.md) first. ``` -```{note} -This tutorial assumes you have already set up AutoGluon-Cloud on AWS. If you haven't, see [Setup](setup.md) first. +```{attention} +SageMaker compute and S3 storage are billed to your AWS account. AutoGluon-Cloud is a free wrapper, but it's your responsibility to monitor usage to avoid unexpected charges. ``` ## Training Using `autogluon.cloud` to train AutoGluon backed models is simple and not too much different from training an AutoGluon predictor directly. -Currently, `autogluon.cloud` supports training/deploying `tabular`, `multimodal` and `timeseries` predictors. In the example below, we use `TabularCloudPredictor` for demonstration. You can substitute it with other `CloudPredictors` easily as they share the same APIs. - ```python from autogluon.cloud import TabularCloudPredictor train_data = "train.csv" # can be a DataFrame as well