awslabs · KeitaW · Mar 12, 2026
diff --git a/3.test_cases/osmo/AMRNavigation/Dockerfile.cosmos-transfer b/3.test_cases/osmo/AMRNavigation/Dockerfile.cosmos-transfer
@@ -0,0 +1,8 @@
+FROM nvcr.io/nvidia/pytorch:24.05-py3
+
+RUN pip install --no-cache-dir boto3 Pillow torchvision
+
+COPY src/amr_utils/ /scripts/amr_utils/
+COPY src/stage5_domain_augment.py /scripts/
+
+WORKDIR /scripts
diff --git a/3.test_cases/osmo/AMRNavigation/Dockerfile.isaac-sim b/3.test_cases/osmo/AMRNavigation/Dockerfile.isaac-sim
@@ -0,0 +1,16 @@
+FROM nvcr.io/nvidia/isaac-sim:5.1.0
+
+# Install S3 helper dependencies (boto3 is pre-installed in Isaac Sim)
+# Install PIL for occupancy map visualization and scipy for dilation
+RUN /isaac-sim/python.sh -m pip install --no-cache-dir Pillow scipy
+
+# Copy all pipeline scripts
+COPY src/amr_utils/ /isaac-sim/scripts/amr_utils/
+COPY src/stage1_scene_setup.py /isaac-sim/scripts/
+COPY src/stage2_occupancy_map.py /isaac-sim/scripts/
+COPY src/stage3_trajectory_gen.py /isaac-sim/scripts/
+COPY src/stage4_render.py /isaac-sim/scripts/
+
+WORKDIR /isaac-sim
+
+# No ENTRYPOINT — each stage specifies its own command
diff --git a/3.test_cases/osmo/AMRNavigation/Dockerfile.xmobility b/3.test_cases/osmo/AMRNavigation/Dockerfile.xmobility
@@ -0,0 +1,33 @@
+FROM nvcr.io/nvidia/pytorch:25.01-py3
+
+# Install X-Mobility + lerobot dependencies
+RUN pip install --no-cache-dir \
+    boto3 \
+    diffusers==0.29.2 \
+    einops==0.7.0 \
+    gin-config==0.5.0 \
+    pytorch-lightning==2.5.0.post0 \
+    timm==1.0.14 \
+    transformers==4.48.1 \
+    wandb==0.19.4 \
+    huggingface_hub \
+    torcheval \
+    moviepy \
+    polars \
+    tensorboardX \
+    av jsonlines datasets deepdiff draccus imageio \
+    opencv-python-headless gymnasium flask pyserial \
+    rerun-sdk termcolor cmake
+
+# Pin numpy to base image version (torch compiled against it)
+RUN pip install --no-cache-dir "numpy==1.26.4"
+
+RUN pip install --no-cache-dir lerobot==0.3.3 --no-deps
+
+# Clone X-Mobility source
+RUN git clone https://github.com/NVlabs/X-MOBILITY.git /workspace/xmobility
+
+COPY src/amr_utils/ /scripts/amr_utils/
+COPY src/stage6_train_evaluate.py /scripts/
+
+WORKDIR /workspace/xmobility
diff --git a/3.test_cases/osmo/AMRNavigation/README.md b/3.test_cases/osmo/AMRNavigation/README.md
@@ -0,0 +1,110 @@
+# Warehouse AMR Navigation Pipeline on NVIDIA OSMO
+
+MobilityGen-style synthetic data generation pipeline for warehouse AMR (Autonomous Mobile Robot) navigation, running on Amazon EKS with NVIDIA OSMO orchestration and KAI Scheduler.
+
+## Overview
+
+A 6-stage AMR pipeline orchestrated as an OSMO DAG: from scene generation through rendering, domain augmentation, and X-Mobility foundation model training.
+
+Stage 6 trains NVIDIA's [X-Mobility](https://github.com/NVlabs/X-MOBILITY) navigation foundation model (~1B params) on MobilityGen-generated datasets, using Karpenter-managed capacity reservations for training compute.
+
+### 6-Stage Pipeline Architecture
+
+```
+scene-setup --> occupancy-map --> trajectory-gen --> render --+--> domain-augment --> train-evaluate
+                                                             |                            ^
+                                                             +----------------------------+
+```
+
+| Stage | Script | Image | GPU Pool | Purpose |
+|-------|--------|-------|----------|---------|
+| 1. Scene Setup | `stage1_scene_setup.py` | isaac-sim-amr | G-series (rendering) | Build warehouse USD scene |
+| 2. Occupancy Map | `stage2_occupancy_map.py` | isaac-sim-amr | G-series (rendering) | 2D occupancy grid from prim geometry |
+| 3. Trajectory Gen | `stage3_trajectory_gen.py` | isaac-sim-amr | G-series (rendering) | A* path planning + camera poses |
+| 4. Render | `stage4_render.py` | isaac-sim-amr | G-series (rendering) | RGB/depth/segmentation rendering |
+| 5. Domain Augment | `stage5_domain_augment.py` | cosmos-transfer-amr | G-series (rendering) | Visual augmentation (torchvision, Cosmos Transfer-compatible) |
+| 6. Train+Eval | `stage6_train_evaluate.py` | xmobility-amr | P-series (training) | X-Mobility foundation model training (8 GPUs) |
+
+**OSMO orchestration features used:**
+- DAG task dependencies via `inputs:`
+- KAI Scheduler assignment via `schedulerName: kai-scheduler`
+- Priority scheduling via `priority: medium`
+- Checkpoint/reschedule semantics via `exitAction: reschedule` on training stage
+- Heterogeneous compute: G-series (rendering) and P-series (training) NodePools
+
+**Data passing**: S3 bucket via IRSA. Path: `s3://<bucket>/amr-pipeline/<run-id>/<stage>/`
+
+## Prerequisites
+
+- Amazon EKS cluster with GPU nodes (G5/G6 for rendering, P-series for training)
+- NVIDIA GPU Operator + KAI Scheduler + OSMO Platform installed
+- Karpenter with 4 OSMO NodePools (osmo-rendering, osmo-gpu-od, osmo-cpu-batch, osmo-cpu-system)
+- [NVIDIA NGC](https://ngc.nvidia.com/) account and API key
+- X-Mobility datasets from [HuggingFace](https://huggingface.co/datasets/nvidia/X-Mobility), pre-cached in S3
+- S3 bucket for inter-stage data + IRSA ServiceAccount
+- Docker, `kubectl`, and AWS CLI configured
+
+## Quick Start
+
+```bash
+# 1. Setup
+./kubernetes/0.setup-ngc-secret.sh
+
+# 2. Build all 3 images
+./kubernetes/1.build-container.sh
+
+# 3. Verify OSMO is ready
+./kubernetes/4.verify-osmo.sh
+
+# 4. Submit pipeline
+export S3_BUCKET="my-amr-pipeline-bucket"
+./kubernetes/3.submit-pipeline.sh
+```
+
+See [kubernetes/README.md](kubernetes/README.md) for detailed per-stage instructions.
+
+## Configuration
+
+- `configs/default_config.yaml` - Pipeline-level settings
+- `configs/pipeline_config.yaml` - Per-stage pipeline parameters
+
+## Container Images
+
+| Image | Dockerfile | Base | Stages |
+|-------|-----------|------|--------|
+| `isaac-sim-amr` | `Dockerfile.isaac-sim` | Isaac Sim 5.1.0 | 1-4 (scene, occupancy, trajectory, render) |
+| `cosmos-transfer-amr` | `Dockerfile.cosmos-transfer` | PyTorch 24.05 | 5 (domain augmentation) |
+| `xmobility-amr` | `Dockerfile.xmobility` | PyTorch 24.01 + X-Mobility | 6 (X-Mobility foundation model training) |
+
+## S3 Output Structure
+
+```
+s3://<bucket>/amr-pipeline/<run-id>/
+  scene/              # warehouse_scene.usd + metadata.json
+  occupancy/          # occupancy_map.npy + .png + metadata.json
+  trajectories/       # trajectory_XXXX.json files + metadata.json
+  raw-v1/             # rgb/ depth/ semantic_segmentation/
+  augmented-v2/       # rgb/ depth/ semantic_segmentation/
+  xmobility-datasets/ # X-Mobility training data (pre-cached from HuggingFace)
+  checkpoints/        # pretrain/ and train/ checkpoints
+  results/            # metrics.json + final model
+```
+
+## Instance Recommendations
+
+| Instance | GPUs | GPU Memory | vCPUs | RAM | Use |
+|----------|------|-----------|-------|-----|-----|
+| g5.4xlarge | 1 | 24 GB | 16 | 64 GB | Stages 1-5 (rendering, augmentation) |
+| g6.2xlarge | 1 | 24 GB | 8 | 32 GB | Stages 1-5 (latest gen) |
+| p5.48xlarge | 8 | 640 GB | 192 | 2 TB | Stage 6 (X-Mobility training) |
+
+## Troubleshooting
+
+| Issue | Solution |
+|-------|----------|
+| Vulkan ICD not found | Ensure GPU Operator toolkit is enabled (`toolkit.enabled=true`) |
+| OOM kills | Use g5.4xlarge+ (64 GB RAM) for Isaac Sim stages |
+| Shader compilation timeout | Increase `activeDeadlineSeconds` to 3600 on first run |
+| S3 access denied | Verify IRSA ServiceAccount annotation matches IAM role ARN |
+| Stage stuck on download | Check S3 bucket region matches cluster region |
+| Training NaN loss | Reduce learning rate or check augmented data integrity |
diff --git a/3.test_cases/osmo/AMRNavigation/configs/default_config.yaml b/3.test_cases/osmo/AMRNavigation/configs/default_config.yaml
@@ -0,0 +1,43 @@
+# MobilityGen SDG configuration
+scene: "omniverse://localhost/NVIDIA/Assets/Isaac/4.2/Isaac/Environments/Simple_Warehouse/full_warehouse.usd"
+num_trajectories: 5
+num_frames: 100
+image_width: 640
+image_height: 480
+
+# Sensor outputs
+sensors:
+  rgb: true
+  depth: true
+  semantic_segmentation: true
+
+# AMR Navigation Pipeline (6-stage) settings
+# Used by pipeline_config.yaml and 3.submit-pipeline.sh
+pipeline:
+  s3_bucket: "amr-pipeline-data"
+  run_id: "run-001"
+
+  scene_setup:
+    num_aisles: 4
+    aisle_length: 20.0
+
+  occupancy_map:
+    resolution: 0.1
+    depth_threshold: 0.5
+
+  trajectory_gen:
+    num_trajectories: 10
+    num_frames: 100
+    camera_height: 1.0
+
+  render:
+    image_width: 640
+    image_height: 480
+
+  augment:
+    num_variants: 1
+
+  train_evaluate:
+    epochs: 20
+    batch_size: 16
+    learning_rate: 0.0001
diff --git a/3.test_cases/osmo/AMRNavigation/configs/pipeline_config.yaml b/3.test_cases/osmo/AMRNavigation/configs/pipeline_config.yaml
@@ -0,0 +1,37 @@
+# AMR Navigation Pipeline Configuration
+# Override per-stage parameters here
+
+pipeline:
+  s3_bucket: "amr-pipeline-data"
+  run_id: "run-001"
+  namespace: "isaac-sim"
+
+images:
+  isaac_sim: "${ISAAC_SIM_IMAGE_URI}"
+  cosmos_transfer: "${COSMOS_IMAGE_URI}"
+  xmobility: "${XMOBILITY_IMAGE_URI}"
+
+stage1_scene_setup:
+  num_aisles: 4
+  aisle_length: 20.0
+
+stage2_occupancy_map:
+  resolution: 0.1
+  depth_threshold: 0.5
+
+stage3_trajectory_gen:
+  num_trajectories: 10
+  num_frames: 100
+  camera_height: 1.0
+
+stage4_render:
+  image_width: 640
+  image_height: 480
+
+stage5_augment:
+  num_variants: 1
+
+stage6_train_evaluate:
+  epochs: 20
+  batch_size: 16
+  learning_rate: 0.0001
diff --git a/3.test_cases/osmo/AMRNavigation/kubernetes/0.setup-ngc-secret.sh b/3.test_cases/osmo/AMRNavigation/kubernetes/0.setup-ngc-secret.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Create NGC image pull secret for Isaac Sim container access
+#
+# Prerequisites:
+#   - kubectl configured for your EKS cluster
+#   - NGC_API_KEY environment variable set (https://ngc.nvidia.com/setup/api-key)
+
+set -euo pipefail
+
+NAMESPACE="${NAMESPACE:-isaac-sim}"
+
+if [ -z "${NGC_API_KEY:-}" ]; then
+    echo "ERROR: NGC_API_KEY environment variable is not set."
+    echo "Get your API key from: https://ngc.nvidia.com/setup/api-key"
+    exit 1
+fi
+
+echo "Creating namespace ${NAMESPACE}..."
+kubectl create namespace "${NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
+
+echo "Creating NGC image pull secret..."
+kubectl create secret docker-registry ngc-secret \
+    --docker-server=nvcr.io \
+    --docker-username='$oauthtoken' \
+    --docker-password="${NGC_API_KEY}" \
+    -n "${NAMESPACE}" \
+    --dry-run=client -o yaml | kubectl apply -f -
+
+echo "NGC secret created in namespace ${NAMESPACE}."
diff --git a/3.test_cases/osmo/AMRNavigation/kubernetes/1.build-container.sh b/3.test_cases/osmo/AMRNavigation/kubernetes/1.build-container.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Build and push the AMR pipeline container images to Amazon ECR
+#
+# Prerequisites:
+#   - Docker installed and running
+#   - AWS CLI configured with ECR push permissions
+#   - NGC_API_KEY set (to pull base image from nvcr.io)
+
+set -euo pipefail
+
+REGION="${AWS_REGION:-us-west-2}"
+ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
+IMAGE_TAG="${IMAGE_TAG:-latest}"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BUILD_DIR="$(dirname "${SCRIPT_DIR}")"
+
+# Image URIs for all 3 pipeline images
+ISAAC_SIM_REPO="${ISAAC_SIM_REPO:-isaac-sim-amr}"
+COSMOS_REPO="${COSMOS_REPO:-cosmos-transfer-amr}"
+XMOBILITY_REPO="${XMOBILITY_REPO:-xmobility-amr}"
+
+ISAAC_SIM_IMAGE_URI="${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${ISAAC_SIM_REPO}:${IMAGE_TAG}"
+COSMOS_IMAGE_URI="${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${COSMOS_REPO}:${IMAGE_TAG}"
+XMOBILITY_IMAGE_URI="${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${XMOBILITY_REPO}:${IMAGE_TAG}"
+
+echo "=== Building AMR Pipeline containers ==="
+echo "  Region:    ${REGION}"
+echo "  Account:   ${ACCOUNT_ID}"
+echo ""
+echo "  Images:"
+echo "    Isaac Sim AMR: ${ISAAC_SIM_IMAGE_URI}"
+echo "    Cosmos:        ${COSMOS_IMAGE_URI}"
+echo "    X-Mobility:    ${XMOBILITY_IMAGE_URI}"
+
+# Authenticate to NGC (base image)
+if [ -n "${NGC_API_KEY:-}" ]; then
+    echo "Logging in to NGC registry..."
+    echo "${NGC_API_KEY}" | docker login nvcr.io --username '$oauthtoken' --password-stdin
+fi
+
+# Authenticate to ECR
+aws ecr get-login-password --region "${REGION}" | \
+    docker login --username AWS --password-stdin "${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com"
+
+# Create ECR repos if they don't exist
+for repo in "${ISAAC_SIM_REPO}" "${COSMOS_REPO}" "${XMOBILITY_REPO}"; do
+    aws ecr describe-repositories --repository-names "${repo}" --region "${REGION}" 2>/dev/null || \
+        aws ecr create-repository --repository-name "${repo}" --region "${REGION}"
+done
+
+# Build 3 pipeline images
+echo ""
+echo "--- Building Isaac Sim AMR image (stages 1-4) ---"
+docker build -t "${ISAAC_SIM_IMAGE_URI}" -f "${BUILD_DIR}/Dockerfile.isaac-sim" "${BUILD_DIR}"
+docker push "${ISAAC_SIM_IMAGE_URI}"
+
+echo ""
+echo "--- Building Cosmos Transfer image (stage 5) ---"
+docker build -t "${COSMOS_IMAGE_URI}" -f "${BUILD_DIR}/Dockerfile.cosmos-transfer" "${BUILD_DIR}"
+docker push "${COSMOS_IMAGE_URI}"
+
+echo ""
+echo "--- Building X-Mobility image (stage 6) ---"
+docker build -t "${XMOBILITY_IMAGE_URI}" -f "${BUILD_DIR}/Dockerfile.xmobility" "${BUILD_DIR}"
+docker push "${XMOBILITY_IMAGE_URI}"
+
+echo ""
+echo "=== All images pushed ==="
+echo ""
+echo "Export for use with submit scripts:"
+echo "  export ISAAC_SIM_IMAGE_URI=${ISAAC_SIM_IMAGE_URI}"
+echo "  export COSMOS_IMAGE_URI=${COSMOS_IMAGE_URI}"
+echo "  export XMOBILITY_IMAGE_URI=${XMOBILITY_IMAGE_URI}"