From 5e4fdcd6cbfc414199d6736a04fe8cd883a60094 Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Fri, 27 Mar 2026 23:17:24 +0000 Subject: [PATCH] Fix ml. prefix and wrong MIG profiles for p6-b200.48xlarge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The p6-b200.48xlarge key was missing the ml. prefix in both INSTANCE_TYPE_MIG_PROFILES (training) and INSTANCE_MIG_PROFILES (inference), causing MIG validation to always reject B200 instances. The instance type flowing through the system from the Kubernetes node label (node.kubernetes.io/instance-type) is always ml.p6-b200.48xlarge, so the dict lookup never matched. Additionally, the inference constant had the wrong MIG profiles for B200 — it used GB200 values (47gb, 93gb, 186gb) instead of the correct B200 values (45gb, 90gb, 180gb), likely a copy-paste from the ml.p6e-gb200.36xlarge entry. Fixes: - training/constants.py: 'p6-b200.48xlarge' -> 'ml.p6-b200.48xlarge' - inference/constant.py: key prefix + correct B200 profiles - test: update to use ml. prefixed instance type --- src/sagemaker/hyperpod/inference/constant.py | 12 ++++++------ src/sagemaker/hyperpod/training/constants.py | 2 +- .../inference/test_hp_jumpstart_endpoint.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/sagemaker/hyperpod/inference/constant.py b/src/sagemaker/hyperpod/inference/constant.py index edf6fa78..f2ddde56 100644 --- a/src/sagemaker/hyperpod/inference/constant.py +++ b/src/sagemaker/hyperpod/inference/constant.py @@ -39,13 +39,13 @@ "mig-4g.71gb", "mig-7g.141gb" ], - "p6-b200.48xlarge": [ + "ml.p6-b200.48xlarge": [ "mig-1g.23gb", - "mig-1g.47gb", - "mig-2g.47gb", - "mig-3g.93gb", - "mig-4g.93gb", - "mig-7g.186gb" + "mig-1g.45gb", + "mig-2g.45gb", + "mig-3g.90gb", + "mig-4g.90gb", + "mig-7g.180gb" ], "ml.p6e-gb200.36xlarge": [ "mig-1g.23gb", diff --git a/src/sagemaker/hyperpod/training/constants.py b/src/sagemaker/hyperpod/training/constants.py index 29f58fa8..0c51ed10 100644 --- a/src/sagemaker/hyperpod/training/constants.py +++ b/src/sagemaker/hyperpod/training/constants.py @@ -131,7 +131,7 @@ 'ml.p5.48xlarge': ['mig-1g.10gb', 'mig-1g.20gb', 'mig-2g.20gb', 'mig-3g.40gb', 'mig-4g.40gb', 'mig-7g.80gb'], 'ml.p5e.48xlarge': ['mig-1g.18gb', 'mig-1g.35gb', 'mig-2g.35gb', 'mig-3g.71gb', 'mig-4g.71gb', 'mig-7g.141gb'], 'ml.p5en.48xlarge': ['mig-1g.18gb', 'mig-1g.35gb', 'mig-2g.35gb', 'mig-3g.71gb', 'mig-4g.71gb', 'mig-7g.141gb'], - 'p6-b200.48xlarge': ['mig-1g.23gb', 'mig-1g.45gb', 'mig-2g.45gb', 'mig-3g.90gb', 'mig-4g.90gb', 'mig-7g.180gb'], + 'ml.p6-b200.48xlarge': ['mig-1g.23gb', 'mig-1g.45gb', 'mig-2g.45gb', 'mig-3g.90gb', 'mig-4g.90gb', 'mig-7g.180gb'], 'ml.p6e-gb200.36xlarge': ['mig-1g.23gb', 'mig-1g.47gb', 'mig-2g.47gb', 'mig-3g.93gb', 'mig-4g.93gb', 'mig-7g.186gb'], 'ml.g7e.2xlarge': ['mig-1g.24gb', 'mig-2g.48gb', 'mig-4g.96gb'], 'ml.g7e.4xlarge': ['mig-1g.24gb', 'mig-2g.48gb', 'mig-4g.96gb'], diff --git a/test/unit_tests/inference/test_hp_jumpstart_endpoint.py b/test/unit_tests/inference/test_hp_jumpstart_endpoint.py index a418dea9..e6079eb9 100644 --- a/test/unit_tests/inference/test_hp_jumpstart_endpoint.py +++ b/test/unit_tests/inference/test_hp_jumpstart_endpoint.py @@ -397,7 +397,7 @@ def test_validate_mig_profile_edge_cases(self): ("ml.p5.48xlarge", "mig-3g.40gb"), ("ml.p5e.48xlarge", "mig-1g.18gb"), ("ml.p5en.48xlarge", "mig-7g.141gb"), - ("p6-b200.48xlarge", "mig-1g.23gb"), + ("ml.p6-b200.48xlarge", "mig-1g.23gb"), ("ml.p6e-gb200.36xlarge", "mig-7g.186gb"), ]